Update downloader_utils.py

This commit is contained in:
Yuvi9587
2025-06-04 04:01:01 +01:00
parent 5a6474cb8a
commit bd46002684

View File

@@ -70,6 +70,7 @@ FOLDER_NAME_STOP_WORDS = {
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = { CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = {
"poll", "cover", "fan-art", "fanart", "requests", "request", "holiday", "poll", "cover", "fan-art", "fanart", "requests", "request", "holiday",
"batch", "open", "closed", "winner", "loser", # Added new words
# Numbers 1-20 (as strings and words) # Numbers 1-20 (as strings and words)
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
@@ -85,6 +86,24 @@ CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = {
"fri", "friday", "sat", "saturday", "sun", "sunday" "fri", "friday", "sat", "saturday", "sun", "sunday"
} }
# New: Patterns to remove from titles/filenames *before* matching against Known.txt
KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [
r'\bcum\b',
r'\bnsfw\b',
r'\bsfw\b',
r'\bweb\b',
r'\bhd\b',
r'\bhi\s*res\b', # hi res, hi-res, hires
r'\bhigh\s*res\b', # high res, high-res, highres
r'\b\d+p\b', # 720p, 1080p, 1440p etc.
r'\b\d+k\b', # 2k, 4k, 8k etc.
r'\[OC\]', # [OC]
r'\[Request(?:s)?\]', # [Request], [Requests]
r'\bCommission\b',
r'\bComm\b',
r'\bPreview\b',
]
def parse_cookie_string(cookie_string): def parse_cookie_string(cookie_string):
"""Parses a 'name=value; name2=value2' cookie string into a dict.""" """Parses a 'name=value; name2=value2' cookie string into a dict."""
cookies = {} cookies = {}
@@ -192,10 +211,20 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
Each name object in names_to_match is expected to be a dict: Each name object in names_to_match is expected to be a dict:
{'name': 'PrimaryFolderName', 'aliases': ['alias1', 'alias2', ...]} {'name': 'PrimaryFolderName', 'aliases': ['alias1', 'alias2', ...]}
""" """
if not title or not names_to_match: return [] if not title or not names_to_match:
title_lower = title.lower() return []
# Pre-process the title to remove specific patterns before matching
cleaned_title_for_matching = title
for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS:
cleaned_title_for_matching = re.sub(pat_str, ' ', cleaned_title_for_matching, flags=re.IGNORECASE) # Replace with space
# Condense multiple spaces that might result from removal and strip
cleaned_title_for_matching = re.sub(r'\s+', ' ', cleaned_title_for_matching).strip()
title_lower = cleaned_title_for_matching.lower() # Use the pre-cleaned title for matching
matched_cleaned_names = set() matched_cleaned_names = set()
sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) # Sort by primary name length
for name_obj in sorted_name_objects: for name_obj in sorted_name_objects:
primary_folder_name = name_obj.get("name") primary_folder_name = name_obj.get("name")
aliases = name_obj.get("aliases", []) aliases = name_obj.get("aliases", [])
@@ -1240,40 +1269,85 @@ class PostProcessorWorker:
base_folder_names_for_post_content = [cleaned_primary_folder_name] base_folder_names_for_post_content = [cleaned_primary_folder_name]
self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}") self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}")
elif not current_character_filters: # No char filters defined, use generic logic elif not current_character_filters: # No char filters defined, use generic logic
# 1. Try to match folder names from Known.txt using the post title # Stage 1: Try to match folder names from Known.txt using the post title
derived_folders_from_known_txt = match_folders_from_title( derived_folders_from_title_via_known_txt = match_folders_from_title(
post_title, post_title,
self.known_names, self.known_names,
effective_unwanted_keywords_for_folder_naming effective_unwanted_keywords_for_folder_naming # Use full ignore list for Known.txt matching
) )
# Filter out any "untitled_folder" that might come from Known.txt if the primary name was problematic, valid_derived_folders_from_title_known_txt = [
# and also filter empty strings. name for name in derived_folders_from_title_via_known_txt
valid_derived_folders = [
name for name in derived_folders_from_known_txt
if name and name.strip() and name.lower() != "untitled_folder" if name and name.strip() and name.lower() != "untitled_folder"
] ]
if valid_derived_folders: if valid_derived_folders_from_title_known_txt:
base_folder_names_for_post_content.extend(valid_derived_folders) base_folder_names_for_post_content.extend(valid_derived_folders_from_title_known_txt)
self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}") self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}")
else: else:
# 2. If no valid folders from Known.txt, fall back to extracting from title directly. # Stage 2: No Known.txt match from title.
extracted_folder_name = extract_folder_name_from_title( # Determine if the title primarily consists of creator-specific ignore words.
post_title,
effective_unwanted_keywords_for_folder_naming # Get a candidate name from title using only generic FOLDER_NAME_STOP_WORDS.
) candidate_name_from_title_basic_clean = extract_folder_name_from_title(
base_folder_names_for_post_content.append(extracted_folder_name) post_title,
self.logger(f" Base folder name(s) for post content (Generic title parsing - no valid Known.txt match): {', '.join(base_folder_names_for_post_content)}") FOLDER_NAME_STOP_WORDS # Only generic stop words
)
title_is_only_creator_ignored_words = False
if candidate_name_from_title_basic_clean and \
candidate_name_from_title_basic_clean.lower() != "untitled_folder" and \
self.creator_download_folder_ignore_words: # Check if specific creator ignore list is active
candidate_title_words = {word.lower() for word in candidate_name_from_title_basic_clean.split()}
if candidate_title_words and candidate_title_words.issubset(self.creator_download_folder_ignore_words):
title_is_only_creator_ignored_words = True
self.logger(f" Title-derived name '{candidate_name_from_title_basic_clean}' consists only of creator-specific ignore words.")
if title_is_only_creator_ignored_words:
# Stage 3: Title is "bad". Try Known.txt match on filenames.
self.logger(f" Attempting Known.txt match on filenames as title was poor ('{candidate_name_from_title_basic_clean}').")
filenames_to_check = [
f_info['_original_name_for_log'] for f_info in all_files_from_post_api_for_char_check # Defined earlier in process()
if f_info.get('_original_name_for_log')
]
derived_folders_from_filenames_known_txt = set()
if filenames_to_check:
for fname in filenames_to_check:
matches = match_folders_from_title(
fname,
self.known_names,
effective_unwanted_keywords_for_folder_naming # Use full ignore list for matching
)
for m in matches:
if m and m.strip() and m.lower() != "untitled_folder":
derived_folders_from_filenames_known_txt.add(m)
if derived_folders_from_filenames_known_txt:
base_folder_names_for_post_content.extend(list(derived_folders_from_filenames_known_txt))
self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Filenames): {', '.join(base_folder_names_for_post_content)}")
else:
final_title_extract = extract_folder_name_from_title(
post_title, effective_unwanted_keywords_for_folder_naming
)
base_folder_names_for_post_content.append(final_title_extract)
self.logger(f" No Known.txt match from filenames. Using title-derived name (with full ignore list): '{final_title_extract}'")
else:
extracted_name_from_title_full_ignore = extract_folder_name_from_title(
post_title, effective_unwanted_keywords_for_folder_naming
)
base_folder_names_for_post_content.append(extracted_name_from_title_full_ignore)
self.logger(f" Base folder name(s) for post content (Generic title parsing - title not solely creator-ignored words): {', '.join(base_folder_names_for_post_content)}")
# 3. Final cleanup: Ensure list is not empty and contains valid, non-empty strings.
base_folder_names_for_post_content = [ base_folder_names_for_post_content = [
name for name in base_folder_names_for_post_content if name and name.strip() name for name in base_folder_names_for_post_content if name and name.strip()
] ]
if not base_folder_names_for_post_content: if not base_folder_names_for_post_content:
final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content") final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content")
base_folder_names_for_post_content = [final_fallback_name] base_folder_names_for_post_content = [final_fallback_name]
self.logger(f" Fallback folder name due to all derivations failing: {final_fallback_name}") self.logger(f" Ultimate fallback folder name: {final_fallback_name}")
if not self.extract_links_only and self.use_subfolders and self.skip_words_list: if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, [] if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, []
for folder_name_to_check in base_folder_names_for_post_content: # type: ignore for folder_name_to_check in base_folder_names_for_post_content: # type: ignore
@@ -1442,10 +1516,10 @@ class PostProcessorWorker:
if not files_to_download_info_list: if not files_to_download_info_list:
self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.") self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.")
return 0, total_skipped_this_post, [], [], [] return 0, total_skipped_this_post, [], [], []
num_files_in_this_post_for_naming = len(files_to_download_info_list)
self.logger(f" Identified {num_files_in_this_post_for_naming} unique original file(s) for potential download from post {post_id}.") self.logger(f" Identified {len(files_to_download_info_list)} unique original file(s) for potential download from post {post_id}.")
with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool: with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool:
futures_list = [] futures_list = [] # type: list[Future]
for file_idx, file_info_to_dl in enumerate(files_to_download_info_list): for file_idx, file_info_to_dl in enumerate(files_to_download_info_list):
if self._check_pause(f"File processing loop for post {post_id}, file {file_idx}"): break if self._check_pause(f"File processing loop for post {post_id}, file {file_idx}"): break
if self.check_cancel(): break if self.check_cancel(): break
@@ -1504,8 +1578,11 @@ class PostProcessorWorker:
self.logger(f" -> Skip File (Char Filter Scope '{self.char_filter_scope}'): '{current_api_original_filename}' no match.") self.logger(f" -> Skip File (Char Filter Scope '{self.char_filter_scope}'): '{current_api_original_filename}' no match.")
total_skipped_this_post += 1 total_skipped_this_post += 1
continue continue
current_path_for_file = self.override_output_dir if self.override_output_dir else self.download_root # Use override if provided
if self.use_subfolders: # Determine the set of target base folder names for THIS file
target_base_folders_for_this_file_iteration = [] # type: list[str]
if current_character_filters: # Active character filter from UI
char_title_subfolder_name = None char_title_subfolder_name = None
if self.target_post_id_from_initial_url and self.custom_folder_name: if self.target_post_id_from_initial_url and self.custom_folder_name:
char_title_subfolder_name = self.custom_folder_name char_title_subfolder_name = self.custom_folder_name
@@ -1513,35 +1590,43 @@ class PostProcessorWorker:
char_title_subfolder_name = clean_folder_name(char_filter_info_that_matched_file["name"]) char_title_subfolder_name = clean_folder_name(char_filter_info_that_matched_file["name"])
elif char_filter_that_matched_title: elif char_filter_that_matched_title:
char_title_subfolder_name = clean_folder_name(char_filter_that_matched_title["name"]) char_title_subfolder_name = clean_folder_name(char_filter_that_matched_title["name"])
elif base_folder_names_for_post_content: elif char_filter_that_matched_comment:
char_title_subfolder_name = base_folder_names_for_post_content[0] char_title_subfolder_name = clean_folder_name(char_filter_that_matched_comment["name"])
if char_title_subfolder_name: if char_title_subfolder_name:
current_path_for_file = os.path.join(current_path_for_file, char_title_subfolder_name) target_base_folders_for_this_file_iteration.append(char_title_subfolder_name)
if self.use_post_subfolders: else:
cleaned_title_for_subfolder = clean_folder_name(post_title) self.logger(f"⚠️ File '{current_api_original_filename}' candidate by char filter, but no folder name derived. Using post title.")
post_specific_subfolder_name = cleaned_title_for_subfolder # Use only the cleaned title target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title))
current_path_for_file = os.path.join(current_path_for_file, post_specific_subfolder_name) else: # No active character filter, use base_folder_names_for_post_content (which could be multiple from Known.txt)
target_folder_path_for_this_file = current_path_for_file if base_folder_names_for_post_content:
manga_date_counter_to_pass = None target_base_folders_for_this_file_iteration.extend(base_folder_names_for_post_content)
manga_global_counter_to_pass = None else: # Fallback if base_folder_names_for_post_content was somehow empty
if self.manga_mode_active: target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title))
if self.manga_filename_style == STYLE_DATE_BASED:
manga_date_counter_to_pass = self.manga_date_file_counter_ref if not target_base_folders_for_this_file_iteration: # Ultimate fallback
elif self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING: target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title if post_title else "Uncategorized_Post_Content"))
manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_global_file_counter_ref is not None else self.manga_date_file_counter_ref
futures_list.append(file_pool.submit( for target_base_folder_name_for_instance in target_base_folders_for_this_file_iteration:
self._download_single_file, current_path_for_file_instance = self.override_output_dir if self.override_output_dir else self.download_root
file_info_to_dl, if self.use_subfolders and target_base_folder_name_for_instance:
target_folder_path_for_this_file, current_path_for_file_instance = os.path.join(current_path_for_file_instance, target_base_folder_name_for_instance)
headers, if self.use_post_subfolders:
post_id, cleaned_title_for_subfolder_instance = clean_folder_name(post_title)
self.skip_current_file_flag, current_path_for_file_instance = os.path.join(current_path_for_file_instance, cleaned_title_for_subfolder_instance)
post_title=post_title,
manga_date_file_counter_ref=manga_date_counter_to_pass, manga_date_counter_to_pass = self.manga_date_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED else None
manga_global_file_counter_ref=manga_global_counter_to_pass, manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING else None
file_index_in_post=file_idx, # Changed to keyword argument
num_files_in_this_post=num_files_in_this_post_for_naming # Changed to keyword argument futures_list.append(file_pool.submit(
)) self._download_single_file,
file_info=file_info_to_dl, # Pass the original file_info
target_folder_path=current_path_for_file_instance,
headers=headers, original_post_id_for_log=post_id, skip_event=self.skip_current_file_flag,
post_title=post_title, manga_date_file_counter_ref=manga_date_counter_to_pass,
manga_global_file_counter_ref=manga_global_counter_to_pass,
file_index_in_post=file_idx, num_files_in_this_post=len(files_to_download_info_list)
))
for future in as_completed(futures_list): for future in as_completed(futures_list):
if self.check_cancel(): if self.check_cancel():
for f_to_cancel in futures_list: for f_to_cancel in futures_list: