diff --git a/downloader_utils.py b/downloader_utils.py index d718343..f2e2ba5 100644 --- a/downloader_utils.py +++ b/downloader_utils.py @@ -70,6 +70,7 @@ FOLDER_NAME_STOP_WORDS = { CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = { "poll", "cover", "fan-art", "fanart", "requests", "request", "holiday", + "batch", "open", "closed", "winner", "loser", # Added new words # Numbers 1-20 (as strings and words) "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", @@ -85,6 +86,24 @@ CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = { "fri", "friday", "sat", "saturday", "sun", "sunday" } +# New: Patterns to remove from titles/filenames *before* matching against Known.txt +KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [ + r'\bcum\b', + r'\bnsfw\b', + r'\bsfw\b', + r'\bweb\b', + r'\bhd\b', + r'\bhi\s*res\b', # hi res, hi-res, hires + r'\bhigh\s*res\b', # high res, high-res, highres + r'\b\d+p\b', # 720p, 1080p, 1440p etc. + r'\b\d+k\b', # 2k, 4k, 8k etc. + r'\[OC\]', # [OC] + r'\[Request(?:s)?\]', # [Request], [Requests] + r'\bCommission\b', + r'\bComm\b', + r'\bPreview\b', +] + def parse_cookie_string(cookie_string): """Parses a 'name=value; name2=value2' cookie string into a dict.""" cookies = {} @@ -192,10 +211,20 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords): Each name object in names_to_match is expected to be a dict: {'name': 'PrimaryFolderName', 'aliases': ['alias1', 'alias2', ...]} """ - if not title or not names_to_match: return [] - title_lower = title.lower() + if not title or not names_to_match: + return [] + + # Pre-process the title to remove specific patterns before matching + cleaned_title_for_matching = title + for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS: + cleaned_title_for_matching = re.sub(pat_str, ' ', cleaned_title_for_matching, flags=re.IGNORECASE) # Replace with space + + # Condense multiple spaces that might result from removal and strip + cleaned_title_for_matching = re.sub(r'\s+', ' ', cleaned_title_for_matching).strip() + + title_lower = cleaned_title_for_matching.lower() # Use the pre-cleaned title for matching matched_cleaned_names = set() - sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) + sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) # Sort by primary name length for name_obj in sorted_name_objects: primary_folder_name = name_obj.get("name") aliases = name_obj.get("aliases", []) @@ -1240,40 +1269,85 @@ class PostProcessorWorker: base_folder_names_for_post_content = [cleaned_primary_folder_name] self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}") elif not current_character_filters: # No char filters defined, use generic logic - # 1. Try to match folder names from Known.txt using the post title - derived_folders_from_known_txt = match_folders_from_title( + # Stage 1: Try to match folder names from Known.txt using the post title + derived_folders_from_title_via_known_txt = match_folders_from_title( post_title, self.known_names, - effective_unwanted_keywords_for_folder_naming + effective_unwanted_keywords_for_folder_naming # Use full ignore list for Known.txt matching ) - # Filter out any "untitled_folder" that might come from Known.txt if the primary name was problematic, - # and also filter empty strings. - valid_derived_folders = [ - name for name in derived_folders_from_known_txt + valid_derived_folders_from_title_known_txt = [ + name for name in derived_folders_from_title_via_known_txt if name and name.strip() and name.lower() != "untitled_folder" ] - if valid_derived_folders: - base_folder_names_for_post_content.extend(valid_derived_folders) + if valid_derived_folders_from_title_known_txt: + base_folder_names_for_post_content.extend(valid_derived_folders_from_title_known_txt) self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}") else: - # 2. If no valid folders from Known.txt, fall back to extracting from title directly. - extracted_folder_name = extract_folder_name_from_title( + # Stage 2: No Known.txt match from title. + # Determine if the title primarily consists of creator-specific ignore words. + + # Get a candidate name from title using only generic FOLDER_NAME_STOP_WORDS. + candidate_name_from_title_basic_clean = extract_folder_name_from_title( post_title, - effective_unwanted_keywords_for_folder_naming + FOLDER_NAME_STOP_WORDS # Only generic stop words ) - base_folder_names_for_post_content.append(extracted_folder_name) - self.logger(f" Base folder name(s) for post content (Generic title parsing - no valid Known.txt match): {', '.join(base_folder_names_for_post_content)}") - # 3. Final cleanup: Ensure list is not empty and contains valid, non-empty strings. + title_is_only_creator_ignored_words = False + if candidate_name_from_title_basic_clean and \ + candidate_name_from_title_basic_clean.lower() != "untitled_folder" and \ + self.creator_download_folder_ignore_words: # Check if specific creator ignore list is active + + candidate_title_words = {word.lower() for word in candidate_name_from_title_basic_clean.split()} + if candidate_title_words and candidate_title_words.issubset(self.creator_download_folder_ignore_words): + title_is_only_creator_ignored_words = True + self.logger(f" Title-derived name '{candidate_name_from_title_basic_clean}' consists only of creator-specific ignore words.") + + if title_is_only_creator_ignored_words: + # Stage 3: Title is "bad". Try Known.txt match on filenames. + self.logger(f" Attempting Known.txt match on filenames as title was poor ('{candidate_name_from_title_basic_clean}').") + + filenames_to_check = [ + f_info['_original_name_for_log'] for f_info in all_files_from_post_api_for_char_check # Defined earlier in process() + if f_info.get('_original_name_for_log') + ] + + derived_folders_from_filenames_known_txt = set() + if filenames_to_check: + for fname in filenames_to_check: + matches = match_folders_from_title( + fname, + self.known_names, + effective_unwanted_keywords_for_folder_naming # Use full ignore list for matching + ) + for m in matches: + if m and m.strip() and m.lower() != "untitled_folder": + derived_folders_from_filenames_known_txt.add(m) + + if derived_folders_from_filenames_known_txt: + base_folder_names_for_post_content.extend(list(derived_folders_from_filenames_known_txt)) + self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Filenames): {', '.join(base_folder_names_for_post_content)}") + else: + final_title_extract = extract_folder_name_from_title( + post_title, effective_unwanted_keywords_for_folder_naming + ) + base_folder_names_for_post_content.append(final_title_extract) + self.logger(f" No Known.txt match from filenames. Using title-derived name (with full ignore list): '{final_title_extract}'") + else: + extracted_name_from_title_full_ignore = extract_folder_name_from_title( + post_title, effective_unwanted_keywords_for_folder_naming + ) + base_folder_names_for_post_content.append(extracted_name_from_title_full_ignore) + self.logger(f" Base folder name(s) for post content (Generic title parsing - title not solely creator-ignored words): {', '.join(base_folder_names_for_post_content)}") + base_folder_names_for_post_content = [ name for name in base_folder_names_for_post_content if name and name.strip() ] if not base_folder_names_for_post_content: final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content") base_folder_names_for_post_content = [final_fallback_name] - self.logger(f" Fallback folder name due to all derivations failing: {final_fallback_name}") + self.logger(f" Ultimate fallback folder name: {final_fallback_name}") if not self.extract_links_only and self.use_subfolders and self.skip_words_list: if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, [] for folder_name_to_check in base_folder_names_for_post_content: # type: ignore @@ -1442,10 +1516,10 @@ class PostProcessorWorker: if not files_to_download_info_list: self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.") return 0, total_skipped_this_post, [], [], [] - num_files_in_this_post_for_naming = len(files_to_download_info_list) - self.logger(f" Identified {num_files_in_this_post_for_naming} unique original file(s) for potential download from post {post_id}.") + + self.logger(f" Identified {len(files_to_download_info_list)} unique original file(s) for potential download from post {post_id}.") with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool: - futures_list = [] + futures_list = [] # type: list[Future] for file_idx, file_info_to_dl in enumerate(files_to_download_info_list): if self._check_pause(f"File processing loop for post {post_id}, file {file_idx}"): break if self.check_cancel(): break @@ -1504,8 +1578,11 @@ class PostProcessorWorker: self.logger(f" -> Skip File (Char Filter Scope '{self.char_filter_scope}'): '{current_api_original_filename}' no match.") total_skipped_this_post += 1 continue - current_path_for_file = self.override_output_dir if self.override_output_dir else self.download_root # Use override if provided - if self.use_subfolders: + + # Determine the set of target base folder names for THIS file + target_base_folders_for_this_file_iteration = [] # type: list[str] + + if current_character_filters: # Active character filter from UI char_title_subfolder_name = None if self.target_post_id_from_initial_url and self.custom_folder_name: char_title_subfolder_name = self.custom_folder_name @@ -1513,35 +1590,43 @@ class PostProcessorWorker: char_title_subfolder_name = clean_folder_name(char_filter_info_that_matched_file["name"]) elif char_filter_that_matched_title: char_title_subfolder_name = clean_folder_name(char_filter_that_matched_title["name"]) - elif base_folder_names_for_post_content: - char_title_subfolder_name = base_folder_names_for_post_content[0] + elif char_filter_that_matched_comment: + char_title_subfolder_name = clean_folder_name(char_filter_that_matched_comment["name"]) if char_title_subfolder_name: - current_path_for_file = os.path.join(current_path_for_file, char_title_subfolder_name) - if self.use_post_subfolders: - cleaned_title_for_subfolder = clean_folder_name(post_title) - post_specific_subfolder_name = cleaned_title_for_subfolder # Use only the cleaned title - current_path_for_file = os.path.join(current_path_for_file, post_specific_subfolder_name) - target_folder_path_for_this_file = current_path_for_file - manga_date_counter_to_pass = None - manga_global_counter_to_pass = None - if self.manga_mode_active: - if self.manga_filename_style == STYLE_DATE_BASED: - manga_date_counter_to_pass = self.manga_date_file_counter_ref - elif self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING: - manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_global_file_counter_ref is not None else self.manga_date_file_counter_ref - futures_list.append(file_pool.submit( - self._download_single_file, - file_info_to_dl, - target_folder_path_for_this_file, - headers, - post_id, - self.skip_current_file_flag, - post_title=post_title, - manga_date_file_counter_ref=manga_date_counter_to_pass, - manga_global_file_counter_ref=manga_global_counter_to_pass, - file_index_in_post=file_idx, # Changed to keyword argument - num_files_in_this_post=num_files_in_this_post_for_naming # Changed to keyword argument - )) + target_base_folders_for_this_file_iteration.append(char_title_subfolder_name) + else: + self.logger(f"⚠️ File '{current_api_original_filename}' candidate by char filter, but no folder name derived. Using post title.") + target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title)) + else: # No active character filter, use base_folder_names_for_post_content (which could be multiple from Known.txt) + if base_folder_names_for_post_content: + target_base_folders_for_this_file_iteration.extend(base_folder_names_for_post_content) + else: # Fallback if base_folder_names_for_post_content was somehow empty + target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title)) + + if not target_base_folders_for_this_file_iteration: # Ultimate fallback + target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title if post_title else "Uncategorized_Post_Content")) + + for target_base_folder_name_for_instance in target_base_folders_for_this_file_iteration: + current_path_for_file_instance = self.override_output_dir if self.override_output_dir else self.download_root + if self.use_subfolders and target_base_folder_name_for_instance: + current_path_for_file_instance = os.path.join(current_path_for_file_instance, target_base_folder_name_for_instance) + if self.use_post_subfolders: + cleaned_title_for_subfolder_instance = clean_folder_name(post_title) + current_path_for_file_instance = os.path.join(current_path_for_file_instance, cleaned_title_for_subfolder_instance) + + manga_date_counter_to_pass = self.manga_date_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED else None + manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING else None + + futures_list.append(file_pool.submit( + self._download_single_file, + file_info=file_info_to_dl, # Pass the original file_info + target_folder_path=current_path_for_file_instance, + headers=headers, original_post_id_for_log=post_id, skip_event=self.skip_current_file_flag, + post_title=post_title, manga_date_file_counter_ref=manga_date_counter_to_pass, + manga_global_file_counter_ref=manga_global_counter_to_pass, + file_index_in_post=file_idx, num_files_in_this_post=len(files_to_download_info_list) + )) + for future in as_completed(futures_list): if self.check_cancel(): for f_to_cancel in futures_list: