Update downloader_utils.py

2025-12-29 16:14:44 +00:00 · 2025-06-04 04:01:01 +01:00
parent 5a6474cb8a
commit bd46002684
1 changed files with 137 additions and 52 deletions
--- a/downloader_utils.py
+++ b/downloader_utils.py
@@ -70,6 +70,7 @@ FOLDER_NAME_STOP_WORDS = {
 CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = {
    "poll", "cover", "fan-art", "fanart", "requests", "request", "holiday",
    "batch", "open", "closed", "winner", "loser", # Added new words
    # Numbers 1-20 (as strings and words)
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
    "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
@@ -85,6 +86,24 @@ CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = {
    "fri", "friday", "sat", "saturday", "sun", "sunday"
 }
 # New: Patterns to remove from titles/filenames *before* matching against Known.txt
 KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [
    r'\bcum\b',
    r'\bnsfw\b',
    r'\bsfw\b',
    r'\bweb\b',
    r'\bhd\b',
    r'\bhi\s*res\b',      # hi res, hi-res, hires
    r'\bhigh\s*res\b',    # high res, high-res, highres
    r'\b\d+p\b',          # 720p, 1080p, 1440p etc.
    r'\b\d+k\b',          # 2k, 4k, 8k etc.
    r'\[OC\]',            # [OC]
    r'\[Request(?:s)?\]', # [Request], [Requests]
    r'\bCommission\b',
    r'\bComm\b',
    r'\bPreview\b',
 ]
 def parse_cookie_string(cookie_string):
    """Parses a 'name=value; name2=value2' cookie string into a dict."""
    cookies = {}
@@ -192,10 +211,20 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
    Each name object in names_to_match is expected to be a dict:
    {'name': 'PrimaryFolderName', 'aliases': ['alias1', 'alias2', ...]}
    """    
-    if not title or not names_to_match: return []
+    if not title or not names_to_match: 
-    title_lower = title.lower()
+        return []
    # Pre-process the title to remove specific patterns before matching
    cleaned_title_for_matching = title
    for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS:
        cleaned_title_for_matching = re.sub(pat_str, ' ', cleaned_title_for_matching, flags=re.IGNORECASE) # Replace with space
    # Condense multiple spaces that might result from removal and strip
    cleaned_title_for_matching = re.sub(r'\s+', ' ', cleaned_title_for_matching).strip()
    title_lower = cleaned_title_for_matching.lower() # Use the pre-cleaned title for matching
    matched_cleaned_names = set()
-    sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True)
+    sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) # Sort by primary name length
    for name_obj in sorted_name_objects:
        primary_folder_name = name_obj.get("name")
        aliases = name_obj.get("aliases", [])
@@ -1240,40 +1269,85 @@ class PostProcessorWorker:
                    base_folder_names_for_post_content = [cleaned_primary_folder_name]          
                self.logger(f"   Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}")
            elif not current_character_filters: # No char filters defined, use generic logic
-                # 1. Try to match folder names from Known.txt using the post title
+                # Stage 1: Try to match folder names from Known.txt using the post title
-                derived_folders_from_known_txt = match_folders_from_title(
+                derived_folders_from_title_via_known_txt = match_folders_from_title(
                    post_title,
                    self.known_names,
-                    effective_unwanted_keywords_for_folder_naming
+                    effective_unwanted_keywords_for_folder_naming # Use full ignore list for Known.txt matching
                )
-                # Filter out any "untitled_folder" that might come from Known.txt if the primary name was problematic,
+                valid_derived_folders_from_title_known_txt = [
-                # and also filter empty strings.
+                    name for name in derived_folders_from_title_via_known_txt
                valid_derived_folders = [
                    name for name in derived_folders_from_known_txt
                    if name and name.strip() and name.lower() != "untitled_folder"
                ]
-                if valid_derived_folders:
+                if valid_derived_folders_from_title_known_txt:
-                    base_folder_names_for_post_content.extend(valid_derived_folders)
+                    base_folder_names_for_post_content.extend(valid_derived_folders_from_title_known_txt)
                    self.logger(f"   Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}")
                else:
-                    # 2. If no valid folders from Known.txt, fall back to extracting from title directly.
+                    # Stage 2: No Known.txt match from title.
-                    extracted_folder_name = extract_folder_name_from_title(
+                    # Determine if the title primarily consists of creator-specific ignore words.
-                        post_title,
+                    
-                        effective_unwanted_keywords_for_folder_naming
+                    # Get a candidate name from title using only generic FOLDER_NAME_STOP_WORDS.
-                    )
+                    candidate_name_from_title_basic_clean = extract_folder_name_from_title(
-                    base_folder_names_for_post_content.append(extracted_folder_name)
+                        post_title,
-                    self.logger(f"   Base folder name(s) for post content (Generic title parsing - no valid Known.txt match): {', '.join(base_folder_names_for_post_content)}")
+                        FOLDER_NAME_STOP_WORDS # Only generic stop words
                    )
                    title_is_only_creator_ignored_words = False
                    if candidate_name_from_title_basic_clean and \
                       candidate_name_from_title_basic_clean.lower() != "untitled_folder" and \
                       self.creator_download_folder_ignore_words: # Check if specific creator ignore list is active
                        candidate_title_words = {word.lower() for word in candidate_name_from_title_basic_clean.split()}
                        if candidate_title_words and candidate_title_words.issubset(self.creator_download_folder_ignore_words):
                            title_is_only_creator_ignored_words = True
                            self.logger(f"   Title-derived name '{candidate_name_from_title_basic_clean}' consists only of creator-specific ignore words.")
                    if title_is_only_creator_ignored_words:
                        # Stage 3: Title is "bad". Try Known.txt match on filenames.
                        self.logger(f"   Attempting Known.txt match on filenames as title was poor ('{candidate_name_from_title_basic_clean}').")
                        filenames_to_check = [
                            f_info['_original_name_for_log'] for f_info in all_files_from_post_api_for_char_check # Defined earlier in process()
                            if f_info.get('_original_name_for_log')
                        ]
                        derived_folders_from_filenames_known_txt = set()
                        if filenames_to_check:
                            for fname in filenames_to_check:
                                matches = match_folders_from_title(
                                    fname, 
                                    self.known_names, 
                                    effective_unwanted_keywords_for_folder_naming # Use full ignore list for matching
                                )
                                for m in matches:
                                    if m and m.strip() and m.lower() != "untitled_folder":
                                        derived_folders_from_filenames_known_txt.add(m)
                        if derived_folders_from_filenames_known_txt:
                            base_folder_names_for_post_content.extend(list(derived_folders_from_filenames_known_txt))
                            self.logger(f"   Base folder name(s) for post content (Derived from Known.txt & Filenames): {', '.join(base_folder_names_for_post_content)}")
                        else:
                            final_title_extract = extract_folder_name_from_title(
                                post_title, effective_unwanted_keywords_for_folder_naming
                            )
                            base_folder_names_for_post_content.append(final_title_extract)
                            self.logger(f"   No Known.txt match from filenames. Using title-derived name (with full ignore list): '{final_title_extract}'")
                    else:
                        extracted_name_from_title_full_ignore = extract_folder_name_from_title(
                            post_title, effective_unwanted_keywords_for_folder_naming
                        )
                        base_folder_names_for_post_content.append(extracted_name_from_title_full_ignore)
                        self.logger(f"   Base folder name(s) for post content (Generic title parsing - title not solely creator-ignored words): {', '.join(base_folder_names_for_post_content)}")
                # 3. Final cleanup: Ensure list is not empty and contains valid, non-empty strings.
                base_folder_names_for_post_content = [
                    name for name in base_folder_names_for_post_content if name and name.strip()
                ]
                if not base_folder_names_for_post_content:
                    final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content")
                    base_folder_names_for_post_content = [final_fallback_name]
-                    self.logger(f"   Fallback folder name due to all derivations failing: {final_fallback_name}")
+                    self.logger(f"   Ultimate fallback folder name: {final_fallback_name}")
        if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
            if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, []
            for folder_name_to_check in base_folder_names_for_post_content: # type: ignore
@@ -1442,10 +1516,10 @@ class PostProcessorWorker:
        if not files_to_download_info_list:
            self.logger(f"   All files for post {post_id} were duplicate original names or skipped earlier.")
            return 0, total_skipped_this_post, [], [], []
-        num_files_in_this_post_for_naming = len(files_to_download_info_list)
+        
-        self.logger(f"   Identified {num_files_in_this_post_for_naming} unique original file(s) for potential download from post {post_id}.")
+        self.logger(f"   Identified {len(files_to_download_info_list)} unique original file(s) for potential download from post {post_id}.")
        with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool:
-            futures_list = []
+            futures_list = [] # type: list[Future]
            for file_idx, file_info_to_dl in enumerate(files_to_download_info_list):
                if self._check_pause(f"File processing loop for post {post_id}, file {file_idx}"): break
                if self.check_cancel(): break
@@ -1504,8 +1578,11 @@ class PostProcessorWorker:
                    self.logger(f"   -> Skip File (Char Filter Scope '{self.char_filter_scope}'): '{current_api_original_filename}' no match.")
                    total_skipped_this_post += 1
                    continue
-                current_path_for_file = self.override_output_dir if self.override_output_dir else self.download_root # Use override if provided
+
-                if self.use_subfolders:
+                # Determine the set of target base folder names for THIS file
                target_base_folders_for_this_file_iteration = [] # type: list[str]
                if current_character_filters: # Active character filter from UI
                    char_title_subfolder_name = None
                    if self.target_post_id_from_initial_url and self.custom_folder_name:
                        char_title_subfolder_name = self.custom_folder_name
@@ -1513,35 +1590,43 @@ class PostProcessorWorker:
                        char_title_subfolder_name = clean_folder_name(char_filter_info_that_matched_file["name"])
                    elif char_filter_that_matched_title: 
                        char_title_subfolder_name = clean_folder_name(char_filter_that_matched_title["name"])
-                    elif base_folder_names_for_post_content:
+                    elif char_filter_that_matched_comment:
-                        char_title_subfolder_name = base_folder_names_for_post_content[0]
+                         char_title_subfolder_name = clean_folder_name(char_filter_that_matched_comment["name"])
                    if char_title_subfolder_name:
-                        current_path_for_file = os.path.join(current_path_for_file, char_title_subfolder_name)
+                        target_base_folders_for_this_file_iteration.append(char_title_subfolder_name)
-                if self.use_post_subfolders:
+                    else:
-                    cleaned_title_for_subfolder = clean_folder_name(post_title)
+                        self.logger(f"⚠️ File '{current_api_original_filename}' candidate by char filter, but no folder name derived. Using post title.")
-                    post_specific_subfolder_name = cleaned_title_for_subfolder # Use only the cleaned title
+                        target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title))
-                    current_path_for_file = os.path.join(current_path_for_file, post_specific_subfolder_name)
+                else: # No active character filter, use base_folder_names_for_post_content (which could be multiple from Known.txt)
-                target_folder_path_for_this_file = current_path_for_file
+                    if base_folder_names_for_post_content:
-                manga_date_counter_to_pass = None
+                        target_base_folders_for_this_file_iteration.extend(base_folder_names_for_post_content)
-                manga_global_counter_to_pass = None
+                    else: # Fallback if base_folder_names_for_post_content was somehow empty
-                if self.manga_mode_active:
+                        target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title))
-                    if self.manga_filename_style == STYLE_DATE_BASED:
+
-                        manga_date_counter_to_pass = self.manga_date_file_counter_ref
+                if not target_base_folders_for_this_file_iteration: # Ultimate fallback
-                    elif self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING:
+                    target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title if post_title else "Uncategorized_Post_Content"))
-                        manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_global_file_counter_ref is not None else self.manga_date_file_counter_ref
+
-                futures_list.append(file_pool.submit(
+                for target_base_folder_name_for_instance in target_base_folders_for_this_file_iteration:
-                    self._download_single_file,
+                    current_path_for_file_instance = self.override_output_dir if self.override_output_dir else self.download_root
-                    file_info_to_dl,
+                    if self.use_subfolders and target_base_folder_name_for_instance:
-                    target_folder_path_for_this_file,
+                        current_path_for_file_instance = os.path.join(current_path_for_file_instance, target_base_folder_name_for_instance)
-                    headers,
+                    if self.use_post_subfolders:
-                    post_id,
+                        cleaned_title_for_subfolder_instance = clean_folder_name(post_title)
-                    self.skip_current_file_flag,
+                        current_path_for_file_instance = os.path.join(current_path_for_file_instance, cleaned_title_for_subfolder_instance)
-                    post_title=post_title,
+                    
-                    manga_date_file_counter_ref=manga_date_counter_to_pass,
+                    manga_date_counter_to_pass = self.manga_date_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED else None
-                    manga_global_file_counter_ref=manga_global_counter_to_pass,
+                    manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING else None
-                    file_index_in_post=file_idx, # Changed to keyword argument
+
-                    num_files_in_this_post=num_files_in_this_post_for_naming # Changed to keyword argument
+                    futures_list.append(file_pool.submit(
-                ))
+                        self._download_single_file,
                        file_info=file_info_to_dl, # Pass the original file_info
                        target_folder_path=current_path_for_file_instance,
                        headers=headers, original_post_id_for_log=post_id, skip_event=self.skip_current_file_flag,
                        post_title=post_title, manga_date_file_counter_ref=manga_date_counter_to_pass,
                        manga_global_file_counter_ref=manga_global_counter_to_pass,
                        file_index_in_post=file_idx, num_files_in_this_post=len(files_to_download_info_list)
                    ))
            for future in as_completed(futures_list):
                if self.check_cancel():
                    for f_to_cancel in futures_list: