Commit

2025-12-29 16:14:44 +00:00 · 2025-11-01 10:41:00 +05:30
parent 169ded3fd8
commit 9563ce82db
2 changed files with 99 additions and 13 deletions
--- a/src/core/workers.py
+++ b/src/core/workers.py
@@ -1810,6 +1810,31 @@ class PostProcessorWorker:

            if not all_files_from_post_api:
                self.logger(f"   No files found to download for post {post_id}.")
+                if not self.extract_links_only and should_create_post_subfolder:
+                    path_to_check_for_emptiness = determined_post_save_path_for_history
+                    try:
+                        if os.path.isdir(path_to_check_for_emptiness):
+                            dir_contents = os.listdir(path_to_check_for_emptiness)
+                            # Check if the directory is empty OR only contains our ID file
+                            is_effectively_empty = True
+                            if dir_contents:
+                                if not all(f.startswith('.postid_') for f in dir_contents):
+                                    is_effectively_empty = False
+                            
+                            if is_effectively_empty:
+                                self.logger(f"   🗑️ Removing empty post-specific subfolder (post had no files): '{path_to_check_for_emptiness}'")
+                                if dir_contents:
+                                    for id_file in dir_contents:
+                                        if id_file.startswith('.postid_'):
+                                            try:
+                                                os.remove(os.path.join(path_to_check_for_emptiness, id_file))
+                                            except OSError as e_rm_id:
+                                                self.logger(f"   ⚠️ Could not remove ID file '{id_file}' during cleanup: {e_rm_id}")
+                                os.rmdir(path_to_check_for_emptiness)
+                    except OSError as e_rmdir:
+                        self.logger(f"   ⚠️ Could not remove effectively empty subfolder (no files) '{path_to_check_for_emptiness}': {e_rmdir}")
+                # --- END NEW CLEANUP LOGIC ---
+
                history_data_for_no_files_post = {
                    'post_title': post_title,
                    'post_id': post_id,
@@ -1823,7 +1848,7 @@ class PostProcessorWorker:
                result_tuple = (0, 0, [], [], [], history_data_for_no_files_post, None)
                self._emit_signal('worker_finished', result_tuple)
                return result_tuple
-
+                
            files_to_download_info_list = []
            processed_original_filenames_in_this_post = set()
            if self.keep_in_post_duplicates:
@@ -2052,9 +2077,27 @@ class PostProcessorWorker:
            if not self.extract_links_only and self.use_post_subfolders and total_downloaded_this_post == 0:
                path_to_check_for_emptiness = determined_post_save_path_for_history
                try:
-                    if os.path.isdir(path_to_check_for_emptiness) and not os.listdir(path_to_check_for_emptiness):
-                        self.logger(f"   🗑️ Removing empty post-specific subfolder: '{path_to_check_for_emptiness}'")
-                        os.rmdir(path_to_check_for_emptiness)
+                    if os.path.isdir(path_to_check_for_emptiness):
+                        dir_contents = os.listdir(path_to_check_for_emptiness)
+                        # Check if the directory is empty OR only contains our ID file
+                        is_effectively_empty = True
+                        if dir_contents:
+                            # If there are files, check if ALL of them are .postid files
+                            if not all(f.startswith('.postid_') for f in dir_contents):
+                                is_effectively_empty = False
+                        
+                        if is_effectively_empty:
+                            self.logger(f"   🗑️ Removing empty post-specific subfolder (no files downloaded): '{path_to_check_for_emptiness}'")
+                            # We must first remove the ID file(s) before removing the dir
+                            if dir_contents:
+                                for id_file in dir_contents:
+                                    if id_file.startswith('.postid_'):
+                                        try:
+                                            os.remove(os.path.join(path_to_check_for_emptiness, id_file))
+                                        except OSError as e_rm_id:
+                                            self.logger(f"   ⚠️ Could not remove ID file '{id_file}' during cleanup: {e_rm_id}")
+                            
+                            os.rmdir(path_to_check_for_emptiness) # Now the rmdir should work
                except OSError as e_rmdir:
                    self.logger(f"   ⚠️ Could not remove empty post-specific subfolder '{path_to_check_for_emptiness}': {e_rmdir}")

@@ -2066,11 +2109,29 @@ class PostProcessorWorker:
            if not self.extract_links_only and self.use_post_subfolders and total_downloaded_this_post == 0:
                path_to_check_for_emptiness = determined_post_save_path_for_history
                try:
-                    if os.path.isdir(path_to_check_for_emptiness) and not os.listdir(path_to_check_for_emptiness):
-                        self.logger(f"   🗑️ Removing empty post-specific subfolder: '{path_to_check_for_emptiness}'")
-                        os.rmdir(path_to_check_for_emptiness)
+                    if os.path.isdir(path_to_check_for_emptiness):
+                        dir_contents = os.listdir(path_to_check_for_emptiness)
+                        # Check if the directory is empty OR only contains our ID file
+                        is_effectively_empty = True
+                        if dir_contents:
+                            # If there are files, check if ALL of them are .postid files
+                            if not all(f.startswith('.postid_') for f in dir_contents):
+                                is_effectively_empty = False
+                        
+                        if is_effectively_empty:
+                            self.logger(f"   🗑️ Removing empty post-specific subfolder (no files downloaded): '{path_to_check_for_emptiness}'")
+                            # We must first remove the ID file(s) before removing the dir
+                            if dir_contents:
+                                for id_file in dir_contents:
+                                    if id_file.startswith('.postid_'):
+                                        try:
+                                            os.remove(os.path.join(path_to_check_for_emptiness, id_file))
+                                        except OSError as e_rm_id:
+                                            self.logger(f"   ⚠️ Could not remove ID file '{id_file}' during cleanup: {e_rm_id}")
+                            
+                            os.rmdir(path_to_check_for_emptiness) # Now the rmdir should work
                except OSError as e_rmdir:
-                    self.logger(f"   ⚠️ Could not remove potentially empty subfolder '{path_to_check_for_emptiness}': {e_rmdir}")
+                    self.logger(f"   ⚠️ Could not remove empty post-specific subfolder '{path_to_check_for_emptiness}': {e_rmdir}")
            
            self._emit_signal('worker_finished', result_tuple)
            return result_tuple
--- a/src/utils/text_utils.py
+++ b/src/utils/text_utils.py
@@ -205,6 +205,8 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw
    """
    Matches folder names from a filename, prioritizing longer and more specific aliases.
    It returns immediately after finding the first (longest) match.
+    MODIFIED: Prioritizes boundary-aware matches for Latin characters,
+    falls back to substring search for CJK compatibility.

    Args:
        filename (str): The filename to check.
@@ -230,10 +232,9 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw
            continue

        for alias in name_obj.get("aliases", []):
-            # <<< MODIFICATION: Ensure alias is not empty before converting to lower case >>>
            if alias: # Check if alias is not None and not an empty string
                alias_lower_val = alias.lower()
-                if alias_lower_val: # Check again after lowercasing (handles case where alias might be just spaces)
+                if alias_lower_val: # Check again after lowercasing
                    alias_map_to_primary.append((alias_lower_val, cleaned_primary_name))

    # Sort by alias length, descending, to match longer aliases first
@@ -241,9 +242,33 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw

    # Return the FIRST match found, which will be the longest
    for alias_lower, primary_name_for_alias in alias_map_to_primary:
-        if alias_lower in filename_lower:
-            # Found the longest possible alias that is a substring. Return immediately.
-            return [primary_name_for_alias]
+        try:
+            # 1. Attempt boundary-aware match first (good for English/Latin)
+            # Matches alias if it's at the start/end or surrounded by common separators
+            # We use word boundaries (\b) and also check for common non-word separators like +_-
+            pattern = r'(?:^|[\s_+-])' + re.escape(alias_lower) + r'(?:[\s_+-]|$)'
+            
+            if re.search(pattern, filename_lower):
+                # Found a precise, boundary-aware match. This is the best case.
+                return [primary_name_for_alias]
+
+            # 2. Fallback: Simple substring check (for CJK or other cases)
+            # This executes ONLY if the boundary match above failed.
+            # We check if the alias contains CJK OR if the filename does.
+            # This avoids applying the simple 'in' check for Latin-only aliases in Latin-only filenames.
+            elif (contains_cjk(alias_lower) or contains_cjk(filename_lower)) and alias_lower in filename_lower:
+                # This is the fallback for CJK compatibility.
+                return [primary_name_for_alias]
+                
+            # If alias is "ul" and filename is "sin+título":
+            # 1. re.search(r'(?:^|[\s_+-])ul(?:[\s_+-]|$)', "sin+título") -> Fails (good)
+            # 2. contains_cjk("ul") -> False
+            # 3. contains_cjk("sin+título") -> False
+            # 4. No match is found for "ul". (correct)
+
+        except re.error as e:
+            print(f"Regex error matching alias '{alias_lower}' in filename '{filename_lower}': {e}")
+            continue # Skip this alias if regex fails

    # If the loop finishes without any matches, return an empty list.
    return []