diff --git a/src/core/workers.py b/src/core/workers.py index 90edb9c..2d122c2 100644 --- a/src/core/workers.py +++ b/src/core/workers.py @@ -1810,6 +1810,31 @@ class PostProcessorWorker: if not all_files_from_post_api: self.logger(f" No files found to download for post {post_id}.") + if not self.extract_links_only and should_create_post_subfolder: + path_to_check_for_emptiness = determined_post_save_path_for_history + try: + if os.path.isdir(path_to_check_for_emptiness): + dir_contents = os.listdir(path_to_check_for_emptiness) + # Check if the directory is empty OR only contains our ID file + is_effectively_empty = True + if dir_contents: + if not all(f.startswith('.postid_') for f in dir_contents): + is_effectively_empty = False + + if is_effectively_empty: + self.logger(f" 🗑️ Removing empty post-specific subfolder (post had no files): '{path_to_check_for_emptiness}'") + if dir_contents: + for id_file in dir_contents: + if id_file.startswith('.postid_'): + try: + os.remove(os.path.join(path_to_check_for_emptiness, id_file)) + except OSError as e_rm_id: + self.logger(f" ⚠️ Could not remove ID file '{id_file}' during cleanup: {e_rm_id}") + os.rmdir(path_to_check_for_emptiness) + except OSError as e_rmdir: + self.logger(f" ⚠️ Could not remove effectively empty subfolder (no files) '{path_to_check_for_emptiness}': {e_rmdir}") + # --- END NEW CLEANUP LOGIC --- + history_data_for_no_files_post = { 'post_title': post_title, 'post_id': post_id, @@ -1823,7 +1848,7 @@ class PostProcessorWorker: result_tuple = (0, 0, [], [], [], history_data_for_no_files_post, None) self._emit_signal('worker_finished', result_tuple) return result_tuple - + files_to_download_info_list = [] processed_original_filenames_in_this_post = set() if self.keep_in_post_duplicates: @@ -2052,9 +2077,27 @@ class PostProcessorWorker: if not self.extract_links_only and self.use_post_subfolders and total_downloaded_this_post == 0: path_to_check_for_emptiness = determined_post_save_path_for_history try: - if os.path.isdir(path_to_check_for_emptiness) and not os.listdir(path_to_check_for_emptiness): - self.logger(f" 🗑️ Removing empty post-specific subfolder: '{path_to_check_for_emptiness}'") - os.rmdir(path_to_check_for_emptiness) + if os.path.isdir(path_to_check_for_emptiness): + dir_contents = os.listdir(path_to_check_for_emptiness) + # Check if the directory is empty OR only contains our ID file + is_effectively_empty = True + if dir_contents: + # If there are files, check if ALL of them are .postid files + if not all(f.startswith('.postid_') for f in dir_contents): + is_effectively_empty = False + + if is_effectively_empty: + self.logger(f" 🗑️ Removing empty post-specific subfolder (no files downloaded): '{path_to_check_for_emptiness}'") + # We must first remove the ID file(s) before removing the dir + if dir_contents: + for id_file in dir_contents: + if id_file.startswith('.postid_'): + try: + os.remove(os.path.join(path_to_check_for_emptiness, id_file)) + except OSError as e_rm_id: + self.logger(f" ⚠️ Could not remove ID file '{id_file}' during cleanup: {e_rm_id}") + + os.rmdir(path_to_check_for_emptiness) # Now the rmdir should work except OSError as e_rmdir: self.logger(f" ⚠️ Could not remove empty post-specific subfolder '{path_to_check_for_emptiness}': {e_rmdir}") @@ -2066,11 +2109,29 @@ class PostProcessorWorker: if not self.extract_links_only and self.use_post_subfolders and total_downloaded_this_post == 0: path_to_check_for_emptiness = determined_post_save_path_for_history try: - if os.path.isdir(path_to_check_for_emptiness) and not os.listdir(path_to_check_for_emptiness): - self.logger(f" 🗑️ Removing empty post-specific subfolder: '{path_to_check_for_emptiness}'") - os.rmdir(path_to_check_for_emptiness) + if os.path.isdir(path_to_check_for_emptiness): + dir_contents = os.listdir(path_to_check_for_emptiness) + # Check if the directory is empty OR only contains our ID file + is_effectively_empty = True + if dir_contents: + # If there are files, check if ALL of them are .postid files + if not all(f.startswith('.postid_') for f in dir_contents): + is_effectively_empty = False + + if is_effectively_empty: + self.logger(f" 🗑️ Removing empty post-specific subfolder (no files downloaded): '{path_to_check_for_emptiness}'") + # We must first remove the ID file(s) before removing the dir + if dir_contents: + for id_file in dir_contents: + if id_file.startswith('.postid_'): + try: + os.remove(os.path.join(path_to_check_for_emptiness, id_file)) + except OSError as e_rm_id: + self.logger(f" ⚠️ Could not remove ID file '{id_file}' during cleanup: {e_rm_id}") + + os.rmdir(path_to_check_for_emptiness) # Now the rmdir should work except OSError as e_rmdir: - self.logger(f" ⚠️ Could not remove potentially empty subfolder '{path_to_check_for_emptiness}': {e_rmdir}") + self.logger(f" ⚠️ Could not remove empty post-specific subfolder '{path_to_check_for_emptiness}': {e_rmdir}") self._emit_signal('worker_finished', result_tuple) return result_tuple diff --git a/src/utils/text_utils.py b/src/utils/text_utils.py index 535c74e..4358c46 100644 --- a/src/utils/text_utils.py +++ b/src/utils/text_utils.py @@ -205,6 +205,8 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw """ Matches folder names from a filename, prioritizing longer and more specific aliases. It returns immediately after finding the first (longest) match. + MODIFIED: Prioritizes boundary-aware matches for Latin characters, + falls back to substring search for CJK compatibility. Args: filename (str): The filename to check. @@ -230,10 +232,9 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw continue for alias in name_obj.get("aliases", []): - # <<< MODIFICATION: Ensure alias is not empty before converting to lower case >>> if alias: # Check if alias is not None and not an empty string alias_lower_val = alias.lower() - if alias_lower_val: # Check again after lowercasing (handles case where alias might be just spaces) + if alias_lower_val: # Check again after lowercasing alias_map_to_primary.append((alias_lower_val, cleaned_primary_name)) # Sort by alias length, descending, to match longer aliases first @@ -241,9 +242,33 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw # Return the FIRST match found, which will be the longest for alias_lower, primary_name_for_alias in alias_map_to_primary: - if alias_lower in filename_lower: - # Found the longest possible alias that is a substring. Return immediately. - return [primary_name_for_alias] + try: + # 1. Attempt boundary-aware match first (good for English/Latin) + # Matches alias if it's at the start/end or surrounded by common separators + # We use word boundaries (\b) and also check for common non-word separators like +_- + pattern = r'(?:^|[\s_+-])' + re.escape(alias_lower) + r'(?:[\s_+-]|$)' + + if re.search(pattern, filename_lower): + # Found a precise, boundary-aware match. This is the best case. + return [primary_name_for_alias] + + # 2. Fallback: Simple substring check (for CJK or other cases) + # This executes ONLY if the boundary match above failed. + # We check if the alias contains CJK OR if the filename does. + # This avoids applying the simple 'in' check for Latin-only aliases in Latin-only filenames. + elif (contains_cjk(alias_lower) or contains_cjk(filename_lower)) and alias_lower in filename_lower: + # This is the fallback for CJK compatibility. + return [primary_name_for_alias] + + # If alias is "ul" and filename is "sin+título": + # 1. re.search(r'(?:^|[\s_+-])ul(?:[\s_+-]|$)', "sin+título") -> Fails (good) + # 2. contains_cjk("ul") -> False + # 3. contains_cjk("sin+título") -> False + # 4. No match is found for "ul". (correct) + + except re.error as e: + print(f"Regex error matching alias '{alias_lower}' in filename '{filename_lower}': {e}") + continue # Skip this alias if regex fails # If the loop finishes without any matches, return an empty list. return [] \ No newline at end of file