This commit is contained in:
Yuvi63771 2025-10-26 12:08:48 +05:30
parent 0acd433920
commit 7e8e8a59e2
4 changed files with 113 additions and 66 deletions

View File

@ -1,8 +1,6 @@
# src/core/Hentai2read_client.py
import re import re
import os import os
import time import time
import cloudscraper import cloudscraper
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
@ -65,12 +63,37 @@ def run_hentai2read_download(start_url, output_dir, progress_callback, overall_p
def _get_series_metadata(start_url, progress_callback, scraper): def _get_series_metadata(start_url, progress_callback, scraper):
""" """
Scrapes the main series page to get the Artist Name, Series Title, and chapter list. Scrapes the main series page to get the Artist Name, Series Title, and chapter list.
Includes a retry mechanism for the initial connection.
""" """
try: max_retries = 4 # Total number of attempts (1 initial + 3 retries)
response = scraper.get(start_url, timeout=30) last_exception = None
response.raise_for_status() soup = None
soup = BeautifulSoup(response.text, 'html.parser')
for attempt in range(max_retries):
try:
if attempt > 0:
progress_callback(f" [Hentai2Read] ⚠️ Retrying connection (Attempt {attempt + 1}/{max_retries})...")
response = scraper.get(start_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# If successful, clear exception and break the loop
last_exception = None
break
except Exception as e:
last_exception = e
progress_callback(f" [Hentai2Read] ⚠️ Connection attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 * (attempt + 1)) # Wait 2s, 4s, 6s
continue # Try again
if last_exception:
progress_callback(f" [Hentai2Read] ❌ Error getting series metadata after {max_retries} attempts: {last_exception}")
return "Unknown Series", []
try:
series_title = "Unknown Series" series_title = "Unknown Series"
artist_name = None artist_name = None
metadata_list = soup.select_one("ul.list.list-simple-mini") metadata_list = soup.select_one("ul.list.list-simple-mini")
@ -107,10 +130,9 @@ def _get_series_metadata(start_url, progress_callback, scraper):
return top_level_folder_name, chapters_to_process return top_level_folder_name, chapters_to_process
except Exception as e: except Exception as e:
progress_callback(f" [Hentai2Read] ❌ Error getting series metadata: {e}") progress_callback(f" [Hentai2Read] ❌ Error parsing metadata after successful connection: {e}")
return "Unknown Series", [] return "Unknown Series", []
### NEW: This function contains the pipeline logic ###
def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func): def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func):
""" """
Uses a producer-consumer pattern to download a chapter. Uses a producer-consumer pattern to download a chapter.
@ -120,12 +142,10 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
task_queue = queue.Queue() task_queue = queue.Queue()
num_download_threads = 8 num_download_threads = 8
# These will be updated by the worker threads
download_stats = {'downloaded': 0, 'skipped': 0} download_stats = {'downloaded': 0, 'skipped': 0}
def downloader_worker(): def downloader_worker():
"""The function that each download thread will run.""" """The function that each download thread will run."""
# Create a unique session for each thread to avoid conflicts
worker_scraper = cloudscraper.create_scraper() worker_scraper = cloudscraper.create_scraper()
while True: while True:
try: try:
@ -153,12 +173,10 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
finally: finally:
task_queue.task_done() task_queue.task_done()
# --- Start the downloader threads ---
executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader') executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader')
for _ in range(num_download_threads): for _ in range(num_download_threads):
executor.submit(downloader_worker) executor.submit(downloader_worker)
# --- Main thread acts as the scraper (producer) ---
page_number = 1 page_number = 1
while True: while True:
if check_pause_func(): break if check_pause_func(): break
@ -168,12 +186,25 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
page_url_to_check = f"{chapter_url}{page_number}/" page_url_to_check = f"{chapter_url}{page_number}/"
try: try:
response = scraper.get(page_url_to_check, timeout=30) page_response = None
if response.history or response.status_code != 200: page_last_exception = None
for page_attempt in range(3): # 3 attempts for sub-pages
try:
page_response = scraper.get(page_url_to_check, timeout=30)
page_last_exception = None
break
except Exception as e:
page_last_exception = e
time.sleep(1) # Short delay for page scraping retries
if page_last_exception:
raise page_last_exception # Give up after 3 tries
if page_response.history or page_response.status_code != 200:
progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.") progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.")
break break
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(page_response.text, 'html.parser')
img_tag = soup.select_one("img#arf-reader") img_tag = soup.select_one("img#arf-reader")
img_src = img_tag.get("src") if img_tag else None img_src = img_tag.get("src") if img_tag else None
@ -181,12 +212,11 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).") progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).")
break break
normalized_img_src = urljoin(response.url, img_src) normalized_img_src = urljoin(page_response.url, img_src)
ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg" ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg"
filename = f"{page_number:03d}{ext}" filename = f"{page_number:03d}{ext}"
filepath = os.path.join(save_path, filename) filepath = os.path.join(save_path, filename)
# Put the download task into the queue for a worker to pick up
task_queue.put((filepath, normalized_img_src)) task_queue.put((filepath, normalized_img_src))
page_number += 1 page_number += 1
@ -195,12 +225,9 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}") progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}")
break break
# --- Shutdown sequence ---
# Tell all worker threads to exit by sending the sentinel value
for _ in range(num_download_threads): for _ in range(num_download_threads):
task_queue.put(None) task_queue.put(None)
# Wait for all download tasks to be completed
executor.shutdown(wait=True) executor.shutdown(wait=True)
progress_callback(f" Found and processed {page_number - 1} images for this chapter.") progress_callback(f" Found and processed {page_number - 1} images for this chapter.")

View File

@ -4,7 +4,7 @@ from urllib.parse import urlparse
# Utility Imports # Utility Imports
from ...utils.network_utils import prepare_cookies_for_request from ...utils.network_utils import prepare_cookies_for_request
from ...utils.file_utils import clean_folder_name # Keep if needed by any thread init from ...utils.file_utils import clean_folder_name
# Downloader Thread Imports (Alphabetical Order Recommended) # Downloader Thread Imports (Alphabetical Order Recommended)
from .allcomic_downloader_thread import AllcomicDownloadThread from .allcomic_downloader_thread import AllcomicDownloadThread
@ -16,7 +16,6 @@ from .erome_downloader_thread import EromeDownloadThread
from .external_link_downloader_thread import ExternalLinkDownloadThread from .external_link_downloader_thread import ExternalLinkDownloadThread
from .fap_nation_downloader_thread import FapNationDownloadThread from .fap_nation_downloader_thread import FapNationDownloadThread
from .hentai2read_downloader_thread import Hentai2readDownloadThread from .hentai2read_downloader_thread import Hentai2readDownloadThread
# ---> ADD IMPORT FOR NEW KEMONO DISCORD THREAD <---
from .kemono_discord_downloader_thread import KemonoDiscordDownloadThread from .kemono_discord_downloader_thread import KemonoDiscordDownloadThread
from .mangadex_downloader_thread import MangaDexDownloadThread from .mangadex_downloader_thread import MangaDexDownloadThread
from .nhentai_downloader_thread import NhentaiDownloadThread from .nhentai_downloader_thread import NhentaiDownloadThread
@ -34,7 +33,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
or None if no special handler is found (indicating fallback to generic BackendDownloadThread). or None if no special handler is found (indicating fallback to generic BackendDownloadThread).
""" """
# --- Specific Site/Service Handlers ---
# Handler for Booru sites (Danbooru, Gelbooru) # Handler for Booru sites (Danbooru, Gelbooru)
if service in ['danbooru', 'gelbooru']: if service in ['danbooru', 'gelbooru']:
@ -68,7 +66,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
return MangaDexDownloadThread(api_url, effective_output_dir_for_run, main_app) return MangaDexDownloadThread(api_url, effective_output_dir_for_run, main_app)
# Handler for Saint2 # Handler for Saint2
# Check specific domains identified by extract_post_info or common patterns
is_saint2_url = service == 'saint2' or 'saint2.su' in api_url or 'saint2.pk' in api_url # Add more domains if needed is_saint2_url = service == 'saint2' or 'saint2.su' in api_url or 'saint2.pk' in api_url # Add more domains if needed
if is_saint2_url and api_url.strip().lower() != 'saint2.su': # Exclude batch mode trigger if using URL input if is_saint2_url and api_url.strip().lower() != 'saint2.su': # Exclude batch mode trigger if using URL input
return Saint2DownloadThread(api_url, effective_output_dir_for_run, main_app) return Saint2DownloadThread(api_url, effective_output_dir_for_run, main_app)
@ -93,7 +90,7 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
main_app.log_signal.emit(" Rule34Video.com URL detected. Starting dedicated downloader.") main_app.log_signal.emit(" Rule34Video.com URL detected. Starting dedicated downloader.")
return Rule34VideoDownloadThread(api_url, effective_output_dir_for_run, main_app) # id1 (video_id) is used inside the thread return Rule34VideoDownloadThread(api_url, effective_output_dir_for_run, main_app) # id1 (video_id) is used inside the thread
# ---> HANDLER FOR KEMONO DISCORD (Place BEFORE official Discord) <--- # HANDLER FOR KEMONO DISCORD (Place BEFORE official Discord)
elif service == 'discord' and any(domain in api_url for domain in ['kemono.cr', 'kemono.su', 'kemono.party']): elif service == 'discord' and any(domain in api_url for domain in ['kemono.cr', 'kemono.su', 'kemono.party']):
main_app.log_signal.emit(" Kemono Discord URL detected. Starting dedicated downloader.") main_app.log_signal.emit(" Kemono Discord URL detected. Starting dedicated downloader.")
cookies = prepare_cookies_for_request( cookies = prepare_cookies_for_request(
@ -119,8 +116,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
token = main_app.remove_from_filename_input.text().strip() # Token is in the "Remove Words" field for Discord token = main_app.remove_from_filename_input.text().strip() # Token is in the "Remove Words" field for Discord
if not token: if not token:
main_app.log_signal.emit("❌ Official Discord requires an Authorization Token in the 'Remove Words' field.") main_app.log_signal.emit("❌ Official Discord requires an Authorization Token in the 'Remove Words' field.")
# Optionally show a message box here
# QMessageBox.warning(main_app, "Token Required", "Please enter your Discord Authorization Token in the 'Remove Words from name' field.")
return None # Or a specific error sentinel return None # Or a specific error sentinel
limit_text = main_app.discord_message_limit_input.text().strip() limit_text = main_app.discord_message_limit_input.text().strip()
@ -140,7 +135,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
parent=main_app # Pass main_app for events/signals parent=main_app # Pass main_app for events/signals
) )
# Handler for Allcomic/Allporncomic
# Check specific domains or rely on service name if extract_post_info provides it # Check specific domains or rely on service name if extract_post_info provides it
if service == 'allcomic' or 'allcomic.com' in api_url or 'allporncomic.com' in api_url: if service == 'allcomic' or 'allcomic.com' in api_url or 'allporncomic.com' in api_url:
return AllcomicDownloadThread(api_url, effective_output_dir_for_run, main_app) return AllcomicDownloadThread(api_url, effective_output_dir_for_run, main_app)
@ -164,7 +158,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
# Handler for nHentai # Handler for nHentai
if service == 'nhentai': if service == 'nhentai':
# nHentai requires fetching data *before* creating the thread
from ...core.nhentai_client import fetch_nhentai_gallery from ...core.nhentai_client import fetch_nhentai_gallery
main_app.log_signal.emit(f" nHentai gallery ID {id1} detected. Fetching gallery data...") main_app.log_signal.emit(f" nHentai gallery ID {id1} detected. Fetching gallery data...")
gallery_data = fetch_nhentai_gallery(id1, main_app.log_signal.emit) gallery_data = fetch_nhentai_gallery(id1, main_app.log_signal.emit)

View File

@ -339,11 +339,9 @@ class DownloaderApp (QWidget ):
self._connect_signals() self._connect_signals()
if hasattr(self, 'character_input'): if hasattr(self, 'character_input'):
self.character_input.setToolTip(self._tr("character_input_tooltip", "Enter character names (comma-separated)...")) self.character_input.setToolTip(self._tr("character_input_tooltip", "Enter character names (comma-separated)..."))
self.log_signal.emit(f" Manga filename style loaded: '{self.manga_filename_style}'") self.log_signal.emit(f" filename style loaded: '{self.manga_filename_style}'")
self.log_signal.emit(f" Skip words scope loaded: '{self.skip_words_scope}'") self.log_signal.emit(f" Skip words scope loaded: '{self.skip_words_scope}'")
self.log_signal.emit(f" Character filter scope set to default: '{self.char_filter_scope}'") self.log_signal.emit(f" Character filter scope set to default: '{self.char_filter_scope}'")
self.log_signal.emit(f" Multi-part download defaults to: {'Enabled' if self.allow_multipart_download_setting else 'Disabled'}")
self.log_signal.emit(f" Scan post content for images defaults to: {'Enabled' if self.scan_content_images_setting else 'Disabled'}")
self.log_signal.emit(f" Application language loaded: '{self.current_selected_language.upper()}' (UI may not reflect this yet).") self.log_signal.emit(f" Application language loaded: '{self.current_selected_language.upper()}' (UI may not reflect this yet).")
self._retranslate_main_ui() self._retranslate_main_ui()
self._load_persistent_history() self._load_persistent_history()
@ -831,14 +829,11 @@ class DownloaderApp (QWidget ):
self.download_btn.setEnabled(False) self.download_btn.setEnabled(False)
self.pause_btn.setEnabled(False) self.pause_btn.setEnabled(False)
else: else:
# --- START MODIFICATION ---
# Check if we are about to download fetched posts and update text accordingly
if self.is_ready_to_download_fetched: if self.is_ready_to_download_fetched:
num_posts = len(self.fetched_posts_for_download) num_posts = len(self.fetched_posts_for_download)
self.download_btn.setText(f"⬇️ Start Download ({num_posts} Posts)") self.download_btn.setText(f"⬇️ Start Download ({num_posts} Posts)")
self.download_btn.setEnabled(True) # Keep it enabled for the user to click self.download_btn.setEnabled(True) # Keep it enabled for the user to click
else: else:
# Original logic for an active download in other scenarios
self.download_btn.setText(self._tr("start_download_button_text", "⬇️ Start Download")) self.download_btn.setText(self._tr("start_download_button_text", "⬇️ Start Download"))
self.download_btn.setEnabled(False) self.download_btn.setEnabled(False)
@ -926,11 +921,9 @@ class DownloaderApp (QWidget ):
args_template = self.last_start_download_args args_template = self.last_start_download_args
# Update both the character filter list and the domain override in the arguments
args_template['filter_character_list'] = parsed_filters args_template['filter_character_list'] = parsed_filters
args_template['domain_override'] = domain_override args_template['domain_override'] = domain_override
# Manually set the UI to a "downloading" state for reliability
self.set_ui_enabled(False) self.set_ui_enabled(False)
self.download_btn.setText("⬇️ Downloading...") self.download_btn.setText("⬇️ Downloading...")
self.download_btn.setEnabled(False) self.download_btn.setEnabled(False)
@ -938,7 +931,6 @@ class DownloaderApp (QWidget ):
self.cancel_btn.setEnabled(True) self.cancel_btn.setEnabled(True)
self.cancel_btn.setText("❌ Cancel & Reset UI") self.cancel_btn.setText("❌ Cancel & Reset UI")
try: try:
# Ensure signals are connected to the correct actions for this state
self.cancel_btn.clicked.disconnect() self.cancel_btn.clicked.disconnect()
self.pause_btn.clicked.disconnect() self.pause_btn.clicked.disconnect()
except TypeError: except TypeError:
@ -5626,13 +5618,11 @@ class DownloaderApp (QWidget ):
api_domain = parsed_api_url.netloc if parsed_api_url.netloc else self._get_domain_for_service(service) api_domain = parsed_api_url.netloc if parsed_api_url.netloc else self._get_domain_for_service(service)
post_page_url = f"https://{api_domain}/{service}/user/{user_id}/post/{post_id}" post_page_url = f"https://{api_domain}/{service}/user/{user_id}/post/{post_id}"
# --- NEW LOGIC: Differentiate between loaded files and live session errors ---
# Initialize variables before the conditional blocks # Initialize variables before the conditional blocks
target_folder_path_for_download = None target_folder_path_for_download = None
filename_override_for_download = None filename_override_for_download = None
if job_details.get('is_loaded_from_txt'): if job_details.get('is_loaded_from_txt'):
# --- BEHAVIOR FOR LOADED FILES: Recalculate everything from current UI settings ---
self.log_signal.emit(f" Retrying loaded file. Recalculating path and name from current UI settings...") self.log_signal.emit(f" Retrying loaded file. Recalculating path and name from current UI settings...")
# 1. Get all current settings and job data # 1. Get all current settings and job data
@ -6325,10 +6315,8 @@ class DownloaderApp (QWidget ):
if hasattr(self, 'link_input'): if hasattr(self, 'link_input'):
self.last_link_input_text_for_queue_sync = self.link_input.text() self.last_link_input_text_for_queue_sync = self.link_input.text()
# --- START: MODIFIED LOGIC ---
# Manually trigger the UI update now that the queue is populated and the dialog is closed. # Manually trigger the UI update now that the queue is populated and the dialog is closed.
self.update_ui_for_manga_mode(self.manga_mode_checkbox.isChecked() if self.manga_mode_checkbox else False) self.update_ui_for_manga_mode(self.manga_mode_checkbox.isChecked() if self.manga_mode_checkbox else False)
# --- END: MODIFIED LOGIC ---
def _load_saved_cookie_settings(self): def _load_saved_cookie_settings(self):
"""Loads and applies saved cookie settings on startup.""" """Loads and applies saved cookie settings on startup."""

View File

@ -26,6 +26,16 @@ KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [
r'\bPreview\b', r'\bPreview\b',
] ]
# --- START NEW CODE ---
# Regular expression to detect CJK characters
# Covers Hiragana, Katakana, Half/Full width forms, CJK Unified Ideographs, Hangul Syllables, etc.
cjk_pattern = re.compile(r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9fff\uac00-\ud7af]')
def contains_cjk(text):
"""Checks if the text contains any CJK characters."""
return bool(cjk_pattern.search(text))
# --- END NEW CODE ---
# --- Text Matching and Manipulation Utilities --- # --- Text Matching and Manipulation Utilities ---
def is_title_match_for_character(post_title, character_name_filter): def is_title_match_for_character(post_title, character_name_filter):
@ -42,7 +52,7 @@ def is_title_match_for_character(post_title, character_name_filter):
""" """
if not post_title or not character_name_filter: if not post_title or not character_name_filter:
return False return False
# Use word boundaries (\b) to match whole words only # Use word boundaries (\b) to match whole words only
pattern = r"(?i)\b" + re.escape(str(character_name_filter).strip()) + r"\b" pattern = r"(?i)\b" + re.escape(str(character_name_filter).strip()) + r"\b"
return bool(re.search(pattern, post_title)) return bool(re.search(pattern, post_title))
@ -62,7 +72,7 @@ def is_filename_match_for_character(filename, character_name_filter):
""" """
if not filename or not character_name_filter: if not filename or not character_name_filter:
return False return False
return str(character_name_filter).strip().lower() in filename.lower() return str(character_name_filter).strip().lower() in filename.lower()
@ -101,16 +111,16 @@ def extract_folder_name_from_title(title, unwanted_keywords):
""" """
if not title: if not title:
return 'Uncategorized' return 'Uncategorized'
title_lower = title.lower() title_lower = title.lower()
# Find all whole words in the title # Find all whole words in the title
tokens = re.findall(r'\b[\w\-]+\b', title_lower) tokens = re.findall(r'\b[\w\-]+\b', title_lower)
for token in tokens: for token in tokens:
clean_token = clean_folder_name(token) clean_token = clean_folder_name(token)
if clean_token and clean_token.lower() not in unwanted_keywords: if clean_token and clean_token.lower() not in unwanted_keywords:
return clean_token return clean_token
# Fallback to cleaning the full title if no single significant word is found # Fallback to cleaning the full title if no single significant word is found
cleaned_full_title = clean_folder_name(title) cleaned_full_title = clean_folder_name(title)
return cleaned_full_title if cleaned_full_title else 'Uncategorized' return cleaned_full_title if cleaned_full_title else 'Uncategorized'
@ -120,6 +130,7 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
""" """
Matches folder names from a title based on a list of known name objects. Matches folder names from a title based on a list of known name objects.
Each name object is a dict: {'name': 'PrimaryName', 'aliases': ['alias1', ...]} Each name object is a dict: {'name': 'PrimaryName', 'aliases': ['alias1', ...]}
MODIFIED: Uses substring matching for CJK aliases, word boundary for others.
Args: Args:
title (str): The post title to check. title (str): The post title to check.
@ -137,10 +148,11 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS: for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS:
cleaned_title = re.sub(pat_str, ' ', cleaned_title, flags=re.IGNORECASE) cleaned_title = re.sub(pat_str, ' ', cleaned_title, flags=re.IGNORECASE)
cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
# Store both original case cleaned title and lower case for different matching
title_lower = cleaned_title.lower() title_lower = cleaned_title.lower()
matched_cleaned_names = set() matched_cleaned_names = set()
# Sort by name length descending to match longer names first (e.g., "Cloud Strife" before "Cloud") # Sort by name length descending to match longer names first (e.g., "Cloud Strife" before "Cloud")
sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True)
@ -149,19 +161,43 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
aliases = name_obj.get("aliases", []) aliases = name_obj.get("aliases", [])
if not primary_folder_name or not aliases: if not primary_folder_name or not aliases:
continue continue
# <<< START MODIFICATION >>>
cleaned_primary_name = clean_folder_name(primary_folder_name)
if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords:
continue # Skip this entry entirely if its primary name is unwanted or empty
match_found_for_this_object = False
for alias in aliases: for alias in aliases:
if not alias: continue
alias_lower = alias.lower() alias_lower = alias.lower()
if not alias_lower: continue
# Check if the alias contains CJK characters
# Use word boundaries for accurate matching if contains_cjk(alias):
pattern = r'\b' + re.escape(alias_lower) + r'\b' # Use simple substring matching for CJK
if re.search(pattern, title_lower): if alias_lower in title_lower:
cleaned_primary_name = clean_folder_name(primary_folder_name)
if cleaned_primary_name.lower() not in unwanted_keywords:
matched_cleaned_names.add(cleaned_primary_name) matched_cleaned_names.add(cleaned_primary_name)
break # Move to the next name object once a match is found for this one match_found_for_this_object = True
break # Move to the next name object
else:
# Use original word boundary matching for non-CJK
try:
# Compile pattern for efficiency if used repeatedly, though here it changes each loop
pattern = r'\b' + re.escape(alias_lower) + r'\b'
if re.search(pattern, title_lower):
matched_cleaned_names.add(cleaned_primary_name)
match_found_for_this_object = True
break # Move to the next name object
except re.error as e:
# Log error if the alias creates an invalid regex (unlikely with escape)
print(f"Regex error for alias '{alias}': {e}") # Or use proper logging
continue
# This outer break logic remains the same (though slightly redundant with inner breaks)
if match_found_for_this_object:
pass # Already added and broke inner loop
# <<< END MODIFICATION >>>
return sorted(list(matched_cleaned_names)) return sorted(list(matched_cleaned_names))
@ -188,23 +224,26 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw
for name_obj in names_to_match: for name_obj in names_to_match:
primary_name = name_obj.get("name") primary_name = name_obj.get("name")
if not primary_name: continue if not primary_name: continue
cleaned_primary_name = clean_folder_name(primary_name) cleaned_primary_name = clean_folder_name(primary_name)
if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords: if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords:
continue continue
for alias in name_obj.get("aliases", []): for alias in name_obj.get("aliases", []):
if alias.lower(): # <<< MODIFICATION: Ensure alias is not empty before converting to lower case >>>
alias_map_to_primary.append((alias.lower(), cleaned_primary_name)) if alias: # Check if alias is not None and not an empty string
alias_lower_val = alias.lower()
if alias_lower_val: # Check again after lowercasing (handles case where alias might be just spaces)
alias_map_to_primary.append((alias_lower_val, cleaned_primary_name))
# Sort by alias length, descending, to match longer aliases first # Sort by alias length, descending, to match longer aliases first
alias_map_to_primary.sort(key=lambda x: len(x[0]), reverse=True) alias_map_to_primary.sort(key=lambda x: len(x[0]), reverse=True)
# <<< MODIFICATION: Return the FIRST match found, which will be the longest >>> # Return the FIRST match found, which will be the longest
for alias_lower, primary_name_for_alias in alias_map_to_primary: for alias_lower, primary_name_for_alias in alias_map_to_primary:
if alias_lower in filename_lower: if alias_lower in filename_lower:
# Found the longest possible alias that is a substring. Return immediately. # Found the longest possible alias that is a substring. Return immediately.
return [primary_name_for_alias] return [primary_name_for_alias]
# If the loop finishes without any matches, return an empty list. # If the loop finishes without any matches, return an empty list.
return [] return []