diff --git a/src/core/Hentai2read_client.py b/src/core/Hentai2read_client.py index a144276..e1b34a7 100644 --- a/src/core/Hentai2read_client.py +++ b/src/core/Hentai2read_client.py @@ -1,8 +1,6 @@ -# src/core/Hentai2read_client.py - import re import os -import time +import time import cloudscraper from bs4 import BeautifulSoup from urllib.parse import urljoin @@ -65,12 +63,37 @@ def run_hentai2read_download(start_url, output_dir, progress_callback, overall_p def _get_series_metadata(start_url, progress_callback, scraper): """ Scrapes the main series page to get the Artist Name, Series Title, and chapter list. + Includes a retry mechanism for the initial connection. """ - try: - response = scraper.get(start_url, timeout=30) - response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') + max_retries = 4 # Total number of attempts (1 initial + 3 retries) + last_exception = None + soup = None + for attempt in range(max_retries): + try: + if attempt > 0: + progress_callback(f" [Hentai2Read] ⚠️ Retrying connection (Attempt {attempt + 1}/{max_retries})...") + + response = scraper.get(start_url, timeout=30) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + # If successful, clear exception and break the loop + last_exception = None + break + + except Exception as e: + last_exception = e + progress_callback(f" [Hentai2Read] ⚠️ Connection attempt {attempt + 1} failed: {e}") + if attempt < max_retries - 1: + time.sleep(2 * (attempt + 1)) # Wait 2s, 4s, 6s + continue # Try again + + if last_exception: + progress_callback(f" [Hentai2Read] ❌ Error getting series metadata after {max_retries} attempts: {last_exception}") + return "Unknown Series", [] + + try: series_title = "Unknown Series" artist_name = None metadata_list = soup.select_one("ul.list.list-simple-mini") @@ -107,10 +130,9 @@ def _get_series_metadata(start_url, progress_callback, scraper): return top_level_folder_name, chapters_to_process except Exception as e: - progress_callback(f" [Hentai2Read] ❌ Error getting series metadata: {e}") + progress_callback(f" [Hentai2Read] ❌ Error parsing metadata after successful connection: {e}") return "Unknown Series", [] -### NEW: This function contains the pipeline logic ### def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func): """ Uses a producer-consumer pattern to download a chapter. @@ -120,12 +142,10 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call task_queue = queue.Queue() num_download_threads = 8 - # These will be updated by the worker threads download_stats = {'downloaded': 0, 'skipped': 0} def downloader_worker(): """The function that each download thread will run.""" - # Create a unique session for each thread to avoid conflicts worker_scraper = cloudscraper.create_scraper() while True: try: @@ -153,12 +173,10 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call finally: task_queue.task_done() - # --- Start the downloader threads --- executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader') for _ in range(num_download_threads): executor.submit(downloader_worker) - # --- Main thread acts as the scraper (producer) --- page_number = 1 while True: if check_pause_func(): break @@ -168,12 +186,25 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call page_url_to_check = f"{chapter_url}{page_number}/" try: - response = scraper.get(page_url_to_check, timeout=30) - if response.history or response.status_code != 200: + page_response = None + page_last_exception = None + for page_attempt in range(3): # 3 attempts for sub-pages + try: + page_response = scraper.get(page_url_to_check, timeout=30) + page_last_exception = None + break + except Exception as e: + page_last_exception = e + time.sleep(1) # Short delay for page scraping retries + + if page_last_exception: + raise page_last_exception # Give up after 3 tries + + if page_response.history or page_response.status_code != 200: progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.") break - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(page_response.text, 'html.parser') img_tag = soup.select_one("img#arf-reader") img_src = img_tag.get("src") if img_tag else None @@ -181,12 +212,11 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).") break - normalized_img_src = urljoin(response.url, img_src) + normalized_img_src = urljoin(page_response.url, img_src) ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg" filename = f"{page_number:03d}{ext}" filepath = os.path.join(save_path, filename) - # Put the download task into the queue for a worker to pick up task_queue.put((filepath, normalized_img_src)) page_number += 1 @@ -195,12 +225,9 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}") break - # --- Shutdown sequence --- - # Tell all worker threads to exit by sending the sentinel value for _ in range(num_download_threads): task_queue.put(None) - # Wait for all download tasks to be completed executor.shutdown(wait=True) progress_callback(f" Found and processed {page_number - 1} images for this chapter.") diff --git a/src/ui/classes/downloader_factory.py b/src/ui/classes/downloader_factory.py index 249e65c..da1a929 100644 --- a/src/ui/classes/downloader_factory.py +++ b/src/ui/classes/downloader_factory.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse # Utility Imports from ...utils.network_utils import prepare_cookies_for_request -from ...utils.file_utils import clean_folder_name # Keep if needed by any thread init +from ...utils.file_utils import clean_folder_name # Downloader Thread Imports (Alphabetical Order Recommended) from .allcomic_downloader_thread import AllcomicDownloadThread @@ -16,7 +16,6 @@ from .erome_downloader_thread import EromeDownloadThread from .external_link_downloader_thread import ExternalLinkDownloadThread from .fap_nation_downloader_thread import FapNationDownloadThread from .hentai2read_downloader_thread import Hentai2readDownloadThread -# ---> ADD IMPORT FOR NEW KEMONO DISCORD THREAD <--- from .kemono_discord_downloader_thread import KemonoDiscordDownloadThread from .mangadex_downloader_thread import MangaDexDownloadThread from .nhentai_downloader_thread import NhentaiDownloadThread @@ -34,7 +33,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out or None if no special handler is found (indicating fallback to generic BackendDownloadThread). """ - # --- Specific Site/Service Handlers --- # Handler for Booru sites (Danbooru, Gelbooru) if service in ['danbooru', 'gelbooru']: @@ -68,7 +66,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out return MangaDexDownloadThread(api_url, effective_output_dir_for_run, main_app) # Handler for Saint2 - # Check specific domains identified by extract_post_info or common patterns is_saint2_url = service == 'saint2' or 'saint2.su' in api_url or 'saint2.pk' in api_url # Add more domains if needed if is_saint2_url and api_url.strip().lower() != 'saint2.su': # Exclude batch mode trigger if using URL input return Saint2DownloadThread(api_url, effective_output_dir_for_run, main_app) @@ -93,7 +90,7 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out main_app.log_signal.emit("ℹ️ Rule34Video.com URL detected. Starting dedicated downloader.") return Rule34VideoDownloadThread(api_url, effective_output_dir_for_run, main_app) # id1 (video_id) is used inside the thread - # ---> HANDLER FOR KEMONO DISCORD (Place BEFORE official Discord) <--- + # HANDLER FOR KEMONO DISCORD (Place BEFORE official Discord) elif service == 'discord' and any(domain in api_url for domain in ['kemono.cr', 'kemono.su', 'kemono.party']): main_app.log_signal.emit("ℹ️ Kemono Discord URL detected. Starting dedicated downloader.") cookies = prepare_cookies_for_request( @@ -119,8 +116,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out token = main_app.remove_from_filename_input.text().strip() # Token is in the "Remove Words" field for Discord if not token: main_app.log_signal.emit("❌ Official Discord requires an Authorization Token in the 'Remove Words' field.") - # Optionally show a message box here - # QMessageBox.warning(main_app, "Token Required", "Please enter your Discord Authorization Token in the 'Remove Words from name' field.") return None # Or a specific error sentinel limit_text = main_app.discord_message_limit_input.text().strip() @@ -140,7 +135,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out parent=main_app # Pass main_app for events/signals ) - # Handler for Allcomic/Allporncomic # Check specific domains or rely on service name if extract_post_info provides it if service == 'allcomic' or 'allcomic.com' in api_url or 'allporncomic.com' in api_url: return AllcomicDownloadThread(api_url, effective_output_dir_for_run, main_app) @@ -164,7 +158,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out # Handler for nHentai if service == 'nhentai': - # nHentai requires fetching data *before* creating the thread from ...core.nhentai_client import fetch_nhentai_gallery main_app.log_signal.emit(f"ℹ️ nHentai gallery ID {id1} detected. Fetching gallery data...") gallery_data = fetch_nhentai_gallery(id1, main_app.log_signal.emit) diff --git a/src/ui/main_window.py b/src/ui/main_window.py index 10906e9..ec84db5 100644 --- a/src/ui/main_window.py +++ b/src/ui/main_window.py @@ -339,11 +339,9 @@ class DownloaderApp (QWidget ): self._connect_signals() if hasattr(self, 'character_input'): self.character_input.setToolTip(self._tr("character_input_tooltip", "Enter character names (comma-separated)...")) - self.log_signal.emit(f"ℹ️ Manga filename style loaded: '{self.manga_filename_style}'") + self.log_signal.emit(f"ℹ️ filename style loaded: '{self.manga_filename_style}'") self.log_signal.emit(f"ℹ️ Skip words scope loaded: '{self.skip_words_scope}'") self.log_signal.emit(f"ℹ️ Character filter scope set to default: '{self.char_filter_scope}'") - self.log_signal.emit(f"ℹ️ Multi-part download defaults to: {'Enabled' if self.allow_multipart_download_setting else 'Disabled'}") - self.log_signal.emit(f"ℹ️ Scan post content for images defaults to: {'Enabled' if self.scan_content_images_setting else 'Disabled'}") self.log_signal.emit(f"ℹ️ Application language loaded: '{self.current_selected_language.upper()}' (UI may not reflect this yet).") self._retranslate_main_ui() self._load_persistent_history() @@ -831,14 +829,11 @@ class DownloaderApp (QWidget ): self.download_btn.setEnabled(False) self.pause_btn.setEnabled(False) else: - # --- START MODIFICATION --- - # Check if we are about to download fetched posts and update text accordingly if self.is_ready_to_download_fetched: num_posts = len(self.fetched_posts_for_download) self.download_btn.setText(f"⬇️ Start Download ({num_posts} Posts)") self.download_btn.setEnabled(True) # Keep it enabled for the user to click else: - # Original logic for an active download in other scenarios self.download_btn.setText(self._tr("start_download_button_text", "⬇️ Start Download")) self.download_btn.setEnabled(False) @@ -926,11 +921,9 @@ class DownloaderApp (QWidget ): args_template = self.last_start_download_args - # Update both the character filter list and the domain override in the arguments args_template['filter_character_list'] = parsed_filters args_template['domain_override'] = domain_override - # Manually set the UI to a "downloading" state for reliability self.set_ui_enabled(False) self.download_btn.setText("⬇️ Downloading...") self.download_btn.setEnabled(False) @@ -938,7 +931,6 @@ class DownloaderApp (QWidget ): self.cancel_btn.setEnabled(True) self.cancel_btn.setText("❌ Cancel & Reset UI") try: - # Ensure signals are connected to the correct actions for this state self.cancel_btn.clicked.disconnect() self.pause_btn.clicked.disconnect() except TypeError: @@ -5626,13 +5618,11 @@ class DownloaderApp (QWidget ): api_domain = parsed_api_url.netloc if parsed_api_url.netloc else self._get_domain_for_service(service) post_page_url = f"https://{api_domain}/{service}/user/{user_id}/post/{post_id}" - # --- NEW LOGIC: Differentiate between loaded files and live session errors --- # Initialize variables before the conditional blocks target_folder_path_for_download = None filename_override_for_download = None if job_details.get('is_loaded_from_txt'): - # --- BEHAVIOR FOR LOADED FILES: Recalculate everything from current UI settings --- self.log_signal.emit(f" Retrying loaded file. Recalculating path and name from current UI settings...") # 1. Get all current settings and job data @@ -6325,10 +6315,8 @@ class DownloaderApp (QWidget ): if hasattr(self, 'link_input'): self.last_link_input_text_for_queue_sync = self.link_input.text() - # --- START: MODIFIED LOGIC --- # Manually trigger the UI update now that the queue is populated and the dialog is closed. self.update_ui_for_manga_mode(self.manga_mode_checkbox.isChecked() if self.manga_mode_checkbox else False) - # --- END: MODIFIED LOGIC --- def _load_saved_cookie_settings(self): """Loads and applies saved cookie settings on startup.""" diff --git a/src/utils/text_utils.py b/src/utils/text_utils.py index b16f4c4..535c74e 100644 --- a/src/utils/text_utils.py +++ b/src/utils/text_utils.py @@ -26,6 +26,16 @@ KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [ r'\bPreview\b', ] +# --- START NEW CODE --- +# Regular expression to detect CJK characters +# Covers Hiragana, Katakana, Half/Full width forms, CJK Unified Ideographs, Hangul Syllables, etc. +cjk_pattern = re.compile(r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9fff\uac00-\ud7af]') + +def contains_cjk(text): + """Checks if the text contains any CJK characters.""" + return bool(cjk_pattern.search(text)) +# --- END NEW CODE --- + # --- Text Matching and Manipulation Utilities --- def is_title_match_for_character(post_title, character_name_filter): @@ -42,7 +52,7 @@ def is_title_match_for_character(post_title, character_name_filter): """ if not post_title or not character_name_filter: return False - + # Use word boundaries (\b) to match whole words only pattern = r"(?i)\b" + re.escape(str(character_name_filter).strip()) + r"\b" return bool(re.search(pattern, post_title)) @@ -62,7 +72,7 @@ def is_filename_match_for_character(filename, character_name_filter): """ if not filename or not character_name_filter: return False - + return str(character_name_filter).strip().lower() in filename.lower() @@ -101,16 +111,16 @@ def extract_folder_name_from_title(title, unwanted_keywords): """ if not title: return 'Uncategorized' - + title_lower = title.lower() # Find all whole words in the title tokens = re.findall(r'\b[\w\-]+\b', title_lower) - + for token in tokens: clean_token = clean_folder_name(token) if clean_token and clean_token.lower() not in unwanted_keywords: return clean_token - + # Fallback to cleaning the full title if no single significant word is found cleaned_full_title = clean_folder_name(title) return cleaned_full_title if cleaned_full_title else 'Uncategorized' @@ -120,6 +130,7 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords): """ Matches folder names from a title based on a list of known name objects. Each name object is a dict: {'name': 'PrimaryName', 'aliases': ['alias1', ...]} + MODIFIED: Uses substring matching for CJK aliases, word boundary for others. Args: title (str): The post title to check. @@ -137,10 +148,11 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords): for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS: cleaned_title = re.sub(pat_str, ' ', cleaned_title, flags=re.IGNORECASE) cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() + # Store both original case cleaned title and lower case for different matching title_lower = cleaned_title.lower() matched_cleaned_names = set() - + # Sort by name length descending to match longer names first (e.g., "Cloud Strife" before "Cloud") sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) @@ -149,19 +161,43 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords): aliases = name_obj.get("aliases", []) if not primary_folder_name or not aliases: continue - + + # <<< START MODIFICATION >>> + cleaned_primary_name = clean_folder_name(primary_folder_name) + if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords: + continue # Skip this entry entirely if its primary name is unwanted or empty + + match_found_for_this_object = False for alias in aliases: + if not alias: continue alias_lower = alias.lower() - if not alias_lower: continue - - # Use word boundaries for accurate matching - pattern = r'\b' + re.escape(alias_lower) + r'\b' - if re.search(pattern, title_lower): - cleaned_primary_name = clean_folder_name(primary_folder_name) - if cleaned_primary_name.lower() not in unwanted_keywords: + + # Check if the alias contains CJK characters + if contains_cjk(alias): + # Use simple substring matching for CJK + if alias_lower in title_lower: matched_cleaned_names.add(cleaned_primary_name) - break # Move to the next name object once a match is found for this one - + match_found_for_this_object = True + break # Move to the next name object + else: + # Use original word boundary matching for non-CJK + try: + # Compile pattern for efficiency if used repeatedly, though here it changes each loop + pattern = r'\b' + re.escape(alias_lower) + r'\b' + if re.search(pattern, title_lower): + matched_cleaned_names.add(cleaned_primary_name) + match_found_for_this_object = True + break # Move to the next name object + except re.error as e: + # Log error if the alias creates an invalid regex (unlikely with escape) + print(f"Regex error for alias '{alias}': {e}") # Or use proper logging + continue + + # This outer break logic remains the same (though slightly redundant with inner breaks) + if match_found_for_this_object: + pass # Already added and broke inner loop + # <<< END MODIFICATION >>> + return sorted(list(matched_cleaned_names)) @@ -188,23 +224,26 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw for name_obj in names_to_match: primary_name = name_obj.get("name") if not primary_name: continue - + cleaned_primary_name = clean_folder_name(primary_name) if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords: continue for alias in name_obj.get("aliases", []): - if alias.lower(): - alias_map_to_primary.append((alias.lower(), cleaned_primary_name)) - + # <<< MODIFICATION: Ensure alias is not empty before converting to lower case >>> + if alias: # Check if alias is not None and not an empty string + alias_lower_val = alias.lower() + if alias_lower_val: # Check again after lowercasing (handles case where alias might be just spaces) + alias_map_to_primary.append((alias_lower_val, cleaned_primary_name)) + # Sort by alias length, descending, to match longer aliases first alias_map_to_primary.sort(key=lambda x: len(x[0]), reverse=True) - # <<< MODIFICATION: Return the FIRST match found, which will be the longest >>> + # Return the FIRST match found, which will be the longest for alias_lower, primary_name_for_alias in alias_map_to_primary: if alias_lower in filename_lower: # Found the longest possible alias that is a substring. Return immediately. return [primary_name_for_alias] - + # If the loop finishes without any matches, return an empty list. - return [] + return [] \ No newline at end of file