commit

2025-12-17 15:36:51 +00:00 · 2025-10-26 12:08:48 +05:30 · 2025-10-26 12:08:48 +05:30 · 7e8e8a59e2
commit 7e8e8a59e2
parent 0acd433920
4 changed files with 113 additions and 66 deletions
--- a/src/core/Hentai2read_client.py
+++ b/src/core/Hentai2read_client.py
@ -1,8 +1,6 @@
-# src/core/Hentai2read_client.py
-
 import re
 import os
-import time
+import time 
 import cloudscraper
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
@ -65,12 +63,37 @@ def run_hentai2read_download(start_url, output_dir, progress_callback, overall_p
 def _get_series_metadata(start_url, progress_callback, scraper):
    """
    Scrapes the main series page to get the Artist Name, Series Title, and chapter list.
+    Includes a retry mechanism for the initial connection.
    """
-    try:
-        response = scraper.get(start_url, timeout=30)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
+    max_retries = 4  # Total number of attempts (1 initial + 3 retries)
+    last_exception = None
+    soup = None

+    for attempt in range(max_retries):
+        try:
+            if attempt > 0:
+                progress_callback(f"   [Hentai2Read] ⚠️ Retrying connection (Attempt {attempt + 1}/{max_retries})...")
+            
+            response = scraper.get(start_url, timeout=30)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            
+            # If successful, clear exception and break the loop
+            last_exception = None
+            break
+
+        except Exception as e:
+            last_exception = e
+            progress_callback(f"   [Hentai2Read] ⚠️ Connection attempt {attempt + 1} failed: {e}")
+            if attempt < max_retries - 1:
+                time.sleep(2 * (attempt + 1))  # Wait 2s, 4s, 6s
+            continue  # Try again
+
+    if last_exception:
+        progress_callback(f"   [Hentai2Read] ❌ Error getting series metadata after {max_retries} attempts: {last_exception}")
+        return "Unknown Series", []
+
+    try:
        series_title = "Unknown Series"
        artist_name = None
        metadata_list = soup.select_one("ul.list.list-simple-mini")
@ -107,10 +130,9 @@ def _get_series_metadata(start_url, progress_callback, scraper):
        return top_level_folder_name, chapters_to_process

    except Exception as e:
-        progress_callback(f"   [Hentai2Read] ❌ Error getting series metadata: {e}")
+        progress_callback(f"   [Hentai2Read] ❌ Error parsing metadata after successful connection: {e}")
        return "Unknown Series", []

-### NEW: This function contains the pipeline logic ###
 def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func):
    """
    Uses a producer-consumer pattern to download a chapter.
@ -120,12 +142,10 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
    task_queue = queue.Queue()
    num_download_threads = 8
    
-    # These will be updated by the worker threads
    download_stats = {'downloaded': 0, 'skipped': 0}

    def downloader_worker():
        """The function that each download thread will run."""
-        # Create a unique session for each thread to avoid conflicts
        worker_scraper = cloudscraper.create_scraper()
        while True:
            try:
@ -153,12 +173,10 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
            finally:
                task_queue.task_done()

-    # --- Start the downloader threads ---
    executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader')
    for _ in range(num_download_threads):
        executor.submit(downloader_worker)

-    # --- Main thread acts as the scraper (producer) ---
    page_number = 1
    while True:
        if check_pause_func(): break
@ -168,12 +186,25 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
        
        page_url_to_check = f"{chapter_url}{page_number}/"
        try:
-            response = scraper.get(page_url_to_check, timeout=30)
-            if response.history or response.status_code != 200:
+            page_response = None
+            page_last_exception = None
+            for page_attempt in range(3): # 3 attempts for sub-pages
+                try:
+                    page_response = scraper.get(page_url_to_check, timeout=30)
+                    page_last_exception = None
+                    break
+                except Exception as e:
+                    page_last_exception = e
+                    time.sleep(1) # Short delay for page scraping retries
+            
+            if page_last_exception:
+                raise page_last_exception # Give up after 3 tries
+
+            if page_response.history or page_response.status_code != 200:
                progress_callback(f"   [Hentai2Read] End of chapter detected on page {page_number}.")
                break

-            soup = BeautifulSoup(response.text, 'html.parser')
+            soup = BeautifulSoup(page_response.text, 'html.parser')
            img_tag = soup.select_one("img#arf-reader")
            img_src = img_tag.get("src") if img_tag else None

@ -181,12 +212,11 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
                progress_callback(f"   [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).")
                break
            
-            normalized_img_src = urljoin(response.url, img_src)
+            normalized_img_src = urljoin(page_response.url, img_src)
            ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg"
            filename = f"{page_number:03d}{ext}"
            filepath = os.path.join(save_path, filename)
            
-            # Put the download task into the queue for a worker to pick up
            task_queue.put((filepath, normalized_img_src))
            
            page_number += 1
@ -195,12 +225,9 @@ def _process_and_download_chapter(chapter_url, save_path, scraper, progress_call
            progress_callback(f"   [Hentai2Read] ❌ Error while scraping page {page_number}: {e}")
            break
            
-    # --- Shutdown sequence ---
-    # Tell all worker threads to exit by sending the sentinel value
    for _ in range(num_download_threads):
        task_queue.put(None)
    
-    # Wait for all download tasks to be completed
    executor.shutdown(wait=True)
    
    progress_callback(f"   Found and processed {page_number - 1} images for this chapter.")
--- a/src/ui/classes/downloader_factory.py
+++ b/src/ui/classes/downloader_factory.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse

 # Utility Imports
 from ...utils.network_utils import prepare_cookies_for_request
-from ...utils.file_utils import clean_folder_name # Keep if needed by any thread init
+from ...utils.file_utils import clean_folder_name 

 # Downloader Thread Imports (Alphabetical Order Recommended)
 from .allcomic_downloader_thread import AllcomicDownloadThread
@ -16,7 +16,6 @@ from .erome_downloader_thread import EromeDownloadThread
 from .external_link_downloader_thread import ExternalLinkDownloadThread
 from .fap_nation_downloader_thread import FapNationDownloadThread
 from .hentai2read_downloader_thread import Hentai2readDownloadThread
-# ---> ADD IMPORT FOR NEW KEMONO DISCORD THREAD <---
 from .kemono_discord_downloader_thread import KemonoDiscordDownloadThread
 from .mangadex_downloader_thread import MangaDexDownloadThread
 from .nhentai_downloader_thread import NhentaiDownloadThread
@ -34,7 +33,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
    or None if no special handler is found (indicating fallback to generic BackendDownloadThread).
    """

-    # --- Specific Site/Service Handlers ---

    # Handler for Booru sites (Danbooru, Gelbooru)
    if service in ['danbooru', 'gelbooru']:
@ -68,7 +66,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
        return MangaDexDownloadThread(api_url, effective_output_dir_for_run, main_app)

    # Handler for Saint2
-    # Check specific domains identified by extract_post_info or common patterns
    is_saint2_url = service == 'saint2' or 'saint2.su' in api_url or 'saint2.pk' in api_url # Add more domains if needed
    if is_saint2_url and api_url.strip().lower() != 'saint2.su': # Exclude batch mode trigger if using URL input
        return Saint2DownloadThread(api_url, effective_output_dir_for_run, main_app)
@ -93,7 +90,7 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
        main_app.log_signal.emit("ℹ️ Rule34Video.com URL detected. Starting dedicated downloader.")
        return Rule34VideoDownloadThread(api_url, effective_output_dir_for_run, main_app) # id1 (video_id) is used inside the thread

-    # ---> HANDLER FOR KEMONO DISCORD (Place BEFORE official Discord) <---
+    # HANDLER FOR KEMONO DISCORD (Place BEFORE official Discord)
    elif service == 'discord' and any(domain in api_url for domain in ['kemono.cr', 'kemono.su', 'kemono.party']):
        main_app.log_signal.emit("ℹ️ Kemono Discord URL detected. Starting dedicated downloader.")
        cookies = prepare_cookies_for_request(
@ -119,8 +116,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
        token = main_app.remove_from_filename_input.text().strip() # Token is in the "Remove Words" field for Discord
        if not token:
             main_app.log_signal.emit("❌ Official Discord requires an Authorization Token in the 'Remove Words' field.")
-             # Optionally show a message box here
-             # QMessageBox.warning(main_app, "Token Required", "Please enter your Discord Authorization Token in the 'Remove Words from name' field.")
             return None # Or a specific error sentinel

        limit_text = main_app.discord_message_limit_input.text().strip()
@ -140,7 +135,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out
            parent=main_app # Pass main_app for events/signals
        )

-    # Handler for Allcomic/Allporncomic
    # Check specific domains or rely on service name if extract_post_info provides it
    if service == 'allcomic' or 'allcomic.com' in api_url or 'allporncomic.com' in api_url:
        return AllcomicDownloadThread(api_url, effective_output_dir_for_run, main_app)
@ -164,7 +158,6 @@ def create_downloader_thread(main_app, api_url, service, id1, id2, effective_out

    # Handler for nHentai
    if service == 'nhentai':
-        # nHentai requires fetching data *before* creating the thread
        from ...core.nhentai_client import fetch_nhentai_gallery
        main_app.log_signal.emit(f"ℹ️ nHentai gallery ID {id1} detected. Fetching gallery data...")
        gallery_data = fetch_nhentai_gallery(id1, main_app.log_signal.emit)
--- a/src/ui/main_window.py
+++ b/src/ui/main_window.py
@ -339,11 +339,9 @@ class DownloaderApp (QWidget ):
        self._connect_signals()
        if hasattr(self, 'character_input'):
            self.character_input.setToolTip(self._tr("character_input_tooltip", "Enter character names (comma-separated)..."))
-        self.log_signal.emit(f"ℹ️ Manga filename style loaded: '{self.manga_filename_style}'")
+        self.log_signal.emit(f"ℹ️ filename style loaded: '{self.manga_filename_style}'")
        self.log_signal.emit(f"ℹ️ Skip words scope loaded: '{self.skip_words_scope}'")
        self.log_signal.emit(f"ℹ️ Character filter scope set to default: '{self.char_filter_scope}'")
-        self.log_signal.emit(f"ℹ️ Multi-part download defaults to: {'Enabled' if self.allow_multipart_download_setting else 'Disabled'}")
-        self.log_signal.emit(f"ℹ️ Scan post content for images defaults to: {'Enabled' if self.scan_content_images_setting else 'Disabled'}")
        self.log_signal.emit(f"ℹ️ Application language loaded: '{self.current_selected_language.upper()}' (UI may not reflect this yet).")
        self._retranslate_main_ui()
        self._load_persistent_history()
@ -831,14 +829,11 @@ class DownloaderApp (QWidget ):
                self.download_btn.setEnabled(False)
                self.pause_btn.setEnabled(False)
            else:
-                # --- START MODIFICATION ---
-                # Check if we are about to download fetched posts and update text accordingly
                if self.is_ready_to_download_fetched:
                    num_posts = len(self.fetched_posts_for_download)
                    self.download_btn.setText(f"⬇️ Start Download ({num_posts} Posts)")
                    self.download_btn.setEnabled(True) # Keep it enabled for the user to click
                else:
-                    # Original logic for an active download in other scenarios
                    self.download_btn.setText(self._tr("start_download_button_text", "⬇️ Start Download"))
                    self.download_btn.setEnabled(False)
                
@ -926,11 +921,9 @@ class DownloaderApp (QWidget ):
        
        args_template = self.last_start_download_args
        
-        # Update both the character filter list and the domain override in the arguments
        args_template['filter_character_list'] = parsed_filters
        args_template['domain_override'] = domain_override
        
-        # Manually set the UI to a "downloading" state for reliability
        self.set_ui_enabled(False)
        self.download_btn.setText("⬇️ Downloading...")
        self.download_btn.setEnabled(False)
@ -938,7 +931,6 @@ class DownloaderApp (QWidget ):
        self.cancel_btn.setEnabled(True)
        self.cancel_btn.setText("❌ Cancel & Reset UI")
        try:
-            # Ensure signals are connected to the correct actions for this state
            self.cancel_btn.clicked.disconnect()
            self.pause_btn.clicked.disconnect()
        except TypeError:
@ -5626,13 +5618,11 @@ class DownloaderApp (QWidget ):
            api_domain = parsed_api_url.netloc if parsed_api_url.netloc else self._get_domain_for_service(service)
            post_page_url = f"https://{api_domain}/{service}/user/{user_id}/post/{post_id}"
        
-        # --- NEW LOGIC: Differentiate between loaded files and live session errors ---
        # Initialize variables before the conditional blocks
        target_folder_path_for_download = None
        filename_override_for_download = None
        
        if job_details.get('is_loaded_from_txt'):
-            # --- BEHAVIOR FOR LOADED FILES: Recalculate everything from current UI settings ---
            self.log_signal.emit(f"   Retrying loaded file. Recalculating path and name from current UI settings...")

            # 1. Get all current settings and job data
@ -6325,10 +6315,8 @@ class DownloaderApp (QWidget ):
                    if hasattr(self, 'link_input'):
                        self.last_link_input_text_for_queue_sync = self.link_input.text()

-                # --- START: MODIFIED LOGIC ---
                # Manually trigger the UI update now that the queue is populated and the dialog is closed.
                self.update_ui_for_manga_mode(self.manga_mode_checkbox.isChecked() if self.manga_mode_checkbox else False)
-                # --- END: MODIFIED LOGIC ---

    def _load_saved_cookie_settings(self):
        """Loads and applies saved cookie settings on startup."""
--- a/src/utils/text_utils.py
+++ b/src/utils/text_utils.py
@ -26,6 +26,16 @@ KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [
    r'\bPreview\b',
 ]

+# --- START NEW CODE ---
+# Regular expression to detect CJK characters
+# Covers Hiragana, Katakana, Half/Full width forms, CJK Unified Ideographs, Hangul Syllables, etc.
+cjk_pattern = re.compile(r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9fff\uac00-\ud7af]')
+
+def contains_cjk(text):
+    """Checks if the text contains any CJK characters."""
+    return bool(cjk_pattern.search(text))
+# --- END NEW CODE ---
+
 # --- Text Matching and Manipulation Utilities ---

 def is_title_match_for_character(post_title, character_name_filter):
@ -42,7 +52,7 @@ def is_title_match_for_character(post_title, character_name_filter):
    """
    if not post_title or not character_name_filter:
        return False
-        
+
    # Use word boundaries (\b) to match whole words only
    pattern = r"(?i)\b" + re.escape(str(character_name_filter).strip()) + r"\b"
    return bool(re.search(pattern, post_title))
@ -62,7 +72,7 @@ def is_filename_match_for_character(filename, character_name_filter):
    """
    if not filename or not character_name_filter:
        return False
-        
+
    return str(character_name_filter).strip().lower() in filename.lower()


@ -101,16 +111,16 @@ def extract_folder_name_from_title(title, unwanted_keywords):
    """
    if not title:
        return 'Uncategorized'
-        
+
    title_lower = title.lower()
    # Find all whole words in the title
    tokens = re.findall(r'\b[\w\-]+\b', title_lower)
-    
+
    for token in tokens:
        clean_token = clean_folder_name(token)
        if clean_token and clean_token.lower() not in unwanted_keywords:
            return clean_token
-            
+
    # Fallback to cleaning the full title if no single significant word is found
    cleaned_full_title = clean_folder_name(title)
    return cleaned_full_title if cleaned_full_title else 'Uncategorized'
@ -120,6 +130,7 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
    """
    Matches folder names from a title based on a list of known name objects.
    Each name object is a dict: {'name': 'PrimaryName', 'aliases': ['alias1', ...]}
+    MODIFIED: Uses substring matching for CJK aliases, word boundary for others.

    Args:
        title (str): The post title to check.
@ -137,10 +148,11 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
    for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS:
        cleaned_title = re.sub(pat_str, ' ', cleaned_title, flags=re.IGNORECASE)
    cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
+    # Store both original case cleaned title and lower case for different matching
    title_lower = cleaned_title.lower()

    matched_cleaned_names = set()
-    
+
    # Sort by name length descending to match longer names first (e.g., "Cloud Strife" before "Cloud")
    sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True)

@ -149,19 +161,43 @@ def match_folders_from_title(title, names_to_match, unwanted_keywords):
        aliases = name_obj.get("aliases", [])
        if not primary_folder_name or not aliases:
            continue
-            
+
+        # <<< START MODIFICATION >>>
+        cleaned_primary_name = clean_folder_name(primary_folder_name)
+        if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords:
+            continue # Skip this entry entirely if its primary name is unwanted or empty
+
+        match_found_for_this_object = False
        for alias in aliases:
+            if not alias: continue
            alias_lower = alias.lower()
-            if not alias_lower: continue
-            
-            # Use word boundaries for accurate matching
-            pattern = r'\b' + re.escape(alias_lower) + r'\b'
-            if re.search(pattern, title_lower):
-                cleaned_primary_name = clean_folder_name(primary_folder_name)
-                if cleaned_primary_name.lower() not in unwanted_keywords:
+
+            # Check if the alias contains CJK characters
+            if contains_cjk(alias):
+                # Use simple substring matching for CJK
+                if alias_lower in title_lower:
                    matched_cleaned_names.add(cleaned_primary_name)
-                    break # Move to the next name object once a match is found for this one
-                    
+                    match_found_for_this_object = True
+                    break # Move to the next name object
+            else:
+                # Use original word boundary matching for non-CJK
+                try:
+                    # Compile pattern for efficiency if used repeatedly, though here it changes each loop
+                    pattern = r'\b' + re.escape(alias_lower) + r'\b'
+                    if re.search(pattern, title_lower):
+                        matched_cleaned_names.add(cleaned_primary_name)
+                        match_found_for_this_object = True
+                        break # Move to the next name object
+                except re.error as e:
+                    # Log error if the alias creates an invalid regex (unlikely with escape)
+                    print(f"Regex error for alias '{alias}': {e}") # Or use proper logging
+                    continue
+
+        # This outer break logic remains the same (though slightly redundant with inner breaks)
+        if match_found_for_this_object:
+             pass # Already added and broke inner loop
+        # <<< END MODIFICATION >>>
+
    return sorted(list(matched_cleaned_names))


@ -188,23 +224,26 @@ def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keyw
    for name_obj in names_to_match:
        primary_name = name_obj.get("name")
        if not primary_name: continue
-        
+
        cleaned_primary_name = clean_folder_name(primary_name)
        if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords:
            continue

        for alias in name_obj.get("aliases", []):
-            if alias.lower():
-                alias_map_to_primary.append((alias.lower(), cleaned_primary_name))
-    
+            # <<< MODIFICATION: Ensure alias is not empty before converting to lower case >>>
+            if alias: # Check if alias is not None and not an empty string
+                alias_lower_val = alias.lower()
+                if alias_lower_val: # Check again after lowercasing (handles case where alias might be just spaces)
+                    alias_map_to_primary.append((alias_lower_val, cleaned_primary_name))
+
    # Sort by alias length, descending, to match longer aliases first
    alias_map_to_primary.sort(key=lambda x: len(x[0]), reverse=True)

-    # <<< MODIFICATION: Return the FIRST match found, which will be the longest >>>
+    # Return the FIRST match found, which will be the longest
    for alias_lower, primary_name_for_alias in alias_map_to_primary:
        if alias_lower in filename_lower:
            # Found the longest possible alias that is a substring. Return immediately.
            return [primary_name_for_alias]
-            
+
    # If the loop finishes without any matches, return an empty list.
-    return []
+    return []