import os import time import requests import re import threading import queue import hashlib import http.client import traceback from concurrent.futures import ThreadPoolExecutor, Future, CancelledError, as_completed import html from PyQt5.QtCore import QObject, pyqtSignal, QThread, QMutex, QMutexLocker from urllib.parse import urlparse try: from PIL import Image except ImportError: print("ERROR: Pillow library not found. Please install it: pip install Pillow") Image = None try: from multipart_downloader import download_file_in_parts MULTIPART_DOWNLOADER_AVAILABLE = True except ImportError as e: print(f"Warning: multipart_downloader.py not found or import error: {e}. Multi-part downloads will be disabled.") MULTIPART_DOWNLOADER_AVAILABLE = False def download_file_in_parts(*args, **kwargs): return False, 0, None, None # Dummy function from io import BytesIO STYLE_POST_TITLE = "post_title" STYLE_ORIGINAL_NAME = "original_name" STYLE_DATE_BASED = "date_based" # For manga date-based sequential naming MANGA_DATE_PREFIX_DEFAULT = "" # Default for the new prefix STYLE_POST_TITLE_GLOBAL_NUMBERING = "post_title_global_numbering" # For manga post title + global counter SKIP_SCOPE_FILES = "files" SKIP_SCOPE_POSTS = "posts" SKIP_SCOPE_BOTH = "both" CHAR_SCOPE_TITLE = "title" CHAR_SCOPE_FILES = "files" CHAR_SCOPE_BOTH = "both" CHAR_SCOPE_COMMENTS = "comments" FILE_DOWNLOAD_STATUS_SUCCESS = "success" FILE_DOWNLOAD_STATUS_SKIPPED = "skipped" FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER = "failed_retry_later" FILE_DOWNLOAD_STATUS_FAILED_PERMANENTLY_THIS_SESSION = "failed_permanent_session" # New status fastapi_app = None KNOWN_NAMES = [] # This will now store dicts: {'name': str, 'is_group': bool, 'aliases': list[str]} MIN_SIZE_FOR_MULTIPART_DOWNLOAD = 10 * 1024 * 1024 # 10 MB - Stays the same MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 15 # Max concurrent connections for a single file MAX_FILENAME_COMPONENT_LENGTH = 150 IMAGE_EXTENSIONS = { '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp', '.heic', '.heif', '.svg', '.ico', '.jfif', '.pjpeg', '.pjp', '.avif' } VIDEO_EXTENSIONS = { '.mp4', '.mov', '.mkv', '.webm', '.avi', '.wmv', '.flv', '.mpeg', '.mpg', '.m4v', '.3gp', '.ogv', '.ts', '.vob' } ARCHIVE_EXTENSIONS = { '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2' } AUDIO_EXTENSIONS = { '.mp3', '.wav', '.aac', '.flac', '.ogg', '.wma', '.m4a', '.opus', '.aiff', '.ape', '.mid', '.midi' } FOLDER_NAME_STOP_WORDS = { "a", "alone", "am", "an", "and", "at", "be", "blues", "but", "by", "com", "for", "grown", "hard", "he", "her", "his", "hitting", "i", "im", "in", "is", "it", "its", "me", "much", "my", "net", "not", "of", "on", "or", "org", "our", "please", "right", "s", "she", "so", "technically", "tell", "the", "their", "they", "this", "to", "ve", "was", "we", "well", "were", "with", "www", "year", "you", "your", } CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = { "poll", "cover", "fan-art", "fanart", "requests", "request", "holiday", "suggest", "suggestions", "batch", "open", "closed", "winner", "loser", "minor", "adult", "wip", "update", "news", "discussion", "question", "stream", "video", "sketchbook", "artwork", # Added more generic words # Numbers 1-20 (as strings and words) "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", # Added "WIP" here as well for consistency "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", # Months (short and long) "jan", "january", "feb", "february", "mar", "march", "apr", "april", "may", "jun", "june", "jul", "july", "aug", "august", "sep", "september", "oct", "october", "nov", "november", "dec", "december", # Weekdays (short and long) "mon", "monday", "tue", "tuesday", "wed", "wednesday", "thu", "thursday", "fri", "friday", "sat", "saturday", "sun", "sunday" } # New: Patterns to remove from titles/filenames *before* matching against Known.txt KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [ r'\bcum\b', r'\bnsfw\b', r'\bsfw\b', r'\bweb\b', r'\bhd\b', r'\bhi\s*res\b', # hi res, hi-res, hires r'\bhigh\s*res\b', # high res, high-res, highres r'\b\d+p\b', # 720p, 1080p, 1440p etc. r'\b\d+k\b', # 2k, 4k, 8k etc. r'\[OC\]', # [OC] r'\[Request(?:s)?\]', # [Request], [Requests] r'\bCommission\b', r'\bComm\b', r'\bPreview\b', ] def parse_cookie_string(cookie_string): """Parses a 'name=value; name2=value2' cookie string into a dict.""" cookies = {} if cookie_string: for item in cookie_string.split(';'): parts = item.split('=', 1) if len(parts) == 2: name = parts[0].strip() value = parts[1].strip() if name: # Ensure name is not empty cookies[name] = value return cookies if cookies else None def load_cookies_from_netscape_file(filepath, logger_func, target_domain_filter=None): # Added target_domain_filter """Loads cookies from a Netscape-formatted cookies.txt file. If target_domain_filter is provided, only cookies for that domain (or its subdomains) are returned. """ cookies = {} loaded_for_target_domain_count = 0 total_cookies_in_file = 0 try: with open(filepath, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line or line.startswith('#'): continue parts = line.split('\t') total_cookies_in_file +=1 # Count all valid lines if len(parts) == 7: cookie_domain_from_file = parts[0] # e.g., ".kemono.su" or "kemono.su" name = parts[5] value = parts[6] if name: # Ensure name is not empty if target_domain_filter: # Standard cookie domain matching logic: # A cookie set for ".example.com" is valid for "example.com" and "www.example.com". # A cookie set for "example.com" is valid only for "example.com". host_to_match = target_domain_filter.lower() cookie_domain_norm = cookie_domain_from_file.lower() is_match = False if cookie_domain_norm.startswith('.'): # Cookie domain is like ".example.com" # Valid if host_to_match is "example.com" or "sub.example.com" # i.e., host_to_match equals cookie_domain_norm[1:] OR host_to_match ends with cookie_domain_norm if host_to_match == cookie_domain_norm[1:] or host_to_match.endswith(cookie_domain_norm): is_match = True else: # Cookie domain is like "example.com" # Valid only if host_to_match is "example.com" if host_to_match == cookie_domain_norm: is_match = True if is_match: cookies[name] = value loaded_for_target_domain_count += 1 else: # No target_domain_filter, load all cookies[name] = value if target_domain_filter: logger_func(f" đŸĒ Scanned {total_cookies_in_file} cookies in '{os.path.basename(filepath)}'. Loaded {loaded_for_target_domain_count} for domain '{target_domain_filter}'.") else: logger_func(f" đŸĒ Loaded {len(cookies)} cookies from '{os.path.basename(filepath)}' (no domain filter).") return cookies if cookies else None # Return None if empty, even if filtered except FileNotFoundError: logger_func(f" đŸĒ Cookie file '{os.path.basename(filepath)}' not found at expected location.") return None except Exception as e: logger_func(f" đŸĒ Error parsing cookie file '{os.path.basename(filepath)}': {e}") return None def is_title_match_for_character(post_title, character_name_filter): if not post_title or not character_name_filter: return False safe_filter = str(character_name_filter).strip() if not safe_filter: return False pattern = r"(?i)\b" + re.escape(safe_filter) + r"\b" match_result = bool(re.search(pattern, post_title)) return match_result def is_filename_match_for_character(filename, character_name_filter): if not filename or not character_name_filter: return False safe_filter = str(character_name_filter).strip().lower() if not safe_filter: return False match_result = safe_filter in filename.lower() return match_result def clean_folder_name(name): if not isinstance(name, str): name = str(name) cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name) cleaned = cleaned.strip() # Initial strip cleaned = re.sub(r'\s+', ' ', cleaned) # Condense multiple spaces if cleaned: # Only process if not empty words = cleaned.split(' ') filtered_words = [word for word in words if word.lower() not in FOLDER_NAME_STOP_WORDS and word] cleaned = ' '.join(filtered_words) cleaned = cleaned.strip() # Strip again in case stop words were at ends if not cleaned: # If empty after initial cleaning return "untitled_folder" if len(cleaned) > MAX_FILENAME_COMPONENT_LENGTH: cleaned = cleaned[:MAX_FILENAME_COMPONENT_LENGTH] temp_name = cleaned while len(temp_name) > 0 and (temp_name.endswith('.') or temp_name.endswith(' ')): temp_name = temp_name[:-1] return temp_name if temp_name else "untitled_folder" def clean_filename(name): if not isinstance(name, str): name = str(name) cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name) cleaned = cleaned.strip() # Remove leading/trailing spaces first cleaned = re.sub(r'\s+', ' ', cleaned) # Replace multiple internal spaces with a single space if not cleaned: return "untitled_file" base_name, ext = os.path.splitext(cleaned) max_base_len = MAX_FILENAME_COMPONENT_LENGTH - len(ext) if len(base_name) > max_base_len: if max_base_len > 0: # If there's space for at least some of the base name base_name = base_name[:max_base_len] else: # No space for base name (extension is too long or fills the entire allowed space) return cleaned[:MAX_FILENAME_COMPONENT_LENGTH] if cleaned else "untitled_file" final_name = base_name + ext return final_name if final_name else "untitled_file" def strip_html_tags(html_text): if not html_text: return "" text = html.unescape(str(html_text)) # Ensure input is a string text_after_tag_removal = re.sub(r'<[^>]+>', ' ', text) cleaned_text = re.sub(r'\s+', ' ', text_after_tag_removal).strip() return cleaned_text def extract_folder_name_from_title(title, unwanted_keywords): if not title: return 'Uncategorized' title_lower = title.lower() tokens = re.findall(r'\b[\w\-]+\b', title_lower) for token in tokens: clean_token = clean_folder_name(token) if clean_token and clean_token.lower() not in unwanted_keywords: return clean_token cleaned_full_title = clean_folder_name(title) return cleaned_full_title if cleaned_full_title else 'Uncategorized' def match_folders_from_title(title, names_to_match, unwanted_keywords): """ Matches folder names from a title based on a list of known name objects. Each name object in names_to_match is expected to be a dict: {'name': 'PrimaryFolderName', 'aliases': ['alias1', 'alias2', ...]} """ if not title or not names_to_match: return [] # Pre-process the title to remove specific patterns before matching cleaned_title_for_matching = title for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS: cleaned_title_for_matching = re.sub(pat_str, ' ', cleaned_title_for_matching, flags=re.IGNORECASE) # Replace with space # Condense multiple spaces that might result from removal and strip cleaned_title_for_matching = re.sub(r'\s+', ' ', cleaned_title_for_matching).strip() title_lower = cleaned_title_for_matching.lower() # Use the pre-cleaned title for matching matched_cleaned_names = set() sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True) # Sort by primary name length for name_obj in sorted_name_objects: primary_folder_name = name_obj.get("name") aliases = name_obj.get("aliases", []) if not primary_folder_name or not aliases: continue for alias in aliases: alias_lower = alias.lower() if not alias_lower: continue pattern = r'\b' + re.escape(alias_lower) + r'\b' if re.search(pattern, title_lower): cleaned_primary_name = clean_folder_name(primary_folder_name) if cleaned_primary_name.lower() not in unwanted_keywords: matched_cleaned_names.add(cleaned_primary_name) break # Found a match for this primary name via one of its aliases return sorted(list(matched_cleaned_names)) def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keywords): if not filename or not names_to_match: return [] filename_lower = filename.lower() # Raw filename, lowercase, no KNOWN_TXT_MATCH_CLEANUP_PATTERNS matched_primary_names = set() # Create a flat list of (alias_lower, primary_folder_name_cleaned) # and sort by alias length (descending) to prioritize longer, more specific alias matches. alias_map_to_primary = [] for name_obj in names_to_match: primary_folder_name = name_obj.get("name") if not primary_folder_name: continue # Clean the primary name once here cleaned_primary_name = clean_folder_name(primary_folder_name) # Only consider this known_name entry if its cleaned primary name is valid and not an unwanted keyword if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords: continue aliases_for_obj = name_obj.get("aliases", []) for alias in aliases_for_obj: alias_lower = alias.lower() if alias_lower: # Ensure alias is not empty alias_map_to_primary.append((alias_lower, cleaned_primary_name)) alias_map_to_primary.sort(key=lambda x: len(x[0]), reverse=True) for alias_lower, primary_name_for_alias in alias_map_to_primary: if filename_lower.startswith(alias_lower): if primary_name_for_alias not in matched_primary_names: matched_primary_names.add(primary_name_for_alias) return sorted(list(matched_primary_names)) def is_image(filename): if not filename: return False _, ext = os.path.splitext(filename) return ext.lower() in IMAGE_EXTENSIONS def is_video(filename): if not filename: return False _, ext = os.path.splitext(filename) return ext.lower() in VIDEO_EXTENSIONS def is_zip(filename): if not filename: return False return filename.lower().endswith('.zip') def is_rar(filename): if not filename: return False return filename.lower().endswith('.rar') def is_archive(filename): if not filename: return False _, ext = os.path.splitext(filename) return ext.lower() in ARCHIVE_EXTENSIONS def is_audio(filename): if not filename: return False _, ext = os.path.splitext(filename) return ext.lower() in AUDIO_EXTENSIONS def is_post_url(url): if not isinstance(url, str): return False return '/post/' in urlparse(url).path def extract_post_info(url_string): service, user_id, post_id = None, None, None if not isinstance(url_string, str) or not url_string.strip(): return None, None, None try: parsed_url = urlparse(url_string.strip()) domain = parsed_url.netloc.lower() is_kemono = any(d in domain for d in ['kemono.su', 'kemono.party']) is_coomer = any(d in domain for d in ['coomer.su', 'coomer.party']) if not (is_kemono or is_coomer): return None, None, None path_parts = [part for part in parsed_url.path.strip('/').split('/') if part] if len(path_parts) >= 3 and path_parts[1].lower() == 'user': service = path_parts[0] user_id = path_parts[2] if len(path_parts) >= 5 and path_parts[3].lower() == 'post': post_id = path_parts[4] return service, user_id, post_id if len(path_parts) >= 5 and path_parts[0].lower() == 'api' and \ path_parts[1].lower() == 'v1' and path_parts[3].lower() == 'user': service = path_parts[2] user_id = path_parts[4] if len(path_parts) >= 7 and path_parts[5].lower() == 'post': post_id = path_parts[6] return service, user_id, post_id except Exception as e: print(f"Debug: Exception during extract_post_info for URL '{url_string}': {e}") return None, None, None def prepare_cookies_for_request(use_cookie_flag, cookie_text_input, selected_cookie_file_path_from_ui, app_base_dir, logger_func, target_domain=None): """Prepares a cookie dictionary from text input or cookies.txt file.""" if not use_cookie_flag: return None attempted_paths = set() # Keep track of paths we've tried to load to avoid redundant attempts/logs # Attempt 1: UI-selected file, if relevant for the target_domain if selected_cookie_file_path_from_ui: basename_selected = os.path.basename(selected_cookie_file_path_from_ui) is_relevant_selection = False if target_domain: # If a specific domain is targeted if basename_selected == f"{target_domain}_cookies.txt" or basename_selected == "cookies.txt": is_relevant_selection = True else: # No target_domain, so any selected file is considered relevant (e.g. for direct file downloads) is_relevant_selection = True if is_relevant_selection: logger_func(f" đŸĒ Attempting to load cookies from UI-selected file: '{basename_selected}' for domain '{target_domain or 'any'}'...") norm_selected_path = os.path.normpath(selected_cookie_file_path_from_ui) attempted_paths.add(norm_selected_path) cookies = load_cookies_from_netscape_file(selected_cookie_file_path_from_ui, logger_func, target_domain_filter=target_domain) if cookies: return cookies else: logger_func(f" âš ī¸ Failed to load cookies from UI-selected file: '{basename_selected}'.") else: logger_func(f" â„šī¸ UI-selected cookie file '{basename_selected}' is not specific to target domain '{target_domain}' or generic. Skipping it for this request, will try other sources.") # Attempt 2: Domain-specific file in app_base_dir (e.g., _cookies.txt) if app_base_dir and target_domain: domain_specific_filename = f"{target_domain}_cookies.txt" domain_specific_path = os.path.join(app_base_dir, domain_specific_filename) norm_domain_specific_path = os.path.normpath(domain_specific_path) if os.path.exists(domain_specific_path) and norm_domain_specific_path not in attempted_paths: logger_func(f" đŸĒ Attempting to load domain-specific cookies: '{domain_specific_filename}' for '{target_domain}' from app directory...") attempted_paths.add(norm_domain_specific_path) cookies = load_cookies_from_netscape_file(domain_specific_path, logger_func, target_domain_filter=target_domain) if cookies: return cookies else: logger_func(f" âš ī¸ Failed to load cookies from '{domain_specific_filename}' in app directory.") # Attempt 3: Default cookies.txt in app_base_dir if app_base_dir: default_cookies_filename = "cookies.txt" default_cookies_path = os.path.join(app_base_dir, default_cookies_filename) norm_default_path = os.path.normpath(default_cookies_path) if os.path.exists(default_cookies_path) and norm_default_path not in attempted_paths: logger_func(f" đŸĒ Attempting to load default '{default_cookies_filename}' from app directory for domain '{target_domain or 'any'}'...") attempted_paths.add(norm_default_path) cookies = load_cookies_from_netscape_file(default_cookies_path, logger_func, target_domain_filter=target_domain) if cookies: return cookies else: logger_func(f" âš ī¸ Failed to load cookies from default '{default_cookies_filename}' in app directory.") # Attempt 4: UI text input if cookie_text_input: logger_func(f" đŸĒ Using cookies from UI text input for domain '{target_domain or 'any'}' (as file methods failed or were not applicable).") cookies = parse_cookie_string(cookie_text_input) if cookies: return cookies else: logger_func(" âš ī¸ UI cookie text input was provided but was empty or invalid.") logger_func(f" đŸĒ Cookie usage enabled for domain '{target_domain or 'any'}', but no valid cookies found from any source.") return None def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None, pause_event=None, cookies_dict=None): if cancellation_event and cancellation_event.is_set(): # type: ignore logger(" Fetch cancelled before request.") raise RuntimeError("Fetch operation cancelled by user.") if pause_event and pause_event.is_set(): # type: ignore logger(" Post fetching paused...") while pause_event.is_set(): if cancellation_event and cancellation_event.is_set(): logger(" Post fetching cancelled while paused.") # type: ignore raise RuntimeError("Fetch operation cancelled by user.") time.sleep(0.5) logger(" Post fetching resumed.") paginated_url = f'{api_url_base}?o={offset}' logger(f" Fetching: {paginated_url} (Page approx. {offset // 50 + 1})") try: response = requests.get(paginated_url, headers=headers, timeout=(10, 60), cookies=cookies_dict) response.raise_for_status() if 'application/json' not in response.headers.get('Content-Type', '').lower(): logger(f"âš ī¸ Unexpected content type from API: {response.headers.get('Content-Type')}. Body: {response.text[:200]}") return [] return response.json() except requests.exceptions.Timeout: raise RuntimeError(f"Timeout fetching offset {offset} from {paginated_url}") except requests.exceptions.RequestException as e: err_msg = f"Error fetching offset {offset} from {paginated_url}: {e}" if e.response is not None: err_msg += f" (Status: {e.response.status_code}, Body: {e.response.text[:200]})" if isinstance(e, requests.exceptions.ConnectionError) and \ ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): err_msg += "\n 💡 This looks like a DNS resolution problem. Please check your internet connection, DNS settings, or VPN." raise RuntimeError(err_msg) except ValueError as e: raise RuntimeError(f"Error decoding JSON from offset {offset} ({paginated_url}): {e}. Response text: {response.text[:200]}") except Exception as e: raise RuntimeError(f"Unexpected error fetching offset {offset} ({paginated_url}): {e}") def fetch_post_comments(api_domain, service, user_id, post_id, headers, logger, cancellation_event=None, pause_event=None, cookies_dict=None): if cancellation_event and cancellation_event.is_set(): logger(" Comment fetch cancelled before request.") raise RuntimeError("Comment fetch operation cancelled by user.") if pause_event and pause_event.is_set(): # type: ignore logger(" Comment fetching paused...") while pause_event.is_set(): if cancellation_event and cancellation_event.is_set(): logger(" Comment fetching cancelled while paused.") raise RuntimeError("Comment fetch operation cancelled by user.") time.sleep(0.5) logger(" Comment fetching resumed.") comments_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{post_id}/comments" logger(f" Fetching comments: {comments_api_url}") try: response = requests.get(comments_api_url, headers=headers, timeout=(10, 30), cookies=cookies_dict) response.raise_for_status() if 'application/json' not in response.headers.get('Content-Type', '').lower(): logger(f"âš ī¸ Unexpected content type from comments API: {response.headers.get('Content-Type')}. Body: {response.text[:200]}") return [] # Return empty list if not JSON return response.json() except requests.exceptions.Timeout: raise RuntimeError(f"Timeout fetching comments for post {post_id} from {comments_api_url}") except requests.exceptions.RequestException as e: err_msg = f"Error fetching comments for post {post_id} from {comments_api_url}: {e}" if e.response is not None: err_msg += f" (Status: {e.response.status_code}, Body: {e.response.text[:200]})" if isinstance(e, requests.exceptions.ConnectionError) and \ ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): err_msg += "\n 💡 This looks like a DNS resolution problem. Please check your internet connection, DNS settings, or VPN." raise RuntimeError(err_msg) except ValueError as e: # JSONDecodeError inherits from ValueError raise RuntimeError(f"Error decoding JSON from comments API for post {post_id} ({comments_api_url}): {e}. Response text: {response.text[:200]}") except Exception as e: raise RuntimeError(f"Unexpected error fetching comments for post {post_id} ({comments_api_url}): {e}") def download_from_api(api_url_input, logger=print, start_page=None, end_page=None, manga_mode=False, cancellation_event=None, pause_event=None, use_cookie=False, cookie_text="", selected_cookie_file=None, app_base_dir=None): headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'} service, user_id, target_post_id = extract_post_info(api_url_input) if cancellation_event and cancellation_event.is_set(): logger(" Download_from_api cancelled at start.") return parsed_input_url_for_domain = urlparse(api_url_input) api_domain = parsed_input_url_for_domain.netloc if not any(d in api_domain.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']): logger(f"âš ī¸ Unrecognized domain '{api_domain}' from input URL. Defaulting to kemono.su for API calls.") api_domain = "kemono.su" # Default domain if input is unusual cookies_for_api = None if use_cookie and app_base_dir: # app_base_dir is needed for cookies.txt path cookies_for_api = prepare_cookies_for_request(use_cookie, cookie_text, selected_cookie_file, app_base_dir, logger, target_domain=api_domain) if target_post_id: direct_post_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{target_post_id}" logger(f" Attempting direct fetch for target post: {direct_post_api_url}") try: direct_response = requests.get(direct_post_api_url, headers=headers, timeout=(10, 30), cookies=cookies_for_api) direct_response.raise_for_status() direct_post_data = direct_response.json() if isinstance(direct_post_data, list) and direct_post_data: direct_post_data = direct_post_data[0] if isinstance(direct_post_data, dict) and 'post' in direct_post_data and isinstance(direct_post_data['post'], dict): direct_post_data = direct_post_data['post'] # Extract the nested post data if isinstance(direct_post_data, dict) and direct_post_data.get('id') == target_post_id: # Now check the extracted/direct dict logger(f" ✅ Direct fetch successful for post {target_post_id}.") yield [direct_post_data] # Yield the single post data as a list return # Exit the generator, no need to paginate else: response_type = type(direct_post_data).__name__ response_snippet = str(direct_post_data)[:200] # Log first 200 chars logger(f" âš ī¸ Direct fetch for post {target_post_id} returned unexpected data (Type: {response_type}, Snippet: '{response_snippet}'). Falling back to pagination.") except requests.exceptions.RequestException as e: logger(f" âš ī¸ Direct fetch failed for post {target_post_id}: {e}. Falling back to pagination.") except Exception as e: logger(f" âš ī¸ Unexpected error during direct fetch for post {target_post_id}: {e}. Falling back to pagination.") if not service or not user_id: logger(f"❌ Invalid URL or could not extract service/user: {api_url_input}") return if target_post_id and (start_page or end_page): logger("âš ī¸ Page range (start/end page) is ignored when a specific post URL is provided (searching all pages for the post).") is_creator_feed_for_manga = manga_mode and not target_post_id api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}" page_size = 50 if is_creator_feed_for_manga: logger(" Manga Mode: Fetching posts to sort by date (oldest processed first)...") all_posts_for_manga_mode = [] current_offset_manga = 0 if start_page and start_page > 1: current_offset_manga = (start_page - 1) * page_size logger(f" Manga Mode: Starting fetch from page {start_page} (offset {current_offset_manga}).") elif start_page: # start_page is 1 logger(f" Manga Mode: Starting fetch from page 1 (offset 0).") if end_page: logger(f" Manga Mode: Will fetch up to page {end_page}.") while True: if pause_event and pause_event.is_set(): logger(" Manga mode post fetching paused...") # type: ignore while pause_event.is_set(): if cancellation_event and cancellation_event.is_set(): logger(" Manga mode post fetching cancelled while paused.") # type: ignore break time.sleep(0.5) if not (cancellation_event and cancellation_event.is_set()): logger(" Manga mode post fetching resumed.") if cancellation_event and cancellation_event.is_set(): logger(" Manga mode post fetching cancelled.") break current_page_num_manga = (current_offset_manga // page_size) + 1 if end_page and current_page_num_manga > end_page: logger(f" Manga Mode: Reached specified end page ({end_page}). Stopping post fetch.") break try: posts_batch_manga = fetch_posts_paginated(api_base_url, headers, current_offset_manga, logger, cancellation_event, pause_event, cookies_dict=cookies_for_api) if not isinstance(posts_batch_manga, list): logger(f"❌ API Error (Manga Mode): Expected list of posts, got {type(posts_batch_manga)}.") break if not posts_batch_manga: logger("✅ Reached end of posts (Manga Mode fetch all).") if start_page and not end_page and current_page_num_manga < start_page: # Started on a page with no posts logger(f" Manga Mode: No posts found on or after specified start page {start_page}.") elif end_page and current_page_num_manga <= end_page and not all_posts_for_manga_mode: # Range specified but no posts in it logger(f" Manga Mode: No posts found within the specified page range ({start_page or 1}-{end_page}).") break # No more posts from API all_posts_for_manga_mode.extend(posts_batch_manga) current_offset_manga += page_size # Increment by page_size for the next API call's 'o' parameter time.sleep(0.6) except RuntimeError as e: if "cancelled by user" in str(e).lower(): logger(f"â„šī¸ Manga mode pagination stopped due to cancellation: {e}") else: logger(f"❌ {e}\n Aborting manga mode pagination.") break except Exception as e: logger(f"❌ Unexpected error during manga mode fetch: {e}") traceback.print_exc() break if cancellation_event and cancellation_event.is_set(): return if all_posts_for_manga_mode: logger(f" Manga Mode: Fetched {len(all_posts_for_manga_mode)} total posts. Sorting by publication date (oldest first)...") def sort_key_tuple(post): published_date_str = post.get('published') added_date_str = post.get('added') post_id_str = post.get('id', "0") primary_sort_val = "0000-00-00T00:00:00" # Default for missing dates (effectively oldest) if published_date_str: primary_sort_val = published_date_str elif added_date_str: logger(f" âš ī¸ Post ID {post_id_str} missing 'published' date, using 'added' date '{added_date_str}' for primary sorting.") primary_sort_val = added_date_str else: logger(f" âš ī¸ Post ID {post_id_str} missing both 'published' and 'added' dates. Placing at start of sort (using default earliest date).") secondary_sort_val = 0 # Default for non-integer IDs try: secondary_sort_val = int(post_id_str) except ValueError: logger(f" âš ī¸ Post ID '{post_id_str}' is not a valid integer for secondary sorting, using 0.") return (primary_sort_val, secondary_sort_val) all_posts_for_manga_mode.sort(key=sort_key_tuple) # Sorts ascending by (date, id) for i in range(0, len(all_posts_for_manga_mode), page_size): if cancellation_event and cancellation_event.is_set(): logger(" Manga mode post yielding cancelled.") break yield all_posts_for_manga_mode[i:i + page_size] return current_page_num = 1 current_offset = 0 processed_target_post_flag = False if start_page and start_page > 1 and not target_post_id: # Only apply start_page if not targeting a specific post directly current_offset = (start_page - 1) * page_size current_page_num = start_page logger(f" Starting from page {current_page_num} (calculated offset {current_offset}).") while True: if pause_event and pause_event.is_set(): logger(" Post fetching loop paused...") # type: ignore while pause_event.is_set(): if cancellation_event and cancellation_event.is_set(): logger(" Post fetching loop cancelled while paused.") break time.sleep(0.5) if not (cancellation_event and cancellation_event.is_set()): logger(" Post fetching loop resumed.") if cancellation_event and cancellation_event.is_set(): logger(" Post fetching loop cancelled.") break if target_post_id and processed_target_post_flag: break if not target_post_id and end_page and current_page_num > end_page: logger(f"✅ Reached specified end page ({end_page}) for creator feed. Stopping.") break try: posts_batch = fetch_posts_paginated(api_base_url, headers, current_offset, logger, cancellation_event, pause_event, cookies_dict=cookies_for_api) if not isinstance(posts_batch, list): logger(f"❌ API Error: Expected list of posts, got {type(posts_batch)} at page {current_page_num} (offset {current_offset}).") break except RuntimeError as e: if "cancelled by user" in str(e).lower(): logger(f"â„šī¸ Pagination stopped due to cancellation: {e}") else: logger(f"❌ {e}\n Aborting pagination at page {current_page_num} (offset {current_offset}).") break except Exception as e: logger(f"❌ Unexpected error fetching page {current_page_num} (offset {current_offset}): {e}") traceback.print_exc() break if not posts_batch: if target_post_id and not processed_target_post_flag: # Only log this if we were searching for a specific post logger(f"❌ Target post {target_post_id} not found after checking all available pages (API returned no more posts at offset {current_offset}).") elif not target_post_id: if current_page_num == (start_page or 1): logger(f"😕 No posts found on the first page checked (page {current_page_num}, offset {current_offset}).") # Log if the very first page of the range/feed is empty else: logger(f"✅ Reached end of posts (no more content from API at offset {current_offset}).") break if target_post_id and not processed_target_post_flag: matching_post = next((p for p in posts_batch if str(p.get('id')) == str(target_post_id)), None) if matching_post: logger(f"đŸŽ¯ Found target post {target_post_id} on page {current_page_num} (offset {current_offset}).") yield [matching_post] processed_target_post_flag = True elif not target_post_id: yield posts_batch if processed_target_post_flag: break current_offset += page_size # Increment by page_size for the next API call's 'o' parameter current_page_num += 1 time.sleep(0.6) if target_post_id and not processed_target_post_flag and not (cancellation_event and cancellation_event.is_set()): logger(f"❌ Target post {target_post_id} could not be found after checking all relevant pages (final check after loop).") def get_link_platform(url): try: domain = urlparse(url).netloc.lower() if 'drive.google.com' in domain: return 'google drive' if 'mega.nz' in domain or 'mega.io' in domain: return 'mega' if 'dropbox.com' in domain: return 'dropbox' if 'patreon.com' in domain: return 'patreon' if 'instagram.com' in domain: return 'instagram' if 'twitter.com' in domain or 'x.com' in domain: return 'twitter/x' if 'discord.gg' in domain or 'discord.com/invite' in domain: return 'discord invite' if 'pixiv.net' in domain: return 'pixiv' if 'kemono.su' in domain or 'kemono.party' in domain: return 'kemono' if 'coomer.su' in domain or 'coomer.party' in domain: return 'coomer' parts = domain.split('.') if len(parts) >= 2: if parts[-2] not in ['com', 'org', 'net', 'gov', 'edu', 'co'] or len(parts) == 2: return parts[-2] elif len(parts) >= 3 and parts[-3] not in ['com', 'org', 'net', 'gov', 'edu', 'co']: return parts[-3] else: return domain return 'external' except Exception: return 'unknown' class PostProcessorSignals(QObject): progress_signal = pyqtSignal(str) file_download_status_signal = pyqtSignal(bool) external_link_signal = pyqtSignal(str, str, str, str, str) # Added decryption_key (str) file_progress_signal = pyqtSignal(str, object) missed_character_post_signal = pyqtSignal(str, str) # New: post_title, reason class PostProcessorWorker: def __init__(self, post_data, download_root, known_names, filter_character_list, emitter, # Changed signals to emitter unwanted_keywords, filter_mode, skip_zip, skip_rar, use_subfolders, use_post_subfolders, target_post_id_from_initial_url, custom_folder_name, compress_images, download_thumbnails, service, user_id, pause_event, # Added pause_event api_url_input, cancellation_event, downloaded_files, downloaded_file_hashes, downloaded_files_lock, downloaded_file_hashes_lock, dynamic_character_filter_holder=None, skip_words_list=None, # Added dynamic_character_filter_holder skip_words_scope=SKIP_SCOPE_FILES, show_external_links=False, extract_links_only=False, num_file_threads=4, skip_current_file_flag=None, manga_mode_active=False, manga_filename_style=STYLE_POST_TITLE, char_filter_scope=CHAR_SCOPE_FILES, remove_from_filename_words_list=None, allow_multipart_download=True, cookie_text="", # Added missing parameter use_cookie=False, # Added missing parameter override_output_dir=None, # New parameter selected_cookie_file=None, # Added missing parameter app_base_dir=None, # New parameter for app's base directory manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter for date-based prefix manga_date_file_counter_ref=None, # New parameter for date-based manga naming scan_content_for_images=False, # New flag for scanning HTML content creator_download_folder_ignore_words=None, # New: For ignoring specific words for folder names manga_global_file_counter_ref=None, # New parameter for global numbering ): # type: ignore self.post = post_data # type: ignore self.download_root = download_root self.known_names = known_names self.filter_character_list_objects_initial = filter_character_list if filter_character_list else [] # Store initial self.dynamic_filter_holder = dynamic_character_filter_holder # Store the holder self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else set() self.filter_mode = filter_mode self.skip_zip = skip_zip self.skip_rar = skip_rar self.use_subfolders = use_subfolders self.use_post_subfolders = use_post_subfolders self.target_post_id_from_initial_url = target_post_id_from_initial_url self.custom_folder_name = custom_folder_name self.compress_images = compress_images self.download_thumbnails = download_thumbnails self.service = service self.user_id = user_id self.api_url_input = api_url_input self.cancellation_event = cancellation_event self.pause_event = pause_event # Store pause_event self.emitter = emitter # Store the emitter if not self.emitter: raise ValueError("PostProcessorWorker requires an emitter (signals object or queue).") self.skip_current_file_flag = skip_current_file_flag self.downloaded_files = downloaded_files if downloaded_files is not None else set() self.downloaded_file_hashes = downloaded_file_hashes if downloaded_file_hashes is not None else set() self.downloaded_files_lock = downloaded_files_lock if downloaded_files_lock is not None else threading.Lock() self.downloaded_file_hashes_lock = downloaded_file_hashes_lock if downloaded_file_hashes_lock is not None else threading.Lock() self.skip_words_list = skip_words_list if skip_words_list is not None else [] self.skip_words_scope = skip_words_scope self.show_external_links = show_external_links self.extract_links_only = extract_links_only self.num_file_threads = num_file_threads self.manga_mode_active = manga_mode_active self.manga_filename_style = manga_filename_style self.char_filter_scope = char_filter_scope self.remove_from_filename_words_list = remove_from_filename_words_list if remove_from_filename_words_list is not None else [] self.allow_multipart_download = allow_multipart_download self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store the reference self.selected_cookie_file = selected_cookie_file # Store selected cookie file path self.app_base_dir = app_base_dir # Store app base dir self.cookie_text = cookie_text # Store cookie text self.manga_date_prefix = manga_date_prefix # Store the prefix self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store global counter self.use_cookie = use_cookie # Store cookie setting self.override_output_dir = override_output_dir # Store the override directory self.scan_content_for_images = scan_content_for_images # Store new flag self.creator_download_folder_ignore_words = creator_download_folder_ignore_words # Store new ignore words if self.compress_images and Image is None: # self.logger is not available yet, PostProcessorSignals.progress_signal.emit can be used or print self.logger("âš ī¸ Image compression disabled: Pillow library not found.") self.compress_images = False def _emit_signal(self, signal_type_str, *payload_args): """Helper to emit signal either directly or via queue.""" if isinstance(self.emitter, queue.Queue): self.emitter.put({'type': signal_type_str, 'payload': payload_args}) elif self.emitter and hasattr(self.emitter, f"{signal_type_str}_signal"): signal_attr = getattr(self.emitter, f"{signal_type_str}_signal") signal_attr.emit(*payload_args) else: print(f"(Worker Log - Unrecognized Emitter for {signal_type_str}): {payload_args[0] if payload_args else ''}") def logger(self, message): self._emit_signal('progress', message) def check_cancel(self): return self.cancellation_event.is_set() def _check_pause(self, context_message="Operation"): if self.pause_event and self.pause_event.is_set(): # type: ignore self.logger(f" {context_message} paused...") while self.pause_event.is_set(): # type: ignore # Loop while pause_event is set if self.check_cancel(): self.logger(f" {context_message} cancelled while paused.") return True # Indicates cancellation occurred time.sleep(0.5) if not self.check_cancel(): self.logger(f" {context_message} resumed.") return False # Not cancelled during pause def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event, # skip_event is threading.Event post_title="", file_index_in_post=0, num_files_in_this_post=1, manga_date_file_counter_ref=None): # Added manga_date_file_counter_ref was_original_name_kept_flag = False manga_global_file_counter_ref = None # Placeholder, will be passed from process() final_filename_saved_for_return = "" def _get_current_character_filters(self): if self.dynamic_filter_holder: return self.dynamic_filter_holder.get_filters() return self.filter_character_list_objects_initial def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event, post_title="", file_index_in_post=0, num_files_in_this_post=1, # Added manga_date_file_counter_ref manga_date_file_counter_ref=None, forced_filename_override=None, # New for retries manga_global_file_counter_ref=None): # New for global numbering was_original_name_kept_flag = False final_filename_saved_for_return = "" retry_later_details = None # self.logger(f" [DSF_DEBUG] Entered _download_single_file for: {file_info.get('name', 'unknown file') if file_info else 'NO_FILE_INFO'}") if self._check_pause(f"File download prep for '{file_info.get('name', 'unknown file')}'"): return 0, 1, "", False if self.check_cancel() or (skip_event and skip_event.is_set()): return 0, 1, "", False # self.logger(f" [DSF_DEBUG] Passed pause/cancel checks.") file_url = file_info.get('url') cookies_to_use_for_file = None if self.use_cookie: # This flag comes from the checkbox # self.logger(f" [DSF_DEBUG] Preparing cookies...") cookies_to_use_for_file = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger) # self.logger(f" [DSF_DEBUG] Cookies prepared for file download (target_domain not applicable here as it's for API auth, not direct file links).") api_original_filename = file_info.get('_original_name_for_log', file_info.get('name')) # self.logger(f" [DSF_DEBUG] API original filename: '{api_original_filename}'") filename_to_save_in_main_path = "" if forced_filename_override: filename_to_save_in_main_path = forced_filename_override self.logger(f" Retrying with forced filename: '{filename_to_save_in_main_path}'") else: # self.logger(f" [DSF_DEBUG] Starting filename generation logic...") if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_FILES or self.skip_words_scope == SKIP_SCOPE_BOTH): filename_to_check_for_skip_words = api_original_filename.lower() for skip_word in self.skip_words_list: if skip_word.lower() in filename_to_check_for_skip_words: self.logger(f" -> Skip File (Keyword in Original Name '{skip_word}'): '{api_original_filename}'. Scope: {self.skip_words_scope}") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None # self.logger(f" [DSF_DEBUG] About to call clean_filename for: '{api_original_filename}'") cleaned_original_api_filename = clean_filename(api_original_filename) # self.logger(f" [DSF_DEBUG] clean_filename returned: '{cleaned_original_api_filename}'") original_filename_cleaned_base, original_ext = os.path.splitext(cleaned_original_api_filename) if not original_ext.startswith('.'): original_ext = '.' + original_ext if original_ext else '' if self.manga_mode_active: # Note: duplicate_file_mode is overridden to "Delete" in main.py if manga_mode is on # self.logger(f" [DSF_DEBUG] Manga mode active. Style: {self.manga_filename_style}") if self.manga_filename_style == STYLE_ORIGINAL_NAME: filename_to_save_in_main_path = cleaned_original_api_filename # Use already cleaned if self.manga_date_prefix and self.manga_date_prefix.strip(): cleaned_prefix = clean_filename(self.manga_date_prefix.strip()) if cleaned_prefix: filename_to_save_in_main_path = f"{cleaned_prefix} {filename_to_save_in_main_path}" else: self.logger(f"âš ī¸ Manga Original Name Mode: Provided prefix '{self.manga_date_prefix}' was empty after cleaning. Using original name only.") was_original_name_kept_flag = True elif self.manga_filename_style == STYLE_POST_TITLE: if post_title and post_title.strip(): cleaned_post_title_base = clean_filename(post_title.strip()) if num_files_in_this_post > 1: if file_index_in_post == 0: filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}" else: filename_to_save_in_main_path = f"{cleaned_post_title_base}_{file_index_in_post}{original_ext}" was_original_name_kept_flag = False # Name is derived, not original else: filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}" else: filename_to_save_in_main_path = cleaned_original_api_filename # Fallback to cleaned original self.logger(f"âš ī¸ Manga mode (Post Title Style): Post title missing for post {original_post_id_for_log}. Using cleaned original filename '{filename_to_save_in_main_path}'.") elif self.manga_filename_style == STYLE_DATE_BASED: current_thread_name = threading.current_thread().name if manga_date_file_counter_ref is not None and len(manga_date_file_counter_ref) == 2: counter_val_for_filename = -1 counter_lock = manga_date_file_counter_ref[1] # self.logger(f" [DSF_DEBUG] Manga Date Mode: Acquiring counter lock...") with counter_lock: counter_val_for_filename = manga_date_file_counter_ref[0] manga_date_file_counter_ref[0] += 1 # type: ignore # self.logger(f" [DSF_DEBUG] Manga Date Mode: Counter lock released. Value: {counter_val_for_filename}") base_numbered_name = f"{counter_val_for_filename:03d}" # type: ignore if self.manga_date_prefix and self.manga_date_prefix.strip(): cleaned_prefix = clean_filename(self.manga_date_prefix.strip()) if cleaned_prefix: # Ensure prefix is not empty after cleaning filename_to_save_in_main_path = f"{cleaned_prefix} {base_numbered_name}{original_ext}" else: # Prefix became empty after cleaning filename_to_save_in_main_path = f"{base_numbered_name}{original_ext}"; self.logger(f"âš ī¸ Manga Date Mode: Provided prefix '{self.manga_date_prefix}' was empty after cleaning. Using number only.") else: # No prefix provided filename_to_save_in_main_path = f"{base_numbered_name}{original_ext}" else: self.logger(f"âš ī¸ Manga Date Mode: Counter ref not provided or malformed for '{api_original_filename}'. Using original. Ref: {manga_date_file_counter_ref}") filename_to_save_in_main_path = cleaned_original_api_filename elif self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING: if manga_global_file_counter_ref is not None and len(manga_global_file_counter_ref) == 2: counter_val_for_filename = -1 counter_lock = manga_global_file_counter_ref[1] # self.logger(f" [DSF_DEBUG] Manga Title+GlobalNum Mode: Acquiring counter lock...") with counter_lock: counter_val_for_filename = manga_global_file_counter_ref[0] manga_global_file_counter_ref[0] += 1 # self.logger(f" [DSF_DEBUG] Manga Title+GlobalNum Mode: Counter lock released. Value: {counter_val_for_filename}") cleaned_post_title_base_for_global = clean_filename(post_title.strip() if post_title and post_title.strip() else "post") filename_to_save_in_main_path = f"{cleaned_post_title_base_for_global}_{counter_val_for_filename:03d}{original_ext}" else: # Counter ref not provided or malformed self.logger(f"âš ī¸ Manga Title+GlobalNum Mode: Counter ref not provided or malformed for '{api_original_filename}'. Using original. Ref: {manga_global_file_counter_ref}") filename_to_save_in_main_path = cleaned_original_api_filename # Fallback to cleaned original self.logger(f"âš ī¸ Manga mode (Title+GlobalNum Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path}' for post {original_post_id_for_log}.") else: self.logger(f"âš ī¸ Manga mode: Unknown filename style '{self.manga_filename_style}'. Defaulting to original filename for '{api_original_filename}'.") filename_to_save_in_main_path = cleaned_original_api_filename if not filename_to_save_in_main_path: filename_to_save_in_main_path = f"manga_file_{original_post_id_for_log}_{file_index_in_post + 1}{original_ext}" self.logger(f"âš ī¸ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save_in_main_path}'.") was_original_name_kept_flag = False else: # self.logger(f" [DSF_DEBUG] Non-manga mode. Setting filename_to_save_in_main_path to cleaned original: '{cleaned_original_api_filename}'") filename_to_save_in_main_path = cleaned_original_api_filename was_original_name_kept_flag = False # self.logger(f" [DSF_DEBUG] Filename after initial/manga logic: '{filename_to_save_in_main_path}'") if self.remove_from_filename_words_list and filename_to_save_in_main_path: # self.logger(f" [DSF_DEBUG] Starting 'remove_from_filename_words_list' logic for: '{filename_to_save_in_main_path}'") base_name_for_removal, ext_for_removal = os.path.splitext(filename_to_save_in_main_path) modified_base_name = base_name_for_removal for word_to_remove in self.remove_from_filename_words_list: if not word_to_remove: continue pattern = re.compile(re.escape(word_to_remove), re.IGNORECASE) modified_base_name = pattern.sub("", modified_base_name) modified_base_name = re.sub(r'[_.\s-]+', ' ', modified_base_name) # Convert all separators to spaces modified_base_name = re.sub(r'\s+', ' ', modified_base_name) # Condense multiple spaces to one modified_base_name = modified_base_name.strip() # Remove leading/trailing spaces if modified_base_name and modified_base_name != ext_for_removal.lstrip('.'): filename_to_save_in_main_path = modified_base_name + ext_for_removal else: filename_to_save_in_main_path = base_name_for_removal + ext_for_removal # self.logger(f" [DSF_DEBUG] Filename after 'remove_from_filename_words_list': '{filename_to_save_in_main_path}'") # self.logger(f" [DSF_DEBUG] Filename generation logic complete. Final base name for download: '{filename_to_save_in_main_path}'") if not self.download_thumbnails: # self.logger(f" [DSF_DEBUG] Starting filter mode checks (thumbnails not downloaded)...") is_img_type = is_image(api_original_filename) is_vid_type = is_video(api_original_filename) is_archive_type = is_archive(api_original_filename) is_audio_type = is_audio(api_original_filename) if self.filter_mode == 'archive': if not is_archive_type: self.logger(f" -> Filter Skip (Archive Mode): '{api_original_filename}' (Not an Archive).") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None elif self.filter_mode == 'image': if not is_img_type: self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Image).") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None elif self.filter_mode == 'video': if not is_vid_type: self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Video).") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None elif self.filter_mode == 'audio': # New audio filter mode if not is_audio_type: self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Audio).") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None if self.skip_zip and is_zip(api_original_filename): self.logger(f" -> Pref Skip: '{api_original_filename}' (ZIP).") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None if self.skip_rar and is_rar(api_original_filename): self.logger(f" -> Pref Skip: '{api_original_filename}' (RAR).") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None # self.logger(f" [DSF_DEBUG] Filter mode checks complete.") # self.logger(f" [DSF_DEBUG] Attempting to create target folder: '{target_folder_path}'") try: os.makedirs(target_folder_path, exist_ok=True) # For .part file # self.logger(f" [DSF_DEBUG] Target folder ensured/created.") except OSError as e: self.logger(f" ❌ Critical error creating directory '{target_folder_path}': {e}. Skipping file '{api_original_filename}'.") return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None # Treat as skip max_retries = 3 retry_delay = 5 downloaded_size_bytes = 0 calculated_file_hash = None # Will store the MD5 hash downloaded_part_file_path = None # Path to the .part file on disk was_multipart_download = False # Flag to indicate download method total_size_bytes = 0 download_successful_flag = False last_exception_for_retry_later = None # self.logger(f" [DSF_DEBUG] Entering download retry loop for '{api_original_filename}'...") for attempt_num_single_stream in range(max_retries + 1): if self._check_pause(f"File download attempt for '{api_original_filename}'"): break if self.check_cancel() or (skip_event and skip_event.is_set()): break try: if attempt_num_single_stream > 0: self.logger(f" Retrying download for '{api_original_filename}' (Overall Attempt {attempt_num_single_stream + 1}/{max_retries + 1})...") time.sleep(retry_delay * (2**(attempt_num_single_stream - 1))) self._emit_signal('file_download_status', True) response = requests.get(file_url, headers=headers, timeout=(15, 300), stream=True, cookies=cookies_to_use_for_file) response.raise_for_status() total_size_bytes = int(response.headers.get('Content-Length', 0)) num_parts_for_file = min(self.num_file_threads, MAX_PARTS_FOR_MULTIPART_DOWNLOAD) attempt_multipart = (self.allow_multipart_download and MULTIPART_DOWNLOADER_AVAILABLE and num_parts_for_file > 1 and total_size_bytes > MIN_SIZE_FOR_MULTIPART_DOWNLOAD and 'bytes' in response.headers.get('Accept-Ranges', '').lower()) if self._check_pause(f"Multipart decision for '{api_original_filename}'"): break # Check pause before potentially long operation if attempt_multipart: response.close() self._emit_signal('file_download_status', False) mp_save_path_base_for_part = os.path.join(target_folder_path, filename_to_save_in_main_path) mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts( file_url, mp_save_path_base_for_part, total_size_bytes, num_parts_for_file, headers, api_original_filename, emitter_for_multipart=self.emitter, cookies_for_chunk_session=cookies_to_use_for_file, # Pass cookies cancellation_event=self.cancellation_event, skip_event=skip_event, logger_func=self.logger, pause_event=self.pause_event # Pass pause_event ) if mp_success: download_successful_flag = True downloaded_size_bytes = mp_bytes calculated_file_hash = mp_hash # mp_save_path_base_for_part is the path *without* .part # temp_file_path in download_file_in_parts is save_path + ".part" downloaded_part_file_path = mp_save_path_base_for_part + ".part" # This is the actual path of the .part file was_multipart_download = True if mp_file_handle: mp_file_handle.close() # Close the handle from multipart_downloader break else: if attempt_num_single_stream < max_retries: self.logger(f" Multi-part download attempt failed for '{api_original_filename}'. Retrying with single stream.") else: download_successful_flag = False; break else: # Single stream download self.logger(f"âŦ‡ī¸ Downloading (Single Stream): '{api_original_filename}' (Size: {total_size_bytes / (1024*1024):.2f} MB if known) [Base Name: '{filename_to_save_in_main_path}']") current_single_stream_part_path = os.path.join(target_folder_path, filename_to_save_in_main_path + ".part") current_attempt_downloaded_bytes = 0 md5_hasher = hashlib.md5() last_progress_time = time.time() try: with open(current_single_stream_part_path, 'wb') as f_part: for chunk in response.iter_content(chunk_size=1 * 1024 * 1024): # Read 1MB chunks if self._check_pause(f"Chunk download for '{api_original_filename}'"): break if self.check_cancel() or (skip_event and skip_event.is_set()): break if chunk: f_part.write(chunk) md5_hasher.update(chunk) current_attempt_downloaded_bytes += len(chunk) if time.time() - last_progress_time > 1 and total_size_bytes > 0: self._emit_signal('file_progress', api_original_filename, (current_attempt_downloaded_bytes, total_size_bytes)) last_progress_time = time.time() if self.check_cancel() or (skip_event and skip_event.is_set()) or \ (self.pause_event and self.pause_event.is_set() and not (current_attempt_downloaded_bytes > 0 or (total_size_bytes == 0 and response.status_code == 200))): if os.path.exists(current_single_stream_part_path): os.remove(current_single_stream_part_path) break # Break from retry loop if current_attempt_downloaded_bytes > 0 or (total_size_bytes == 0 and response.status_code == 200): calculated_file_hash = md5_hasher.hexdigest() downloaded_size_bytes = current_attempt_downloaded_bytes downloaded_part_file_path = current_single_stream_part_path was_multipart_download = False download_successful_flag = True break # Break from retry loop else: # No bytes downloaded or error before finishing if os.path.exists(current_single_stream_part_path): os.remove(current_single_stream_part_path) except Exception as e_write: self.logger(f" ❌ Error writing single-stream to disk for '{api_original_filename}': {e_write}") if os.path.exists(current_single_stream_part_path): os.remove(current_single_stream_part_path) # This exception will be caught by the outer try-except for retries raise except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e: self.logger(f" ❌ Download Error (Retryable): {api_original_filename}. Error: {e}") last_exception_for_retry_later = e # Store this specific exception if isinstance(e, requests.exceptions.ConnectionError) and \ ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): self.logger(" 💡 This looks like a DNS resolution problem. Please check your internet connection, DNS settings, or VPN.") except requests.exceptions.RequestException as e: self.logger(f" ❌ Download Error (Non-Retryable): {api_original_filename}. Error: {e}") last_exception_for_retry_later = e # Store this too if ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): # More general check self.logger(" 💡 This looks like a DNS resolution problem. Please check your internet connection, DNS settings, or VPN.") # For most RequestExceptions (like 404, 403), we break the retry loop as they are unlikely to be fixed by immediate retries. break # Break from retry loop for non-retryable RequestExceptions except Exception as e: self.logger(f" ❌ Unexpected Download Error: {api_original_filename}: {e}\n{traceback.format_exc(limit=2)}") last_exception_for_retry_later = e # Capture unexpected errors too for later analysis break # Break from retry loop for other unexpected errors finally: self._emit_signal('file_download_status', False) final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes self._emit_signal('file_progress', api_original_filename, (downloaded_size_bytes, final_total_for_progress)) if self.check_cancel() or (skip_event and skip_event.is_set()) or (self.pause_event and self.pause_event.is_set() and not download_successful_flag): self.logger(f" âš ī¸ Download process interrupted for {api_original_filename}.") if downloaded_part_file_path and os.path.exists(downloaded_part_file_path): # Cleanup .part file try: os.remove(downloaded_part_file_path) except OSError: pass return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None if not download_successful_flag: self.logger(f"❌ Download failed for '{api_original_filename}' after {max_retries + 1} attempts.") # .part file should have been cleaned up by the download attempt if it failed to produce a final .part file is_actually_incomplete_read = False if isinstance(last_exception_for_retry_later, http.client.IncompleteRead): is_actually_incomplete_read = True elif hasattr(last_exception_for_retry_later, '__cause__') and isinstance(last_exception_for_retry_later.__cause__, http.client.IncompleteRead): is_actually_incomplete_read = True # Check if the exception or its arguments contain IncompleteRead information elif last_exception_for_retry_later is not None: # Ensure it's not None str_exc = str(last_exception_for_retry_later).lower() # Check the string representation of the exception and, if it's a tuple, check its elements. if "incompleteread" in str_exc or \ (isinstance(last_exception_for_retry_later, tuple) and \ any("incompleteread" in str(arg).lower() for arg in last_exception_for_retry_later if isinstance(arg, (str, Exception)))): # type: ignore is_actually_incomplete_read = True if is_actually_incomplete_read: self.logger(f" Marking '{api_original_filename}' for potential retry later due to IncompleteRead.") retry_later_details = { 'file_info': file_info, 'target_folder_path': target_folder_path, 'headers': headers, # Original headers 'original_post_id_for_log': original_post_id_for_log, 'post_title': post_title, 'file_index_in_post': file_index_in_post, 'num_files_in_this_post': num_files_in_this_post, 'forced_filename_override': filename_to_save_in_main_path, # The name it was trying to save as 'manga_mode_active_for_file': self.manga_mode_active, # Store context 'manga_filename_style_for_file': self.manga_filename_style, # Store context } return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER, retry_later_details # type: ignore else: # Other non-successful download attempts self.logger(f" Marking '{api_original_filename}' as permanently failed for this session.") permanent_failure_details = { 'file_info': file_info, 'target_folder_path': target_folder_path, 'headers': headers, # Original headers 'original_post_id_for_log': original_post_id_for_log, 'post_title': post_title, 'file_index_in_post': file_index_in_post, 'num_files_in_this_post': num_files_in_this_post, 'forced_filename_override': filename_to_save_in_main_path, # The name it was trying to save as } return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_FAILED_PERMANENTLY_THIS_SESSION, permanent_failure_details if self._check_pause(f"Post-download hash check for '{api_original_filename}'"): return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None with self.downloaded_file_hashes_lock: if calculated_file_hash in self.downloaded_file_hashes: self.logger(f" -> Skip Saving Duplicate (Hash Match): '{api_original_filename}' (Hash: {calculated_file_hash[:8]}...).") with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path) # Mark logical name if downloaded_part_file_path and os.path.exists(downloaded_part_file_path): # Cleanup .part file try: os.remove(downloaded_part_file_path) except OSError as e_rem: self.logger(f" -> Failed to remove .part file for hash duplicate: {e_rem}") return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None effective_save_folder = target_folder_path # Default: main character/post folder filename_after_styling_and_word_removal = filename_to_save_in_main_path try: # Ensure the chosen save folder (main or Duplicate) exists os.makedirs(effective_save_folder, exist_ok=True) except OSError as e: self.logger(f" ❌ Critical error creating directory '{effective_save_folder}': {e}. Skipping file '{api_original_filename}'.") if downloaded_part_file_path and os.path.exists(downloaded_part_file_path): # Cleanup .part file try: os.remove(downloaded_part_file_path) except OSError: pass return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None data_to_write_io = None # This will be BytesIO if compression happens and is used filename_after_compression = filename_after_styling_and_word_removal is_img_for_compress_check = is_image(api_original_filename) if is_img_for_compress_check and self.compress_images and Image and downloaded_size_bytes > (1.5 * 1024 * 1024): self.logger(f" Compressing '{api_original_filename}' ({downloaded_size_bytes / (1024*1024):.2f} MB)...") if self._check_pause(f"Image compression for '{api_original_filename}'"): return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None # Allow pause before compression img_content_for_pillow = None try: with open(downloaded_part_file_path, 'rb') as f_img_in: # Open the .part file img_content_for_pillow = BytesIO(f_img_in.read()) # Read its content for Pillow with Image.open(img_content_for_pillow) as img_obj: if img_obj.mode == 'P': img_obj = img_obj.convert('RGBA') # type: ignore elif img_obj.mode not in ['RGB', 'RGBA', 'L']: img_obj = img_obj.convert('RGB') # type: ignore compressed_output_io = BytesIO() img_obj.save(compressed_output_io, format='WebP', quality=80, method=4) compressed_size = compressed_output_io.getbuffer().nbytes if compressed_size < downloaded_size_bytes * 0.9: # If significantly smaller self.logger(f" Compression success: {compressed_size / (1024*1024):.2f} MB.") data_to_write_io = compressed_output_io data_to_write_io.seek(0) base_name_orig, _ = os.path.splitext(filename_after_compression) filename_after_compression = base_name_orig + '.webp' self.logger(f" Updated filename (compressed): {filename_after_compression}") else: self.logger(f" Compression skipped: WebP not significantly smaller.") # data_to_write_io remains None, original .part file will be used if compressed_output_io: compressed_output_io.close() # Close unused compressed buffer except Exception as comp_e: self.logger(f"❌ Compression failed for '{api_original_filename}': {comp_e}. Saving original.") # data_to_write_io remains None finally: if img_content_for_pillow: img_content_for_pillow.close() final_filename_on_disk = filename_after_compression # This is the name after potential compression if not (self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED): temp_base, temp_ext = os.path.splitext(final_filename_on_disk) suffix_counter = 1 while os.path.exists(os.path.join(effective_save_folder, final_filename_on_disk)): final_filename_on_disk = f"{temp_base}_{suffix_counter}{temp_ext}" suffix_counter += 1 if final_filename_on_disk != filename_after_compression: # Log if a suffix was applied self.logger(f" Applied numeric suffix in '{os.path.basename(effective_save_folder)}': '{final_filename_on_disk}' (was '{filename_after_compression}')") if self._check_pause(f"File saving for '{final_filename_on_disk}'"): return 0, 1, final_filename_on_disk, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None final_save_path = os.path.join(effective_save_folder, final_filename_on_disk) try: if data_to_write_io: # If compression produced data_to_write_io (BytesIO) with open(final_save_path, 'wb') as f_out: f_out.write(data_to_write_io.getvalue()) # Successfully wrote compressed data, now remove the original .part file if downloaded_part_file_path and os.path.exists(downloaded_part_file_path): try: os.remove(downloaded_part_file_path) except OSError as e_rem: self.logger(f" -> Failed to remove .part after compression: {e_rem}") else: # No compression or compression not used, rename the .part file if downloaded_part_file_path and os.path.exists(downloaded_part_file_path): os.rename(downloaded_part_file_path, final_save_path) else: # Should not happen if download was successful raise FileNotFoundError(f"Original .part file not found for saving: {downloaded_part_file_path}") with self.downloaded_file_hashes_lock: self.downloaded_file_hashes.add(calculated_file_hash) with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path) # Track by logical name final_filename_saved_for_return = final_filename_on_disk self.logger(f"✅ Saved: '{final_filename_saved_for_return}' (from '{api_original_filename}', {downloaded_size_bytes / (1024*1024):.2f} MB) in '{os.path.basename(effective_save_folder)}'") time.sleep(0.05) # Brief pause after successful save return 1, 0, final_filename_saved_for_return, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SUCCESS, None except Exception as save_err: self.logger(f"❌ Save Fail for '{final_filename_on_disk}': {save_err}") if os.path.exists(final_save_path): try: os.remove(final_save_path); except OSError: self.logger(f" -> Failed to remove partially saved file: {final_save_path}") # If rename failed, downloaded_part_file_path might still exist. # If writing compressed failed, final_save_path (partial compressed) was removed. Original .part might still exist. return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None # Treat save fail as skip finally: if data_to_write_io and hasattr(data_to_write_io, 'close'): data_to_write_io.close() # downloaded_part_file_path is just a path, no open handle from this function's direct scope to close here. # Handles from multipart or single stream write loop are closed within their respective sections or by 'with open'. def process(self): if self._check_pause(f"Post processing for ID {self.post.get('id', 'N/A')}"): return 0,0,[], [], [] if self.check_cancel(): return 0, 0, [], [], [] current_character_filters = self._get_current_character_filters() kept_original_filenames_for_log = [] retryable_failures_this_post = [] # New list to store retryable failure details permanent_failures_this_post = [] # New list for permanent failures total_downloaded_this_post = 0 total_skipped_this_post = 0 parsed_api_url = urlparse(self.api_url_input) referer_url = f"https://{parsed_api_url.netloc}/" headers = {'User-Agent': 'Mozilla/5.0', 'Referer': referer_url, 'Accept': '*/*'} link_pattern = re.compile(r"""]*>(.*?)""", re.IGNORECASE | re.DOTALL) post_data = self.post post_title = post_data.get('title', '') or 'untitled_post' post_id = post_data.get('id', 'unknown_id') post_main_file_info = post_data.get('file') post_attachments = post_data.get('attachments', []) effective_unwanted_keywords_for_folder_naming = self.unwanted_keywords.copy() is_full_creator_download_no_char_filter = not self.target_post_id_from_initial_url and not current_character_filters if is_full_creator_download_no_char_filter and self.creator_download_folder_ignore_words: self.logger(f" Applying creator download specific folder ignore words ({len(self.creator_download_folder_ignore_words)} words).") effective_unwanted_keywords_for_folder_naming.update(self.creator_download_folder_ignore_words) post_content_html = post_data.get('content', '') self.logger(f"\n--- Processing Post {post_id} ('{post_title[:50]}...') (Thread: {threading.current_thread().name}) ---") num_potential_files_in_post = len(post_attachments or []) + (1 if post_main_file_info and post_main_file_info.get('path') else 0) post_is_candidate_by_title_char_match = False char_filter_that_matched_title = None post_is_candidate_by_comment_char_match = False post_is_candidate_by_file_char_match_in_comment_scope = False char_filter_that_matched_file_in_comment_scope = None char_filter_that_matched_comment = None if current_character_filters and \ (self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH): if self._check_pause(f"Character title filter for post {post_id}"): return 0, num_potential_files_in_post, [], [] for idx, filter_item_obj in enumerate(current_character_filters): if self.check_cancel(): break terms_to_check_for_title = list(filter_item_obj["aliases"]) if filter_item_obj["is_group"]: if filter_item_obj["name"] not in terms_to_check_for_title: terms_to_check_for_title.append(filter_item_obj["name"]) unique_terms_for_title_check = list(set(terms_to_check_for_title)) for term_to_match in unique_terms_for_title_check: match_found_for_term = is_title_match_for_character(post_title, term_to_match) if match_found_for_term: post_is_candidate_by_title_char_match = True char_filter_that_matched_title = filter_item_obj self.logger(f" Post title matches char filter term '{term_to_match}' (from group/name '{filter_item_obj['name']}', Scope: {self.char_filter_scope}). Post is candidate.") break if post_is_candidate_by_title_char_match: break all_files_from_post_api_for_char_check = [] api_file_domain_for_char_check = urlparse(self.api_url_input).netloc if not api_file_domain_for_char_check or not any(d in api_file_domain_for_char_check.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']): api_file_domain_for_char_check = "kemono.su" if "kemono" in self.service.lower() else "coomer.party" if post_main_file_info and isinstance(post_main_file_info, dict) and post_main_file_info.get('path'): original_api_name = post_main_file_info.get('name') or os.path.basename(post_main_file_info['path'].lstrip('/')) if original_api_name: all_files_from_post_api_for_char_check.append({'_original_name_for_log': original_api_name}) for att_info in post_attachments: if isinstance(att_info, dict) and att_info.get('path'): original_api_att_name = att_info.get('name') or os.path.basename(att_info['path'].lstrip('/')) if original_api_att_name: all_files_from_post_api_for_char_check.append({'_original_name_for_log': original_api_att_name}) if current_character_filters and self.char_filter_scope == CHAR_SCOPE_COMMENTS: self.logger(f" [Char Scope: Comments] Phase 1: Checking post files for matches before comments for post ID '{post_id}'.") if self._check_pause(f"File check (comments scope) for post {post_id}"): return 0, num_potential_files_in_post, [], [] for file_info_item in all_files_from_post_api_for_char_check: # Use the pre-populated list of file names if self.check_cancel(): break current_api_original_filename_for_check = file_info_item.get('_original_name_for_log') if not current_api_original_filename_for_check: continue for filter_item_obj in current_character_filters: terms_to_check = list(filter_item_obj["aliases"]) if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check: terms_to_check.append(filter_item_obj["name"]) for term_to_match in terms_to_check: if is_filename_match_for_character(current_api_original_filename_for_check, term_to_match): post_is_candidate_by_file_char_match_in_comment_scope = True char_filter_that_matched_file_in_comment_scope = filter_item_obj self.logger(f" Match Found (File in Comments Scope): File '{current_api_original_filename_for_check}' matches char filter term '{term_to_match}' (from group/name '{filter_item_obj['name']}'). Post is candidate.") break if post_is_candidate_by_file_char_match_in_comment_scope: break if post_is_candidate_by_file_char_match_in_comment_scope: break self.logger(f" [Char Scope: Comments] Phase 1 Result: post_is_candidate_by_file_char_match_in_comment_scope = {post_is_candidate_by_file_char_match_in_comment_scope}") if current_character_filters and self.char_filter_scope == CHAR_SCOPE_COMMENTS: if not post_is_candidate_by_file_char_match_in_comment_scope: if self._check_pause(f"Comment check for post {post_id}"): return 0, num_potential_files_in_post, [], [] self.logger(f" [Char Scope: Comments] Phase 2: No file match found. Checking post comments for post ID '{post_id}'.") try: parsed_input_url_for_comments = urlparse(self.api_url_input) api_domain_for_comments = parsed_input_url_for_comments.netloc if not any(d in api_domain_for_comments.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']): self.logger(f"âš ī¸ Unrecognized domain '{api_domain_for_comments}' for comment API. Defaulting based on service.") api_domain_for_comments = "kemono.su" if "kemono" in self.service.lower() else "coomer.party" comments_data = fetch_post_comments( api_domain_for_comments, self.service, self.user_id, post_id, headers, self.logger, self.cancellation_event, self.pause_event, # Pass pause_event cookies_dict=prepare_cookies_for_request( # Prepare cookies for this API call self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger ) ) if comments_data: self.logger(f" Fetched {len(comments_data)} comments for post {post_id}.") for comment_item_idx, comment_item in enumerate(comments_data): if self.check_cancel(): break raw_comment_content = comment_item.get('content', '') if not raw_comment_content: continue cleaned_comment_text = strip_html_tags(raw_comment_content) if not cleaned_comment_text.strip(): continue for filter_item_obj in current_character_filters: terms_to_check_comment = list(filter_item_obj["aliases"]) if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_comment: terms_to_check_comment.append(filter_item_obj["name"]) for term_to_match_comment in terms_to_check_comment: if is_title_match_for_character(cleaned_comment_text, term_to_match_comment): # Re-use title matcher post_is_candidate_by_comment_char_match = True char_filter_that_matched_comment = filter_item_obj self.logger(f" Match Found (Comment in Comments Scope): Comment in post {post_id} matches char filter term '{term_to_match_comment}' (from group/name '{filter_item_obj['name']}'). Post is candidate.") self.logger(f" Matching comment (first 100 chars): '{cleaned_comment_text[:100]}...'") break if post_is_candidate_by_comment_char_match: break if post_is_candidate_by_comment_char_match: break else: self.logger(f" No comments found or fetched for post {post_id} to check against character filters.") except RuntimeError as e_fetch_comment: self.logger(f" âš ī¸ Error fetching or processing comments for post {post_id}: {e_fetch_comment}") except Exception as e_generic_comment: self.logger(f" ❌ Unexpected error during comment processing for post {post_id}: {e_generic_comment}\n{traceback.format_exc(limit=2)}") self.logger(f" [Char Scope: Comments] Phase 2 Result: post_is_candidate_by_comment_char_match = {post_is_candidate_by_comment_char_match}") else: # post_is_candidate_by_file_char_match_in_comment_scope was True self.logger(f" [Char Scope: Comments] Phase 2: Skipped comment check for post ID '{post_id}' because a file match already made it a candidate.") if current_character_filters: # Check if any filters are defined if self.char_filter_scope == CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match: self.logger(f" -> Skip Post (Scope: Title - No Char Match): Title '{post_title[:50]}' does not match character filters.") self._emit_signal('missed_character_post', post_title, "No title match for character filter") # type: ignore return 0, num_potential_files_in_post, [], [], [] if self.char_filter_scope == CHAR_SCOPE_COMMENTS and \ not post_is_candidate_by_file_char_match_in_comment_scope and \ not post_is_candidate_by_comment_char_match: # MODIFIED: Check both file and comment match flags self.logger(f" -> Skip Post (Scope: Comments - No Char Match in Comments): Post ID '{post_id}', Title '{post_title[:50]}...'") if self.emitter and hasattr(self.emitter, 'missed_character_post_signal'): # Check emitter self._emit_signal('missed_character_post', post_title, "No character match in files or comments (Comments scope)") # type: ignore return 0, num_potential_files_in_post, [], [], [] if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_POSTS or self.skip_words_scope == SKIP_SCOPE_BOTH): if self._check_pause(f"Skip words (post title) for post {post_id}"): return 0, num_potential_files_in_post, [], [] post_title_lower = post_title.lower() for skip_word in self.skip_words_list: if skip_word.lower() in post_title_lower: self.logger(f" -> Skip Post (Keyword in Title '{skip_word}'): '{post_title[:50]}...'. Scope: {self.skip_words_scope}") return 0, num_potential_files_in_post, [], [], [] if not self.extract_links_only and self.manga_mode_active and current_character_filters and \ (self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH) and \ not post_is_candidate_by_title_char_match: self.logger(f" -> Skip Post (Manga Mode with Title/Both Scope - No Title Char Match): Title '{post_title[:50]}' doesn't match filters.") self._emit_signal('missed_character_post', post_title, "Manga Mode: No title match for character filter (Title/Both scope)") # type: ignore return 0, num_potential_files_in_post, [], [], [] if not isinstance(post_attachments, list): self.logger(f"âš ī¸ Corrupt attachment data for post {post_id} (expected list, got {type(post_attachments)}). Skipping attachments.") post_attachments = [] base_folder_names_for_post_content = [] if not self.extract_links_only and self.use_subfolders: if self._check_pause(f"Subfolder determination for post {post_id}"): return 0, num_potential_files_in_post, [] primary_char_filter_for_folder = None # type: ignore log_reason_for_folder = "" if self.char_filter_scope == CHAR_SCOPE_COMMENTS and char_filter_that_matched_comment: if post_is_candidate_by_file_char_match_in_comment_scope and char_filter_that_matched_file_in_comment_scope: primary_char_filter_for_folder = char_filter_that_matched_file_in_comment_scope log_reason_for_folder = "Matched char filter in filename (Comments scope)" elif post_is_candidate_by_comment_char_match and char_filter_that_matched_comment: # Fallback to comment match primary_char_filter_for_folder = char_filter_that_matched_comment log_reason_for_folder = "Matched char filter in comments (Comments scope, no file match)" elif (self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH) and char_filter_that_matched_title: # Existing logic for other scopes primary_char_filter_for_folder = char_filter_that_matched_title log_reason_for_folder = "Matched char filter in title" if primary_char_filter_for_folder: base_folder_names_for_post_content = [clean_folder_name(primary_char_filter_for_folder["name"])] cleaned_primary_folder_name = clean_folder_name(primary_char_filter_for_folder["name"]) if cleaned_primary_folder_name.lower() in effective_unwanted_keywords_for_folder_naming and cleaned_primary_folder_name.lower() != "untitled_folder": self.logger(f" âš ī¸ Primary char filter folder name '{cleaned_primary_folder_name}' is in ignore list. Using generic name.") base_folder_names_for_post_content = ["Generic Post Content"] else: base_folder_names_for_post_content = [cleaned_primary_folder_name] self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}") elif not current_character_filters: # No char filters defined, use generic logic # Stage 1: Try to match folder names from Known.txt using the post title derived_folders_from_title_via_known_txt = match_folders_from_title( post_title, self.known_names, effective_unwanted_keywords_for_folder_naming # Use full ignore list for Known.txt matching ) valid_derived_folders_from_title_known_txt = [ name for name in derived_folders_from_title_via_known_txt if name and name.strip() and name.lower() != "untitled_folder" ] if valid_derived_folders_from_title_known_txt: base_folder_names_for_post_content.extend(valid_derived_folders_from_title_known_txt) self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}") else: # Stage 2: No Known.txt match from title. # Determine if the title primarily consists of creator-specific ignore words. # Get a candidate name from title using only generic FOLDER_NAME_STOP_WORDS. candidate_name_from_title_basic_clean = extract_folder_name_from_title( post_title, FOLDER_NAME_STOP_WORDS # Only generic stop words ) title_is_only_creator_ignored_words = False if candidate_name_from_title_basic_clean and \ candidate_name_from_title_basic_clean.lower() != "untitled_folder" and \ self.creator_download_folder_ignore_words: # Check if specific creator ignore list is active candidate_title_words = {word.lower() for word in candidate_name_from_title_basic_clean.split()} if candidate_title_words and candidate_title_words.issubset(self.creator_download_folder_ignore_words): title_is_only_creator_ignored_words = True self.logger(f" Title-derived name '{candidate_name_from_title_basic_clean}' consists only of creator-specific ignore words.") if title_is_only_creator_ignored_words: # Stage 3: Title is "bad". Try Known.txt match on filenames. self.logger(f" Attempting Known.txt match on filenames as title was poor ('{candidate_name_from_title_basic_clean}').") filenames_to_check = [ f_info['_original_name_for_log'] for f_info in all_files_from_post_api_for_char_check # Defined earlier in process() if f_info.get('_original_name_for_log') ] derived_folders_from_filenames_known_txt = set() if filenames_to_check: for fname in filenames_to_check: matches = match_folders_from_title( fname, self.known_names, effective_unwanted_keywords_for_folder_naming # Use full ignore list for matching ) for m in matches: if m and m.strip() and m.lower() != "untitled_folder": derived_folders_from_filenames_known_txt.add(m) if derived_folders_from_filenames_known_txt: base_folder_names_for_post_content.extend(list(derived_folders_from_filenames_known_txt)) self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Filenames): {', '.join(base_folder_names_for_post_content)}") else: final_title_extract = extract_folder_name_from_title( post_title, effective_unwanted_keywords_for_folder_naming ) base_folder_names_for_post_content.append(final_title_extract) self.logger(f" No Known.txt match from filenames. Using title-derived name (with full ignore list): '{final_title_extract}'") else: extracted_name_from_title_full_ignore = extract_folder_name_from_title( post_title, effective_unwanted_keywords_for_folder_naming ) base_folder_names_for_post_content.append(extracted_name_from_title_full_ignore) self.logger(f" Base folder name(s) for post content (Generic title parsing - title not solely creator-ignored words): {', '.join(base_folder_names_for_post_content)}") base_folder_names_for_post_content = [ name for name in base_folder_names_for_post_content if name and name.strip() ] if not base_folder_names_for_post_content: final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content") base_folder_names_for_post_content = [final_fallback_name] self.logger(f" Ultimate fallback folder name: {final_fallback_name}") if not self.extract_links_only and self.use_subfolders and self.skip_words_list: if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, [] for folder_name_to_check in base_folder_names_for_post_content: # type: ignore if not folder_name_to_check: continue if any(skip_word.lower() in folder_name_to_check.lower() for skip_word in self.skip_words_list): matched_skip = next((sw for sw in self.skip_words_list if sw.lower() in folder_name_to_check.lower()), "unknown_skip_word") # type: ignore self.logger(f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check}' contains '{matched_skip}'.") return 0, num_potential_files_in_post, [], [], [] if (self.show_external_links or self.extract_links_only) and post_content_html: # type: ignore if self._check_pause(f"External link extraction for post {post_id}"): return 0, num_potential_files_in_post, [], [] try: mega_key_pattern = re.compile(r'\b([a-zA-Z0-9_-]{43}|[a-zA-Z0-9_-]{22})\b') # type: ignore unique_links_data = {} for match in link_pattern.finditer(post_content_html): link_url = match.group(1).strip() link_url = html.unescape(link_url) # Decode HTML entities in the URL link_inner_text = match.group(2) if not any(ext in link_url.lower() for ext in ['.css', '.js', '.ico', '.xml', '.svg']) \ and not link_url.startswith('javascript:') \ and link_url not in unique_links_data: clean_link_text = re.sub(r'<.*?>', '', link_inner_text) clean_link_text = html.unescape(clean_link_text).strip() display_text = clean_link_text if clean_link_text else "[Link]" unique_links_data[link_url] = display_text links_emitted_count = 0 scraped_platforms = {'kemono', 'coomer', 'patreon'} for link_url, link_text in unique_links_data.items(): platform = get_link_platform(link_url) decryption_key_found = "" if platform == 'mega': parsed_mega_url = urlparse(link_url) if parsed_mega_url.fragment: # type: ignore potential_key_from_fragment = parsed_mega_url.fragment.split('!')[-1] # Handle cases like #!key or #key if mega_key_pattern.fullmatch(potential_key_from_fragment): decryption_key_found = potential_key_from_fragment if not decryption_key_found and link_text: key_match_in_text = mega_key_pattern.search(link_text) if key_match_in_text: # type: ignore decryption_key_found = key_match_in_text.group(1) if not decryption_key_found and self.extract_links_only and post_content_html: key_match_in_content = mega_key_pattern.search(strip_html_tags(post_content_html)) # Search cleaned content if key_match_in_content: decryption_key_found = key_match_in_content.group(1) if platform not in scraped_platforms: self._emit_signal('external_link', post_title, link_text, link_url, platform, decryption_key_found or "") links_emitted_count +=1 # type: ignore if links_emitted_count > 0: self.logger(f" 🔗 Found {links_emitted_count} potential external link(s) in post content.") except Exception as e: self.logger(f"âš ī¸ Error parsing post content for links: {e}\n{traceback.format_exc(limit=2)}") if self.extract_links_only: self.logger(f" Extract Links Only mode: Finished processing post {post_id} for links.") return 0, 0, [], [], [] all_files_from_post_api = [] api_file_domain = urlparse(self.api_url_input).netloc if not api_file_domain or not any(d in api_file_domain.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']): api_file_domain = "kemono.su" if "kemono" in self.service.lower() else "coomer.party" if post_main_file_info and isinstance(post_main_file_info, dict) and post_main_file_info.get('path'): file_path = post_main_file_info['path'].lstrip('/') original_api_name = post_main_file_info.get('name') or os.path.basename(file_path) if original_api_name: all_files_from_post_api.append({ 'url': f"https://{api_file_domain}{file_path}" if file_path.startswith('/') else f"https://{api_file_domain}/data/{file_path}", 'name': original_api_name, # This is the cleaned/API provided name '_original_name_for_log': original_api_name, '_is_thumbnail': is_image(original_api_name) # Mark if it's an image from API }) else: self.logger(f" âš ī¸ Skipping main file for post {post_id}: Missing name (Path: {file_path})") for idx, att_info in enumerate(post_attachments): if isinstance(att_info, dict) and att_info.get('path'): # Ensure att_info is a dict att_path = att_info['path'].lstrip('/') original_api_att_name = att_info.get('name') or os.path.basename(att_path) if original_api_att_name: all_files_from_post_api.append({ 'url': f"https://{api_file_domain}{att_path}" if att_path.startswith('/') else f"https://{api_file_domain}/data/{att_path}", 'name': original_api_att_name, '_original_name_for_log': original_api_att_name, '_is_thumbnail': is_image(original_api_att_name) # Mark if it's an image from API }) else: self.logger(f" âš ī¸ Skipping attachment {idx+1} for post {post_id}: Missing name (Path: {att_path})") else: self.logger(f" âš ī¸ Skipping invalid attachment {idx+1} for post {post_id}: {str(att_info)[:100]}") if self.scan_content_for_images and post_content_html and not self.extract_links_only: # This block was duplicated, ensure only one exists self.logger(f" Scanning post content for additional image URLs (Post ID: {post_id})...") parsed_input_url = urlparse(self.api_url_input) base_url_for_relative_paths = f"{parsed_input_url.scheme}://{parsed_input_url.netloc}" img_ext_pattern = "|".join(ext.lstrip('.') for ext in IMAGE_EXTENSIONS) direct_url_pattern_str = r"""(?i)\b(https?://[^\s"'<>\[\]\{\}\|\^\\^~\[\]`]+\.(?:""" + img_ext_pattern + r"""))\b""" img_tag_src_pattern_str = r"""]*?src\s*=\s*["']([^"']+)["']""" found_image_sources = set() for direct_url_match in re.finditer(direct_url_pattern_str, post_content_html): found_image_sources.add(direct_url_match.group(1)) for img_tag_match in re.finditer(img_tag_src_pattern_str, post_content_html, re.IGNORECASE): src_attr = img_tag_match.group(1).strip() src_attr = html.unescape(src_attr) if not src_attr: continue resolved_src_url = "" if src_attr.startswith(('http://', 'https://')): resolved_src_url = src_attr elif src_attr.startswith('//'): resolved_src_url = f"{parsed_input_url.scheme}:{src_attr}" elif src_attr.startswith('/'): resolved_src_url = f"{base_url_for_relative_paths}{src_attr}" if resolved_src_url: parsed_resolved_url = urlparse(resolved_src_url) if any(parsed_resolved_url.path.lower().endswith(ext) for ext in IMAGE_EXTENSIONS): found_image_sources.add(resolved_src_url) if found_image_sources: self.logger(f" Found {len(found_image_sources)} potential image URLs/sources in content.") existing_urls_in_api_list = {f_info['url'] for f_info in all_files_from_post_api} for found_url in found_image_sources: # Iterate over the unique, resolved URLs if self.check_cancel(): break if found_url in existing_urls_in_api_list: self.logger(f" Skipping URL from content (already in API list or previously added from content): {found_url[:70]}...") continue try: parsed_found_url = urlparse(found_url) url_filename = os.path.basename(parsed_found_url.path) if not url_filename or not is_image(url_filename): self.logger(f" Skipping URL from content (no filename part or not an image extension): {found_url[:70]}...") continue self.logger(f" Adding image from content: {url_filename} (URL: {found_url[:70]}...)") all_files_from_post_api.append({ 'url': found_url, 'name': url_filename, '_original_name_for_log': url_filename, '_is_thumbnail': False, # Images from content are not API thumbnails '_from_content_scan': True }) existing_urls_in_api_list.add(found_url) except Exception as e_url_parse: self.logger(f" Error processing URL from content '{found_url[:70]}...': {e_url_parse}") else: self.logger(f" No additional image URLs found in post content scan for post {post_id}.") if self.download_thumbnails: if self.scan_content_for_images: self.logger(f" Mode: 'Download Thumbnails Only' + 'Scan Content for Images' active. Prioritizing images from content scan for post {post_id}.") all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo.get('_from_content_scan')] if not all_files_from_post_api: self.logger(f" -> No images found via content scan for post {post_id} in this combined mode.") return 0, 0, [], [], [] # No files to download for this post else: self.logger(f" Mode: 'Download Thumbnails Only' active. Filtering for API thumbnails for post {post_id}.") all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo.get('_is_thumbnail')] if not all_files_from_post_api: self.logger(f" -> No API image thumbnails found for post {post_id} in thumbnail-only mode.") return 0, 0, [], [], [] # No files to download for this post if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED: def natural_sort_key_for_files(file_api_info): name = file_api_info.get('_original_name_for_log', '').lower() return [int(text) if text.isdigit() else text for text in re.split('([0-9]+)', name)] all_files_from_post_api.sort(key=natural_sort_key_for_files) self.logger(f" Manga Date Mode: Sorted {len(all_files_from_post_api)} files within post {post_id} by original name for sequential numbering.") if not all_files_from_post_api: self.logger(f" No files found to download for post {post_id}.") return 0, 0, [], [], [] files_to_download_info_list = [] processed_original_filenames_in_this_post = set() for file_info in all_files_from_post_api: current_api_original_filename = file_info.get('_original_name_for_log') if current_api_original_filename in processed_original_filenames_in_this_post: self.logger(f" -> Skip Duplicate Original Name (within post {post_id}): '{current_api_original_filename}' already processed/listed for this post.") total_skipped_this_post += 1 else: files_to_download_info_list.append(file_info) if current_api_original_filename: processed_original_filenames_in_this_post.add(current_api_original_filename) if not files_to_download_info_list: self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.") return 0, total_skipped_this_post, [], [], [] self.logger(f" Identified {len(files_to_download_info_list)} unique original file(s) for potential download from post {post_id}.") with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool: futures_list = [] # type: list[Future] for file_idx, file_info_to_dl in enumerate(files_to_download_info_list): if self._check_pause(f"File processing loop for post {post_id}, file {file_idx}"): break if self.check_cancel(): break current_api_original_filename = file_info_to_dl.get('_original_name_for_log') file_is_candidate_by_char_filter_scope = False char_filter_info_that_matched_file = None if not current_character_filters: file_is_candidate_by_char_filter_scope = True else: if self.char_filter_scope == CHAR_SCOPE_FILES: for filter_item_obj in current_character_filters: terms_to_check_for_file = list(filter_item_obj["aliases"]) if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_for_file: terms_to_check_for_file.append(filter_item_obj["name"]) unique_terms_for_file_check = list(set(terms_to_check_for_file)) for term_to_match in unique_terms_for_file_check: if is_filename_match_for_character(current_api_original_filename, term_to_match): file_is_candidate_by_char_filter_scope = True char_filter_info_that_matched_file = filter_item_obj self.logger(f" File '{current_api_original_filename}' matches char filter term '{term_to_match}' (from '{filter_item_obj['name']}'). Scope: Files.") break if file_is_candidate_by_char_filter_scope: break elif self.char_filter_scope == CHAR_SCOPE_TITLE: if post_is_candidate_by_title_char_match: file_is_candidate_by_char_filter_scope = True char_filter_info_that_matched_file = char_filter_that_matched_title self.logger(f" File '{current_api_original_filename}' is candidate because post title matched. Scope: Title.") elif self.char_filter_scope == CHAR_SCOPE_BOTH: if post_is_candidate_by_title_char_match: file_is_candidate_by_char_filter_scope = True char_filter_info_that_matched_file = char_filter_that_matched_title self.logger(f" File '{current_api_original_filename}' is candidate because post title matched. Scope: Both (Title part).") else: for filter_item_obj_both_file in current_character_filters: terms_to_check_for_file_both = list(filter_item_obj_both_file["aliases"]) if filter_item_obj_both_file["is_group"] and filter_item_obj_both_file["name"] not in terms_to_check_for_file_both: terms_to_check_for_file_both.append(filter_item_obj_both_file["name"]) unique_terms_for_file_both_check = list(set(terms_to_check_for_file_both)) for term_to_match in unique_terms_for_file_both_check: if is_filename_match_for_character(current_api_original_filename, term_to_match): file_is_candidate_by_char_filter_scope = True char_filter_info_that_matched_file = filter_item_obj_both_file self.logger(f" File '{current_api_original_filename}' matches char filter term '{term_to_match}' (from '{filter_item_obj['name']}'). Scope: Both (File part).") break if file_is_candidate_by_char_filter_scope: break elif self.char_filter_scope == CHAR_SCOPE_COMMENTS: if post_is_candidate_by_file_char_match_in_comment_scope: # Post was candidate due to a file match file_is_candidate_by_char_filter_scope = True char_filter_info_that_matched_file = char_filter_that_matched_file_in_comment_scope # Use the filter that matched a file in the post self.logger(f" File '{current_api_original_filename}' is candidate because a file in this post matched char filter (Overall Scope: Comments).") elif post_is_candidate_by_comment_char_match: # Post was candidate due to comment match (no file match for post) file_is_candidate_by_char_filter_scope = True char_filter_info_that_matched_file = char_filter_that_matched_comment # Use the filter that matched comments self.logger(f" File '{current_api_original_filename}' is candidate because post comments matched char filter (Overall Scope: Comments).") if not file_is_candidate_by_char_filter_scope: self.logger(f" -> Skip File (Char Filter Scope '{self.char_filter_scope}'): '{current_api_original_filename}' no match.") total_skipped_this_post += 1 continue # Determine the set of target base folder names for THIS file target_base_folders_for_this_file_iteration = [] # type: list[str] if current_character_filters: # Active character filter from UI char_title_subfolder_name = None if self.target_post_id_from_initial_url and self.custom_folder_name: char_title_subfolder_name = self.custom_folder_name elif char_filter_info_that_matched_file: char_title_subfolder_name = clean_folder_name(char_filter_info_that_matched_file["name"]) elif char_filter_that_matched_title: char_title_subfolder_name = clean_folder_name(char_filter_that_matched_title["name"]) elif char_filter_that_matched_comment: char_title_subfolder_name = clean_folder_name(char_filter_that_matched_comment["name"]) if char_title_subfolder_name: target_base_folders_for_this_file_iteration.append(char_title_subfolder_name) else: self.logger(f"âš ī¸ File '{current_api_original_filename}' candidate by char filter, but no folder name derived. Using post title.") target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title)) else: # No active character filter, use base_folder_names_for_post_content (which could be multiple from Known.txt) if base_folder_names_for_post_content: target_base_folders_for_this_file_iteration.extend(base_folder_names_for_post_content) else: # Fallback if base_folder_names_for_post_content was somehow empty target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title)) if not target_base_folders_for_this_file_iteration: # Ultimate fallback target_base_folders_for_this_file_iteration.append(clean_folder_name(post_title if post_title else "Uncategorized_Post_Content")) for target_base_folder_name_for_instance in target_base_folders_for_this_file_iteration: current_path_for_file_instance = self.override_output_dir if self.override_output_dir else self.download_root if self.use_subfolders and target_base_folder_name_for_instance: current_path_for_file_instance = os.path.join(current_path_for_file_instance, target_base_folder_name_for_instance) if self.use_post_subfolders: cleaned_title_for_subfolder_instance = clean_folder_name(post_title) current_path_for_file_instance = os.path.join(current_path_for_file_instance, cleaned_title_for_subfolder_instance) manga_date_counter_to_pass = self.manga_date_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED else None manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING else None futures_list.append(file_pool.submit( self._download_single_file, file_info=file_info_to_dl, # Pass the original file_info target_folder_path=current_path_for_file_instance, headers=headers, original_post_id_for_log=post_id, skip_event=self.skip_current_file_flag, post_title=post_title, manga_date_file_counter_ref=manga_date_counter_to_pass, manga_global_file_counter_ref=manga_global_counter_to_pass, file_index_in_post=file_idx, num_files_in_this_post=len(files_to_download_info_list) )) for future in as_completed(futures_list): if self.check_cancel(): for f_to_cancel in futures_list: if not f_to_cancel.done(): f_to_cancel.cancel() break try: # type: ignore dl_count, skip_count, actual_filename_saved, original_kept_flag, status, details_for_dialog_or_retry = future.result() total_downloaded_this_post += dl_count total_skipped_this_post += skip_count if original_kept_flag and dl_count > 0 and actual_filename_saved: kept_original_filenames_for_log.append(actual_filename_saved) if status == FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER and details_for_dialog_or_retry: retryable_failures_this_post.append(details_for_dialog_or_retry) elif status == FILE_DOWNLOAD_STATUS_FAILED_PERMANENTLY_THIS_SESSION and details_for_dialog_or_retry: permanent_failures_this_post.append(details_for_dialog_or_retry) except CancelledError: self.logger(f" File download task for post {post_id} was cancelled.") total_skipped_this_post += 1 except Exception as exc_f: self.logger(f"❌ File download task for post {post_id} resulted in error: {exc_f}") total_skipped_this_post += 1 self._emit_signal('file_progress', "", None) if self.check_cancel(): self.logger(f" Post {post_id} processing interrupted/cancelled."); else: self.logger(f" Post {post_id} Summary: Downloaded={total_downloaded_this_post}, Skipped Files={total_skipped_this_post}") return total_downloaded_this_post, total_skipped_this_post, kept_original_filenames_for_log, retryable_failures_this_post, permanent_failures_this_post class DownloadThread(QThread): progress_signal = pyqtSignal(str) # Already QObject, no need to change add_character_prompt_signal = pyqtSignal(str) file_download_status_signal = pyqtSignal(bool) finished_signal = pyqtSignal(int, int, bool, list) # total_downloaded, total_skipped, cancelled_by_user, kept_original_names_list # type: ignore external_link_signal = pyqtSignal(str, str, str, str, str) # post_title, link_text, link_url, platform, decryption_key file_progress_signal = pyqtSignal(str, object) retryable_file_failed_signal = pyqtSignal(list) # New: list of retry_details dicts missed_character_post_signal = pyqtSignal(str, str) # New: post_title, reason permanent_file_failed_signal = pyqtSignal(list) # New: list of permanent failure details def __init__(self, api_url_input, output_dir, known_names_copy, cancellation_event, pause_event, filter_character_list=None, dynamic_character_filter_holder=None, # Added pause_event and holder filter_mode='all', skip_zip=True, skip_rar=True, use_subfolders=True, use_post_subfolders=False, custom_folder_name=None, compress_images=False, download_thumbnails=False, service=None, user_id=None, downloaded_files=None, downloaded_file_hashes=None, downloaded_files_lock=None, downloaded_file_hashes_lock=None, skip_words_list=None, skip_words_scope=SKIP_SCOPE_FILES, show_external_links=False, extract_links_only=False, num_file_threads_for_worker=1, skip_current_file_flag=None, start_page=None, end_page=None, target_post_id_from_initial_url=None, manga_mode_active=False, unwanted_keywords=None, manga_filename_style=STYLE_POST_TITLE, char_filter_scope=CHAR_SCOPE_FILES, # manga_date_file_counter_ref removed from here remove_from_filename_words_list=None, manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter allow_multipart_download=True, selected_cookie_file=None, # New parameter for selected cookie file override_output_dir=None, # New parameter app_base_dir=None, # New parameter manga_date_file_counter_ref=None, # New parameter manga_global_file_counter_ref=None, # New parameter for global numbering use_cookie=False, # Added: Expected by main.py scan_content_for_images=False, # Added new flag creator_download_folder_ignore_words=None, # Added for DownloadThread cookie_text="", # Added: Expected by main.py ): super().__init__() self.api_url_input = api_url_input self.output_dir = output_dir self.known_names = list(known_names_copy) # type: ignore self.cancellation_event = cancellation_event self.pause_event = pause_event # Store pause_event self.skip_current_file_flag = skip_current_file_flag self.initial_target_post_id = target_post_id_from_initial_url self.filter_character_list_objects_initial = filter_character_list if filter_character_list else [] # Store initial self.dynamic_filter_holder = dynamic_character_filter_holder # Store the holder self.filter_mode = filter_mode self.skip_zip = skip_zip self.skip_rar = skip_rar self.use_subfolders = use_subfolders self.use_post_subfolders = use_post_subfolders self.custom_folder_name = custom_folder_name self.compress_images = compress_images self.download_thumbnails = download_thumbnails self.service = service self.user_id = user_id self.skip_words_list = skip_words_list if skip_words_list is not None else [] self.skip_words_scope = skip_words_scope self.downloaded_files = downloaded_files self.downloaded_files_lock = downloaded_files_lock self.downloaded_file_hashes = downloaded_file_hashes self.downloaded_file_hashes_lock = downloaded_file_hashes_lock self._add_character_response = None self.prompt_mutex = QMutex() self.show_external_links = show_external_links self.extract_links_only = extract_links_only self.num_file_threads_for_worker = num_file_threads_for_worker self.start_page = start_page self.end_page = end_page self.manga_mode_active = manga_mode_active self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else \ {'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'} self.manga_filename_style = manga_filename_style self.char_filter_scope = char_filter_scope self.remove_from_filename_words_list = remove_from_filename_words_list self.manga_date_prefix = manga_date_prefix # Store the prefix self.allow_multipart_download = allow_multipart_download self.selected_cookie_file = selected_cookie_file # Store selected cookie file self.app_base_dir = app_base_dir # Store app base dir self.cookie_text = cookie_text # Store cookie text self.use_cookie = use_cookie # Store cookie setting self.override_output_dir = override_output_dir # Store override dir self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker by DownloadThread self.scan_content_for_images = scan_content_for_images # Store new flag self.creator_download_folder_ignore_words = creator_download_folder_ignore_words # Store new ignore words self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store for global numbering if self.compress_images and Image is None: self.logger("âš ī¸ Image compression disabled: Pillow library not found (DownloadThread).") self.compress_images = False def logger(self, message): self.progress_signal.emit(str(message)) def isInterruptionRequested(self): return self.cancellation_event.is_set() or super().isInterruptionRequested() def _check_pause_self(self, context_message="DownloadThread operation"): if self.pause_event and self.pause_event.is_set(): self.logger(f" {context_message} paused...") while self.pause_event.is_set(): if self.isInterruptionRequested(): self.logger(f" {context_message} cancelled while paused.") return True # Indicates cancellation occurred time.sleep(0.5) if not self.isInterruptionRequested(): self.logger(f" {context_message} resumed.") return False def skip_file(self): if self.isRunning() and self.skip_current_file_flag: self.logger("â­ī¸ Skip requested for current file (single-thread mode).") self.skip_current_file_flag.set() else: self.logger("â„šī¸ Skip file: No download active or skip flag not available for current context.") def run(self): grand_total_downloaded_files = 0 grand_total_skipped_files = 0 grand_list_of_kept_original_filenames = [] was_process_cancelled = False # Initialize manga_date_file_counter_ref if needed if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED and \ not self.extract_links_only and self.manga_date_file_counter_ref is None: series_scan_dir = self.output_dir if self.use_subfolders: if self.filter_character_list_objects_initial and self.filter_character_list_objects_initial[0] and self.filter_character_list_objects_initial[0].get("name"): series_folder_name = clean_folder_name(self.filter_character_list_objects_initial[0]["name"]) series_scan_dir = os.path.join(series_scan_dir, series_folder_name) elif self.service and self.user_id: creator_based_folder_name = clean_folder_name(str(self.user_id)) # Ensure user_id is string for clean_folder_name series_scan_dir = os.path.join(series_scan_dir, creator_based_folder_name) highest_num = 0 if os.path.isdir(series_scan_dir): self.logger(f"â„šī¸ [Thread] Manga Date Mode: Scanning for existing files in '{series_scan_dir}'...") for dirpath, _, filenames_in_dir in os.walk(series_scan_dir): for filename_to_check in filenames_in_dir: # Check if filename starts with the manga_date_prefix if it's set prefix_to_check = clean_filename(self.manga_date_prefix.strip()) if self.manga_date_prefix and self.manga_date_prefix.strip() else "" name_part_to_match = filename_to_check if prefix_to_check and name_part_to_match.startswith(prefix_to_check): name_part_to_match = name_part_to_match[len(prefix_to_check):].lstrip() # Remove prefix and leading space base_name_no_ext = os.path.splitext(name_part_to_match)[0] match = re.match(r"(\d+)", base_name_no_ext) # Match any number of digits at the start if match: highest_num = max(highest_num, int(match.group(1))) self.manga_date_file_counter_ref = [highest_num + 1, threading.Lock()] self.logger(f"â„šī¸ [Thread] Manga Date Mode: Initialized date-based counter at {self.manga_date_file_counter_ref[0]}.") # Initialize manga_global_file_counter_ref if needed (separate from date_based) if self.manga_mode_active and self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING and \ not self.extract_links_only and self.manga_global_file_counter_ref is None: self.manga_global_file_counter_ref = [1, threading.Lock()] # Assign to the member variable self.logger(f"â„šī¸ [Thread] Manga Title+GlobalNum Mode: Initialized global counter at {self.manga_global_file_counter_ref[0]}.") worker_signals_obj = PostProcessorSignals() try: worker_signals_obj.progress_signal.connect(self.progress_signal) worker_signals_obj.file_download_status_signal.connect(self.file_download_status_signal) worker_signals_obj.file_progress_signal.connect(self.file_progress_signal) worker_signals_obj.external_link_signal.connect(self.external_link_signal) worker_signals_obj.missed_character_post_signal.connect(self.missed_character_post_signal) self.logger(" Starting post fetch (single-threaded download process)...") post_generator = download_from_api( self.api_url_input, logger=self.logger, start_page=self.start_page, end_page=self.end_page, manga_mode=self.manga_mode_active, cancellation_event=self.cancellation_event, # type: ignore pause_event=self.pause_event, # Pass pause_event use_cookie=self.use_cookie, # Pass cookie settings for API calls cookie_text=self.cookie_text, selected_cookie_file=self.selected_cookie_file, app_base_dir=self.app_base_dir ) for posts_batch_data in post_generator: if self._check_pause_self("Post batch processing"): was_process_cancelled = True; break if self.isInterruptionRequested(): was_process_cancelled = True; break for individual_post_data in posts_batch_data: if self._check_pause_self(f"Individual post processing for {individual_post_data.get('id', 'N/A')}"): was_process_cancelled = True; break if self.isInterruptionRequested(): was_process_cancelled = True; break post_processing_worker = PostProcessorWorker( post_data=individual_post_data, download_root=self.output_dir, known_names=self.known_names, filter_character_list=self.filter_character_list_objects_initial, # Pass initial dynamic_character_filter_holder=self.dynamic_filter_holder, # Pass the holder unwanted_keywords=self.unwanted_keywords, filter_mode=self.filter_mode, skip_zip=self.skip_zip, skip_rar=self.skip_rar, use_subfolders=self.use_subfolders, use_post_subfolders=self.use_post_subfolders, target_post_id_from_initial_url=self.initial_target_post_id, custom_folder_name=self.custom_folder_name, compress_images=self.compress_images, download_thumbnails=self.download_thumbnails, service=self.service, user_id=self.user_id, api_url_input=self.api_url_input, pause_event=self.pause_event, # Pass pause_event to worker cancellation_event=self.cancellation_event, # emitter is PostProcessorSignals for single-thread emitter=worker_signals_obj, # Pass the signals object as the emitter downloaded_files=self.downloaded_files, downloaded_file_hashes=self.downloaded_file_hashes, downloaded_files_lock=self.downloaded_files_lock, downloaded_file_hashes_lock=self.downloaded_file_hashes_lock, skip_words_list=self.skip_words_list, skip_words_scope=self.skip_words_scope, show_external_links=self.show_external_links, extract_links_only=self.extract_links_only, num_file_threads=self.num_file_threads_for_worker, skip_current_file_flag=self.skip_current_file_flag, manga_mode_active=self.manga_mode_active, manga_filename_style=self.manga_filename_style, manga_date_prefix=self.manga_date_prefix, # Pass the prefix char_filter_scope=self.char_filter_scope, remove_from_filename_words_list=self.remove_from_filename_words_list, allow_multipart_download=self.allow_multipart_download, selected_cookie_file=self.selected_cookie_file, # Pass selected cookie file app_base_dir=self.app_base_dir, # Pass app_base_dir cookie_text=self.cookie_text, # Pass cookie text override_output_dir=self.override_output_dir, # Pass override dir manga_global_file_counter_ref=self.manga_global_file_counter_ref, # Pass the ref use_cookie=self.use_cookie, manga_date_file_counter_ref=self.manga_date_file_counter_ref, # Pass the member variable scan_content_for_images=self.scan_content_for_images, # Pass new flag creator_download_folder_ignore_words=self.creator_download_folder_ignore_words, # Pass new ignore words ) try: dl_count, skip_count, kept_originals_this_post, retryable_failures, permanent_failures = post_processing_worker.process() grand_total_downloaded_files += dl_count grand_total_skipped_files += skip_count if kept_originals_this_post: grand_list_of_kept_original_filenames.extend(kept_originals_this_post) if retryable_failures: self.retryable_file_failed_signal.emit(retryable_failures) if permanent_failures: # Emit new signal for permanent failures self.permanent_file_failed_signal.emit(permanent_failures) except Exception as proc_err: post_id_for_err = individual_post_data.get('id', 'N/A') self.logger(f"❌ Error processing post {post_id_for_err} in DownloadThread: {proc_err}") traceback.print_exc() num_potential_files_est = len(individual_post_data.get('attachments', [])) + \ (1 if individual_post_data.get('file') else 0) grand_total_skipped_files += num_potential_files_est if self.skip_current_file_flag and self.skip_current_file_flag.is_set(): self.skip_current_file_flag.clear() self.logger(" Skip current file flag was processed and cleared by DownloadThread.") self.msleep(10) if was_process_cancelled: break if not was_process_cancelled and not self.isInterruptionRequested(): self.logger("✅ All posts processed or end of content reached by DownloadThread.") except Exception as main_thread_err: self.logger(f"\n❌ Critical error within DownloadThread run loop: {main_thread_err}") traceback.print_exc() if not self.isInterruptionRequested(): was_process_cancelled = False finally: try: if worker_signals_obj: worker_signals_obj.progress_signal.disconnect(self.progress_signal) worker_signals_obj.file_download_status_signal.disconnect(self.file_download_status_signal) worker_signals_obj.external_link_signal.disconnect(self.external_link_signal) worker_signals_obj.file_progress_signal.disconnect(self.file_progress_signal) worker_signals_obj.missed_character_post_signal.disconnect(self.missed_character_post_signal) except (TypeError, RuntimeError) as e: self.logger(f"â„šī¸ Note during DownloadThread signal disconnection: {e}") self.finished_signal.emit(grand_total_downloaded_files, grand_total_skipped_files, self.isInterruptionRequested(), grand_list_of_kept_original_filenames) def receive_add_character_result(self, result): with QMutexLocker(self.prompt_mutex): self._add_character_response = result self.logger(f" (DownloadThread) Received character prompt response: {'Yes (added/confirmed)' if result else 'No (declined/failed)'}")