Kemono-Downloader/downloader_utils.py

1293 lines
75 KiB
Python
Raw Normal View History

2025-05-07 07:20:40 +05:30
import os
import time
import requests
import re
import threading
2025-05-10 11:07:27 +05:30
import queue # Not directly used for link queue, but kept for historical reasons
2025-05-07 07:20:40 +05:30
import hashlib
2025-05-08 19:49:50 +05:30
import http.client
import traceback
from concurrent.futures import ThreadPoolExecutor, Future, CancelledError, as_completed
2025-05-10 11:07:27 +05:30
import html
2025-05-07 07:20:40 +05:30
from PyQt5.QtCore import QObject, pyqtSignal, QThread, QMutex, QMutexLocker
from urllib.parse import urlparse
try:
from PIL import Image
except ImportError:
print("ERROR: Pillow library not found. Please install it: pip install Pillow")
Image = None
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
from io import BytesIO
2025-05-10 11:07:27 +05:30
# Constants for filename styles, mirroring main.py for clarity if used directly here
STYLE_POST_TITLE = "post_title"
STYLE_ORIGINAL_NAME = "original_name"
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
# Constants for skip_words_scope, mirroring main.py
SKIP_SCOPE_FILES = "files"
SKIP_SCOPE_POSTS = "posts"
SKIP_SCOPE_BOTH = "both"
fastapi_app = None
KNOWN_NAMES = []
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
IMAGE_EXTENSIONS = {
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp',
'.heic', '.heif', '.svg', '.ico', '.jfif', '.pjpeg', '.pjp', '.avif'
}
VIDEO_EXTENSIONS = {
'.mp4', '.mov', '.mkv', '.webm', '.avi', '.wmv', '.flv', '.mpeg',
'.mpg', '.m4v', '.3gp', '.ogv', '.ts', '.vob'
}
2025-05-10 11:07:27 +05:30
# ADDED: Archive Extensions
ARCHIVE_EXTENSIONS = {
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2' # Added more common archive types
}
2025-05-08 19:49:50 +05:30
def is_title_match_for_character(post_title, character_name_filter):
"""Checks if a post title contains a specific character name (case-insensitive, whole word)."""
2025-05-09 19:03:01 +05:30
if not post_title or not character_name_filter:
2025-05-08 19:49:50 +05:30
return False
pattern = r"(?i)\b" + re.escape(character_name_filter) + r"\b"
2025-05-09 19:03:01 +05:30
return bool(re.search(pattern, post_title))
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
def is_filename_match_for_character(filename, character_name_filter):
"""Checks if a filename contains a specific character name (case-insensitive, substring)."""
if not filename or not character_name_filter:
return False
return character_name_filter.lower() in filename.lower()
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def clean_folder_name(name):
2025-05-08 19:49:50 +05:30
"""Cleans a string to be suitable for a folder name."""
2025-05-09 19:03:01 +05:30
if not isinstance(name, str): name = str(name)
2025-05-10 11:07:27 +05:30
cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name)
cleaned = cleaned.strip()
2025-05-09 19:03:01 +05:30
cleaned = re.sub(r'\s+', '_', cleaned)
return cleaned if cleaned else "untitled_folder"
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def clean_filename(name):
2025-05-09 19:03:01 +05:30
"""Cleans a string to be suitable for a file name."""
if not isinstance(name, str): name = str(name)
2025-05-10 11:07:27 +05:30
cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name)
cleaned = cleaned.strip()
2025-05-09 19:03:01 +05:30
cleaned = re.sub(r'\s+', '_', cleaned)
return cleaned if cleaned else "untitled_file"
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def extract_folder_name_from_title(title, unwanted_keywords):
2025-05-09 19:03:01 +05:30
"""Extracts a potential folder name from a title, avoiding unwanted keywords."""
2025-05-07 07:20:40 +05:30
if not title: return 'Uncategorized'
title_lower = title.lower()
2025-05-10 11:07:27 +05:30
tokens = re.findall(r'\b[\w\-]+\b', title_lower)
2025-05-07 07:20:40 +05:30
for token in tokens:
2025-05-10 11:07:27 +05:30
clean_token = clean_folder_name(token)
if clean_token and clean_token.lower() not in unwanted_keywords:
2025-05-09 19:03:01 +05:30
return clean_token
2025-05-08 19:49:50 +05:30
cleaned_full_title = clean_folder_name(title)
return cleaned_full_title if cleaned_full_title else 'Uncategorized'
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def match_folders_from_title(title, names_to_match, unwanted_keywords):
2025-05-09 19:03:01 +05:30
"""
Matches names from a list against a title to determine potential folder names.
Prioritizes longer matches.
"""
2025-05-08 19:49:50 +05:30
if not title or not names_to_match: return []
title_lower = title.lower()
2025-05-07 07:20:40 +05:30
matched_cleaned_names = set()
2025-05-08 19:49:50 +05:30
sorted_names_to_match = sorted(names_to_match, key=len, reverse=True)
for name in sorted_names_to_match:
name_lower = name.lower()
2025-05-10 11:07:27 +05:30
if not name_lower: continue
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
pattern = r'\b' + re.escape(name_lower) + r'\b'
2025-05-08 19:49:50 +05:30
if re.search(pattern, title_lower):
2025-05-09 19:03:01 +05:30
cleaned_name_for_folder = clean_folder_name(name)
2025-05-10 11:07:27 +05:30
if cleaned_name_for_folder.lower() not in unwanted_keywords:
matched_cleaned_names.add(cleaned_name_for_folder)
2025-05-08 19:49:50 +05:30
return sorted(list(matched_cleaned_names))
2025-05-07 07:20:40 +05:30
def is_image(filename):
2025-05-09 19:03:01 +05:30
"""Checks if the filename has a common image extension."""
2025-05-07 07:20:40 +05:30
if not filename: return False
2025-05-09 19:03:01 +05:30
_, ext = os.path.splitext(filename)
return ext.lower() in IMAGE_EXTENSIONS
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_video(filename):
2025-05-09 19:03:01 +05:30
"""Checks if the filename has a common video extension."""
2025-05-07 07:20:40 +05:30
if not filename: return False
2025-05-09 19:03:01 +05:30
_, ext = os.path.splitext(filename)
return ext.lower() in VIDEO_EXTENSIONS
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_zip(filename):
2025-05-09 19:03:01 +05:30
"""Checks if the filename ends with .zip (case-insensitive)."""
2025-05-07 07:20:40 +05:30
if not filename: return False
return filename.lower().endswith('.zip')
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_rar(filename):
2025-05-09 19:03:01 +05:30
"""Checks if the filename ends with .rar (case-insensitive)."""
2025-05-07 07:20:40 +05:30
if not filename: return False
return filename.lower().endswith('.rar')
2025-05-10 11:07:27 +05:30
# ADDED: Generic is_archive function
def is_archive(filename):
"""Checks if the filename has a common archive extension."""
if not filename: return False
_, ext = os.path.splitext(filename)
return ext.lower() in ARCHIVE_EXTENSIONS
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_post_url(url):
2025-05-09 19:03:01 +05:30
"""Checks if the URL likely points to a specific post."""
2025-05-07 07:20:40 +05:30
if not isinstance(url, str): return False
return '/post/' in urlparse(url).path
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def extract_post_info(url_string):
2025-05-08 19:49:50 +05:30
"""Extracts service, user ID, and post ID from a Kemono/Coomer URL."""
2025-05-07 07:20:40 +05:30
service, user_id, post_id = None, None, None
2025-05-08 19:49:50 +05:30
if not isinstance(url_string, str) or not url_string.strip(): return None, None, None
2025-05-07 07:20:40 +05:30
try:
parsed_url = urlparse(url_string.strip())
domain = parsed_url.netloc.lower()
2025-05-08 19:49:50 +05:30
is_kemono = any(d in domain for d in ['kemono.su', 'kemono.party'])
is_coomer = any(d in domain for d in ['coomer.su', 'coomer.party'])
2025-05-10 11:07:27 +05:30
if not (is_kemono or is_coomer): return None, None, None
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
path_parts = [part for part in parsed_url.path.strip('/').split('/') if part]
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
if len(path_parts) >= 3 and path_parts[1].lower() == 'user':
service = path_parts[0]
user_id = path_parts[2]
if len(path_parts) >= 5 and path_parts[3].lower() == 'post':
post_id = path_parts[4]
return service, user_id, post_id
2025-05-08 19:49:50 +05:30
if len(path_parts) >= 5 and path_parts[0].lower() == 'api' and \
path_parts[1].lower() == 'v1' and path_parts[3].lower() == 'user':
2025-05-07 07:20:40 +05:30
service = path_parts[2]
user_id = path_parts[4]
if len(path_parts) >= 7 and path_parts[5].lower() == 'post':
2025-05-08 19:49:50 +05:30
post_id = path_parts[6]
2025-05-07 07:20:40 +05:30
return service, user_id, post_id
2025-05-09 19:03:01 +05:30
2025-05-07 07:20:40 +05:30
except Exception as e:
print(f"Debug: Exception during extract_post_info for URL '{url_string}': {e}")
2025-05-10 11:07:27 +05:30
return None, None, None
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None):
2025-05-09 19:03:01 +05:30
"""Fetches a single page of posts from the API."""
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event.is_set():
logger(" Fetch cancelled before request.")
2025-05-10 11:07:27 +05:30
raise RuntimeError("Fetch operation cancelled by user.")
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
paginated_url = f'{api_url_base}?o={offset}'
2025-05-10 11:07:27 +05:30
logger(f" Fetching: {paginated_url} (Page approx. {offset // 50 + 1})")
2025-05-07 07:20:40 +05:30
try:
2025-05-10 11:07:27 +05:30
response = requests.get(paginated_url, headers=headers, timeout=(10, 60))
response.raise_for_status()
2025-05-08 19:49:50 +05:30
if 'application/json' not in response.headers.get('Content-Type', '').lower():
logger(f"⚠️ Unexpected content type from API: {response.headers.get('Content-Type')}. Body: {response.text[:200]}")
2025-05-10 11:07:27 +05:30
return [] # Return empty list on unexpected content type
2025-05-07 07:20:40 +05:30
return response.json()
except requests.exceptions.Timeout:
2025-05-08 19:49:50 +05:30
raise RuntimeError(f"Timeout fetching offset {offset} from {paginated_url}")
2025-05-07 07:20:40 +05:30
except requests.exceptions.RequestException as e:
2025-05-08 19:49:50 +05:30
err_msg = f"Error fetching offset {offset} from {paginated_url}: {e}"
2025-05-07 07:20:40 +05:30
if e.response is not None:
err_msg += f" (Status: {e.response.status_code}, Body: {e.response.text[:200]})"
raise RuntimeError(err_msg)
2025-05-10 11:07:27 +05:30
except ValueError as e: # JSONDecodeError is a subclass of ValueError
2025-05-08 19:49:50 +05:30
raise RuntimeError(f"Error decoding JSON from offset {offset} ({paginated_url}): {e}. Response text: {response.text[:200]}")
2025-05-09 19:03:01 +05:30
except Exception as e:
2025-05-08 19:49:50 +05:30
raise RuntimeError(f"Unexpected error fetching offset {offset} ({paginated_url}): {e}")
def download_from_api(api_url_input, logger=print, start_page=None, end_page=None, manga_mode=False, cancellation_event=None):
"""
2025-05-09 19:03:01 +05:30
Generator function to fetch post data from Kemono/Coomer API.
Handles pagination and yields batches of posts.
In Manga Mode, fetches all posts first, then yields them in reverse order (oldest first).
2025-05-10 11:07:27 +05:30
If target_post_id is specified, it will paginate until that post is found or all pages are exhausted.
2025-05-08 19:49:50 +05:30
"""
2025-05-10 11:07:27 +05:30
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
2025-05-07 07:20:40 +05:30
service, user_id, target_post_id = extract_post_info(api_url_input)
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event.is_set():
logger(" Download_from_api cancelled at start.")
2025-05-07 07:20:40 +05:30
return
2025-05-08 19:49:50 +05:30
if not service or not user_id:
logger(f"❌ Invalid URL or could not extract service/user: {api_url_input}")
2025-05-09 19:03:01 +05:30
return
2025-05-08 19:49:50 +05:30
if target_post_id and (start_page or end_page):
2025-05-10 11:07:27 +05:30
logger("⚠️ Page range (start/end page) is ignored when a specific post URL is provided (searching all pages for the post).")
start_page = end_page = None # Ensure no page limits when searching for a specific post
2025-05-08 19:49:50 +05:30
is_creator_feed_for_manga = manga_mode and not target_post_id
2025-05-07 07:20:40 +05:30
parsed_input = urlparse(api_url_input)
2025-05-08 19:49:50 +05:30
api_domain = parsed_input.netloc
if not any(d in api_domain.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']):
logger(f"⚠️ Unrecognized domain '{api_domain}'. Defaulting to kemono.su for API calls.")
2025-05-10 11:07:27 +05:30
api_domain = "kemono.su"
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}"
2025-05-09 19:03:01 +05:30
page_size = 50 # Kemono API typically returns 50 posts per page
2025-05-08 19:49:50 +05:30
if is_creator_feed_for_manga:
logger(" Manga Mode: Fetching all posts to reverse order (oldest posts processed first)...")
all_posts_for_manga_mode = []
current_offset_manga = 0
2025-05-09 19:03:01 +05:30
while True:
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post fetching cancelled.")
break
try:
posts_batch_manga = fetch_posts_paginated(api_base_url, headers, current_offset_manga, logger, cancellation_event)
2025-05-10 11:07:27 +05:30
if not isinstance(posts_batch_manga, list):
2025-05-08 19:49:50 +05:30
logger(f"❌ API Error (Manga Mode): Expected list of posts, got {type(posts_batch_manga)}.")
break
2025-05-10 11:07:27 +05:30
if not posts_batch_manga:
2025-05-08 19:49:50 +05:30
logger("✅ Reached end of posts (Manga Mode fetch all).")
break
all_posts_for_manga_mode.extend(posts_batch_manga)
2025-05-10 11:07:27 +05:30
current_offset_manga += len(posts_batch_manga) # Use actual length
time.sleep(0.6)
except RuntimeError as e:
2025-05-09 19:03:01 +05:30
if "cancelled by user" in str(e).lower():
2025-05-08 19:49:50 +05:30
logger(f" Manga mode pagination stopped due to cancellation: {e}")
else:
logger(f"{e}\n Aborting manga mode pagination.")
2025-05-10 11:07:27 +05:30
break # Stop on runtime error
2025-05-09 19:03:01 +05:30
except Exception as e: # Catch any other unexpected errors
2025-05-08 19:49:50 +05:30
logger(f"❌ Unexpected error during manga mode fetch: {e}")
traceback.print_exc()
2025-05-10 11:07:27 +05:30
break # Stop on other errors
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if cancellation_event and cancellation_event.is_set(): return
2025-05-08 19:49:50 +05:30
if all_posts_for_manga_mode:
logger(f" Manga Mode: Fetched {len(all_posts_for_manga_mode)} total posts. Reversing order...")
2025-05-10 11:07:27 +05:30
all_posts_for_manga_mode.reverse() # Oldest first
2025-05-08 19:49:50 +05:30
for i in range(0, len(all_posts_for_manga_mode), page_size):
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post yielding cancelled.")
break
yield all_posts_for_manga_mode[i:i + page_size]
else:
logger(" Manga Mode: No posts found to process.")
2025-05-10 11:07:27 +05:30
return # End of manga mode logic
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
# --- Regular pagination (Creator feed or Single Post search) ---
2025-05-08 19:49:50 +05:30
current_page_num = 1
current_offset = 0
2025-05-10 11:07:27 +05:30
processed_target_post_flag = False
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
if start_page and start_page > 1 and not target_post_id: # Only apply start_page if not searching for a specific post
current_offset = (start_page - 1) * page_size
2025-05-08 19:49:50 +05:30
current_page_num = start_page
logger(f" Starting from page {current_page_num} (calculated offset {current_offset}).")
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
while True:
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event.is_set():
logger(" Post fetching loop cancelled.")
break
2025-05-10 11:07:27 +05:30
if target_post_id and processed_target_post_flag: # If target post was found and yielded in a previous iteration
# logger(f"✅ Target post {target_post_id} was processed. Stopping pagination.") # Logged when found
2025-05-08 19:49:50 +05:30
break
2025-05-10 11:07:27 +05:30
# For creator feeds (not target_post_id mode), check end_page limit
if not target_post_id and end_page and current_page_num > end_page:
logger(f"✅ Reached specified end page ({end_page}) for creator feed. Stopping.")
2025-05-07 07:20:40 +05:30
break
try:
2025-05-08 19:49:50 +05:30
posts_batch = fetch_posts_paginated(api_base_url, headers, current_offset, logger, cancellation_event)
2025-05-07 07:20:40 +05:30
if not isinstance(posts_batch, list):
2025-05-10 11:07:27 +05:30
logger(f"❌ API Error: Expected list of posts, got {type(posts_batch)} at page {current_page_num} (offset {current_offset}).")
2025-05-08 19:49:50 +05:30
break
2025-05-10 11:07:27 +05:30
except RuntimeError as e:
2025-05-08 19:49:50 +05:30
if "cancelled by user" in str(e).lower():
logger(f" Pagination stopped due to cancellation: {e}")
else:
2025-05-10 11:07:27 +05:30
logger(f"{e}\n Aborting pagination at page {current_page_num} (offset {current_offset}).")
break # Stop on runtime error
2025-05-09 19:03:01 +05:30
except Exception as e: # Catch any other unexpected errors
2025-05-10 11:07:27 +05:30
logger(f"❌ Unexpected error fetching page {current_page_num} (offset {current_offset}): {e}")
2025-05-08 19:49:50 +05:30
traceback.print_exc()
2025-05-10 11:07:27 +05:30
break # Stop on other errors
if not posts_batch: # API returned an empty list, meaning no more posts
if target_post_id and not processed_target_post_flag:
logger(f"❌ Target post {target_post_id} not found after checking all available pages (API returned no more posts at offset {current_offset}).")
elif not target_post_id: # Normal creator feed end
if current_page_num == (start_page or 1): # Check if it was the first page attempted
logger(f"😕 No posts found on the first page checked (page {current_page_num}, offset {current_offset}).")
else:
logger(f"✅ Reached end of posts (no more content from API at offset {current_offset}).")
break # Exit while loop
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if target_post_id and not processed_target_post_flag:
2025-05-08 19:49:50 +05:30
matching_post = next((p for p in posts_batch if str(p.get('id')) == str(target_post_id)), None)
2025-05-07 07:20:40 +05:30
if matching_post:
2025-05-10 11:07:27 +05:30
logger(f"🎯 Found target post {target_post_id} on page {current_page_num} (offset {current_offset}).")
yield [matching_post] # Yield only the matching post as a list
processed_target_post_flag = True
# Loop will break at the top in the next iteration due to processed_target_post_flag
# If not found in this batch, the loop continues to the next page.
# Logger message for "not found in batch" is removed here to avoid spam if post is on a later page.
elif not target_post_id: # Processing a creator feed (no specific target post)
yield posts_batch
if processed_target_post_flag: # If we just found and yielded the target post, stop.
2025-05-09 19:03:01 +05:30
break
2025-05-10 11:07:27 +05:30
# Increment page and offset for the next iteration
current_offset += len(posts_batch) # Use actual length of batch for offset
current_page_num += 1
time.sleep(0.6) # Keep the delay
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Final check after the loop, specifically if a target post was being searched for but not found
2025-05-08 19:49:50 +05:30
if target_post_id and not processed_target_post_flag and not (cancellation_event and cancellation_event.is_set()):
2025-05-10 11:07:27 +05:30
# This log might be redundant if the one inside "if not posts_batch:" already covered it,
# but it serves as a final confirmation if the loop exited for other reasons before exhausting pages.
logger(f"❌ Target post {target_post_id} could not be found after checking all relevant pages (final check after loop).")
2025-05-08 19:49:50 +05:30
def get_link_platform(url):
2025-05-09 19:03:01 +05:30
"""Attempts to identify the platform of an external link from its domain."""
2025-05-08 19:49:50 +05:30
try:
domain = urlparse(url).netloc.lower()
if 'drive.google.com' in domain: return 'google drive'
if 'mega.nz' in domain or 'mega.io' in domain: return 'mega'
if 'dropbox.com' in domain: return 'dropbox'
if 'patreon.com' in domain: return 'patreon'
if 'instagram.com' in domain: return 'instagram'
if 'twitter.com' in domain or 'x.com' in domain: return 'twitter/x'
if 'discord.gg' in domain or 'discord.com/invite' in domain: return 'discord invite'
if 'pixiv.net' in domain: return 'pixiv'
2025-05-10 11:07:27 +05:30
if 'kemono.su' in domain or 'kemono.party' in domain: return 'kemono'
if 'coomer.su' in domain or 'coomer.party' in domain: return 'coomer'
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
parts = domain.split('.')
if len(parts) >= 2:
if parts[-2] not in ['com', 'org', 'net', 'gov', 'edu', 'co'] or len(parts) == 2:
2025-05-09 19:03:01 +05:30
return parts[-2]
elif len(parts) >= 3 and parts[-3] not in ['com', 'org', 'net', 'gov', 'edu', 'co']:
2025-05-08 19:49:50 +05:30
return parts[-3]
2025-05-10 11:07:27 +05:30
else:
2025-05-09 19:03:01 +05:30
return domain
2025-05-10 11:07:27 +05:30
return 'external'
except Exception: return 'unknown'
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
class PostProcessorSignals(QObject):
2025-05-08 19:49:50 +05:30
"""Defines signals used by PostProcessorWorker to communicate with the GUI thread."""
2025-05-10 11:07:27 +05:30
progress_signal = pyqtSignal(str)
file_download_status_signal = pyqtSignal(bool)
external_link_signal = pyqtSignal(str, str, str, str)
2025-05-09 19:03:01 +05:30
file_progress_signal = pyqtSignal(str, int, int)
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
class PostProcessorWorker:
2025-05-08 19:49:50 +05:30
"""Processes a single post: determines save paths, downloads files, handles compression."""
def __init__(self, post_data, download_root, known_names,
filter_character_list,
2025-05-07 07:20:40 +05:30
unwanted_keywords, filter_mode, skip_zip, skip_rar,
2025-05-08 19:49:50 +05:30
use_subfolders, use_post_subfolders, target_post_id_from_initial_url, custom_folder_name,
2025-05-07 07:20:40 +05:30
compress_images, download_thumbnails, service, user_id,
api_url_input, cancellation_event, signals,
downloaded_files, downloaded_file_hashes, downloaded_files_lock, downloaded_file_hashes_lock,
2025-05-10 11:07:27 +05:30
skip_words_list=None,
skip_words_scope=SKIP_SCOPE_FILES, # New parameter with default
show_external_links=False,
2025-05-09 19:03:01 +05:30
extract_links_only=False,
num_file_threads=4, skip_current_file_flag=None,
2025-05-10 11:07:27 +05:30
manga_mode_active=False,
manga_filename_style=STYLE_POST_TITLE
2025-05-08 19:49:50 +05:30
):
2025-05-07 07:20:40 +05:30
self.post = post_data
self.download_root = download_root
self.known_names = known_names
2025-05-08 19:49:50 +05:30
self.filter_character_list = filter_character_list if filter_character_list else []
2025-05-09 19:03:01 +05:30
self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else set()
2025-05-10 11:07:27 +05:30
self.filter_mode = filter_mode
2025-05-07 07:20:40 +05:30
self.skip_zip = skip_zip
self.skip_rar = skip_rar
self.use_subfolders = use_subfolders
2025-05-08 19:49:50 +05:30
self.use_post_subfolders = use_post_subfolders
2025-05-10 11:07:27 +05:30
self.target_post_id_from_initial_url = target_post_id_from_initial_url
self.custom_folder_name = custom_folder_name
2025-05-07 07:20:40 +05:30
self.compress_images = compress_images
self.download_thumbnails = download_thumbnails
self.service = service
self.user_id = user_id
2025-05-10 11:07:27 +05:30
self.api_url_input = api_url_input
2025-05-07 07:20:40 +05:30
self.cancellation_event = cancellation_event
2025-05-10 11:07:27 +05:30
self.signals = signals
self.skip_current_file_flag = skip_current_file_flag
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
self.downloaded_files = downloaded_files if downloaded_files is not None else set()
self.downloaded_file_hashes = downloaded_file_hashes if downloaded_file_hashes is not None else set()
self.downloaded_files_lock = downloaded_files_lock if downloaded_files_lock is not None else threading.Lock()
self.downloaded_file_hashes_lock = downloaded_file_hashes_lock if downloaded_file_hashes_lock is not None else threading.Lock()
2025-05-09 19:03:01 +05:30
2025-05-07 07:20:40 +05:30
self.skip_words_list = skip_words_list if skip_words_list is not None else []
2025-05-10 11:07:27 +05:30
self.skip_words_scope = skip_words_scope # Store the new scope
self.show_external_links = show_external_links
self.extract_links_only = extract_links_only
self.num_file_threads = num_file_threads
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
self.manga_mode_active = manga_mode_active
self.manga_filename_style = manga_filename_style
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
if self.compress_images and Image is None:
2025-05-08 19:49:50 +05:30
self.logger("⚠️ Image compression disabled: Pillow library not found.")
2025-05-07 07:20:40 +05:30
self.compress_images = False
def logger(self, message):
2025-05-08 19:49:50 +05:30
"""Emits a log message via the progress_signal if available."""
2025-05-07 07:20:40 +05:30
if self.signals and hasattr(self.signals, 'progress_signal'):
self.signals.progress_signal.emit(message)
2025-05-10 11:07:27 +05:30
else:
2025-05-09 19:03:01 +05:30
print(f"(Worker Log - No Signal): {message}")
2025-05-07 07:20:40 +05:30
def check_cancel(self):
2025-05-08 19:49:50 +05:30
"""Checks if cancellation has been requested."""
return self.cancellation_event.is_set()
2025-05-07 07:20:40 +05:30
2025-05-09 19:03:01 +05:30
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event,
2025-05-10 11:07:27 +05:30
post_title="", file_index_in_post=0, num_files_in_this_post=1):
"""
Downloads a single file, handles retries, compression, and hash checking.
Returns:
(int, int, str, bool): (downloaded_count, skipped_count, final_filename_saved, was_original_name_kept_flag)
"""
was_original_name_kept_flag = False
final_filename_saved_for_return = ""
if self.check_cancel() or (skip_event and skip_event.is_set()): return 0, 1, "", False
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
file_url = file_info.get('url')
api_original_filename = file_info.get('_original_name_for_log', file_info.get('name'))
if not file_url or not api_original_filename:
self.logger(f"⚠️ Skipping file from post {original_post_id_for_log}: Missing URL or original filename. Info: {str(file_info)[:100]}")
2025-05-10 11:07:27 +05:30
return 0, 1, api_original_filename or "", False
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
final_filename_saved_for_return = api_original_filename
# Apply skip_words_list based on skip_words_scope (for files)
if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_FILES or self.skip_words_scope == SKIP_SCOPE_BOTH):
filename_to_check_for_skip_words = api_original_filename.lower()
2025-05-09 19:03:01 +05:30
for skip_word in self.skip_words_list:
2025-05-10 11:07:27 +05:30
if skip_word.lower() in filename_to_check_for_skip_words:
self.logger(f" -> Skip File (Keyword in Original Name '{skip_word}'): '{api_original_filename}'. Scope: {self.skip_words_scope}")
return 0, 1, api_original_filename, False
2025-05-09 19:03:01 +05:30
if self.filter_character_list:
matches_any_character_filter = False
2025-05-10 11:07:27 +05:30
if self.manga_mode_active: # In manga mode, character filter applies to post title primarily
2025-05-09 19:03:01 +05:30
if any(is_title_match_for_character(post_title, char_filter) for char_filter in self.filter_character_list):
matches_any_character_filter = True
2025-05-10 11:07:27 +05:30
# Fallback: if title doesn't match, but filename does, still consider it a match for manga mode if desired
# For now, let's stick to title match for manga post filtering, file name match for file filtering.
# If you want manga mode character filter to also check filenames, uncomment below:
# if not matches_any_character_filter and any(is_filename_match_for_character(api_original_filename, char_filter) for char_filter in self.filter_character_list):
# matches_any_character_filter = True
else: # Normal mode, character filter applies to filename
2025-05-09 19:03:01 +05:30
if any(is_filename_match_for_character(api_original_filename, char_filter) for char_filter in self.filter_character_list):
matches_any_character_filter = True
2025-05-10 11:07:27 +05:30
if not matches_any_character_filter: # If no character filter matched (based on mode)
self.logger(f" -> Skip File (No Char Match): '{api_original_filename}' (Post: '{post_title[:30]}...') doesn't match character filters.")
return 0, 1, api_original_filename, False
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
original_filename_cleaned_base, original_ext = os.path.splitext(clean_filename(api_original_filename))
if not original_ext.startswith('.'): original_ext = '.' + original_ext if original_ext else ''
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
filename_to_save = ""
2025-05-08 19:49:50 +05:30
if self.manga_mode_active:
2025-05-10 11:07:27 +05:30
if self.manga_filename_style == STYLE_ORIGINAL_NAME:
filename_to_save = clean_filename(api_original_filename)
was_original_name_kept_flag = True # Original name is kept by definition here
elif self.manga_filename_style == STYLE_POST_TITLE:
if post_title and post_title.strip():
cleaned_post_title_base = clean_filename(post_title.strip())
if num_files_in_this_post > 1: # Multi-file post
if file_index_in_post == 0: # First file of multi-file post
filename_to_save = f"{cleaned_post_title_base}{original_ext}"
was_original_name_kept_flag = False
else: # Subsequent files of multi-file post
filename_to_save = clean_filename(api_original_filename) # Keep original for subsequent
was_original_name_kept_flag = True
else: # Single file post in manga mode
filename_to_save = f"{cleaned_post_title_base}{original_ext}"
was_original_name_kept_flag = False
else: # Manga mode, post title style, but post_title is missing
filename_to_save = clean_filename(api_original_filename)
was_original_name_kept_flag = False # Not truly "kept original" in the spirit of the style choice
self.logger(f"⚠️ Manga mode (Post Title Style): Post title missing for post {original_post_id_for_log}. Using cleaned original filename '{filename_to_save}'.")
else: # Unknown manga style
self.logger(f"⚠️ Manga mode: Unknown filename style '{self.manga_filename_style}'. Defaulting to original filename for '{api_original_filename}'.")
filename_to_save = clean_filename(api_original_filename)
was_original_name_kept_flag = False # Or True, depending on interpretation. Let's say False as it's a fallback.
# Collision handling for manga mode filenames
if filename_to_save:
2025-05-08 19:49:50 +05:30
counter = 1
base_name_coll, ext_coll = os.path.splitext(filename_to_save)
temp_filename_for_collision_check = filename_to_save
2025-05-10 11:07:27 +05:30
# Ensure unique filename in target folder
2025-05-08 19:49:50 +05:30
while os.path.exists(os.path.join(target_folder_path, temp_filename_for_collision_check)):
2025-05-10 11:07:27 +05:30
# If it's the first file of a multi-file post using post_title style, append _N
if self.manga_filename_style == STYLE_POST_TITLE and file_index_in_post == 0 and num_files_in_this_post > 1:
temp_filename_for_collision_check = f"{base_name_coll}_{counter}{ext_coll}"
# If it's original name style, or subsequent file, or single file post, append _N to its base
else:
temp_filename_for_collision_check = f"{base_name_coll}_{counter}{ext_coll}"
2025-05-08 19:49:50 +05:30
counter += 1
if temp_filename_for_collision_check != filename_to_save:
filename_to_save = temp_filename_for_collision_check
2025-05-10 11:07:27 +05:30
else: # Fallback if filename_to_save ended up empty
filename_to_save = f"manga_file_{original_post_id_for_log}_{file_index_in_post + 1}{original_ext}"
self.logger(f"⚠️ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save}'.")
was_original_name_kept_flag = False
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
else: # Not Manga Mode
filename_to_save = clean_filename(api_original_filename)
was_original_name_kept_flag = False # Not manga mode, so this flag isn't relevant in the same way
# Collision handling for non-manga mode
counter = 1
base_name_coll, ext_coll = os.path.splitext(filename_to_save)
temp_filename_for_collision_check = filename_to_save
while os.path.exists(os.path.join(target_folder_path, temp_filename_for_collision_check)):
temp_filename_for_collision_check = f"{base_name_coll}_{counter}{ext_coll}"
counter += 1
if temp_filename_for_collision_check != filename_to_save:
filename_to_save = temp_filename_for_collision_check
final_filename_for_sets_and_saving = filename_to_save
final_filename_saved_for_return = final_filename_for_sets_and_saving
if not self.download_thumbnails:
# Determine file type based on the original API filename
is_img_type = is_image(api_original_filename)
2025-05-08 19:49:50 +05:30
is_vid_type = is_video(api_original_filename)
2025-05-10 11:07:27 +05:30
# Use the generic is_archive function
is_archive_type = is_archive(api_original_filename)
# ===== MODIFICATION START =====
if self.filter_mode == 'archive':
if not is_archive_type: # If in 'archive' mode and the file is NOT an archive
self.logger(f" -> Filter Skip (Archive Mode): '{api_original_filename}' (Not an Archive).")
return 0, 1, api_original_filename, False
# If it IS an archive, it will proceed.
# self.skip_zip and self.skip_rar are False in this mode (set in main.py), so they won't cause a skip.
elif self.filter_mode == 'image':
if not is_img_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Image).")
return 0, 1, api_original_filename, False
elif self.filter_mode == 'video':
if not is_vid_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Video).")
return 0, 1, api_original_filename, False
# No specific 'elif self.filter_mode == 'all':' is needed here, as 'all' implies no primary type filtering.
# The self.skip_zip / self.skip_rar checks below will handle user preference for skipping archives in 'all' mode.
# These skip checks are now primarily for 'all' mode or if filter_mode is something else.
# In 'archive' mode, self.skip_zip and self.skip_rar will be False.
if self.skip_zip and is_zip(api_original_filename): # Use specific is_zip for the skip_zip flag
2025-05-09 19:03:01 +05:30
self.logger(f" -> Pref Skip: '{api_original_filename}' (ZIP).")
2025-05-10 11:07:27 +05:30
return 0, 1, api_original_filename, False
if self.skip_rar and is_rar(api_original_filename): # Use specific is_rar for the skip_rar flag
2025-05-09 19:03:01 +05:30
self.logger(f" -> Pref Skip: '{api_original_filename}' (RAR).")
2025-05-10 11:07:27 +05:30
return 0, 1, api_original_filename, False
# ===== MODIFICATION END =====
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
target_folder_basename = os.path.basename(target_folder_path)
2025-05-08 19:49:50 +05:30
current_save_path = os.path.join(target_folder_path, final_filename_for_sets_and_saving)
if os.path.exists(current_save_path) and os.path.getsize(current_save_path) > 0:
self.logger(f" -> Exists (Path): '{final_filename_for_sets_and_saving}' in '{target_folder_basename}'.")
2025-05-10 11:07:27 +05:30
with self.downloaded_files_lock: self.downloaded_files.add(final_filename_for_sets_and_saving) # Add final name
return 0, 1, final_filename_for_sets_and_saving, was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
with self.downloaded_files_lock:
if final_filename_for_sets_and_saving in self.downloaded_files:
2025-05-10 11:07:27 +05:30
self.logger(f" -> Global Skip (Filename): '{final_filename_for_sets_and_saving}' already recorded this session.")
return 0, 1, final_filename_for_sets_and_saving, was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
max_retries = 3
2025-05-10 11:07:27 +05:30
retry_delay = 5
downloaded_size_bytes = 0
2025-05-09 19:03:01 +05:30
calculated_file_hash = None
2025-05-10 11:07:27 +05:30
file_content_bytes = None
total_size_bytes = 0 # Initialize total_size_bytes for this download attempt
2025-05-08 19:49:50 +05:30
download_successful_flag = False
2025-05-10 11:07:27 +05:30
for attempt_num in range(max_retries + 1):
2025-05-09 19:03:01 +05:30
if self.check_cancel() or (skip_event and skip_event.is_set()):
2025-05-10 11:07:27 +05:30
break
2025-05-07 07:20:40 +05:30
try:
2025-05-09 19:03:01 +05:30
if attempt_num > 0:
self.logger(f" Retrying '{api_original_filename}' (Attempt {attempt_num}/{max_retries})...")
time.sleep(retry_delay * (2**(attempt_num - 1))) # Exponential backoff
if self.signals and hasattr(self.signals, 'file_download_status_signal'):
2025-05-10 11:07:27 +05:30
self.signals.file_download_status_signal.emit(True) # Indicate download attempt start
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
response = requests.get(file_url, headers=headers, timeout=(15, 300), stream=True) # Generous timeout
2025-05-09 19:03:01 +05:30
response.raise_for_status() # Check for HTTP errors
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
current_total_size_bytes_from_headers = int(response.headers.get('Content-Length', 0))
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if attempt_num == 0: # Only set total_size_bytes on the first attempt from headers
2025-05-09 19:03:01 +05:30
total_size_bytes = current_total_size_bytes_from_headers
size_str = f"{total_size_bytes / (1024 * 1024):.2f} MB" if total_size_bytes > 0 else "unknown size"
self.logger(f"⬇️ Downloading: '{api_original_filename}' (Size: {size_str}) [Saving as: '{final_filename_for_sets_and_saving}']")
2025-05-10 11:07:27 +05:30
current_attempt_total_size = total_size_bytes # Use the initial total_size for progress calculation
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
file_content_buffer = BytesIO()
2025-05-09 19:03:01 +05:30
current_attempt_downloaded_bytes = 0
md5_hasher = hashlib.md5()
2025-05-08 19:49:50 +05:30
last_progress_time = time.time()
for chunk in response.iter_content(chunk_size=1 * 1024 * 1024): # 1MB chunks
2025-05-09 19:03:01 +05:30
if self.check_cancel() or (skip_event and skip_event.is_set()):
2025-05-10 11:07:27 +05:30
break
2025-05-08 19:49:50 +05:30
if chunk:
2025-05-09 19:03:01 +05:30
file_content_buffer.write(chunk)
md5_hasher.update(chunk)
current_attempt_downloaded_bytes += len(chunk)
if time.time() - last_progress_time > 1 and current_attempt_total_size > 0 and \
self.signals and hasattr(self.signals, 'file_progress_signal'):
self.signals.file_progress_signal.emit(
api_original_filename, # Show original name in progress
current_attempt_downloaded_bytes,
2025-05-10 11:07:27 +05:30
current_attempt_total_size
2025-05-09 19:03:01 +05:30
)
2025-05-08 19:49:50 +05:30
last_progress_time = time.time()
2025-05-09 19:03:01 +05:30
if self.check_cancel() or (skip_event and skip_event.is_set()):
if file_content_buffer: file_content_buffer.close()
2025-05-10 11:07:27 +05:30
break # Exit retry loop if cancelled
# After loop, check if download was successful for this attempt
if current_attempt_downloaded_bytes > 0 or (current_attempt_total_size == 0 and response.status_code == 200): # Successfully downloaded something or it's a valid 0-byte file
2025-05-08 19:49:50 +05:30
calculated_file_hash = md5_hasher.hexdigest()
2025-05-09 19:03:01 +05:30
downloaded_size_bytes = current_attempt_downloaded_bytes
2025-05-10 11:07:27 +05:30
if file_content_bytes: file_content_bytes.close() # Close previous buffer if any
file_content_bytes = file_content_buffer # Assign the new buffer
file_content_bytes.seek(0) # Rewind for reading
2025-05-08 19:49:50 +05:30
download_successful_flag = True
2025-05-10 11:07:27 +05:30
break # Successful download, exit retry loop
else: # No bytes downloaded, and not a 0-byte file case
if file_content_buffer: file_content_buffer.close()
# Continue to next retry if not max retries
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e:
2025-05-09 19:03:01 +05:30
self.logger(f" ❌ Download Error (Retryable): {api_original_filename}. Error: {e}")
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close()
2025-05-10 11:07:27 +05:30
# Continue to next retry if not max retries
except requests.exceptions.RequestException as e: # Non-retryable HTTP errors
2025-05-09 19:03:01 +05:30
self.logger(f" ❌ Download Error (Non-Retryable): {api_original_filename}. Error: {e}")
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close()
2025-05-10 11:07:27 +05:30
break # Exit retry loop
2025-05-09 19:03:01 +05:30
except Exception as e: # Other unexpected errors
self.logger(f" ❌ Unexpected Download Error: {api_original_filename}: {e}\n{traceback.format_exc(limit=2)}")
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close()
2025-05-10 11:07:27 +05:30
break # Exit retry loop
2025-05-08 19:49:50 +05:30
finally:
2025-05-09 19:03:01 +05:30
if self.signals and hasattr(self.signals, 'file_download_status_signal'):
2025-05-10 11:07:27 +05:30
self.signals.file_download_status_signal.emit(False) # Indicate download attempt end
# Final progress update after all retries or success
2025-05-09 19:03:01 +05:30
if self.signals and hasattr(self.signals, 'file_progress_signal'):
final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes
self.signals.file_progress_signal.emit(api_original_filename, downloaded_size_bytes, final_total_for_progress)
2025-05-08 19:49:50 +05:30
if self.check_cancel() or (skip_event and skip_event.is_set()):
2025-05-09 19:03:01 +05:30
self.logger(f" ⚠️ Download interrupted for {api_original_filename}.")
2025-05-08 19:49:50 +05:30
if file_content_bytes: file_content_bytes.close()
2025-05-10 11:07:27 +05:30
return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
if not download_successful_flag:
2025-05-09 19:03:01 +05:30
self.logger(f"❌ Download failed for '{api_original_filename}' after {max_retries + 1} attempts.")
2025-05-10 11:07:27 +05:30
if file_content_bytes: file_content_bytes.close()
return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
# Check hash against already downloaded files (session-based)
2025-05-08 19:49:50 +05:30
with self.downloaded_file_hashes_lock:
if calculated_file_hash in self.downloaded_file_hashes:
2025-05-09 19:03:01 +05:30
self.logger(f" -> Content Skip (Hash): '{api_original_filename}' (Hash: {calculated_file_hash[:8]}...) already downloaded this session.")
2025-05-10 11:07:27 +05:30
with self.downloaded_files_lock: self.downloaded_files.add(final_filename_for_sets_and_saving) # Add final name
2025-05-08 19:49:50 +05:30
if file_content_bytes: file_content_bytes.close()
2025-05-10 11:07:27 +05:30
return 0, 1, final_filename_for_sets_and_saving, was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
bytes_to_write = file_content_bytes # This is the BytesIO object with downloaded content
final_filename_after_processing = final_filename_for_sets_and_saving
current_save_path_final = current_save_path # Path with potentially collided name
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
is_img_for_compress_check = is_image(api_original_filename) # Check original name for image type
if is_img_for_compress_check and self.compress_images and Image and downloaded_size_bytes > (1.5 * 1024 * 1024): # 1.5MB threshold
2025-05-08 19:49:50 +05:30
self.logger(f" Compressing '{api_original_filename}' ({downloaded_size_bytes / (1024*1024):.2f} MB)...")
try:
2025-05-10 11:07:27 +05:30
bytes_to_write.seek(0) # Ensure buffer is at the beginning
2025-05-08 19:49:50 +05:30
with Image.open(bytes_to_write) as img_obj:
2025-05-10 11:07:27 +05:30
# Handle palette mode images by converting to RGBA/RGB
if img_obj.mode == 'P': img_obj = img_obj.convert('RGBA')
2025-05-09 19:03:01 +05:30
elif img_obj.mode not in ['RGB', 'RGBA', 'L']: img_obj = img_obj.convert('RGB')
2025-05-08 19:49:50 +05:30
compressed_bytes_io = BytesIO()
2025-05-10 11:07:27 +05:30
img_obj.save(compressed_bytes_io, format='WebP', quality=80, method=4) # method=4 is a good balance
2025-05-08 19:49:50 +05:30
compressed_size = compressed_bytes_io.getbuffer().nbytes
2025-05-10 11:07:27 +05:30
if compressed_size < downloaded_size_bytes * 0.9: # Only save if significantly smaller (e.g., 10% reduction)
2025-05-08 19:49:50 +05:30
self.logger(f" Compression success: {compressed_size / (1024*1024):.2f} MB.")
2025-05-10 11:07:27 +05:30
bytes_to_write.close() # Close original downloaded buffer
bytes_to_write = compressed_bytes_io # Switch to compressed buffer
bytes_to_write.seek(0) # Rewind compressed buffer
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
base_name_orig, _ = os.path.splitext(final_filename_for_sets_and_saving)
2025-05-10 11:07:27 +05:30
final_filename_after_processing = base_name_orig + '.webp'
current_save_path_final = os.path.join(target_folder_path, final_filename_after_processing) # Update save path
2025-05-08 19:49:50 +05:30
self.logger(f" Updated filename (compressed): {final_filename_after_processing}")
2025-05-07 07:20:40 +05:30
else:
2025-05-10 11:07:27 +05:30
self.logger(f" Compression skipped: WebP not significantly smaller."); bytes_to_write.seek(0) # Rewind original if not using compressed
2025-05-08 19:49:50 +05:30
except Exception as comp_e:
2025-05-10 11:07:27 +05:30
self.logger(f"❌ Compression failed for '{api_original_filename}': {comp_e}. Saving original."); bytes_to_write.seek(0) # Rewind original
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
final_filename_saved_for_return = final_filename_after_processing # This is the name that will be saved
# Final check if the (potentially new, e.g. .webp) filename already exists
2025-05-08 19:49:50 +05:30
if final_filename_after_processing != final_filename_for_sets_and_saving and \
os.path.exists(current_save_path_final) and os.path.getsize(current_save_path_final) > 0:
self.logger(f" -> Exists (Path - Post-Compress): '{final_filename_after_processing}' in '{target_folder_basename}'.")
with self.downloaded_files_lock: self.downloaded_files.add(final_filename_after_processing)
bytes_to_write.close()
2025-05-10 11:07:27 +05:30
return 0, 1, final_filename_after_processing, was_original_name_kept_flag
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
try:
2025-05-10 11:07:27 +05:30
os.makedirs(os.path.dirname(current_save_path_final), exist_ok=True)
2025-05-08 19:49:50 +05:30
with open(current_save_path_final, 'wb') as f_out:
2025-05-10 11:07:27 +05:30
f_out.write(bytes_to_write.getvalue())
2025-05-08 19:49:50 +05:30
with self.downloaded_file_hashes_lock: self.downloaded_file_hashes.add(calculated_file_hash)
2025-05-10 11:07:27 +05:30
with self.downloaded_files_lock: self.downloaded_files.add(final_filename_after_processing) # Add final name
2025-05-08 19:49:50 +05:30
self.logger(f"✅ Saved: '{final_filename_after_processing}' (from '{api_original_filename}', {downloaded_size_bytes / (1024*1024):.2f} MB) in '{target_folder_basename}'")
2025-05-10 11:07:27 +05:30
time.sleep(0.05) # Small delay
return 1, 0, final_filename_after_processing, was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
except Exception as save_err:
self.logger(f"❌ Save Fail for '{final_filename_after_processing}': {save_err}")
2025-05-10 11:07:27 +05:30
if os.path.exists(current_save_path_final): # Attempt to clean up partial file
2025-05-09 19:03:01 +05:30
try: os.remove(current_save_path_final);
2025-05-08 19:49:50 +05:30
except OSError: self.logger(f" -> Failed to remove partially saved file: {current_save_path_final}")
2025-05-10 11:07:27 +05:30
return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag # Return the name it attempted to save as
2025-05-08 19:49:50 +05:30
finally:
2025-05-10 11:07:27 +05:30
if bytes_to_write: bytes_to_write.close()
2025-05-08 19:49:50 +05:30
def process(self):
"""Main processing logic for a single post."""
2025-05-10 11:07:27 +05:30
if self.check_cancel(): return 0, 0, []
kept_original_filenames_for_log = []
2025-05-08 19:49:50 +05:30
total_downloaded_this_post = 0
total_skipped_this_post = 0
2025-05-10 11:07:27 +05:30
parsed_api_url = urlparse(self.api_url_input)
2025-05-09 19:03:01 +05:30
referer_url = f"https://{parsed_api_url.netloc}/"
headers = {'User-Agent': 'Mozilla/5.0', 'Referer': referer_url, 'Accept': '*/*'}
link_pattern = re.compile(r"""<a\s+.*?href=["'](https?://[^"']+)["'][^>]*>(.*?)</a>""",
re.IGNORECASE | re.DOTALL)
post_data = self.post
post_title = post_data.get('title', '') or 'untitled_post'
2025-05-08 19:49:50 +05:30
post_id = post_data.get('id', 'unknown_id')
2025-05-10 11:07:27 +05:30
post_main_file_info = post_data.get('file') # This is a dict if present
post_attachments = post_data.get('attachments', []) # This is a list of dicts
post_content_html = post_data.get('content', '')
2025-05-08 19:49:50 +05:30
self.logger(f"\n--- Processing Post {post_id} ('{post_title[:50]}...') (Thread: {threading.current_thread().name}) ---")
2025-05-10 11:07:27 +05:30
num_potential_files_in_post = len(post_attachments or []) + (1 if post_main_file_info and post_main_file_info.get('path') else 0)
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Apply skip_words_list based on skip_words_scope (for posts)
if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_POSTS or self.skip_words_scope == SKIP_SCOPE_BOTH):
post_title_lower = post_title.lower()
for skip_word in self.skip_words_list:
if skip_word.lower() in post_title_lower:
self.logger(f" -> Skip Post (Keyword in Title '{skip_word}'): '{post_title[:50]}...'. Scope: {self.skip_words_scope}")
return 0, num_potential_files_in_post, [] # Skip all files in this post
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Character filter for Manga Mode (applies to post title)
2025-05-09 19:03:01 +05:30
if not self.extract_links_only and self.manga_mode_active and self.filter_character_list:
if not any(is_title_match_for_character(post_title, char_name) for char_name in self.filter_character_list):
self.logger(f" -> Skip Post (Manga Mode - Title No Char Match): Title '{post_title[:50]}' doesn't match active character filters.")
2025-05-10 11:07:27 +05:30
return 0, num_potential_files_in_post, []
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
if not isinstance(post_attachments, list): # Basic sanity check
2025-05-08 19:49:50 +05:30
self.logger(f"⚠️ Corrupt attachment data for post {post_id} (expected list, got {type(post_attachments)}). Skipping attachments.")
post_attachments = []
2025-05-10 11:07:27 +05:30
potential_base_save_folders = []
if not self.extract_links_only:
2025-05-09 19:03:01 +05:30
if self.use_subfolders:
2025-05-10 11:07:27 +05:30
# If character filters are active and it's manga mode, folder name comes from character filter matching post title
if self.filter_character_list and self.manga_mode_active:
for char_filter_name in self.filter_character_list:
if is_title_match_for_character(post_title, char_filter_name):
2025-05-09 19:03:01 +05:30
cleaned_folder = clean_folder_name(char_filter_name)
if cleaned_folder: potential_base_save_folders.append(cleaned_folder)
2025-05-10 11:07:27 +05:30
# If not manga mode with character filter, or if manga mode didn't find a match, try known names / title
if not potential_base_save_folders:
2025-05-09 19:03:01 +05:30
derived_folders = match_folders_from_title(post_title, self.known_names, self.unwanted_keywords)
if derived_folders:
potential_base_save_folders.extend(derived_folders)
self.logger(f" Folder Target(s) (Derived from Title & Known Names): {', '.join(derived_folders)}")
2025-05-10 11:07:27 +05:30
else:
2025-05-09 19:03:01 +05:30
fallback_folder = extract_folder_name_from_title(post_title, self.unwanted_keywords)
potential_base_save_folders.append(fallback_folder)
self.logger(f" Folder Target (Fallback from Title): {fallback_folder}")
2025-05-10 11:07:27 +05:30
if not potential_base_save_folders: # Absolute fallback
2025-05-09 19:03:01 +05:30
potential_base_save_folders.append(clean_folder_name(post_title if post_title else "untitled_creator_content"))
self.logger(f" Folder Target (Final Fallback): {potential_base_save_folders[0]}")
2025-05-10 11:07:27 +05:30
else: # Not using subfolders, save to root
potential_base_save_folders = [""]
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Skip post if folder name contains skip words (only if subfolders are used)
2025-05-09 19:03:01 +05:30
if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
for folder_name_to_check in potential_base_save_folders:
2025-05-10 11:07:27 +05:30
if not folder_name_to_check: continue # Skip if base folder is root
2025-05-09 19:03:01 +05:30
if any(skip_word.lower() in folder_name_to_check.lower() for skip_word in self.skip_words_list):
matched_skip = next((sw for sw in self.skip_words_list if sw.lower() in folder_name_to_check.lower()), "unknown_skip_word")
self.logger(f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check}' contains '{matched_skip}'.")
2025-05-10 11:07:27 +05:30
return 0, num_potential_files_in_post, []
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# External Link Extraction
2025-05-08 19:49:50 +05:30
if (self.show_external_links or self.extract_links_only) and post_content_html:
try:
2025-05-10 11:07:27 +05:30
unique_links_data = {}
2025-05-09 19:03:01 +05:30
for match in link_pattern.finditer(post_content_html):
link_url = match.group(1).strip()
2025-05-10 11:07:27 +05:30
link_inner_text = match.group(2)
2025-05-09 19:03:01 +05:30
if not any(ext in link_url.lower() for ext in ['.css', '.js', '.ico', '.xml', '.svg']) \
and not link_url.startswith('javascript:') \
and link_url not in unique_links_data:
clean_link_text = re.sub(r'<.*?>', '', link_inner_text)
clean_link_text = html.unescape(clean_link_text).strip()
display_text = clean_link_text if clean_link_text else "[Link]"
unique_links_data[link_url] = display_text
links_emitted_count = 0
scraped_platforms = {'kemono', 'coomer', 'patreon'}
for link_url, link_text in unique_links_data.items():
platform = get_link_platform(link_url)
if platform not in scraped_platforms:
if self.signals and hasattr(self.signals, 'external_link_signal'):
self.signals.external_link_signal.emit(post_title, link_text, link_url, platform)
links_emitted_count +=1
if links_emitted_count > 0: self.logger(f" 🔗 Found {links_emitted_count} potential external link(s) in post content.")
2025-05-08 19:49:50 +05:30
except Exception as e: self.logger(f"⚠️ Error parsing post content for links: {e}\n{traceback.format_exc(limit=2)}")
if self.extract_links_only:
2025-05-09 19:03:01 +05:30
self.logger(f" Extract Links Only mode: Finished processing post {post_id} for links.")
2025-05-10 11:07:27 +05:30
return 0, 0, [] # No files downloaded or skipped in this mode for this counter
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
# --- Prepare list of files to download from this post ---
all_files_from_post_api = []
api_file_domain = urlparse(self.api_url_input).netloc # Use domain from input URL
if not api_file_domain or not any(d in api_file_domain.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']):
# Fallback if input URL isn't a direct site URL (e.g. API URL was passed, though less common for user input)
2025-05-09 19:03:01 +05:30
api_file_domain = "kemono.su" if "kemono" in self.service.lower() else "coomer.party"
2025-05-10 11:07:27 +05:30
if post_main_file_info and isinstance(post_main_file_info, dict) and post_main_file_info.get('path'):
file_path = post_main_file_info['path'].lstrip('/')
original_api_name = post_main_file_info.get('name') or os.path.basename(file_path)
if original_api_name:
all_files_from_post_api.append({
'url': f"https://{api_file_domain}{file_path}" if file_path.startswith('/') else f"https://{api_file_domain}/data/{file_path}",
'name': original_api_name, # This 'name' might be used for initial filtering if _original_name_for_log isn't set
'_original_name_for_log': original_api_name, # Store the true original for logging/type checks
'_is_thumbnail': self.download_thumbnails and is_image(original_api_name)
})
else: self.logger(f" ⚠️ Skipping main file for post {post_id}: Missing name (Path: {file_path})")
for idx, att_info in enumerate(post_attachments):
if isinstance(att_info, dict) and att_info.get('path'):
att_path = att_info['path'].lstrip('/')
original_api_att_name = att_info.get('name') or os.path.basename(att_path)
if original_api_att_name:
all_files_from_post_api.append({
'url': f"https://{api_file_domain}{att_path}" if att_path.startswith('/') else f"https://{api_file_domain}/data/{att_path}",
'name': original_api_att_name,
'_original_name_for_log': original_api_att_name,
'_is_thumbnail': self.download_thumbnails and is_image(original_api_att_name)
2025-05-08 19:49:50 +05:30
})
2025-05-10 11:07:27 +05:30
else: self.logger(f" ⚠️ Skipping attachment {idx+1} for post {post_id}: Missing name (Path: {att_path})")
else: self.logger(f" ⚠️ Skipping invalid attachment {idx+1} for post {post_id}: {str(att_info)[:100]}")
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if self.download_thumbnails: # Filter non-images if in thumbnail mode
all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo['_is_thumbnail']]
if not all_files_from_post_api:
self.logger(f" -> No image thumbnails found for post {post_id} in thumbnail-only mode.")
return 0, 0, []
if not all_files_from_post_api:
2025-05-07 07:20:40 +05:30
self.logger(f" No files found to download for post {post_id}.")
2025-05-10 11:07:27 +05:30
return 0, 0, []
# --- Filter out duplicates based on original API filename WITHIN THIS POST ---
files_to_download_info_list = []
processed_original_filenames_in_this_post = set()
for file_info in all_files_from_post_api:
current_api_original_filename = file_info.get('_original_name_for_log')
if current_api_original_filename in processed_original_filenames_in_this_post:
self.logger(f" -> Skip Duplicate Original Name (within post {post_id}): '{current_api_original_filename}' already processed/listed for this post.")
total_skipped_this_post += 1
else:
files_to_download_info_list.append(file_info)
if current_api_original_filename:
processed_original_filenames_in_this_post.add(current_api_original_filename)
if not files_to_download_info_list:
self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.")
return 0, total_skipped_this_post, []
num_files_in_this_post_for_naming = len(files_to_download_info_list)
self.logger(f" Identified {num_files_in_this_post_for_naming} unique original file(s) for potential download from post {post_id}.")
2025-05-08 19:49:50 +05:30
with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool:
futures_list = []
2025-05-09 19:03:01 +05:30
for file_idx, file_info_to_dl in enumerate(files_to_download_info_list):
2025-05-07 07:20:40 +05:30
if self.check_cancel(): break
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
actual_target_full_paths_for_this_file = []
2025-05-09 19:03:01 +05:30
if self.use_subfolders:
2025-05-10 11:07:27 +05:30
# If character filters are active and NOT manga mode, folder name comes from char filter matching filename
if self.filter_character_list and not self.manga_mode_active:
2025-05-09 19:03:01 +05:30
for char_name_from_filter_list in self.filter_character_list:
2025-05-10 11:07:27 +05:30
if is_filename_match_for_character(file_info_to_dl.get('_original_name_for_log'), char_name_from_filter_list):
2025-05-09 19:03:01 +05:30
base_char_folder_path = os.path.join(self.download_root, clean_folder_name(char_name_from_filter_list))
if self.use_post_subfolders:
cleaned_title_for_subfolder = clean_folder_name(post_title)
post_specific_subfolder_name = f"{post_id}_{cleaned_title_for_subfolder}" if cleaned_title_for_subfolder else f"{post_id}_untitled"
actual_target_full_paths_for_this_file.append(os.path.join(base_char_folder_path, post_specific_subfolder_name))
else:
actual_target_full_paths_for_this_file.append(base_char_folder_path)
2025-05-10 11:07:27 +05:30
else: # Manga mode with char filter (already handled for potential_base_save_folders) OR no char filter OR char filter didn't match filename in normal mode
for base_folder_name in potential_base_save_folders: # These were determined earlier
2025-05-09 19:03:01 +05:30
base_folder_path = os.path.join(self.download_root, base_folder_name)
if self.use_post_subfolders:
cleaned_title_for_subfolder = clean_folder_name(post_title)
post_specific_subfolder_name = f"{post_id}_{cleaned_title_for_subfolder}" if cleaned_title_for_subfolder else f"{post_id}_untitled"
actual_target_full_paths_for_this_file.append(os.path.join(base_folder_path, post_specific_subfolder_name))
else:
actual_target_full_paths_for_this_file.append(base_folder_path)
2025-05-10 11:07:27 +05:30
else: # Not using subfolders at all
2025-05-09 19:03:01 +05:30
actual_target_full_paths_for_this_file = [self.download_root]
2025-05-10 11:07:27 +05:30
# Override with custom folder name if it's a single post download and custom name is provided
if self.target_post_id_from_initial_url and self.custom_folder_name: # custom_folder_name is already cleaned
2025-05-09 19:03:01 +05:30
custom_full_path = os.path.join(self.download_root, self.custom_folder_name)
actual_target_full_paths_for_this_file = [custom_full_path]
2025-05-10 11:07:27 +05:30
# Fallback if no specific target paths were determined (e.g. char filter normal mode no match)
2025-05-09 19:03:01 +05:30
if not actual_target_full_paths_for_this_file:
2025-05-10 11:07:27 +05:30
default_target_for_non_match = self.download_root
if self.use_subfolders: # Should use one of the potential_base_save_folders if subfolders enabled
gen_folder_name = potential_base_save_folders[0] if potential_base_save_folders and potential_base_save_folders[0] else clean_folder_name(post_title)
default_target_for_non_match = os.path.join(self.download_root, gen_folder_name)
if self.use_post_subfolders:
cleaned_title_for_subfolder = clean_folder_name(post_title)
post_specific_subfolder_name = f"{post_id}_{cleaned_title_for_subfolder}" if cleaned_title_for_subfolder else f"{post_id}_untitled"
default_target_for_non_match = os.path.join(default_target_for_non_match, post_specific_subfolder_name)
actual_target_full_paths_for_this_file = [default_target_for_non_match]
for target_path in set(actual_target_full_paths_for_this_file): # Use set to avoid duplicate downloads to same path
2025-05-08 19:49:50 +05:30
if self.check_cancel(): break
futures_list.append(file_pool.submit(
self._download_single_file,
file_info_to_dl,
2025-05-10 11:07:27 +05:30
target_path,
2025-05-08 19:49:50 +05:30
headers,
2025-05-10 11:07:27 +05:30
post_id,
self.skip_current_file_flag,
post_title, # Pass post_title for manga naming
file_idx,
num_files_in_this_post_for_naming
2025-05-08 19:49:50 +05:30
))
2025-05-10 11:07:27 +05:30
if self.check_cancel(): break
2025-05-09 19:03:01 +05:30
for future in as_completed(futures_list):
2025-05-10 11:07:27 +05:30
if self.check_cancel():
for f_to_cancel in futures_list: # Attempt to cancel pending futures
2025-05-09 19:03:01 +05:30
if not f_to_cancel.done():
f_to_cancel.cancel()
2025-05-10 11:07:27 +05:30
break
2025-05-07 07:20:40 +05:30
try:
2025-05-10 11:07:27 +05:30
dl_count, skip_count, actual_filename_saved, original_kept_flag = future.result()
2025-05-08 19:49:50 +05:30
total_downloaded_this_post += dl_count
total_skipped_this_post += skip_count
2025-05-10 11:07:27 +05:30
if original_kept_flag and dl_count > 0 and actual_filename_saved: # Ensure filename is not empty
kept_original_filenames_for_log.append(actual_filename_saved)
except CancelledError:
self.logger(f" File download task for post {post_id} was cancelled.")
total_skipped_this_post += 1 # Assume one file per cancelled future
2025-05-09 19:03:01 +05:30
except Exception as exc_f:
2025-05-08 19:49:50 +05:30
self.logger(f"❌ File download task for post {post_id} resulted in error: {exc_f}")
2025-05-10 11:07:27 +05:30
total_skipped_this_post += 1 # Assume one file failed
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Clear file progress after all files for this post are done or cancelled
2025-05-08 19:49:50 +05:30
if self.signals and hasattr(self.signals, 'file_progress_signal'):
2025-05-10 11:07:27 +05:30
self.signals.file_progress_signal.emit("", 0, 0)
2025-05-07 07:20:40 +05:30
2025-05-09 19:03:01 +05:30
if self.check_cancel(): self.logger(f" Post {post_id} processing interrupted/cancelled.");
2025-05-08 19:49:50 +05:30
else: self.logger(f" Post {post_id} Summary: Downloaded={total_downloaded_this_post}, Skipped Files={total_skipped_this_post}")
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
return total_downloaded_this_post, total_skipped_this_post, kept_original_filenames_for_log
2025-05-08 19:49:50 +05:30
class DownloadThread(QThread):
2025-05-09 19:03:01 +05:30
"""
2025-05-10 11:07:27 +05:30
Manages the overall download process.
2025-05-09 19:03:01 +05:30
Fetches posts using download_from_api and then processes each post using PostProcessorWorker.
"""
2025-05-10 11:07:27 +05:30
progress_signal = pyqtSignal(str)
add_character_prompt_signal = pyqtSignal(str) # For main app to show prompt
file_download_status_signal = pyqtSignal(bool) # True when a file dl starts, False when ends/fails
finished_signal = pyqtSignal(int, int, bool, list) # dl_count, skip_count, was_cancelled, kept_original_names
external_link_signal = pyqtSignal(str, str, str, str) # post_title, link_text, link_url, platform
file_progress_signal = pyqtSignal(str, int, int) # filename, downloaded_bytes, total_bytes
2025-05-08 19:49:50 +05:30
def __init__(self, api_url_input, output_dir, known_names_copy,
2025-05-10 11:07:27 +05:30
cancellation_event, # This is a threading.Event from the main app
2025-05-08 19:49:50 +05:30
filter_character_list=None,
filter_mode='all', skip_zip=True, skip_rar=True,
use_subfolders=True, use_post_subfolders=False, custom_folder_name=None, compress_images=False,
2025-05-07 07:20:40 +05:30
download_thumbnails=False, service=None, user_id=None,
2025-05-08 19:49:50 +05:30
downloaded_files=None, downloaded_file_hashes=None, downloaded_files_lock=None, downloaded_file_hashes_lock=None,
skip_words_list=None,
2025-05-10 11:07:27 +05:30
skip_words_scope=SKIP_SCOPE_FILES,
2025-05-08 19:49:50 +05:30
show_external_links=False,
2025-05-09 19:03:01 +05:30
extract_links_only=False,
2025-05-10 11:07:27 +05:30
num_file_threads_for_worker=1, # For PostProcessorWorker's internal pool
skip_current_file_flag=None, # This is a threading.Event
2025-05-09 19:03:01 +05:30
start_page=None, end_page=None,
2025-05-10 11:07:27 +05:30
target_post_id_from_initial_url=None, # The specific post ID if single post URL
2025-05-08 19:49:50 +05:30
manga_mode_active=False,
2025-05-10 11:07:27 +05:30
unwanted_keywords=None,
manga_filename_style=STYLE_POST_TITLE
2025-05-08 19:49:50 +05:30
):
2025-05-07 07:20:40 +05:30
super().__init__()
2025-05-08 19:49:50 +05:30
self.api_url_input = api_url_input
2025-05-07 07:20:40 +05:30
self.output_dir = output_dir
2025-05-10 11:07:27 +05:30
self.known_names = list(known_names_copy) # Make a copy
self.cancellation_event = cancellation_event # Use the shared event
self.skip_current_file_flag = skip_current_file_flag # Use the shared event
self.initial_target_post_id = target_post_id_from_initial_url # Store the original target
2025-05-08 19:49:50 +05:30
self.filter_character_list = filter_character_list if filter_character_list else []
2025-05-07 07:20:40 +05:30
self.filter_mode = filter_mode
self.skip_zip = skip_zip
self.skip_rar = skip_rar
self.use_subfolders = use_subfolders
2025-05-08 19:49:50 +05:30
self.use_post_subfolders = use_post_subfolders
2025-05-07 07:20:40 +05:30
self.custom_folder_name = custom_folder_name
self.compress_images = compress_images
self.download_thumbnails = download_thumbnails
self.service = service
self.user_id = user_id
self.skip_words_list = skip_words_list if skip_words_list is not None else []
2025-05-10 11:07:27 +05:30
self.skip_words_scope = skip_words_scope
self.downloaded_files = downloaded_files # Should be the shared set from main app
self.downloaded_files_lock = downloaded_files_lock # Shared lock
self.downloaded_file_hashes = downloaded_file_hashes # Shared set
self.downloaded_file_hashes_lock = downloaded_file_hashes_lock # Shared lock
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
self._add_character_response = None # For sync prompt result
self.prompt_mutex = QMutex() # For sync prompt result
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
self.show_external_links = show_external_links
2025-05-09 19:03:01 +05:30
self.extract_links_only = extract_links_only
2025-05-08 19:49:50 +05:30
self.num_file_threads_for_worker = num_file_threads_for_worker
self.start_page = start_page
self.end_page = end_page
self.manga_mode_active = manga_mode_active
2025-05-09 19:03:01 +05:30
self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else \
2025-05-10 11:07:27 +05:30
{'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'}
self.manga_filename_style = manga_filename_style
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
if self.compress_images and Image is None: # Check Pillow again
2025-05-08 19:49:50 +05:30
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
self.compress_images = False
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def logger(self, message):
"""Emits a log message via the progress_signal."""
self.progress_signal.emit(str(message))
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def isInterruptionRequested(self):
2025-05-09 19:03:01 +05:30
"""Checks if Qt interruption or manual cancellation event is set."""
2025-05-10 11:07:27 +05:30
# QThread's interruption is different from threading.Event
# We primarily use the threading.Event (self.cancellation_event)
return self.cancellation_event.is_set() or super().isInterruptionRequested()
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def skip_file(self):
2025-05-09 19:03:01 +05:30
"""Sets the flag to skip the currently processing file (if any)."""
2025-05-10 11:07:27 +05:30
# This method is called from the main thread via the GUI button.
# It needs to signal the PostProcessorWorker's skip_event if one is active.
# However, the DownloadThread itself doesn't directly manage the skip_event for individual files.
# The skip_current_file_flag is passed to PostProcessorWorker.
2025-05-08 19:49:50 +05:30
if self.isRunning() and self.skip_current_file_flag:
self.logger("⏭️ Skip requested for current file (single-thread mode).")
2025-05-10 11:07:27 +05:30
self.skip_current_file_flag.set() # Signal the event
else: self.logger(" Skip file: No download active or skip flag not available for current context.")
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def run(self):
2025-05-09 19:03:01 +05:30
"""Main execution logic for the download thread."""
2025-05-08 19:49:50 +05:30
grand_total_downloaded_files = 0
grand_total_skipped_files = 0
2025-05-10 11:07:27 +05:30
grand_list_of_kept_original_filenames = []
2025-05-08 19:49:50 +05:30
was_process_cancelled = False
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
# Create a PostProcessorSignals instance for this thread's workers
2025-05-08 19:49:50 +05:30
worker_signals_obj = PostProcessorSignals()
try:
2025-05-10 11:07:27 +05:30
# Connect signals from this worker_signals_obj to the DownloadThread's own signals
2025-05-08 19:49:50 +05:30
worker_signals_obj.progress_signal.connect(self.progress_signal)
worker_signals_obj.file_download_status_signal.connect(self.file_download_status_signal)
worker_signals_obj.file_progress_signal.connect(self.file_progress_signal)
worker_signals_obj.external_link_signal.connect(self.external_link_signal)
self.logger(" Starting post fetch (single-threaded download process)...")
post_generator = download_from_api(
self.api_url_input,
2025-05-10 11:07:27 +05:30
logger=self.logger,
2025-05-08 19:49:50 +05:30
start_page=self.start_page,
end_page=self.end_page,
manga_mode=self.manga_mode_active,
2025-05-10 11:07:27 +05:30
cancellation_event=self.cancellation_event # Pass the shared event
2025-05-08 19:49:50 +05:30
)
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
for posts_batch_data in post_generator: # download_from_api yields batches
2025-05-08 19:49:50 +05:30
if self.isInterruptionRequested(): was_process_cancelled = True; break
2025-05-10 11:07:27 +05:30
for individual_post_data in posts_batch_data: # Iterate through posts in the batch
2025-05-08 19:49:50 +05:30
if self.isInterruptionRequested(): was_process_cancelled = True; break
2025-05-10 11:07:27 +05:30
# Create and run PostProcessorWorker for each post
# The PostProcessorWorker will use its own ThreadPoolExecutor for files if num_file_threads_for_worker > 1
2025-05-08 19:49:50 +05:30
post_processing_worker = PostProcessorWorker(
post_data=individual_post_data,
download_root=self.output_dir,
2025-05-10 11:07:27 +05:30
known_names=self.known_names, # Pass the copy
2025-05-08 19:49:50 +05:30
filter_character_list=self.filter_character_list,
unwanted_keywords=self.unwanted_keywords,
filter_mode=self.filter_mode,
skip_zip=self.skip_zip, skip_rar=self.skip_rar,
use_subfolders=self.use_subfolders, use_post_subfolders=self.use_post_subfolders,
2025-05-10 11:07:27 +05:30
target_post_id_from_initial_url=self.initial_target_post_id, # Pass the original target
2025-05-08 19:49:50 +05:30
custom_folder_name=self.custom_folder_name,
compress_images=self.compress_images, download_thumbnails=self.download_thumbnails,
service=self.service, user_id=self.user_id,
2025-05-10 11:07:27 +05:30
api_url_input=self.api_url_input, # Pass the original input URL
cancellation_event=self.cancellation_event, # Pass the shared event
signals=worker_signals_obj, # Pass the signals object for this thread
downloaded_files=self.downloaded_files, # Pass shared set
downloaded_file_hashes=self.downloaded_file_hashes, # Pass shared set
downloaded_files_lock=self.downloaded_files_lock, # Pass shared lock
downloaded_file_hashes_lock=self.downloaded_file_hashes_lock, # Pass shared lock
2025-05-08 19:49:50 +05:30
skip_words_list=self.skip_words_list,
2025-05-10 11:07:27 +05:30
skip_words_scope=self.skip_words_scope,
2025-05-08 19:49:50 +05:30
show_external_links=self.show_external_links,
2025-05-09 19:03:01 +05:30
extract_links_only=self.extract_links_only,
2025-05-10 11:07:27 +05:30
num_file_threads=self.num_file_threads_for_worker, # Threads for files within this post
skip_current_file_flag=self.skip_current_file_flag, # Pass the shared event
manga_mode_active=self.manga_mode_active,
manga_filename_style=self.manga_filename_style
2025-05-08 19:49:50 +05:30
)
try:
2025-05-10 11:07:27 +05:30
# The process method of PostProcessorWorker handles its internal file downloads
dl_count, skip_count, kept_originals_this_post = post_processing_worker.process()
2025-05-08 19:49:50 +05:30
grand_total_downloaded_files += dl_count
grand_total_skipped_files += skip_count
2025-05-10 11:07:27 +05:30
if kept_originals_this_post: # This is a list
grand_list_of_kept_original_filenames.extend(kept_originals_this_post)
2025-05-08 19:49:50 +05:30
except Exception as proc_err:
post_id_for_err = individual_post_data.get('id', 'N/A')
self.logger(f"❌ Error processing post {post_id_for_err} in DownloadThread: {proc_err}")
traceback.print_exc()
2025-05-10 11:07:27 +05:30
# Estimate skipped files for this post if worker crashes
2025-05-09 19:03:01 +05:30
num_potential_files_est = len(individual_post_data.get('attachments', [])) + \
(1 if individual_post_data.get('file') else 0)
grand_total_skipped_files += num_potential_files_est
2025-05-08 19:49:50 +05:30
if self.skip_current_file_flag and self.skip_current_file_flag.is_set():
2025-05-10 11:07:27 +05:30
self.skip_current_file_flag.clear() # Reset for the next file/post
2025-05-09 19:03:01 +05:30
self.logger(" Skip current file flag was processed and cleared by DownloadThread.")
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
self.msleep(10) # Small delay between processing posts in single-thread mode
if was_process_cancelled: break # Break from outer loop (batches)
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if not was_process_cancelled and not self.isInterruptionRequested(): # Check again after loops
self.logger("✅ All posts processed or end of content reached by DownloadThread.")
2025-05-08 19:49:50 +05:30
except Exception as main_thread_err:
self.logger(f"\n❌ Critical error within DownloadThread run loop: {main_thread_err}")
traceback.print_exc()
2025-05-10 11:07:27 +05:30
# Don't assume cancelled if an unexpected error occurs, let was_process_cancelled reflect actual interruption
if not self.isInterruptionRequested(): was_process_cancelled = False
2025-05-08 19:49:50 +05:30
finally:
2025-05-10 11:07:27 +05:30
# Disconnect signals
2025-05-08 19:49:50 +05:30
try:
2025-05-09 19:03:01 +05:30
if worker_signals_obj: # Check if it was initialized
2025-05-08 19:49:50 +05:30
worker_signals_obj.progress_signal.disconnect(self.progress_signal)
worker_signals_obj.file_download_status_signal.disconnect(self.file_download_status_signal)
worker_signals_obj.external_link_signal.disconnect(self.external_link_signal)
worker_signals_obj.file_progress_signal.disconnect(self.file_progress_signal)
2025-05-10 11:07:27 +05:30
except (TypeError, RuntimeError) as e: #TypeError if not connected, RuntimeError if object deleted
2025-05-09 19:03:01 +05:30
self.logger(f" Note during DownloadThread signal disconnection: {e}")
2025-05-10 11:07:27 +05:30
# Emit finished signal with final counts and status
self.finished_signal.emit(grand_total_downloaded_files, grand_total_skipped_files, self.isInterruptionRequested(), grand_list_of_kept_original_filenames)
2025-05-07 07:20:40 +05:30
def receive_add_character_result(self, result):
2025-05-09 19:03:01 +05:30
"""Slot to receive the result from a character add prompt shown in the main thread."""
2025-05-10 11:07:27 +05:30
# This is called by a signal from the main thread
with QMutexLocker(self.prompt_mutex):
2025-05-07 07:20:40 +05:30
self._add_character_response = result
2025-05-10 11:07:27 +05:30
self.logger(f" (DownloadThread) Received character prompt response: {'Yes (added/confirmed)' if result else 'No (declined/failed)'}")