Kemono-Downloader/downloader_utils.py
2025-05-12 10:54:31 +05:30

1345 lines
75 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import time
import requests
import re
import threading
import queue
import hashlib
import http.client
import traceback
from concurrent.futures import ThreadPoolExecutor, Future, CancelledError, as_completed
import html
from PyQt5.QtCore import QObject, pyqtSignal, QThread, QMutex, QMutexLocker
from urllib.parse import urlparse
try:
from PIL import Image
except ImportError:
print("ERROR: Pillow library not found. Please install it: pip install Pillow")
Image = None
try:
from multipart_downloader import download_file_in_parts
MULTIPART_DOWNLOADER_AVAILABLE = True
except ImportError as e:
print(f"Warning: multipart_downloader.py not found or import error: {e}. Multi-part downloads will be disabled.")
MULTIPART_DOWNLOADER_AVAILABLE = False
def download_file_in_parts(*args, **kwargs): return False, 0, None, None # Dummy function
from io import BytesIO
STYLE_POST_TITLE = "post_title"
STYLE_ORIGINAL_NAME = "original_name"
SKIP_SCOPE_FILES = "files"
SKIP_SCOPE_POSTS = "posts"
SKIP_SCOPE_BOTH = "both"
CHAR_SCOPE_TITLE = "title"
CHAR_SCOPE_FILES = "files"
CHAR_SCOPE_BOTH = "both"
# DUPLICATE_MODE_RENAME is removed. Renaming only happens within a target folder if needed.
DUPLICATE_MODE_DELETE = "delete"
DUPLICATE_MODE_MOVE_TO_SUBFOLDER = "move"
fastapi_app = None
KNOWN_NAMES = []
MIN_SIZE_FOR_MULTIPART_DOWNLOAD = 10 * 1024 * 1024 # 10 MB
MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 8 # Max concurrent connections for a single file
IMAGE_EXTENSIONS = {
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp',
'.heic', '.heif', '.svg', '.ico', '.jfif', '.pjpeg', '.pjp', '.avif'
}
VIDEO_EXTENSIONS = {
'.mp4', '.mov', '.mkv', '.webm', '.avi', '.wmv', '.flv', '.mpeg',
'.mpg', '.m4v', '.3gp', '.ogv', '.ts', '.vob'
}
ARCHIVE_EXTENSIONS = {
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'
}
def is_title_match_for_character(post_title, character_name_filter):
if not post_title or not character_name_filter:
return False
safe_filter = str(character_name_filter).strip()
if not safe_filter:
return False
pattern = r"(?i)\b" + re.escape(safe_filter) + r"\b"
match_result = bool(re.search(pattern, post_title))
return match_result
def is_filename_match_for_character(filename, character_name_filter):
if not filename or not character_name_filter:
return False
safe_filter = str(character_name_filter).strip().lower()
if not safe_filter:
return False
match_result = safe_filter in filename.lower()
return match_result
def clean_folder_name(name):
if not isinstance(name, str): name = str(name)
cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name)
cleaned = cleaned.strip()
cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned if cleaned else "untitled_folder"
def clean_filename(name):
if not isinstance(name, str): name = str(name)
cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name)
cleaned = cleaned.strip()
cleaned = re.sub(r'\s+', '_', cleaned)
return cleaned if cleaned else "untitled_file"
def extract_folder_name_from_title(title, unwanted_keywords):
if not title: return 'Uncategorized'
title_lower = title.lower()
tokens = re.findall(r'\b[\w\-]+\b', title_lower)
for token in tokens:
clean_token = clean_folder_name(token)
if clean_token and clean_token.lower() not in unwanted_keywords:
return clean_token
cleaned_full_title = clean_folder_name(title)
return cleaned_full_title if cleaned_full_title else 'Uncategorized'
def match_folders_from_title(title, names_to_match, unwanted_keywords):
if not title or not names_to_match: return []
title_lower = title.lower()
matched_cleaned_names = set()
sorted_names_to_match = sorted(names_to_match, key=len, reverse=True)
for name in sorted_names_to_match:
name_lower = name.lower()
if not name_lower: continue
pattern = r'\b' + re.escape(name_lower) + r'\b'
if re.search(pattern, title_lower):
cleaned_name_for_folder = clean_folder_name(name)
if cleaned_name_for_folder.lower() not in unwanted_keywords:
matched_cleaned_names.add(cleaned_name_for_folder)
return sorted(list(matched_cleaned_names))
def is_image(filename):
if not filename: return False
_, ext = os.path.splitext(filename)
return ext.lower() in IMAGE_EXTENSIONS
def is_video(filename):
if not filename: return False
_, ext = os.path.splitext(filename)
return ext.lower() in VIDEO_EXTENSIONS
def is_zip(filename):
if not filename: return False
return filename.lower().endswith('.zip')
def is_rar(filename):
if not filename: return False
return filename.lower().endswith('.rar')
def is_archive(filename):
if not filename: return False
_, ext = os.path.splitext(filename)
return ext.lower() in ARCHIVE_EXTENSIONS
def is_post_url(url):
if not isinstance(url, str): return False
return '/post/' in urlparse(url).path
def extract_post_info(url_string):
service, user_id, post_id = None, None, None
if not isinstance(url_string, str) or not url_string.strip(): return None, None, None
try:
parsed_url = urlparse(url_string.strip())
domain = parsed_url.netloc.lower()
is_kemono = any(d in domain for d in ['kemono.su', 'kemono.party'])
is_coomer = any(d in domain for d in ['coomer.su', 'coomer.party'])
if not (is_kemono or is_coomer): return None, None, None
path_parts = [part for part in parsed_url.path.strip('/').split('/') if part]
if len(path_parts) >= 3 and path_parts[1].lower() == 'user':
service = path_parts[0]
user_id = path_parts[2]
if len(path_parts) >= 5 and path_parts[3].lower() == 'post':
post_id = path_parts[4]
return service, user_id, post_id
if len(path_parts) >= 5 and path_parts[0].lower() == 'api' and \
path_parts[1].lower() == 'v1' and path_parts[3].lower() == 'user':
service = path_parts[2]
user_id = path_parts[4]
if len(path_parts) >= 7 and path_parts[5].lower() == 'post':
post_id = path_parts[6]
return service, user_id, post_id
except Exception as e:
print(f"Debug: Exception during extract_post_info for URL '{url_string}': {e}")
return None, None, None
def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None):
if cancellation_event and cancellation_event.is_set():
logger(" Fetch cancelled before request.")
raise RuntimeError("Fetch operation cancelled by user.")
paginated_url = f'{api_url_base}?o={offset}'
logger(f" Fetching: {paginated_url} (Page approx. {offset // 50 + 1})")
try:
response = requests.get(paginated_url, headers=headers, timeout=(10, 60))
response.raise_for_status()
if 'application/json' not in response.headers.get('Content-Type', '').lower():
logger(f"⚠️ Unexpected content type from API: {response.headers.get('Content-Type')}. Body: {response.text[:200]}")
return []
return response.json()
except requests.exceptions.Timeout:
raise RuntimeError(f"Timeout fetching offset {offset} from {paginated_url}")
except requests.exceptions.RequestException as e:
err_msg = f"Error fetching offset {offset} from {paginated_url}: {e}"
if e.response is not None:
err_msg += f" (Status: {e.response.status_code}, Body: {e.response.text[:200]})"
raise RuntimeError(err_msg)
except ValueError as e:
raise RuntimeError(f"Error decoding JSON from offset {offset} ({paginated_url}): {e}. Response text: {response.text[:200]}")
except Exception as e:
raise RuntimeError(f"Unexpected error fetching offset {offset} ({paginated_url}): {e}")
def download_from_api(api_url_input, logger=print, start_page=None, end_page=None, manga_mode=False, cancellation_event=None):
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
service, user_id, target_post_id = extract_post_info(api_url_input)
if cancellation_event and cancellation_event.is_set():
logger(" Download_from_api cancelled at start.")
return
if not service or not user_id:
logger(f"❌ Invalid URL or could not extract service/user: {api_url_input}")
return
if target_post_id and (start_page or end_page):
logger("⚠️ Page range (start/end page) is ignored when a specific post URL is provided (searching all pages for the post).")
start_page = end_page = None
is_creator_feed_for_manga = manga_mode and not target_post_id
parsed_input = urlparse(api_url_input)
api_domain = parsed_input.netloc
if not any(d in api_domain.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']):
logger(f"⚠️ Unrecognized domain '{api_domain}'. Defaulting to kemono.su for API calls.")
api_domain = "kemono.su"
api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}"
page_size = 50
if is_creator_feed_for_manga:
logger(" Manga Mode: Fetching all posts to reverse order (oldest posts processed first)...")
all_posts_for_manga_mode = []
current_offset_manga = 0
while True:
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post fetching cancelled.")
break
try:
posts_batch_manga = fetch_posts_paginated(api_base_url, headers, current_offset_manga, logger, cancellation_event)
if not isinstance(posts_batch_manga, list):
logger(f"❌ API Error (Manga Mode): Expected list of posts, got {type(posts_batch_manga)}.")
break
if not posts_batch_manga:
logger("✅ Reached end of posts (Manga Mode fetch all).")
break
all_posts_for_manga_mode.extend(posts_batch_manga)
current_offset_manga += len(posts_batch_manga)
time.sleep(0.6)
except RuntimeError as e:
if "cancelled by user" in str(e).lower():
logger(f" Manga mode pagination stopped due to cancellation: {e}")
else:
logger(f"{e}\n Aborting manga mode pagination.")
break
except Exception as e:
logger(f"❌ Unexpected error during manga mode fetch: {e}")
traceback.print_exc()
break
if cancellation_event and cancellation_event.is_set(): return
if all_posts_for_manga_mode:
logger(f" Manga Mode: Fetched {len(all_posts_for_manga_mode)} total posts. Reversing order...")
all_posts_for_manga_mode.reverse()
for i in range(0, len(all_posts_for_manga_mode), page_size):
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post yielding cancelled.")
break
yield all_posts_for_manga_mode[i:i + page_size]
else:
logger(" Manga Mode: No posts found to process.")
return
current_page_num = 1
current_offset = 0
processed_target_post_flag = False
if start_page and start_page > 1 and not target_post_id:
current_offset = (start_page - 1) * page_size
current_page_num = start_page
logger(f" Starting from page {current_page_num} (calculated offset {current_offset}).")
while True:
if cancellation_event and cancellation_event.is_set():
logger(" Post fetching loop cancelled.")
break
if target_post_id and processed_target_post_flag:
break
if not target_post_id and end_page and current_page_num > end_page:
logger(f"✅ Reached specified end page ({end_page}) for creator feed. Stopping.")
break
try:
posts_batch = fetch_posts_paginated(api_base_url, headers, current_offset, logger, cancellation_event)
if not isinstance(posts_batch, list):
logger(f"❌ API Error: Expected list of posts, got {type(posts_batch)} at page {current_page_num} (offset {current_offset}).")
break
except RuntimeError as e:
if "cancelled by user" in str(e).lower():
logger(f" Pagination stopped due to cancellation: {e}")
else:
logger(f"{e}\n Aborting pagination at page {current_page_num} (offset {current_offset}).")
break
except Exception as e:
logger(f"❌ Unexpected error fetching page {current_page_num} (offset {current_offset}): {e}")
traceback.print_exc()
break
if not posts_batch:
if target_post_id and not processed_target_post_flag:
logger(f"❌ Target post {target_post_id} not found after checking all available pages (API returned no more posts at offset {current_offset}).")
elif not target_post_id:
if current_page_num == (start_page or 1):
logger(f"😕 No posts found on the first page checked (page {current_page_num}, offset {current_offset}).")
else:
logger(f"✅ Reached end of posts (no more content from API at offset {current_offset}).")
break
if target_post_id and not processed_target_post_flag:
matching_post = next((p for p in posts_batch if str(p.get('id')) == str(target_post_id)), None)
if matching_post:
logger(f"🎯 Found target post {target_post_id} on page {current_page_num} (offset {current_offset}).")
yield [matching_post]
processed_target_post_flag = True
elif not target_post_id:
yield posts_batch
if processed_target_post_flag:
break
current_offset += len(posts_batch)
current_page_num += 1
time.sleep(0.6)
if target_post_id and not processed_target_post_flag and not (cancellation_event and cancellation_event.is_set()):
logger(f"❌ Target post {target_post_id} could not be found after checking all relevant pages (final check after loop).")
def get_link_platform(url):
try:
domain = urlparse(url).netloc.lower()
if 'drive.google.com' in domain: return 'google drive'
if 'mega.nz' in domain or 'mega.io' in domain: return 'mega'
if 'dropbox.com' in domain: return 'dropbox'
if 'patreon.com' in domain: return 'patreon'
if 'instagram.com' in domain: return 'instagram'
if 'twitter.com' in domain or 'x.com' in domain: return 'twitter/x'
if 'discord.gg' in domain or 'discord.com/invite' in domain: return 'discord invite'
if 'pixiv.net' in domain: return 'pixiv'
if 'kemono.su' in domain or 'kemono.party' in domain: return 'kemono'
if 'coomer.su' in domain or 'coomer.party' in domain: return 'coomer'
parts = domain.split('.')
if len(parts) >= 2:
if parts[-2] not in ['com', 'org', 'net', 'gov', 'edu', 'co'] or len(parts) == 2:
return parts[-2]
elif len(parts) >= 3 and parts[-3] not in ['com', 'org', 'net', 'gov', 'edu', 'co']:
return parts[-3]
else:
return domain
return 'external'
except Exception: return 'unknown'
class PostProcessorSignals(QObject):
progress_signal = pyqtSignal(str)
file_download_status_signal = pyqtSignal(bool)
external_link_signal = pyqtSignal(str, str, str, str)
file_progress_signal = pyqtSignal(str, object)
class PostProcessorWorker:
def __init__(self, post_data, download_root, known_names,
filter_character_list,
unwanted_keywords, filter_mode, skip_zip, skip_rar,
use_subfolders, use_post_subfolders, target_post_id_from_initial_url, custom_folder_name,
compress_images, download_thumbnails, service, user_id,
api_url_input, cancellation_event, signals,
downloaded_files, downloaded_file_hashes, downloaded_files_lock, downloaded_file_hashes_lock,
skip_words_list=None,
skip_words_scope=SKIP_SCOPE_FILES,
show_external_links=False,
extract_links_only=False,
num_file_threads=4, skip_current_file_flag=None,
manga_mode_active=False,
manga_filename_style=STYLE_POST_TITLE,
char_filter_scope=CHAR_SCOPE_FILES,
remove_from_filename_words_list=None,
allow_multipart_download=True,
duplicate_file_mode=DUPLICATE_MODE_DELETE):
self.post = post_data
self.download_root = download_root
self.known_names = known_names
self.filter_character_list_objects = filter_character_list if filter_character_list else []
self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else set()
self.filter_mode = filter_mode
self.skip_zip = skip_zip
self.skip_rar = skip_rar
self.use_subfolders = use_subfolders
self.use_post_subfolders = use_post_subfolders
self.target_post_id_from_initial_url = target_post_id_from_initial_url
self.custom_folder_name = custom_folder_name
self.compress_images = compress_images
self.download_thumbnails = download_thumbnails
self.service = service
self.user_id = user_id
self.api_url_input = api_url_input
self.cancellation_event = cancellation_event
self.signals = signals
self.skip_current_file_flag = skip_current_file_flag
self.downloaded_files = downloaded_files if downloaded_files is not None else set()
self.downloaded_file_hashes = downloaded_file_hashes if downloaded_file_hashes is not None else set()
self.downloaded_files_lock = downloaded_files_lock if downloaded_files_lock is not None else threading.Lock()
self.downloaded_file_hashes_lock = downloaded_file_hashes_lock if downloaded_file_hashes_lock is not None else threading.Lock()
self.skip_words_list = skip_words_list if skip_words_list is not None else []
self.skip_words_scope = skip_words_scope
self.show_external_links = show_external_links
self.extract_links_only = extract_links_only
self.num_file_threads = num_file_threads
self.manga_mode_active = manga_mode_active
self.manga_filename_style = manga_filename_style
self.char_filter_scope = char_filter_scope
self.remove_from_filename_words_list = remove_from_filename_words_list if remove_from_filename_words_list is not None else []
self.allow_multipart_download = allow_multipart_download
self.duplicate_file_mode = duplicate_file_mode # This will be the effective mode (possibly overridden by main.py for manga)
if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found.")
self.compress_images = False
def logger(self, message):
if self.signals and hasattr(self.signals, 'progress_signal'):
self.signals.progress_signal.emit(message)
else:
print(f"(Worker Log - No Signal): {message}")
def check_cancel(self):
return self.cancellation_event.is_set()
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event,
post_title="", file_index_in_post=0, num_files_in_this_post=1):
was_original_name_kept_flag = False
final_filename_saved_for_return = ""
# current_target_folder_path is the actual folder where the file will be saved.
# It starts as the main character/post folder (target_folder_path) by default.
current_target_folder_path = target_folder_path
if self.check_cancel() or (skip_event and skip_event.is_set()): return 0, 1, "", False
file_url = file_info.get('url')
api_original_filename = file_info.get('_original_name_for_log', file_info.get('name'))
# This is the ideal name for the file if it were to be saved in the main target_folder_path.
filename_to_save_in_main_path = ""
if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_FILES or self.skip_words_scope == SKIP_SCOPE_BOTH):
filename_to_check_for_skip_words = api_original_filename.lower()
for skip_word in self.skip_words_list:
if skip_word.lower() in filename_to_check_for_skip_words:
self.logger(f" -> Skip File (Keyword in Original Name '{skip_word}'): '{api_original_filename}'. Scope: {self.skip_words_scope}")
return 0, 1, api_original_filename, False
original_filename_cleaned_base, original_ext = os.path.splitext(clean_filename(api_original_filename))
if not original_ext.startswith('.'): original_ext = '.' + original_ext if original_ext else ''
if self.manga_mode_active: # Note: duplicate_file_mode is overridden to "Delete" in main.py if manga_mode is on
if self.manga_filename_style == STYLE_ORIGINAL_NAME:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = True
elif self.manga_filename_style == STYLE_POST_TITLE:
if post_title and post_title.strip():
cleaned_post_title_base = clean_filename(post_title.strip())
if num_files_in_this_post > 1:
if file_index_in_post == 0:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = True
else:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename)
self.logger(f"⚠️ Manga mode (Post Title Style): Post title missing for post {original_post_id_for_log}. Using cleaned original filename '{filename_to_save_in_main_path}'.")
else:
self.logger(f"⚠️ Manga mode: Unknown filename style '{self.manga_filename_style}'. Defaulting to original filename for '{api_original_filename}'.")
filename_to_save_in_main_path = clean_filename(api_original_filename)
if not filename_to_save_in_main_path:
filename_to_save_in_main_path = f"manga_file_{original_post_id_for_log}_{file_index_in_post + 1}{original_ext}"
self.logger(f"⚠️ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save_in_main_path}'.")
was_original_name_kept_flag = False
else:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = False
if self.remove_from_filename_words_list and filename_to_save_in_main_path:
base_name_for_removal, ext_for_removal = os.path.splitext(filename_to_save_in_main_path)
modified_base_name = base_name_for_removal
for word_to_remove in self.remove_from_filename_words_list:
if not word_to_remove: continue
pattern = re.compile(re.escape(word_to_remove), re.IGNORECASE)
modified_base_name = pattern.sub("", modified_base_name)
modified_base_name = re.sub(r'[_.\s-]+', '_', modified_base_name)
modified_base_name = modified_base_name.strip('_')
if modified_base_name and modified_base_name != ext_for_removal.lstrip('.'):
filename_to_save_in_main_path = modified_base_name + ext_for_removal
else:
filename_to_save_in_main_path = base_name_for_removal + ext_for_removal
if not self.download_thumbnails:
is_img_type = is_image(api_original_filename)
is_vid_type = is_video(api_original_filename)
is_archive_type = is_archive(api_original_filename)
if self.filter_mode == 'archive':
if not is_archive_type:
self.logger(f" -> Filter Skip (Archive Mode): '{api_original_filename}' (Not an Archive).")
return 0, 1, api_original_filename, False
elif self.filter_mode == 'image':
if not is_img_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Image).")
return 0, 1, api_original_filename, False
elif self.filter_mode == 'video':
if not is_vid_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Video).")
return 0, 1, api_original_filename, False
if self.skip_zip and is_zip(api_original_filename):
self.logger(f" -> Pref Skip: '{api_original_filename}' (ZIP).")
return 0, 1, api_original_filename, False
if self.skip_rar and is_rar(api_original_filename):
self.logger(f" -> Pref Skip: '{api_original_filename}' (RAR).")
return 0, 1, api_original_filename, False
if not self.manga_mode_active:
# --- Pre-Download Duplicate Handling (Standard Mode Only) ---
is_duplicate_for_main_folder_by_path = os.path.exists(os.path.join(target_folder_path, filename_to_save_in_main_path)) and \
os.path.getsize(os.path.join(target_folder_path, filename_to_save_in_main_path)) > 0
is_duplicate_for_main_folder_by_session_name = False
with self.downloaded_files_lock:
if filename_to_save_in_main_path in self.downloaded_files:
is_duplicate_for_main_folder_by_session_name = True
if is_duplicate_for_main_folder_by_path or is_duplicate_for_main_folder_by_session_name:
if self.duplicate_file_mode == DUPLICATE_MODE_DELETE:
reason = "Path Exists" if is_duplicate_for_main_folder_by_path else "Session Name"
self.logger(f" -> Delete Duplicate ({reason}): '{filename_to_save_in_main_path}'. Skipping download.")
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path)
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
elif self.duplicate_file_mode == DUPLICATE_MODE_MOVE_TO_SUBFOLDER:
reason = "Path Exists" if is_duplicate_for_main_folder_by_path else "Session Name"
self.logger(f" -> Pre-DL Move ({reason}): '{filename_to_save_in_main_path}'. Will target 'Duplicate' subfolder.")
current_target_folder_path = os.path.join(target_folder_path, "Duplicate")
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path)
try:
os.makedirs(current_target_folder_path, exist_ok=True)
except OSError as e:
self.logger(f" ❌ Critical error creating directory '{current_target_folder_path}': {e}. Skipping file '{api_original_filename}'.")
return 0, 1, api_original_filename, False
# If mode is MOVE (and not manga mode), and current_target_folder_path is now "Duplicate",
# check if the file *already* exists by its base name in this "Duplicate" folder. (Standard Mode Only)
if not self.manga_mode_active and \
self.duplicate_file_mode == DUPLICATE_MODE_MOVE_TO_SUBFOLDER and \
"Duplicate" in current_target_folder_path.split(os.sep) and \
os.path.exists(os.path.join(current_target_folder_path, filename_to_save_in_main_path)):
self.logger(f" -> File '{filename_to_save_in_main_path}' already exists in '{os.path.basename(current_target_folder_path)}' subfolder. Skipping download.")
# The name was already added to downloaded_files if it was a pre-DL move.
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
# --- Download Attempt ---
max_retries = 3
retry_delay = 5
downloaded_size_bytes = 0
calculated_file_hash = None
file_content_bytes = None
total_size_bytes = 0
download_successful_flag = False
for attempt_num_single_stream in range(max_retries + 1):
if self.check_cancel() or (skip_event and skip_event.is_set()): break
try:
if attempt_num_single_stream > 0:
self.logger(f" Retrying download for '{api_original_filename}' (Overall Attempt {attempt_num_single_stream + 1}/{max_retries + 1})...")
time.sleep(retry_delay * (2**(attempt_num_single_stream - 1)))
if self.signals and hasattr(self.signals, 'file_download_status_signal'):
self.signals.file_download_status_signal.emit(True)
response = requests.get(file_url, headers=headers, timeout=(15, 300), stream=True)
response.raise_for_status()
total_size_bytes = int(response.headers.get('Content-Length', 0))
num_parts_for_file = min(self.num_file_threads, MAX_PARTS_FOR_MULTIPART_DOWNLOAD)
attempt_multipart = (self.allow_multipart_download and MULTIPART_DOWNLOADER_AVAILABLE and
num_parts_for_file > 1 and total_size_bytes > MIN_SIZE_FOR_MULTIPART_DOWNLOAD and
'bytes' in response.headers.get('Accept-Ranges', '').lower())
if attempt_multipart:
response.close()
if self.signals and hasattr(self.signals, 'file_download_status_signal'):
self.signals.file_download_status_signal.emit(False)
mp_save_path_base = os.path.join(current_target_folder_path, filename_to_save_in_main_path)
mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts(
file_url, mp_save_path_base, total_size_bytes, num_parts_for_file, headers,
api_original_filename, self.signals, self.cancellation_event, skip_event, self.logger
)
if mp_success:
download_successful_flag = True
downloaded_size_bytes = mp_bytes
calculated_file_hash = mp_hash
file_content_bytes = mp_file_handle
break
else:
if attempt_num_single_stream < max_retries:
self.logger(f" Multi-part download attempt failed for '{api_original_filename}'. Retrying with single stream.")
else:
download_successful_flag = False; break
self.logger(f"⬇️ Downloading (Single Stream): '{api_original_filename}' (Size: {total_size_bytes / (1024*1024):.2f} MB if known) [Base Name: '{filename_to_save_in_main_path}']")
file_content_buffer = BytesIO()
current_attempt_downloaded_bytes = 0
md5_hasher = hashlib.md5()
last_progress_time = time.time()
for chunk in response.iter_content(chunk_size=1 * 1024 * 1024):
if self.check_cancel() or (skip_event and skip_event.is_set()): break
if chunk:
file_content_buffer.write(chunk); md5_hasher.update(chunk)
current_attempt_downloaded_bytes += len(chunk)
if time.time() - last_progress_time > 1 and total_size_bytes > 0 and \
self.signals and hasattr(self.signals, 'file_progress_signal'):
self.signals.file_progress_signal.emit(api_original_filename, (current_attempt_downloaded_bytes, total_size_bytes))
last_progress_time = time.time()
if self.check_cancel() or (skip_event and skip_event.is_set()):
if file_content_buffer: file_content_buffer.close(); break
if current_attempt_downloaded_bytes > 0 or (total_size_bytes == 0 and response.status_code == 200):
calculated_file_hash = md5_hasher.hexdigest()
downloaded_size_bytes = current_attempt_downloaded_bytes
if file_content_bytes: file_content_bytes.close()
file_content_bytes = file_content_buffer; file_content_bytes.seek(0)
download_successful_flag = True; break
else:
if file_content_buffer: file_content_buffer.close()
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e:
self.logger(f" ❌ Download Error (Retryable): {api_original_filename}. Error: {e}")
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close()
except requests.exceptions.RequestException as e:
self.logger(f" ❌ Download Error (Non-Retryable): {api_original_filename}. Error: {e}")
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close(); break
except Exception as e:
self.logger(f" ❌ Unexpected Download Error: {api_original_filename}: {e}\n{traceback.format_exc(limit=2)}")
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close(); break
finally:
if self.signals and hasattr(self.signals, 'file_download_status_signal'):
self.signals.file_download_status_signal.emit(False)
if self.signals and hasattr(self.signals, 'file_progress_signal'):
final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes
self.signals.file_progress_signal.emit(api_original_filename, (downloaded_size_bytes, final_total_for_progress))
if self.check_cancel() or (skip_event and skip_event.is_set()):
self.logger(f" ⚠️ Download process interrupted for {api_original_filename}.")
if file_content_bytes: file_content_bytes.close()
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
if not download_successful_flag:
self.logger(f"❌ Download failed for '{api_original_filename}' after {max_retries + 1} attempts.")
if file_content_bytes: file_content_bytes.close()
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
if not self.manga_mode_active:
# --- Post-Download Hash Check (Standard Mode Only) ---
with self.downloaded_file_hashes_lock:
if calculated_file_hash in self.downloaded_file_hashes:
if self.duplicate_file_mode == DUPLICATE_MODE_DELETE:
self.logger(f" -> Delete Duplicate (Hash): '{api_original_filename}' (Hash: {calculated_file_hash[:8]}...). Skipping save.")
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path)
if file_content_bytes: file_content_bytes.close()
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
elif self.duplicate_file_mode == DUPLICATE_MODE_MOVE_TO_SUBFOLDER:
self.logger(f" -> Post-DL Move (Hash): '{api_original_filename}' (Hash: {calculated_file_hash[:8]}...). Content already downloaded.")
if "Duplicate" not in current_target_folder_path.split(os.sep):
current_target_folder_path = os.path.join(target_folder_path, "Duplicate")
self.logger(f" Redirecting to 'Duplicate' subfolder: '{current_target_folder_path}'")
# Ensure "Duplicate" folder exists if this is a new redirection due to hash
try: os.makedirs(current_target_folder_path, exist_ok=True)
except OSError as e_mkdir_hash: self.logger(f" Error creating Duplicate folder for hash collision: {e_mkdir_hash}")
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path)
# --- Final Filename Determination for Saving ---
filename_for_actual_save = filename_to_save_in_main_path
# If mode is MOVE (and not manga mode) and the file is destined for the main folder,
# but a file with that name *now* exists (e.g. race condition, or different file with same name not caught by hash),
# reroute it to the "Duplicate" folder.
if not self.manga_mode_active and \
self.duplicate_file_mode == DUPLICATE_MODE_MOVE_TO_SUBFOLDER and \
current_target_folder_path == target_folder_path and \
os.path.exists(os.path.join(current_target_folder_path, filename_for_actual_save)):
self.logger(f" -> Post-DL Move (Late Name Collision in Main): '{filename_for_actual_save}'. Moving to 'Duplicate'.")
current_target_folder_path = os.path.join(target_folder_path, "Duplicate")
try: # Ensure "Duplicate" folder exists if this is a new redirection
os.makedirs(current_target_folder_path, exist_ok=True)
except OSError as e_mkdir: self.logger(f" Error creating Duplicate folder during late move: {e_mkdir}")
# The name filename_to_save_in_main_path was already added to downloaded_files if it was a pre-DL name collision.
# If it was a hash collision that got rerouted, it was also added.
# If this is a new reroute due to late name collision, ensure it's marked.
# Apply numeric suffix renaming (_1, _2) *only if needed within the current_target_folder_path*
# This means:
# - If current_target_folder_path is the main folder (and not MOVE mode, or MOVE mode but file was unique):
# Renaming happens if a file with filename_for_actual_save exists there.
# - If current_target_folder_path is "Duplicate" (because of MOVE mode):
# Renaming happens if filename_for_actual_save exists *within "Duplicate"*.
counter = 1
base_name_final_coll, ext_final_coll = os.path.splitext(filename_for_actual_save)
temp_filename_final_check = filename_for_actual_save
while os.path.exists(os.path.join(current_target_folder_path, temp_filename_final_check)):
temp_filename_final_check = f"{base_name_final_coll}_{counter}{ext_final_coll}"
counter += 1
if temp_filename_final_check != filename_for_actual_save:
self.logger(f" Final rename for target folder '{os.path.basename(current_target_folder_path)}': '{temp_filename_final_check}' (was '{filename_for_actual_save}')")
filename_for_actual_save = temp_filename_final_check
bytes_to_write = file_content_bytes
final_filename_after_processing = filename_for_actual_save
current_save_path_final = os.path.join(current_target_folder_path, final_filename_after_processing)
is_img_for_compress_check = is_image(api_original_filename)
if is_img_for_compress_check and self.compress_images and Image and downloaded_size_bytes > (1.5 * 1024 * 1024):
self.logger(f" Compressing '{api_original_filename}' ({downloaded_size_bytes / (1024*1024):.2f} MB)...")
try:
bytes_to_write.seek(0)
with Image.open(bytes_to_write) as img_obj:
if img_obj.mode == 'P': img_obj = img_obj.convert('RGBA')
elif img_obj.mode not in ['RGB', 'RGBA', 'L']: img_obj = img_obj.convert('RGB')
compressed_bytes_io = BytesIO()
img_obj.save(compressed_bytes_io, format='WebP', quality=80, method=4)
compressed_size = compressed_bytes_io.getbuffer().nbytes
if compressed_size < downloaded_size_bytes * 0.9:
self.logger(f" Compression success: {compressed_size / (1024*1024):.2f} MB.")
if hasattr(bytes_to_write, 'close'): bytes_to_write.close()
original_part_file_path = os.path.join(current_target_folder_path, filename_to_save_in_main_path) + ".part" # Use original base for .part
if os.path.exists(original_part_file_path):
os.remove(original_part_file_path)
bytes_to_write = compressed_bytes_io; bytes_to_write.seek(0)
base_name_orig, _ = os.path.splitext(filename_for_actual_save)
final_filename_after_processing = base_name_orig + '.webp'
current_save_path_final = os.path.join(current_target_folder_path, final_filename_after_processing)
self.logger(f" Updated filename (compressed): {final_filename_after_processing}")
else:
self.logger(f" Compression skipped: WebP not significantly smaller."); bytes_to_write.seek(0)
except Exception as comp_e:
self.logger(f"❌ Compression failed for '{api_original_filename}': {comp_e}. Saving original."); bytes_to_write.seek(0)
if final_filename_after_processing != filename_for_actual_save and \
os.path.exists(current_save_path_final) and os.path.getsize(current_save_path_final) > 0:
self.logger(f" -> Exists (Path - Post-Compress): '{final_filename_after_processing}' in '{os.path.basename(current_target_folder_path)}'.")
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path)
if bytes_to_write and hasattr(bytes_to_write, 'close'): bytes_to_write.close()
return 0, 1, final_filename_after_processing, was_original_name_kept_flag
try:
os.makedirs(current_target_folder_path, exist_ok=True)
if isinstance(bytes_to_write, BytesIO):
with open(current_save_path_final, 'wb') as f_out:
f_out.write(bytes_to_write.getvalue())
else:
if hasattr(bytes_to_write, 'close'): bytes_to_write.close()
source_part_file = os.path.join(current_target_folder_path, filename_to_save_in_main_path) + ".part" # Use original base for .part
os.rename(source_part_file, current_save_path_final)
with self.downloaded_file_hashes_lock: self.downloaded_file_hashes.add(calculated_file_hash)
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path)
final_filename_saved_for_return = final_filename_after_processing
self.logger(f"✅ Saved: '{final_filename_saved_for_return}' (from '{api_original_filename}', {downloaded_size_bytes / (1024*1024):.2f} MB) in '{os.path.basename(current_target_folder_path)}'")
time.sleep(0.05)
return 1, 0, final_filename_saved_for_return, was_original_name_kept_flag
except Exception as save_err:
self.logger(f"❌ Save Fail for '{final_filename_after_processing}': {save_err}")
if os.path.exists(current_save_path_final):
try: os.remove(current_save_path_final);
except OSError: self.logger(f" -> Failed to remove partially saved file: {current_save_path_final}")
return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag
finally:
if bytes_to_write and hasattr(bytes_to_write, 'close'):
bytes_to_write.close()
def process(self):
if self.check_cancel(): return 0, 0, []
kept_original_filenames_for_log = []
total_downloaded_this_post = 0
total_skipped_this_post = 0
parsed_api_url = urlparse(self.api_url_input)
referer_url = f"https://{parsed_api_url.netloc}/"
headers = {'User-Agent': 'Mozilla/5.0', 'Referer': referer_url, 'Accept': '*/*'}
link_pattern = re.compile(r"""<a\s+.*?href=["'](https?://[^"']+)["'][^>]*>(.*?)</a>""",
re.IGNORECASE | re.DOTALL)
post_data = self.post
post_title = post_data.get('title', '') or 'untitled_post'
post_id = post_data.get('id', 'unknown_id')
post_main_file_info = post_data.get('file')
post_attachments = post_data.get('attachments', [])
post_content_html = post_data.get('content', '')
self.logger(f"\n--- Processing Post {post_id} ('{post_title[:50]}...') (Thread: {threading.current_thread().name}) ---")
num_potential_files_in_post = len(post_attachments or []) + (1 if post_main_file_info and post_main_file_info.get('path') else 0)
post_is_candidate_by_title_char_match = False
char_filter_that_matched_title = None
if self.filter_character_list_objects and \
(self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH):
self.logger(f" [Debug Title Match] Checking post title '{post_title}' against {len(self.filter_character_list_objects)} filter objects. Scope: {self.char_filter_scope}")
for idx, filter_item_obj in enumerate(self.filter_character_list_objects):
self.logger(f" [Debug Title Match] Filter obj #{idx}: {filter_item_obj}")
terms_to_check_for_title = list(filter_item_obj["aliases"])
if filter_item_obj["is_group"]:
if filter_item_obj["name"] not in terms_to_check_for_title:
terms_to_check_for_title.append(filter_item_obj["name"])
unique_terms_for_title_check = list(set(terms_to_check_for_title))
self.logger(f" [Debug Title Match] Unique terms for this filter obj: {unique_terms_for_title_check}")
for term_to_match in unique_terms_for_title_check:
self.logger(f" [Debug Title Match] Checking term: '{term_to_match}'")
match_found_for_term = is_title_match_for_character(post_title, term_to_match)
self.logger(f" [Debug Title Match] Result for '{term_to_match}': {match_found_for_term}")
if match_found_for_term:
post_is_candidate_by_title_char_match = True
char_filter_that_matched_title = filter_item_obj
self.logger(f" Post title matches char filter term '{term_to_match}' (from group/name '{filter_item_obj['name']}', Scope: {self.char_filter_scope}). Post is candidate.")
break
if post_is_candidate_by_title_char_match: break
self.logger(f" [Debug Title Match] Final post_is_candidate_by_title_char_match: {post_is_candidate_by_title_char_match}")
if self.filter_character_list_objects and self.char_filter_scope == CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match:
self.logger(f" -> Skip Post (Scope: Title - No Char Match): Title '{post_title[:50]}' does not match character filters.")
return 0, num_potential_files_in_post, []
if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_POSTS or self.skip_words_scope == SKIP_SCOPE_BOTH):
post_title_lower = post_title.lower()
for skip_word in self.skip_words_list:
if skip_word.lower() in post_title_lower:
self.logger(f" -> Skip Post (Keyword in Title '{skip_word}'): '{post_title[:50]}...'. Scope: {self.skip_words_scope}")
return 0, num_potential_files_in_post, []
if not self.extract_links_only and self.manga_mode_active and self.filter_character_list_objects and \
(self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH) and \
not post_is_candidate_by_title_char_match:
self.logger(f" -> Skip Post (Manga Mode with Title/Both Scope - No Title Char Match): Title '{post_title[:50]}' doesn't match filters.")
return 0, num_potential_files_in_post, []
if not isinstance(post_attachments, list):
self.logger(f"⚠️ Corrupt attachment data for post {post_id} (expected list, got {type(post_attachments)}). Skipping attachments.")
post_attachments = []
base_folder_names_for_post_content = []
if not self.extract_links_only and self.use_subfolders:
if post_is_candidate_by_title_char_match and char_filter_that_matched_title:
base_folder_names_for_post_content = [clean_folder_name(char_filter_that_matched_title["name"])]
elif not self.filter_character_list_objects:
derived_folders = match_folders_from_title(post_title, self.known_names, self.unwanted_keywords)
if derived_folders:
base_folder_names_for_post_content.extend(derived_folders)
else:
base_folder_names_for_post_content.append(extract_folder_name_from_title(post_title, self.unwanted_keywords))
if not base_folder_names_for_post_content or not base_folder_names_for_post_content[0]:
base_folder_names_for_post_content = [clean_folder_name(post_title if post_title else "untitled_creator_content")]
if base_folder_names_for_post_content:
log_reason = "Matched char filter" if (post_is_candidate_by_title_char_match and char_filter_that_matched_title) else "Generic title parsing (no char filters)"
self.logger(f" Base folder name(s) for post content ({log_reason}): {', '.join(base_folder_names_for_post_content)}")
if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
for folder_name_to_check in base_folder_names_for_post_content:
if not folder_name_to_check: continue
if any(skip_word.lower() in folder_name_to_check.lower() for skip_word in self.skip_words_list):
matched_skip = next((sw for sw in self.skip_words_list if sw.lower() in folder_name_to_check.lower()), "unknown_skip_word")
self.logger(f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check}' contains '{matched_skip}'.")
return 0, num_potential_files_in_post, []
if (self.show_external_links or self.extract_links_only) and post_content_html:
try:
unique_links_data = {}
for match in link_pattern.finditer(post_content_html):
link_url = match.group(1).strip()
link_inner_text = match.group(2)
if not any(ext in link_url.lower() for ext in ['.css', '.js', '.ico', '.xml', '.svg']) \
and not link_url.startswith('javascript:') \
and link_url not in unique_links_data:
clean_link_text = re.sub(r'<.*?>', '', link_inner_text)
clean_link_text = html.unescape(clean_link_text).strip()
display_text = clean_link_text if clean_link_text else "[Link]"
unique_links_data[link_url] = display_text
links_emitted_count = 0
scraped_platforms = {'kemono', 'coomer', 'patreon'}
for link_url, link_text in unique_links_data.items():
platform = get_link_platform(link_url)
if platform not in scraped_platforms:
if self.signals and hasattr(self.signals, 'external_link_signal'):
self.signals.external_link_signal.emit(post_title, link_text, link_url, platform)
links_emitted_count +=1
if links_emitted_count > 0: self.logger(f" 🔗 Found {links_emitted_count} potential external link(s) in post content.")
except Exception as e: self.logger(f"⚠️ Error parsing post content for links: {e}\n{traceback.format_exc(limit=2)}")
if self.extract_links_only:
self.logger(f" Extract Links Only mode: Finished processing post {post_id} for links.")
return 0, 0, []
all_files_from_post_api = []
api_file_domain = urlparse(self.api_url_input).netloc
if not api_file_domain or not any(d in api_file_domain.lower() for d in ['kemono.su', 'kemono.party', 'coomer.su', 'coomer.party']):
api_file_domain = "kemono.su" if "kemono" in self.service.lower() else "coomer.party"
if post_main_file_info and isinstance(post_main_file_info, dict) and post_main_file_info.get('path'):
file_path = post_main_file_info['path'].lstrip('/')
original_api_name = post_main_file_info.get('name') or os.path.basename(file_path)
if original_api_name:
all_files_from_post_api.append({
'url': f"https://{api_file_domain}{file_path}" if file_path.startswith('/') else f"https://{api_file_domain}/data/{file_path}",
'name': original_api_name,
'_original_name_for_log': original_api_name,
'_is_thumbnail': self.download_thumbnails and is_image(original_api_name)
})
else: self.logger(f" ⚠️ Skipping main file for post {post_id}: Missing name (Path: {file_path})")
for idx, att_info in enumerate(post_attachments):
if isinstance(att_info, dict) and att_info.get('path'):
att_path = att_info['path'].lstrip('/')
original_api_att_name = att_info.get('name') or os.path.basename(att_path)
if original_api_att_name:
all_files_from_post_api.append({
'url': f"https://{api_file_domain}{att_path}" if att_path.startswith('/') else f"https://{api_file_domain}/data/{att_path}",
'name': original_api_att_name,
'_original_name_for_log': original_api_att_name,
'_is_thumbnail': self.download_thumbnails and is_image(original_api_att_name)
})
else: self.logger(f" ⚠️ Skipping attachment {idx+1} for post {post_id}: Missing name (Path: {att_path})")
else: self.logger(f" ⚠️ Skipping invalid attachment {idx+1} for post {post_id}: {str(att_info)[:100]}")
if self.download_thumbnails:
all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo['_is_thumbnail']]
if not all_files_from_post_api:
self.logger(f" -> No image thumbnails found for post {post_id} in thumbnail-only mode.")
return 0, 0, []
if not all_files_from_post_api:
self.logger(f" No files found to download for post {post_id}.")
return 0, 0, []
files_to_download_info_list = []
processed_original_filenames_in_this_post = set()
for file_info in all_files_from_post_api:
current_api_original_filename = file_info.get('_original_name_for_log')
if current_api_original_filename in processed_original_filenames_in_this_post:
self.logger(f" -> Skip Duplicate Original Name (within post {post_id}): '{current_api_original_filename}' already processed/listed for this post.")
total_skipped_this_post += 1
else:
files_to_download_info_list.append(file_info)
if current_api_original_filename:
processed_original_filenames_in_this_post.add(current_api_original_filename)
if not files_to_download_info_list:
self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.")
return 0, total_skipped_this_post, []
num_files_in_this_post_for_naming = len(files_to_download_info_list)
self.logger(f" Identified {num_files_in_this_post_for_naming} unique original file(s) for potential download from post {post_id}.")
with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool:
futures_list = []
for file_idx, file_info_to_dl in enumerate(files_to_download_info_list):
if self.check_cancel(): break
current_api_original_filename = file_info_to_dl.get('_original_name_for_log')
file_is_candidate_by_char_filter_scope = False
char_filter_info_that_matched_file = None
if not self.filter_character_list_objects:
file_is_candidate_by_char_filter_scope = True
else:
if self.char_filter_scope == CHAR_SCOPE_FILES:
for filter_item_obj in self.filter_character_list_objects:
terms_to_check_for_file = list(filter_item_obj["aliases"])
if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_for_file:
terms_to_check_for_file.append(filter_item_obj["name"])
unique_terms_for_file_check = list(set(terms_to_check_for_file))
for term_to_match in unique_terms_for_file_check:
if is_filename_match_for_character(current_api_original_filename, term_to_match):
file_is_candidate_by_char_filter_scope = True
char_filter_info_that_matched_file = filter_item_obj
self.logger(f" File '{current_api_original_filename}' matches char filter term '{term_to_match}' (from '{filter_item_obj['name']}'). Scope: Files.")
break
if file_is_candidate_by_char_filter_scope: break
elif self.char_filter_scope == CHAR_SCOPE_TITLE:
if post_is_candidate_by_title_char_match:
file_is_candidate_by_char_filter_scope = True
char_filter_info_that_matched_file = char_filter_that_matched_title
self.logger(f" File '{current_api_original_filename}' is candidate because post title matched. Scope: Title.")
elif self.char_filter_scope == CHAR_SCOPE_BOTH:
if post_is_candidate_by_title_char_match:
file_is_candidate_by_char_filter_scope = True
char_filter_info_that_matched_file = char_filter_that_matched_title
self.logger(f" File '{current_api_original_filename}' is candidate because post title matched. Scope: Both (Title part).")
else:
for filter_item_obj in self.filter_character_list_objects:
terms_to_check_for_file_both = list(filter_item_obj["aliases"])
if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_for_file_both:
terms_to_check_for_file_both.append(filter_item_obj["name"])
unique_terms_for_file_both_check = list(set(terms_to_check_for_file_both))
for term_to_match in unique_terms_for_file_both_check:
if is_filename_match_for_character(current_api_original_filename, term_to_match):
file_is_candidate_by_char_filter_scope = True
char_filter_info_that_matched_file = filter_item_obj
self.logger(f" File '{current_api_original_filename}' matches char filter term '{term_to_match}' (from '{filter_item_obj['name']}'). Scope: Both (File part).")
break
if file_is_candidate_by_char_filter_scope: break
if not file_is_candidate_by_char_filter_scope:
self.logger(f" -> Skip File (Char Filter Scope '{self.char_filter_scope}'): '{current_api_original_filename}' no match.")
total_skipped_this_post += 1
continue
current_path_for_file = self.download_root
if self.use_subfolders:
char_title_subfolder_name = None
if self.target_post_id_from_initial_url and self.custom_folder_name:
char_title_subfolder_name = self.custom_folder_name
elif char_filter_info_that_matched_file:
char_title_subfolder_name = clean_folder_name(char_filter_info_that_matched_file["name"])
elif char_filter_that_matched_title:
char_title_subfolder_name = clean_folder_name(char_filter_that_matched_title["name"])
elif base_folder_names_for_post_content:
char_title_subfolder_name = base_folder_names_for_post_content[0]
if char_title_subfolder_name:
current_path_for_file = os.path.join(current_path_for_file, char_title_subfolder_name)
if self.use_post_subfolders:
cleaned_title_for_subfolder = clean_folder_name(post_title)
post_specific_subfolder_name = cleaned_title_for_subfolder # Use only the cleaned title
current_path_for_file = os.path.join(current_path_for_file, post_specific_subfolder_name)
target_folder_path_for_this_file = current_path_for_file
futures_list.append(file_pool.submit(
self._download_single_file,
file_info_to_dl,
target_folder_path_for_this_file,
headers,
post_id,
self.skip_current_file_flag,
post_title,
file_idx,
num_files_in_this_post_for_naming
))
for future in as_completed(futures_list):
if self.check_cancel():
for f_to_cancel in futures_list:
if not f_to_cancel.done():
f_to_cancel.cancel()
break
try:
dl_count, skip_count, actual_filename_saved, original_kept_flag = future.result()
total_downloaded_this_post += dl_count
total_skipped_this_post += skip_count
if original_kept_flag and dl_count > 0 and actual_filename_saved:
kept_original_filenames_for_log.append(actual_filename_saved)
except CancelledError:
self.logger(f" File download task for post {post_id} was cancelled.")
total_skipped_this_post += 1
except Exception as exc_f:
self.logger(f"❌ File download task for post {post_id} resulted in error: {exc_f}")
total_skipped_this_post += 1
if self.signals and hasattr(self.signals, 'file_progress_signal'):
self.signals.file_progress_signal.emit("", None)
if self.check_cancel(): self.logger(f" Post {post_id} processing interrupted/cancelled.");
else: self.logger(f" Post {post_id} Summary: Downloaded={total_downloaded_this_post}, Skipped Files={total_skipped_this_post}")
return total_downloaded_this_post, total_skipped_this_post, kept_original_filenames_for_log
class DownloadThread(QThread):
progress_signal = pyqtSignal(str)
add_character_prompt_signal = pyqtSignal(str)
file_download_status_signal = pyqtSignal(bool)
finished_signal = pyqtSignal(int, int, bool, list)
external_link_signal = pyqtSignal(str, str, str, str)
file_progress_signal = pyqtSignal(str, object)
def __init__(self, api_url_input, output_dir, known_names_copy,
cancellation_event,
filter_character_list=None,
filter_mode='all', skip_zip=True, skip_rar=True,
use_subfolders=True, use_post_subfolders=False, custom_folder_name=None, compress_images=False,
download_thumbnails=False, service=None, user_id=None,
downloaded_files=None, downloaded_file_hashes=None, downloaded_files_lock=None, downloaded_file_hashes_lock=None,
skip_words_list=None,
skip_words_scope=SKIP_SCOPE_FILES,
show_external_links=False,
extract_links_only=False,
num_file_threads_for_worker=1,
skip_current_file_flag=None,
start_page=None, end_page=None,
target_post_id_from_initial_url=None,
manga_mode_active=False,
unwanted_keywords=None,
manga_filename_style=STYLE_POST_TITLE,
char_filter_scope=CHAR_SCOPE_FILES,
remove_from_filename_words_list=None,
allow_multipart_download=True,
duplicate_file_mode=DUPLICATE_MODE_DELETE): # Default to DELETE
super().__init__()
self.api_url_input = api_url_input
self.output_dir = output_dir
self.known_names = list(known_names_copy)
self.cancellation_event = cancellation_event
self.skip_current_file_flag = skip_current_file_flag
self.initial_target_post_id = target_post_id_from_initial_url
self.filter_character_list_objects = filter_character_list if filter_character_list else []
self.filter_mode = filter_mode
self.skip_zip = skip_zip
self.skip_rar = skip_rar
self.use_subfolders = use_subfolders
self.use_post_subfolders = use_post_subfolders
self.custom_folder_name = custom_folder_name
self.compress_images = compress_images
self.download_thumbnails = download_thumbnails
self.service = service
self.user_id = user_id
self.skip_words_list = skip_words_list if skip_words_list is not None else []
self.skip_words_scope = skip_words_scope
self.downloaded_files = downloaded_files
self.downloaded_files_lock = downloaded_files_lock
self.downloaded_file_hashes = downloaded_file_hashes
self.downloaded_file_hashes_lock = downloaded_file_hashes_lock
self._add_character_response = None
self.prompt_mutex = QMutex()
self.show_external_links = show_external_links
self.extract_links_only = extract_links_only
self.num_file_threads_for_worker = num_file_threads_for_worker
self.start_page = start_page
self.end_page = end_page
self.manga_mode_active = manga_mode_active
self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else \
{'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'}
self.manga_filename_style = manga_filename_style
self.char_filter_scope = char_filter_scope
self.remove_from_filename_words_list = remove_from_filename_words_list
self.allow_multipart_download = allow_multipart_download
self.duplicate_file_mode = duplicate_file_mode
if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
self.compress_images = False
def logger(self, message):
self.progress_signal.emit(str(message))
def isInterruptionRequested(self):
return self.cancellation_event.is_set() or super().isInterruptionRequested()
def skip_file(self):
if self.isRunning() and self.skip_current_file_flag:
self.logger("⏭️ Skip requested for current file (single-thread mode).")
self.skip_current_file_flag.set()
else: self.logger(" Skip file: No download active or skip flag not available for current context.")
def run(self):
grand_total_downloaded_files = 0
grand_total_skipped_files = 0
grand_list_of_kept_original_filenames = []
was_process_cancelled = False
worker_signals_obj = PostProcessorSignals()
try:
worker_signals_obj.progress_signal.connect(self.progress_signal)
worker_signals_obj.file_download_status_signal.connect(self.file_download_status_signal)
worker_signals_obj.file_progress_signal.connect(self.file_progress_signal)
worker_signals_obj.external_link_signal.connect(self.external_link_signal)
self.logger(" Starting post fetch (single-threaded download process)...")
post_generator = download_from_api(
self.api_url_input,
logger=self.logger,
start_page=self.start_page,
end_page=self.end_page,
manga_mode=self.manga_mode_active,
cancellation_event=self.cancellation_event
)
for posts_batch_data in post_generator:
if self.isInterruptionRequested(): was_process_cancelled = True; break
for individual_post_data in posts_batch_data:
if self.isInterruptionRequested(): was_process_cancelled = True; break
post_processing_worker = PostProcessorWorker(
post_data=individual_post_data,
download_root=self.output_dir,
known_names=self.known_names,
filter_character_list=self.filter_character_list_objects,
unwanted_keywords=self.unwanted_keywords,
filter_mode=self.filter_mode,
skip_zip=self.skip_zip, skip_rar=self.skip_rar,
use_subfolders=self.use_subfolders, use_post_subfolders=self.use_post_subfolders,
target_post_id_from_initial_url=self.initial_target_post_id,
custom_folder_name=self.custom_folder_name,
compress_images=self.compress_images, download_thumbnails=self.download_thumbnails,
service=self.service, user_id=self.user_id,
api_url_input=self.api_url_input,
cancellation_event=self.cancellation_event,
signals=worker_signals_obj,
downloaded_files=self.downloaded_files,
downloaded_file_hashes=self.downloaded_file_hashes,
downloaded_files_lock=self.downloaded_files_lock,
downloaded_file_hashes_lock=self.downloaded_file_hashes_lock,
skip_words_list=self.skip_words_list,
skip_words_scope=self.skip_words_scope,
show_external_links=self.show_external_links,
extract_links_only=self.extract_links_only,
num_file_threads=self.num_file_threads_for_worker,
skip_current_file_flag=self.skip_current_file_flag,
manga_mode_active=self.manga_mode_active,
manga_filename_style=self.manga_filename_style,
char_filter_scope=self.char_filter_scope,
remove_from_filename_words_list=self.remove_from_filename_words_list,
allow_multipart_download=self.allow_multipart_download,
duplicate_file_mode=self.duplicate_file_mode)
try:
dl_count, skip_count, kept_originals_this_post = post_processing_worker.process()
grand_total_downloaded_files += dl_count
grand_total_skipped_files += skip_count
if kept_originals_this_post:
grand_list_of_kept_original_filenames.extend(kept_originals_this_post)
except Exception as proc_err:
post_id_for_err = individual_post_data.get('id', 'N/A')
self.logger(f"❌ Error processing post {post_id_for_err} in DownloadThread: {proc_err}")
traceback.print_exc()
num_potential_files_est = len(individual_post_data.get('attachments', [])) + \
(1 if individual_post_data.get('file') else 0)
grand_total_skipped_files += num_potential_files_est
if self.skip_current_file_flag and self.skip_current_file_flag.is_set():
self.skip_current_file_flag.clear()
self.logger(" Skip current file flag was processed and cleared by DownloadThread.")
self.msleep(10)
if was_process_cancelled: break
if not was_process_cancelled and not self.isInterruptionRequested():
self.logger("✅ All posts processed or end of content reached by DownloadThread.")
except Exception as main_thread_err:
self.logger(f"\n❌ Critical error within DownloadThread run loop: {main_thread_err}")
traceback.print_exc()
if not self.isInterruptionRequested(): was_process_cancelled = False
finally:
try:
if worker_signals_obj:
worker_signals_obj.progress_signal.disconnect(self.progress_signal)
worker_signals_obj.file_download_status_signal.disconnect(self.file_download_status_signal)
worker_signals_obj.external_link_signal.disconnect(self.external_link_signal)
worker_signals_obj.file_progress_signal.disconnect(self.file_progress_signal)
except (TypeError, RuntimeError) as e:
self.logger(f" Note during DownloadThread signal disconnection: {e}")
self.finished_signal.emit(grand_total_downloaded_files, grand_total_skipped_files, self.isInterruptionRequested(), grand_list_of_kept_original_filenames)
def receive_add_character_result(self, result):
with QMutexLocker(self.prompt_mutex):
self._add_character_response = result
self.logger(f" (DownloadThread) Received character prompt response: {'Yes (added/confirmed)' if result else 'No (declined/failed)'}")