This commit is contained in:
Yuvi9587
2025-05-21 17:20:16 +05:30
parent 651f9d9f8d
commit a6383b20a4
5 changed files with 1157 additions and 415 deletions

BIN
Read.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

View File

@@ -41,8 +41,12 @@ CHAR_SCOPE_FILES = "files"
CHAR_SCOPE_BOTH = "both" CHAR_SCOPE_BOTH = "both"
CHAR_SCOPE_COMMENTS = "comments" CHAR_SCOPE_COMMENTS = "comments"
FILE_DOWNLOAD_STATUS_SUCCESS = "success"
FILE_DOWNLOAD_STATUS_SKIPPED = "skipped"
FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER = "failed_retry_later"
fastapi_app = None fastapi_app = None
KNOWN_NAMES = [] KNOWN_NAMES = [] # This will now store dicts: {'name': str, 'is_group': bool, 'aliases': list[str]}
MIN_SIZE_FOR_MULTIPART_DOWNLOAD = 10 * 1024 * 1024 # 10 MB - Stays the same MIN_SIZE_FOR_MULTIPART_DOWNLOAD = 10 * 1024 * 1024 # 10 MB - Stays the same
MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 15 # Max concurrent connections for a single file MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 15 # Max concurrent connections for a single file
@@ -87,7 +91,19 @@ def clean_folder_name(name):
cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name) cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name)
cleaned = cleaned.strip() cleaned = cleaned.strip()
cleaned = re.sub(r'\s+', ' ', cleaned) cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned if cleaned else "untitled_folder"
if not cleaned: # If empty after initial cleaning
return "untitled_folder"
# Strip all trailing dots and spaces.
# This handles cases like "folder...", "folder. .", "folder . ." -> "folder"
temp_name = cleaned
while len(temp_name) > 0 and (temp_name.endswith('.') or temp_name.endswith(' ')):
temp_name = temp_name[:-1]
# If stripping all trailing dots/spaces made it empty (e.g., original was "."), use default
# Also handles if the original name was just spaces and became empty.
return temp_name if temp_name else "untitled_folder"
def clean_filename(name): def clean_filename(name):
@@ -120,20 +136,33 @@ def extract_folder_name_from_title(title, unwanted_keywords):
def match_folders_from_title(title, names_to_match, unwanted_keywords): def match_folders_from_title(title, names_to_match, unwanted_keywords):
"""
Matches folder names from a title based on a list of known name objects.
Each name object in names_to_match is expected to be a dict:
{'name': 'PrimaryFolderName', 'aliases': ['alias1', 'alias2', ...]}
"""
if not title or not names_to_match: return [] if not title or not names_to_match: return []
title_lower = title.lower() title_lower = title.lower()
matched_cleaned_names = set() matched_cleaned_names = set()
sorted_names_to_match = sorted(names_to_match, key=len, reverse=True) # Sort by the length of the primary name for matching longer, more specific names first.
# This is a heuristic; alias length might also be a factor but primary name length is simpler.
sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True)
for name in sorted_names_to_match: for name_obj in sorted_name_objects:
name_lower = name.lower() primary_folder_name = name_obj.get("name")
if not name_lower: continue aliases = name_obj.get("aliases", [])
if not primary_folder_name or not aliases:
continue
pattern = r'\b' + re.escape(name_lower) + r'\b' for alias in aliases:
if re.search(pattern, title_lower): alias_lower = alias.lower()
cleaned_name_for_folder = clean_folder_name(name) if not alias_lower: continue
if cleaned_name_for_folder.lower() not in unwanted_keywords: pattern = r'\b' + re.escape(alias_lower) + r'\b'
matched_cleaned_names.add(cleaned_name_for_folder) if re.search(pattern, title_lower):
cleaned_primary_name = clean_folder_name(primary_folder_name)
if cleaned_primary_name.lower() not in unwanted_keywords:
matched_cleaned_names.add(cleaned_primary_name)
break # Found a match for this primary name via one of its aliases
return sorted(list(matched_cleaned_names)) return sorted(list(matched_cleaned_names))
@@ -202,11 +231,20 @@ def extract_post_info(url_string):
return None, None, None return None, None, None
def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None): def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None, pause_event=None):
if cancellation_event and cancellation_event.is_set(): if cancellation_event and cancellation_event.is_set():
logger(" Fetch cancelled before request.") logger(" Fetch cancelled before request.")
raise RuntimeError("Fetch operation cancelled by user.") raise RuntimeError("Fetch operation cancelled by user.")
if pause_event and pause_event.is_set():
logger(" Post fetching paused...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger(" Post fetching cancelled while paused.")
raise RuntimeError("Fetch operation cancelled by user.")
time.sleep(0.5)
logger(" Post fetching resumed.")
paginated_url = f'{api_url_base}?o={offset}' paginated_url = f'{api_url_base}?o={offset}'
logger(f" Fetching: {paginated_url} (Page approx. {offset // 50 + 1})") logger(f" Fetching: {paginated_url} (Page approx. {offset // 50 + 1})")
try: try:
@@ -228,11 +266,20 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
except Exception as e: except Exception as e:
raise RuntimeError(f"Unexpected error fetching offset {offset} ({paginated_url}): {e}") raise RuntimeError(f"Unexpected error fetching offset {offset} ({paginated_url}): {e}")
def fetch_post_comments(api_domain, service, user_id, post_id, headers, logger, cancellation_event=None): def fetch_post_comments(api_domain, service, user_id, post_id, headers, logger, cancellation_event=None, pause_event=None):
if cancellation_event and cancellation_event.is_set(): if cancellation_event and cancellation_event.is_set():
logger(" Comment fetch cancelled before request.") logger(" Comment fetch cancelled before request.")
raise RuntimeError("Comment fetch operation cancelled by user.") raise RuntimeError("Comment fetch operation cancelled by user.")
if pause_event and pause_event.is_set():
logger(" Comment fetching paused...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger(" Comment fetching cancelled while paused.")
raise RuntimeError("Comment fetch operation cancelled by user.")
time.sleep(0.5)
logger(" Comment fetching resumed.")
comments_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{post_id}/comments" comments_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{post_id}/comments"
logger(f" Fetching comments: {comments_api_url}") logger(f" Fetching comments: {comments_api_url}")
try: try:
@@ -254,7 +301,7 @@ def fetch_post_comments(api_domain, service, user_id, post_id, headers, logger,
except Exception as e: except Exception as e:
raise RuntimeError(f"Unexpected error fetching comments for post {post_id} ({comments_api_url}): {e}") raise RuntimeError(f"Unexpected error fetching comments for post {post_id} ({comments_api_url}): {e}")
def download_from_api(api_url_input, logger=print, start_page=None, end_page=None, manga_mode=False, cancellation_event=None): def download_from_api(api_url_input, logger=print, start_page=None, end_page=None, manga_mode=False, cancellation_event=None, pause_event=None):
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'} headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
service, user_id, target_post_id = extract_post_info(api_url_input) service, user_id, target_post_id = extract_post_info(api_url_input)
@@ -286,11 +333,19 @@ def download_from_api(api_url_input, logger=print, start_page=None, end_page=Non
all_posts_for_manga_mode = [] all_posts_for_manga_mode = []
current_offset_manga = 0 current_offset_manga = 0
while True: while True:
if pause_event and pause_event.is_set():
logger(" Manga mode post fetching paused...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post fetching cancelled while paused.")
break
time.sleep(0.5)
if not (cancellation_event and cancellation_event.is_set()): logger(" Manga mode post fetching resumed.")
if cancellation_event and cancellation_event.is_set(): if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post fetching cancelled.") logger(" Manga mode post fetching cancelled.")
break break
try: try:
posts_batch_manga = fetch_posts_paginated(api_base_url, headers, current_offset_manga, logger, cancellation_event) posts_batch_manga = fetch_posts_paginated(api_base_url, headers, current_offset_manga, logger, cancellation_event, pause_event)
if not isinstance(posts_batch_manga, list): if not isinstance(posts_batch_manga, list):
logger(f"❌ API Error (Manga Mode): Expected list of posts, got {type(posts_batch_manga)}.") logger(f"❌ API Error (Manga Mode): Expected list of posts, got {type(posts_batch_manga)}.")
break break
@@ -357,6 +412,14 @@ def download_from_api(api_url_input, logger=print, start_page=None, end_page=Non
logger(f" Starting from page {current_page_num} (calculated offset {current_offset}).") logger(f" Starting from page {current_page_num} (calculated offset {current_offset}).")
while True: while True:
if pause_event and pause_event.is_set():
logger(" Post fetching loop paused...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger(" Post fetching loop cancelled while paused.")
break
time.sleep(0.5)
if not (cancellation_event and cancellation_event.is_set()): logger(" Post fetching loop resumed.")
if cancellation_event and cancellation_event.is_set(): if cancellation_event and cancellation_event.is_set():
logger(" Post fetching loop cancelled.") logger(" Post fetching loop cancelled.")
break break
@@ -369,7 +432,7 @@ def download_from_api(api_url_input, logger=print, start_page=None, end_page=Non
break break
try: try:
posts_batch = fetch_posts_paginated(api_base_url, headers, current_offset, logger, cancellation_event) posts_batch = fetch_posts_paginated(api_base_url, headers, current_offset, logger, cancellation_event, pause_event)
if not isinstance(posts_batch, list): if not isinstance(posts_batch, list):
logger(f"❌ API Error: Expected list of posts, got {type(posts_batch)} at page {current_page_num} (offset {current_offset}).") logger(f"❌ API Error: Expected list of posts, got {type(posts_batch)} at page {current_page_num} (offset {current_offset}).")
break break
@@ -453,10 +516,10 @@ class PostProcessorWorker:
filter_character_list, emitter, # Changed signals to emitter filter_character_list, emitter, # Changed signals to emitter
unwanted_keywords, filter_mode, skip_zip, skip_rar, unwanted_keywords, filter_mode, skip_zip, skip_rar,
use_subfolders, use_post_subfolders, target_post_id_from_initial_url, custom_folder_name, use_subfolders, use_post_subfolders, target_post_id_from_initial_url, custom_folder_name,
compress_images, download_thumbnails, service, user_id, compress_images, download_thumbnails, service, user_id, pause_event, # Added pause_event
api_url_input, cancellation_event, api_url_input, cancellation_event,
downloaded_files, downloaded_file_hashes, downloaded_files_lock, downloaded_file_hashes_lock, downloaded_files, downloaded_file_hashes, downloaded_files_lock, downloaded_file_hashes_lock,
skip_words_list=None, dynamic_character_filter_holder=None, skip_words_list=None, # Added dynamic_character_filter_holder
skip_words_scope=SKIP_SCOPE_FILES, skip_words_scope=SKIP_SCOPE_FILES,
show_external_links=False, show_external_links=False,
extract_links_only=False, extract_links_only=False,
@@ -471,7 +534,8 @@ class PostProcessorWorker:
self.post = post_data self.post = post_data
self.download_root = download_root self.download_root = download_root
self.known_names = known_names self.known_names = known_names
self.filter_character_list_objects = filter_character_list if filter_character_list else [] self.filter_character_list_objects_initial = filter_character_list if filter_character_list else [] # Store initial
self.dynamic_filter_holder = dynamic_character_filter_holder # Store the holder
self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else set() self.unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else set()
self.filter_mode = filter_mode self.filter_mode = filter_mode
self.skip_zip = skip_zip self.skip_zip = skip_zip
@@ -486,6 +550,7 @@ class PostProcessorWorker:
self.user_id = user_id self.user_id = user_id
self.api_url_input = api_url_input self.api_url_input = api_url_input
self.cancellation_event = cancellation_event self.cancellation_event = cancellation_event
self.pause_event = pause_event # Store pause_event
self.emitter = emitter # Store the emitter self.emitter = emitter # Store the emitter
if not self.emitter: if not self.emitter:
# This case should ideally be prevented by the caller # This case should ideally be prevented by the caller
@@ -533,7 +598,18 @@ class PostProcessorWorker:
def check_cancel(self): def check_cancel(self):
return self.cancellation_event.is_set() return self.cancellation_event.is_set()
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event, def _check_pause(self, context_message="Operation"):
if self.pause_event and self.pause_event.is_set():
self.logger(f" {context_message} paused...")
while self.pause_event.is_set(): # Loop while pause_event is set
if self.check_cancel():
self.logger(f" {context_message} cancelled while paused.")
return True # Indicates cancellation occurred
time.sleep(0.5)
if not self.check_cancel(): self.logger(f" {context_message} resumed.")
return False # Not cancelled during pause
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event, # skip_event is threading.Event
# emitter_for_file_ops, # This will be self.emitter # emitter_for_file_ops, # This will be self.emitter
post_title="", file_index_in_post=0, num_files_in_this_post=1, post_title="", file_index_in_post=0, num_files_in_this_post=1,
manga_date_file_counter_ref=None): # Added manga_date_file_counter_ref manga_date_file_counter_ref=None): # Added manga_date_file_counter_ref
@@ -541,6 +617,22 @@ class PostProcessorWorker:
final_filename_saved_for_return = "" final_filename_saved_for_return = ""
# target_folder_path is the base character/post folder. # target_folder_path is the base character/post folder.
def _get_current_character_filters(self):
if self.dynamic_filter_holder:
return self.dynamic_filter_holder.get_filters()
return self.filter_character_list_objects_initial
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event,
# emitter_for_file_ops, # This will be self.emitter
post_title="", file_index_in_post=0, num_files_in_this_post=1, # Added manga_date_file_counter_ref
manga_date_file_counter_ref=None,
forced_filename_override=None): # New for retries
was_original_name_kept_flag = False
final_filename_saved_for_return = ""
retry_later_details = None # For storing info if retryable failure
# target_folder_path is the base character/post folder.
if self._check_pause(f"File download prep for '{file_info.get('name', 'unknown file')}'"): return 0, 1, "", False
if self.check_cancel() or (skip_event and skip_event.is_set()): return 0, 1, "", False if self.check_cancel() or (skip_event and skip_event.is_set()): return 0, 1, "", False
file_url = file_info.get('url') file_url = file_info.get('url')
@@ -549,84 +641,85 @@ class PostProcessorWorker:
# This is the ideal name for the file if it were to be saved in the main target_folder_path. # This is the ideal name for the file if it were to be saved in the main target_folder_path.
filename_to_save_in_main_path = "" filename_to_save_in_main_path = ""
if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_FILES or self.skip_words_scope == SKIP_SCOPE_BOTH): if forced_filename_override:
filename_to_check_for_skip_words = api_original_filename.lower() filename_to_save_in_main_path = forced_filename_override
for skip_word in self.skip_words_list: self.logger(f" Retrying with forced filename: '{filename_to_save_in_main_path}'")
if skip_word.lower() in filename_to_check_for_skip_words: # was_original_name_kept_flag might need to be determined based on how forced_filename_override was created
self.logger(f" -> Skip File (Keyword in Original Name '{skip_word}'): '{api_original_filename}'. Scope: {self.skip_words_scope}")
return 0, 1, api_original_filename, False
original_filename_cleaned_base, original_ext = os.path.splitext(clean_filename(api_original_filename))
if not original_ext.startswith('.'): original_ext = '.' + original_ext if original_ext else ''
if self.manga_mode_active: # Note: duplicate_file_mode is overridden to "Delete" in main.py if manga_mode is on
if self.manga_filename_style == STYLE_ORIGINAL_NAME:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = True
elif self.manga_filename_style == STYLE_POST_TITLE:
if post_title and post_title.strip():
cleaned_post_title_base = clean_filename(post_title.strip())
if num_files_in_this_post > 1:
if file_index_in_post == 0:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = True
else:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename) # Fallback to original if no title
self.logger(f"⚠️ Manga mode (Post Title Style): Post title missing for post {original_post_id_for_log}. Using cleaned original filename '{filename_to_save_in_main_path}'.")
elif self.manga_filename_style == STYLE_DATE_BASED:
current_thread_name = threading.current_thread().name
self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Manga Date Mode. Counter Ref ID: {id(manga_date_file_counter_ref)}, Value before access: {manga_date_file_counter_ref}")
if manga_date_file_counter_ref is not None and len(manga_date_file_counter_ref) == 2:
counter_val_for_filename = -1
counter_lock = manga_date_file_counter_ref[1]
self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Attempting to acquire lock. Counter value before lock: {manga_date_file_counter_ref[0]}")
with counter_lock:
self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Lock acquired. Counter value at lock acquisition: {manga_date_file_counter_ref[0]}")
counter_val_for_filename = manga_date_file_counter_ref[0]
# Increment is done here, under lock, before this number is used by another thread.
# This number is now "reserved" for this file.
# If this file download fails, this number is "lost" (sequence will have a gap). This is acceptable.
manga_date_file_counter_ref[0] += 1
self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Incremented counter. New counter value: {manga_date_file_counter_ref[0]}. Filename will use: {counter_val_for_filename}")
filename_to_save_in_main_path = f"{counter_val_for_filename:03d}{original_ext}"
self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Lock released. Generated filename: {filename_to_save_in_main_path}")
else:
self.logger(f"⚠️ Manga Date Mode: Counter ref not provided or malformed for '{api_original_filename}'. Using original. Ref: {manga_date_file_counter_ref}")
# This log line below had a typo, fixed to reflect Date Mode context
filename_to_save_in_main_path = clean_filename(api_original_filename)
self.logger(f"⚠️ Manga mode (Date Based Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path}' for post {original_post_id_for_log}.")
else:
self.logger(f"⚠️ Manga mode: Unknown filename style '{self.manga_filename_style}'. Defaulting to original filename for '{api_original_filename}'.")
filename_to_save_in_main_path = clean_filename(api_original_filename)
if not filename_to_save_in_main_path:
filename_to_save_in_main_path = f"manga_file_{original_post_id_for_log}_{file_index_in_post + 1}{original_ext}"
self.logger(f"⚠️ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save_in_main_path}'.")
was_original_name_kept_flag = False
else: else:
filename_to_save_in_main_path = clean_filename(api_original_filename) if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_FILES or self.skip_words_scope == SKIP_SCOPE_BOTH):
was_original_name_kept_flag = False filename_to_check_for_skip_words = api_original_filename.lower()
for skip_word in self.skip_words_list:
if skip_word.lower() in filename_to_check_for_skip_words:
self.logger(f" -> Skip File (Keyword in Original Name '{skip_word}'): '{api_original_filename}'. Scope: {self.skip_words_scope}")
return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
if self.remove_from_filename_words_list and filename_to_save_in_main_path: original_filename_cleaned_base, original_ext = os.path.splitext(clean_filename(api_original_filename))
base_name_for_removal, ext_for_removal = os.path.splitext(filename_to_save_in_main_path) if not original_ext.startswith('.'): original_ext = '.' + original_ext if original_ext else ''
modified_base_name = base_name_for_removal
for word_to_remove in self.remove_from_filename_words_list: if self.manga_mode_active: # Note: duplicate_file_mode is overridden to "Delete" in main.py if manga_mode is on
if not word_to_remove: continue if self.manga_filename_style == STYLE_ORIGINAL_NAME:
pattern = re.compile(re.escape(word_to_remove), re.IGNORECASE) filename_to_save_in_main_path = clean_filename(api_original_filename)
modified_base_name = pattern.sub("", modified_base_name) was_original_name_kept_flag = True
modified_base_name = re.sub(r'[_.\s-]+', '_', modified_base_name) elif self.manga_filename_style == STYLE_POST_TITLE:
modified_base_name = modified_base_name.strip('_') if post_title and post_title.strip():
if modified_base_name and modified_base_name != ext_for_removal.lstrip('.'): cleaned_post_title_base = clean_filename(post_title.strip())
filename_to_save_in_main_path = modified_base_name + ext_for_removal if num_files_in_this_post > 1:
if file_index_in_post == 0:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = True
else:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename) # Fallback to original if no title
self.logger(f"⚠️ Manga mode (Post Title Style): Post title missing for post {original_post_id_for_log}. Using cleaned original filename '{filename_to_save_in_main_path}'.")
elif self.manga_filename_style == STYLE_DATE_BASED:
current_thread_name = threading.current_thread().name
# self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Manga Date Mode. Counter Ref ID: {id(manga_date_file_counter_ref)}, Value before access: {manga_date_file_counter_ref}")
if manga_date_file_counter_ref is not None and len(manga_date_file_counter_ref) == 2:
counter_val_for_filename = -1
counter_lock = manga_date_file_counter_ref[1]
# self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Attempting to acquire lock. Counter value before lock: {manga_date_file_counter_ref[0]}")
with counter_lock:
# self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Lock acquired. Counter value at lock acquisition: {manga_date_file_counter_ref[0]}")
counter_val_for_filename = manga_date_file_counter_ref[0]
manga_date_file_counter_ref[0] += 1
# self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Incremented counter. New counter value: {manga_date_file_counter_ref[0]}. Filename will use: {counter_val_for_filename}")
filename_to_save_in_main_path = f"{counter_val_for_filename:03d}{original_ext}"
# self.logger(f"DEBUG_COUNTER [{current_thread_name}, PostID: {original_post_id_for_log}]: File '{api_original_filename}'. Lock released. Generated filename: {filename_to_save_in_main_path}")
else:
self.logger(f"⚠️ Manga Date Mode: Counter ref not provided or malformed for '{api_original_filename}'. Using original. Ref: {manga_date_file_counter_ref}")
filename_to_save_in_main_path = clean_filename(api_original_filename)
self.logger(f"⚠️ Manga mode (Date Based Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path}' for post {original_post_id_for_log}.")
else:
self.logger(f"⚠️ Manga mode: Unknown filename style '{self.manga_filename_style}'. Defaulting to original filename for '{api_original_filename}'.")
filename_to_save_in_main_path = clean_filename(api_original_filename)
if not filename_to_save_in_main_path:
filename_to_save_in_main_path = f"manga_file_{original_post_id_for_log}_{file_index_in_post + 1}{original_ext}"
self.logger(f"⚠️ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save_in_main_path}'.")
was_original_name_kept_flag = False
else: else:
filename_to_save_in_main_path = base_name_for_removal + ext_for_removal filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = False
if self.remove_from_filename_words_list and filename_to_save_in_main_path:
base_name_for_removal, ext_for_removal = os.path.splitext(filename_to_save_in_main_path)
modified_base_name = base_name_for_removal
for word_to_remove in self.remove_from_filename_words_list:
if not word_to_remove: continue
pattern = re.compile(re.escape(word_to_remove), re.IGNORECASE)
modified_base_name = pattern.sub("", modified_base_name)
modified_base_name = re.sub(r'[_.\s-]+', '_', modified_base_name)
modified_base_name = modified_base_name.strip('_')
if modified_base_name and modified_base_name != ext_for_removal.lstrip('.'):
filename_to_save_in_main_path = modified_base_name + ext_for_removal
else:
filename_to_save_in_main_path = base_name_for_removal + ext_for_removal
if not self.download_thumbnails: if not self.download_thumbnails:
is_img_type = is_image(api_original_filename) is_img_type = is_image(api_original_filename)
@@ -636,46 +729,33 @@ class PostProcessorWorker:
if self.filter_mode == 'archive': if self.filter_mode == 'archive':
if not is_archive_type: if not is_archive_type:
self.logger(f" -> Filter Skip (Archive Mode): '{api_original_filename}' (Not an Archive).") self.logger(f" -> Filter Skip (Archive Mode): '{api_original_filename}' (Not an Archive).")
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
elif self.filter_mode == 'image': elif self.filter_mode == 'image':
if not is_img_type: if not is_img_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Image).") self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Image).")
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
elif self.filter_mode == 'video': elif self.filter_mode == 'video':
if not is_vid_type: if not is_vid_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Video).") self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Video).")
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
if self.skip_zip and is_zip(api_original_filename): if self.skip_zip and is_zip(api_original_filename):
self.logger(f" -> Pref Skip: '{api_original_filename}' (ZIP).") self.logger(f" -> Pref Skip: '{api_original_filename}' (ZIP).")
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
if self.skip_rar and is_rar(api_original_filename): if self.skip_rar and is_rar(api_original_filename):
self.logger(f" -> Pref Skip: '{api_original_filename}' (RAR).") self.logger(f" -> Pref Skip: '{api_original_filename}' (RAR).")
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
# --- Pre-Download Duplicate Handling (Standard Mode Only - Manga mode has its own suffixing) ---
if not self.manga_mode_active:
path_in_main_folder_check = os.path.join(target_folder_path, filename_to_save_in_main_path)
is_duplicate_by_path = os.path.exists(path_in_main_folder_check) and \
os.path.getsize(path_in_main_folder_check) > 0
is_duplicate_by_session_name = False
with self.downloaded_files_lock:
if filename_to_save_in_main_path in self.downloaded_files:
is_duplicate_by_session_name = True
if is_duplicate_by_path or is_duplicate_by_session_name:
reason = "Path Exists" if is_duplicate_by_path else "Session Name"
self.logger(f" -> Skip Duplicate ({reason}, Pre-DL): '{filename_to_save_in_main_path}'. Skipping download.")
with self.downloaded_files_lock: self.downloaded_files.add(filename_to_save_in_main_path) # Mark as processed
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
# --- Pre-Download Duplicate Handling ---
# Skipping based on filename before download is removed to allow suffixing for files from different posts.
# Hash-based skipping occurs after download.
# Physical path existence is handled by suffixing logic later.
# Ensure base target folder exists (used for .part file with multipart) # Ensure base target folder exists (used for .part file with multipart)
try: try:
os.makedirs(target_folder_path, exist_ok=True) # For .part file os.makedirs(target_folder_path, exist_ok=True) # For .part file
except OSError as e: except OSError as e:
self.logger(f" ❌ Critical error creating directory '{target_folder_path}': {e}. Skipping file '{api_original_filename}'.") self.logger(f" ❌ Critical error creating directory '{target_folder_path}': {e}. Skipping file '{api_original_filename}'.")
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None # Treat as skip
# --- Download Attempt --- # --- Download Attempt ---
max_retries = 3 max_retries = 3
@@ -685,8 +765,10 @@ class PostProcessorWorker:
file_content_bytes = None file_content_bytes = None
total_size_bytes = 0 total_size_bytes = 0
download_successful_flag = False download_successful_flag = False
last_exception_for_retry_later = None
for attempt_num_single_stream in range(max_retries + 1): for attempt_num_single_stream in range(max_retries + 1):
if self._check_pause(f"File download attempt for '{api_original_filename}'"): break
if self.check_cancel() or (skip_event and skip_event.is_set()): break if self.check_cancel() or (skip_event and skip_event.is_set()): break
try: try:
if attempt_num_single_stream > 0: if attempt_num_single_stream > 0:
@@ -704,6 +786,7 @@ class PostProcessorWorker:
num_parts_for_file > 1 and total_size_bytes > MIN_SIZE_FOR_MULTIPART_DOWNLOAD and num_parts_for_file > 1 and total_size_bytes > MIN_SIZE_FOR_MULTIPART_DOWNLOAD and
'bytes' in response.headers.get('Accept-Ranges', '').lower()) 'bytes' in response.headers.get('Accept-Ranges', '').lower())
if self._check_pause(f"Multipart decision for '{api_original_filename}'"): break # Check pause before potentially long operation
if attempt_multipart: if attempt_multipart:
response.close() response.close()
self._emit_signal('file_download_status', False) self._emit_signal('file_download_status', False)
@@ -713,7 +796,8 @@ class PostProcessorWorker:
mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts( mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts(
file_url, mp_save_path_base_for_part, total_size_bytes, num_parts_for_file, headers, api_original_filename, file_url, mp_save_path_base_for_part, total_size_bytes, num_parts_for_file, headers, api_original_filename,
emitter_for_multipart=self.emitter, # Pass the worker's emitter emitter_for_multipart=self.emitter, # Pass the worker's emitter
cancellation_event=self.cancellation_event, skip_event=skip_event, logger_func=self.logger cancellation_event=self.cancellation_event, skip_event=skip_event, logger_func=self.logger,
pause_event=self.pause_event # Pass pause_event
) )
if mp_success: if mp_success:
download_successful_flag = True download_successful_flag = True
@@ -734,6 +818,7 @@ class PostProcessorWorker:
last_progress_time = time.time() last_progress_time = time.time()
for chunk in response.iter_content(chunk_size=1 * 1024 * 1024): for chunk in response.iter_content(chunk_size=1 * 1024 * 1024):
if self._check_pause(f"Chunk download for '{api_original_filename}'"): break
if self.check_cancel() or (skip_event and skip_event.is_set()): break if self.check_cancel() or (skip_event and skip_event.is_set()): break
if chunk: if chunk:
file_content_buffer.write(chunk); md5_hasher.update(chunk) file_content_buffer.write(chunk); md5_hasher.update(chunk)
@@ -742,7 +827,7 @@ class PostProcessorWorker:
self._emit_signal('file_progress', api_original_filename, (current_attempt_downloaded_bytes, total_size_bytes)) self._emit_signal('file_progress', api_original_filename, (current_attempt_downloaded_bytes, total_size_bytes))
last_progress_time = time.time() last_progress_time = time.time()
if self.check_cancel() or (skip_event and skip_event.is_set()): if self.check_cancel() or (skip_event and skip_event.is_set()) or (self.pause_event and self.pause_event.is_set()):
if file_content_buffer: file_content_buffer.close(); break if file_content_buffer: file_content_buffer.close(); break
if current_attempt_downloaded_bytes > 0 or (total_size_bytes == 0 and response.status_code == 200): if current_attempt_downloaded_bytes > 0 or (total_size_bytes == 0 and response.status_code == 200):
@@ -756,9 +841,11 @@ class PostProcessorWorker:
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e: except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e:
self.logger(f" ❌ Download Error (Retryable): {api_original_filename}. Error: {e}") self.logger(f" ❌ Download Error (Retryable): {api_original_filename}. Error: {e}")
last_exception_for_retry_later = e # Store this specific exception
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close() if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close()
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
self.logger(f" ❌ Download Error (Non-Retryable): {api_original_filename}. Error: {e}") self.logger(f" ❌ Download Error (Non-Retryable): {api_original_filename}. Error: {e}")
last_exception_for_retry_later = e # Store this too
if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close(); break if 'file_content_buffer' in locals() and file_content_buffer: file_content_buffer.close(); break
except Exception as e: except Exception as e:
self.logger(f" ❌ Unexpected Download Error: {api_original_filename}: {e}\n{traceback.format_exc(limit=2)}") self.logger(f" ❌ Unexpected Download Error: {api_original_filename}: {e}\n{traceback.format_exc(limit=2)}")
@@ -770,16 +857,34 @@ class PostProcessorWorker:
final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes
self._emit_signal('file_progress', api_original_filename, (downloaded_size_bytes, final_total_for_progress)) self._emit_signal('file_progress', api_original_filename, (downloaded_size_bytes, final_total_for_progress))
if self.check_cancel() or (skip_event and skip_event.is_set()): if self.check_cancel() or (skip_event and skip_event.is_set()) or (self.pause_event and self.pause_event.is_set() and not download_successful_flag):
self.logger(f" ⚠️ Download process interrupted for {api_original_filename}.") self.logger(f" ⚠️ Download process interrupted for {api_original_filename}.")
if file_content_bytes: file_content_bytes.close() if file_content_bytes: file_content_bytes.close()
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None
if not download_successful_flag: if not download_successful_flag:
self.logger(f"❌ Download failed for '{api_original_filename}' after {max_retries + 1} attempts.") self.logger(f"❌ Download failed for '{api_original_filename}' after {max_retries + 1} attempts.")
if file_content_bytes: file_content_bytes.close() if file_content_bytes: file_content_bytes.close()
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag
# Check if this failure is one we want to mark for later retry
if isinstance(last_exception_for_retry_later, http.client.IncompleteRead):
self.logger(f" Marking '{api_original_filename}' for potential retry later due to IncompleteRead.")
retry_later_details = {
'file_info': file_info,
'target_folder_path': target_folder_path, # This is the base character/post folder
'headers': headers, # Original headers
'original_post_id_for_log': original_post_id_for_log,
'post_title': post_title,
'file_index_in_post': file_index_in_post,
'num_files_in_this_post': num_files_in_this_post,
'forced_filename_override': filename_to_save_in_main_path, # The name it was trying to save as
'manga_mode_active_for_file': self.manga_mode_active, # Store context
'manga_filename_style_for_file': self.manga_filename_style, # Store context
}
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER, retry_later_details
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None # Generic failure
if self._check_pause(f"Post-download hash check for '{api_original_filename}'"): return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None
# --- Universal Post-Download Hash Check --- # --- Universal Post-Download Hash Check ---
with self.downloaded_file_hashes_lock: with self.downloaded_file_hashes_lock:
if calculated_file_hash in self.downloaded_file_hashes: if calculated_file_hash in self.downloaded_file_hashes:
@@ -791,8 +896,8 @@ class PostProcessorWorker:
part_file_to_remove = os.path.join(target_folder_path, filename_to_save_in_main_path + ".part") part_file_to_remove = os.path.join(target_folder_path, filename_to_save_in_main_path + ".part")
if os.path.exists(part_file_to_remove): if os.path.exists(part_file_to_remove):
try: os.remove(part_file_to_remove); try: os.remove(part_file_to_remove);
except OSError: self.logger(f" -> Failed to remove .part file for hash duplicate: {part_file_to_remove}") except OSError: self.logger(f" -> Failed to remove .part file for hash duplicate: {part_file_to_remove}") # type: ignore
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None
# --- Determine Save Location and Final Filename --- # --- Determine Save Location and Final Filename ---
effective_save_folder = target_folder_path # Default: main character/post folder effective_save_folder = target_folder_path # Default: main character/post folder
@@ -811,7 +916,7 @@ class PostProcessorWorker:
if not isinstance(file_content_bytes, BytesIO): if not isinstance(file_content_bytes, BytesIO):
part_file_to_remove = os.path.join(target_folder_path, filename_to_save_in_main_path + ".part") part_file_to_remove = os.path.join(target_folder_path, filename_to_save_in_main_path + ".part")
if os.path.exists(part_file_to_remove): os.remove(part_file_to_remove) if os.path.exists(part_file_to_remove): os.remove(part_file_to_remove)
return 0, 1, api_original_filename, False return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
# --- Image Compression --- # --- Image Compression ---
# This operates on file_content_bytes (which is BytesIO or a file handle from multipart) # This operates on file_content_bytes (which is BytesIO or a file handle from multipart)
@@ -823,6 +928,7 @@ class PostProcessorWorker:
is_img_for_compress_check = is_image(api_original_filename) is_img_for_compress_check = is_image(api_original_filename)
if is_img_for_compress_check and self.compress_images and Image and downloaded_size_bytes > (1.5 * 1024 * 1024): if is_img_for_compress_check and self.compress_images and Image and downloaded_size_bytes > (1.5 * 1024 * 1024):
self.logger(f" Compressing '{api_original_filename}' ({downloaded_size_bytes / (1024*1024):.2f} MB)...") self.logger(f" Compressing '{api_original_filename}' ({downloaded_size_bytes / (1024*1024):.2f} MB)...")
if self._check_pause(f"Image compression for '{api_original_filename}'"): return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None # Allow pause before compression
try: try:
file_content_bytes.seek(0) file_content_bytes.seek(0)
with Image.open(file_content_bytes) as img_obj: with Image.open(file_content_bytes) as img_obj:
@@ -860,7 +966,7 @@ class PostProcessorWorker:
if final_filename_on_disk != filename_after_compression: # Log if a suffix was applied if final_filename_on_disk != filename_after_compression: # Log if a suffix was applied
self.logger(f" Applied numeric suffix in '{os.path.basename(effective_save_folder)}': '{final_filename_on_disk}' (was '{filename_after_compression}')") self.logger(f" Applied numeric suffix in '{os.path.basename(effective_save_folder)}': '{final_filename_on_disk}' (was '{filename_after_compression}')")
# else: for STYLE_DATE_BASED, final_filename_on_disk remains filename_after_compression. # else: for STYLE_DATE_BASED, final_filename_on_disk remains filename_after_compression.
if self._check_pause(f"File saving for '{final_filename_on_disk}'"): return 0, 1, final_filename_on_disk, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None
# --- Save File --- # --- Save File ---
final_save_path = os.path.join(effective_save_folder, final_filename_on_disk) final_save_path = os.path.join(effective_save_folder, final_filename_on_disk)
@@ -893,14 +999,14 @@ class PostProcessorWorker:
final_filename_saved_for_return = final_filename_on_disk final_filename_saved_for_return = final_filename_on_disk
self.logger(f"✅ Saved: '{final_filename_saved_for_return}' (from '{api_original_filename}', {downloaded_size_bytes / (1024*1024):.2f} MB) in '{os.path.basename(effective_save_folder)}'") self.logger(f"✅ Saved: '{final_filename_saved_for_return}' (from '{api_original_filename}', {downloaded_size_bytes / (1024*1024):.2f} MB) in '{os.path.basename(effective_save_folder)}'")
# Session-wide base name tracking removed. # Session-wide base name tracking removed.
time.sleep(0.05) time.sleep(0.05) # Brief pause after successful save
return 1, 0, final_filename_saved_for_return, was_original_name_kept_flag return 1, 0, final_filename_saved_for_return, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SUCCESS, None
except Exception as save_err: except Exception as save_err:
self.logger(f"❌ Save Fail for '{final_filename_on_disk}': {save_err}") self.logger(f"❌ Save Fail for '{final_filename_on_disk}': {save_err}")
if os.path.exists(final_save_path): if os.path.exists(final_save_path):
try: os.remove(final_save_path); try: os.remove(final_save_path);
except OSError: self.logger(f" -> Failed to remove partially saved file: {final_save_path}") except OSError: self.logger(f" -> Failed to remove partially saved file: {final_save_path}")
return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag return 0, 1, final_filename_saved_for_return, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_SKIPPED, None # Treat save fail as skip
finally: finally:
# Ensure all handles are closed # Ensure all handles are closed
if data_to_write_after_compression and hasattr(data_to_write_after_compression, 'close'): if data_to_write_after_compression and hasattr(data_to_write_after_compression, 'close'):
@@ -914,9 +1020,15 @@ class PostProcessorWorker:
def process(self): def process(self):
if self.check_cancel(): return 0, 0, [] if self._check_pause(f"Post processing for ID {self.post.get('id', 'N/A')}"): return 0,0,[], []
if self.check_cancel(): return 0, 0, [], []
# Get the potentially updated character filters at the start of processing this post
current_character_filters = self._get_current_character_filters()
# self.logger(f"DEBUG: Post {post_id}, Worker using filters: {[(f['name'], f['aliases']) for f in current_character_filters]}")
kept_original_filenames_for_log = [] kept_original_filenames_for_log = []
retryable_failures_this_post = [] # New list to store retryable failure details
total_downloaded_this_post = 0 total_downloaded_this_post = 0
total_skipped_this_post = 0 total_skipped_this_post = 0
@@ -946,10 +1058,11 @@ class PostProcessorWorker:
char_filter_that_matched_file_in_comment_scope = None char_filter_that_matched_file_in_comment_scope = None
char_filter_that_matched_comment = None char_filter_that_matched_comment = None
if self.filter_character_list_objects and \ if current_character_filters and \
(self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH): (self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH):
# self.logger(f" [Debug Title Match] Checking post title '{post_title}' against {len(self.filter_character_list_objects)} filter objects. Scope: {self.char_filter_scope}") # self.logger(f" [Debug Title Match] Checking post title '{post_title}' against {len(self.filter_character_list_objects)} filter objects. Scope: {self.char_filter_scope}")
for idx, filter_item_obj in enumerate(self.filter_character_list_objects): if self._check_pause(f"Character title filter for post {post_id}"): return 0, num_potential_files_in_post, [], []
for idx, filter_item_obj in enumerate(current_character_filters):
if self.check_cancel(): break if self.check_cancel(): break
# self.logger(f" [Debug Title Match] Filter obj #{idx}: {filter_item_obj}") # self.logger(f" [Debug Title Match] Filter obj #{idx}: {filter_item_obj}")
terms_to_check_for_title = list(filter_item_obj["aliases"]) terms_to_check_for_title = list(filter_item_obj["aliases"])
@@ -992,14 +1105,14 @@ class PostProcessorWorker:
# --- End population of all_files_from_post_api_for_char_check --- # --- End population of all_files_from_post_api_for_char_check ---
if self.filter_character_list_objects and self.char_filter_scope == CHAR_SCOPE_COMMENTS: if current_character_filters and self.char_filter_scope == CHAR_SCOPE_COMMENTS:
self.logger(f" [Char Scope: Comments] Phase 1: Checking post files for matches before comments for post ID '{post_id}'.") self.logger(f" [Char Scope: Comments] Phase 1: Checking post files for matches before comments for post ID '{post_id}'.")
if self._check_pause(f"File check (comments scope) for post {post_id}"): return 0, num_potential_files_in_post, [], []
for file_info_item in all_files_from_post_api_for_char_check: # Use the pre-populated list of file names for file_info_item in all_files_from_post_api_for_char_check: # Use the pre-populated list of file names
if self.check_cancel(): break if self.check_cancel(): break
current_api_original_filename_for_check = file_info_item.get('_original_name_for_log') current_api_original_filename_for_check = file_info_item.get('_original_name_for_log')
if not current_api_original_filename_for_check: continue if not current_api_original_filename_for_check: continue
for filter_item_obj in current_character_filters:
for filter_item_obj in self.filter_character_list_objects:
terms_to_check = list(filter_item_obj["aliases"]) terms_to_check = list(filter_item_obj["aliases"])
if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check: if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check:
terms_to_check.append(filter_item_obj["name"]) terms_to_check.append(filter_item_obj["name"])
@@ -1014,8 +1127,9 @@ class PostProcessorWorker:
if post_is_candidate_by_file_char_match_in_comment_scope: break if post_is_candidate_by_file_char_match_in_comment_scope: break
self.logger(f" [Char Scope: Comments] Phase 1 Result: post_is_candidate_by_file_char_match_in_comment_scope = {post_is_candidate_by_file_char_match_in_comment_scope}") self.logger(f" [Char Scope: Comments] Phase 1 Result: post_is_candidate_by_file_char_match_in_comment_scope = {post_is_candidate_by_file_char_match_in_comment_scope}")
if self.filter_character_list_objects and self.char_filter_scope == CHAR_SCOPE_COMMENTS: if current_character_filters and self.char_filter_scope == CHAR_SCOPE_COMMENTS:
if not post_is_candidate_by_file_char_match_in_comment_scope: if not post_is_candidate_by_file_char_match_in_comment_scope:
if self._check_pause(f"Comment check for post {post_id}"): return 0, num_potential_files_in_post, [], []
self.logger(f" [Char Scope: Comments] Phase 2: No file match found. Checking post comments for post ID '{post_id}'.") self.logger(f" [Char Scope: Comments] Phase 2: No file match found. Checking post comments for post ID '{post_id}'.")
try: try:
parsed_input_url_for_comments = urlparse(self.api_url_input) parsed_input_url_for_comments = urlparse(self.api_url_input)
@@ -1026,7 +1140,7 @@ class PostProcessorWorker:
comments_data = fetch_post_comments( comments_data = fetch_post_comments(
api_domain_for_comments, self.service, self.user_id, post_id, api_domain_for_comments, self.service, self.user_id, post_id,
headers, self.logger, self.cancellation_event headers, self.logger, self.cancellation_event, self.pause_event # Pass pause_event
) )
if comments_data: if comments_data:
self.logger(f" Fetched {len(comments_data)} comments for post {post_id}.") self.logger(f" Fetched {len(comments_data)} comments for post {post_id}.")
@@ -1038,7 +1152,7 @@ class PostProcessorWorker:
cleaned_comment_text = strip_html_tags(raw_comment_content) cleaned_comment_text = strip_html_tags(raw_comment_content)
if not cleaned_comment_text.strip(): continue if not cleaned_comment_text.strip(): continue
for filter_item_obj in self.filter_character_list_objects: for filter_item_obj in current_character_filters:
terms_to_check_comment = list(filter_item_obj["aliases"]) terms_to_check_comment = list(filter_item_obj["aliases"])
if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_comment: if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_comment:
terms_to_check_comment.append(filter_item_obj["name"]) terms_to_check_comment.append(filter_item_obj["name"])
@@ -1064,20 +1178,21 @@ class PostProcessorWorker:
self.logger(f" [Char Scope: Comments] Phase 2: Skipped comment check for post ID '{post_id}' because a file match already made it a candidate.") self.logger(f" [Char Scope: Comments] Phase 2: Skipped comment check for post ID '{post_id}' because a file match already made it a candidate.")
# --- Skip Post Logic based on Title or Comment Scope (if filters are active) --- # --- Skip Post Logic based on Title or Comment Scope (if filters are active) ---
if self.filter_character_list_objects: if current_character_filters: # Check if any filters are defined
if self.char_filter_scope == CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match: if self.char_filter_scope == CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match:
self.logger(f" -> Skip Post (Scope: Title - No Char Match): Title '{post_title[:50]}' does not match character filters.") self.logger(f" -> Skip Post (Scope: Title - No Char Match): Title '{post_title[:50]}' does not match character filters.")
self._emit_signal('missed_character_post', post_title, "No title match for character filter") self._emit_signal('missed_character_post', post_title, "No title match for character filter")
return 0, num_potential_files_in_post, [] return 0, num_potential_files_in_post, [], []
if self.char_filter_scope == CHAR_SCOPE_COMMENTS and \ if self.char_filter_scope == CHAR_SCOPE_COMMENTS and \
not post_is_candidate_by_file_char_match_in_comment_scope and \ not post_is_candidate_by_file_char_match_in_comment_scope and \
not post_is_candidate_by_comment_char_match: # MODIFIED: Check both file and comment match flags not post_is_candidate_by_comment_char_match: # MODIFIED: Check both file and comment match flags
self.logger(f" -> Skip Post (Scope: Comments - No Char Match in Comments): Post ID '{post_id}', Title '{post_title[:50]}...'") self.logger(f" -> Skip Post (Scope: Comments - No Char Match in Comments): Post ID '{post_id}', Title '{post_title[:50]}...'")
if self.signals and hasattr(self.signals, 'missed_character_post_signal'): if self.emitter and hasattr(self.emitter, 'missed_character_post_signal'): # Check emitter
self._emit_signal('missed_character_post', post_title, "No character match in files or comments (Comments scope)") self._emit_signal('missed_character_post', post_title, "No character match in files or comments (Comments scope)")
return 0, num_potential_files_in_post, [] return 0, num_potential_files_in_post, [], []
if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_POSTS or self.skip_words_scope == SKIP_SCOPE_BOTH): if self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_POSTS or self.skip_words_scope == SKIP_SCOPE_BOTH):
if self._check_pause(f"Skip words (post title) for post {post_id}"): return 0, num_potential_files_in_post, [], []
post_title_lower = post_title.lower() post_title_lower = post_title.lower()
for skip_word in self.skip_words_list: for skip_word in self.skip_words_list:
if skip_word.lower() in post_title_lower: if skip_word.lower() in post_title_lower:
@@ -1085,14 +1200,14 @@ class PostProcessorWorker:
# If you want these in the "Missed Character Log" too, you'd add a signal emit here. # If you want these in the "Missed Character Log" too, you'd add a signal emit here.
# For now, sticking to the request for character filter misses. # For now, sticking to the request for character filter misses.
self.logger(f" -> Skip Post (Keyword in Title '{skip_word}'): '{post_title[:50]}...'. Scope: {self.skip_words_scope}") self.logger(f" -> Skip Post (Keyword in Title '{skip_word}'): '{post_title[:50]}...'. Scope: {self.skip_words_scope}")
return 0, num_potential_files_in_post, [] return 0, num_potential_files_in_post, [], []
if not self.extract_links_only and self.manga_mode_active and self.filter_character_list_objects and \ if not self.extract_links_only and self.manga_mode_active and current_character_filters and \
(self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH) and \ (self.char_filter_scope == CHAR_SCOPE_TITLE or self.char_filter_scope == CHAR_SCOPE_BOTH) and \
not post_is_candidate_by_title_char_match: not post_is_candidate_by_title_char_match:
self.logger(f" -> Skip Post (Manga Mode with Title/Both Scope - No Title Char Match): Title '{post_title[:50]}' doesn't match filters.") self.logger(f" -> Skip Post (Manga Mode with Title/Both Scope - No Title Char Match): Title '{post_title[:50]}' doesn't match filters.")
self._emit_signal('missed_character_post', post_title, "Manga Mode: No title match for character filter (Title/Both scope)") self._emit_signal('missed_character_post', post_title, "Manga Mode: No title match for character filter (Title/Both scope)")
return 0, num_potential_files_in_post, [] return 0, num_potential_files_in_post, [], []
if not isinstance(post_attachments, list): if not isinstance(post_attachments, list):
self.logger(f"⚠️ Corrupt attachment data for post {post_id} (expected list, got {type(post_attachments)}). Skipping attachments.") self.logger(f"⚠️ Corrupt attachment data for post {post_id} (expected list, got {type(post_attachments)}). Skipping attachments.")
@@ -1100,7 +1215,8 @@ class PostProcessorWorker:
base_folder_names_for_post_content = [] base_folder_names_for_post_content = []
if not self.extract_links_only and self.use_subfolders: if not self.extract_links_only and self.use_subfolders:
primary_char_filter_for_folder = None if self._check_pause(f"Subfolder determination for post {post_id}"): return 0, num_potential_files_in_post, []
primary_char_filter_for_folder = None # type: ignore
log_reason_for_folder = "" log_reason_for_folder = ""
if self.char_filter_scope == CHAR_SCOPE_COMMENTS and char_filter_that_matched_comment: if self.char_filter_scope == CHAR_SCOPE_COMMENTS and char_filter_that_matched_comment:
@@ -1116,13 +1232,16 @@ class PostProcessorWorker:
log_reason_for_folder = "Matched char filter in title" log_reason_for_folder = "Matched char filter in title"
# If scope is FILES, primary_char_filter_for_folder will be None here. Folder determined per file. # If scope is FILES, primary_char_filter_for_folder will be None here. Folder determined per file.
# When determining base_folder_names_for_post_content without a direct character filter match:
if primary_char_filter_for_folder: if primary_char_filter_for_folder:
base_folder_names_for_post_content = [clean_folder_name(primary_char_filter_for_folder["name"])] base_folder_names_for_post_content = [clean_folder_name(primary_char_filter_for_folder["name"])]
self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}") self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}")
elif not self.filter_character_list_objects: # No char filters defined, use generic logic elif not current_character_filters: # No char filters defined, use generic logic
derived_folders = match_folders_from_title(post_title, self.known_names, self.unwanted_keywords) derived_folders = match_folders_from_title(post_title, self.known_names, self.unwanted_keywords)
if derived_folders: if derived_folders:
base_folder_names_for_post_content.extend(derived_folders) # Use the live KNOWN_NAMES from downloader_utils for generic title parsing
# self.known_names is a snapshot from when the worker was created.
base_folder_names_for_post_content.extend(match_folders_from_title(post_title, KNOWN_NAMES, self.unwanted_keywords))
else: else:
base_folder_names_for_post_content.append(extract_folder_name_from_title(post_title, self.unwanted_keywords)) base_folder_names_for_post_content.append(extract_folder_name_from_title(post_title, self.unwanted_keywords))
if not base_folder_names_for_post_content or not base_folder_names_for_post_content[0]: if not base_folder_names_for_post_content or not base_folder_names_for_post_content[0]:
@@ -1132,14 +1251,16 @@ class PostProcessorWorker:
# The folder will be determined by char_filter_info_that_matched_file later. # The folder will be determined by char_filter_info_that_matched_file later.
if not self.extract_links_only and self.use_subfolders and self.skip_words_list: if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
for folder_name_to_check in base_folder_names_for_post_content: if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, []
for folder_name_to_check in base_folder_names_for_post_content: # type: ignore
if not folder_name_to_check: continue if not folder_name_to_check: continue
if any(skip_word.lower() in folder_name_to_check.lower() for skip_word in self.skip_words_list): if any(skip_word.lower() in folder_name_to_check.lower() for skip_word in self.skip_words_list):
matched_skip = next((sw for sw in self.skip_words_list if sw.lower() in folder_name_to_check.lower()), "unknown_skip_word") matched_skip = next((sw for sw in self.skip_words_list if sw.lower() in folder_name_to_check.lower()), "unknown_skip_word")
self.logger(f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check}' contains '{matched_skip}'.") self.logger(f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check}' contains '{matched_skip}'.")
return 0, num_potential_files_in_post, [] return 0, num_potential_files_in_post, [], []
if (self.show_external_links or self.extract_links_only) and post_content_html: if (self.show_external_links or self.extract_links_only) and post_content_html:
if self._check_pause(f"External link extraction for post {post_id}"): return 0, num_potential_files_in_post, [], []
try: try:
unique_links_data = {} unique_links_data = {}
for match in link_pattern.finditer(post_content_html): for match in link_pattern.finditer(post_content_html):
@@ -1170,7 +1291,7 @@ class PostProcessorWorker:
if self.extract_links_only: if self.extract_links_only:
self.logger(f" Extract Links Only mode: Finished processing post {post_id} for links.") self.logger(f" Extract Links Only mode: Finished processing post {post_id} for links.")
return 0, 0, [] return 0, 0, [], []
all_files_from_post_api = [] all_files_from_post_api = []
api_file_domain = urlparse(self.api_url_input).netloc api_file_domain = urlparse(self.api_url_input).netloc
@@ -1208,7 +1329,7 @@ class PostProcessorWorker:
all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo['_is_thumbnail']] all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo['_is_thumbnail']]
if not all_files_from_post_api: if not all_files_from_post_api:
self.logger(f" -> No image thumbnails found for post {post_id} in thumbnail-only mode.") self.logger(f" -> No image thumbnails found for post {post_id} in thumbnail-only mode.")
return 0, 0, [] return 0, 0, [], []
# Sort files within the post by original name if in Date Based manga mode # Sort files within the post by original name if in Date Based manga mode
if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED: if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED:
@@ -1223,7 +1344,7 @@ class PostProcessorWorker:
if not all_files_from_post_api: if not all_files_from_post_api:
self.logger(f" No files found to download for post {post_id}.") self.logger(f" No files found to download for post {post_id}.")
return 0, 0, [] return 0, 0, [], []
files_to_download_info_list = [] files_to_download_info_list = []
processed_original_filenames_in_this_post = set() processed_original_filenames_in_this_post = set()
@@ -1239,7 +1360,7 @@ class PostProcessorWorker:
if not files_to_download_info_list: if not files_to_download_info_list:
self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.") self.logger(f" All files for post {post_id} were duplicate original names or skipped earlier.")
return 0, total_skipped_this_post, [] return 0, total_skipped_this_post, [], []
num_files_in_this_post_for_naming = len(files_to_download_info_list) num_files_in_this_post_for_naming = len(files_to_download_info_list)
@@ -1249,6 +1370,7 @@ class PostProcessorWorker:
with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool: with ThreadPoolExecutor(max_workers=self.num_file_threads, thread_name_prefix=f'P{post_id}File_') as file_pool:
futures_list = [] futures_list = []
for file_idx, file_info_to_dl in enumerate(files_to_download_info_list): for file_idx, file_info_to_dl in enumerate(files_to_download_info_list):
if self._check_pause(f"File processing loop for post {post_id}, file {file_idx}"): break
if self.check_cancel(): break if self.check_cancel(): break
current_api_original_filename = file_info_to_dl.get('_original_name_for_log') current_api_original_filename = file_info_to_dl.get('_original_name_for_log')
@@ -1256,11 +1378,11 @@ class PostProcessorWorker:
file_is_candidate_by_char_filter_scope = False file_is_candidate_by_char_filter_scope = False
char_filter_info_that_matched_file = None char_filter_info_that_matched_file = None
if not self.filter_character_list_objects: if not current_character_filters:
file_is_candidate_by_char_filter_scope = True file_is_candidate_by_char_filter_scope = True
else: else:
if self.char_filter_scope == CHAR_SCOPE_FILES: if self.char_filter_scope == CHAR_SCOPE_FILES:
for filter_item_obj in self.filter_character_list_objects: for filter_item_obj in current_character_filters:
terms_to_check_for_file = list(filter_item_obj["aliases"]) terms_to_check_for_file = list(filter_item_obj["aliases"])
if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_for_file: if filter_item_obj["is_group"] and filter_item_obj["name"] not in terms_to_check_for_file:
terms_to_check_for_file.append(filter_item_obj["name"]) terms_to_check_for_file.append(filter_item_obj["name"])
@@ -1285,7 +1407,7 @@ class PostProcessorWorker:
self.logger(f" File '{current_api_original_filename}' is candidate because post title matched. Scope: Both (Title part).") self.logger(f" File '{current_api_original_filename}' is candidate because post title matched. Scope: Both (Title part).")
else: else:
# This part is for the "File" part of "Both" scope # This part is for the "File" part of "Both" scope
for filter_item_obj_both_file in self.filter_character_list_objects: for filter_item_obj_both_file in current_character_filters:
terms_to_check_for_file_both = list(filter_item_obj_both_file["aliases"]) terms_to_check_for_file_both = list(filter_item_obj_both_file["aliases"])
if filter_item_obj_both_file["is_group"] and filter_item_obj_both_file["name"] not in terms_to_check_for_file_both: if filter_item_obj_both_file["is_group"] and filter_item_obj_both_file["name"] not in terms_to_check_for_file_both:
terms_to_check_for_file_both.append(filter_item_obj_both_file["name"]) terms_to_check_for_file_both.append(filter_item_obj_both_file["name"])
@@ -1295,7 +1417,7 @@ class PostProcessorWorker:
for term_to_match in unique_terms_for_file_both_check: for term_to_match in unique_terms_for_file_both_check:
if is_filename_match_for_character(current_api_original_filename, term_to_match): if is_filename_match_for_character(current_api_original_filename, term_to_match):
file_is_candidate_by_char_filter_scope = True file_is_candidate_by_char_filter_scope = True
char_filter_info_that_matched_file = filter_item_obj_both_file # Use the filter that matched the file char_filter_info_that_matched_file = filter_item_obj_both_file
self.logger(f" File '{current_api_original_filename}' matches char filter term '{term_to_match}' (from '{filter_item_obj['name']}'). Scope: Both (File part).") self.logger(f" File '{current_api_original_filename}' matches char filter term '{term_to_match}' (from '{filter_item_obj['name']}'). Scope: Both (File part).")
break break
if file_is_candidate_by_char_filter_scope: break if file_is_candidate_by_char_filter_scope: break
@@ -1359,11 +1481,13 @@ class PostProcessorWorker:
f_to_cancel.cancel() f_to_cancel.cancel()
break break
try: try:
dl_count, skip_count, actual_filename_saved, original_kept_flag = future.result() dl_count, skip_count, actual_filename_saved, original_kept_flag, status, retry_details = future.result()
total_downloaded_this_post += dl_count total_downloaded_this_post += dl_count
total_skipped_this_post += skip_count total_skipped_this_post += skip_count
if original_kept_flag and dl_count > 0 and actual_filename_saved: if original_kept_flag and dl_count > 0 and actual_filename_saved:
kept_original_filenames_for_log.append(actual_filename_saved) kept_original_filenames_for_log.append(actual_filename_saved)
if status == FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER and retry_details:
retryable_failures_this_post.append(retry_details)
except CancelledError: except CancelledError:
self.logger(f" File download task for post {post_id} was cancelled.") self.logger(f" File download task for post {post_id} was cancelled.")
total_skipped_this_post += 1 total_skipped_this_post += 1
@@ -1377,22 +1501,23 @@ class PostProcessorWorker:
if self.check_cancel(): self.logger(f" Post {post_id} processing interrupted/cancelled."); if self.check_cancel(): self.logger(f" Post {post_id} processing interrupted/cancelled.");
else: self.logger(f" Post {post_id} Summary: Downloaded={total_downloaded_this_post}, Skipped Files={total_skipped_this_post}") else: self.logger(f" Post {post_id} Summary: Downloaded={total_downloaded_this_post}, Skipped Files={total_skipped_this_post}")
return total_downloaded_this_post, total_skipped_this_post, kept_original_filenames_for_log return total_downloaded_this_post, total_skipped_this_post, kept_original_filenames_for_log, retryable_failures_this_post
class DownloadThread(QThread): class DownloadThread(QThread):
progress_signal = pyqtSignal(str) progress_signal = pyqtSignal(str) # Already QObject, no need to change
add_character_prompt_signal = pyqtSignal(str) add_character_prompt_signal = pyqtSignal(str)
file_download_status_signal = pyqtSignal(bool) file_download_status_signal = pyqtSignal(bool)
finished_signal = pyqtSignal(int, int, bool, list) finished_signal = pyqtSignal(int, int, bool, list)
external_link_signal = pyqtSignal(str, str, str, str) external_link_signal = pyqtSignal(str, str, str, str)
file_progress_signal = pyqtSignal(str, object) file_progress_signal = pyqtSignal(str, object)
retryable_file_failed_signal = pyqtSignal(list) # New: list of retry_details dicts
missed_character_post_signal = pyqtSignal(str, str) # New: post_title, reason missed_character_post_signal = pyqtSignal(str, str) # New: post_title, reason
def __init__(self, api_url_input, output_dir, known_names_copy, def __init__(self, api_url_input, output_dir, known_names_copy,
cancellation_event, cancellation_event,
filter_character_list=None, pause_event, filter_character_list=None, dynamic_character_filter_holder=None, # Added pause_event and holder
filter_mode='all', skip_zip=True, skip_rar=True, filter_mode='all', skip_zip=True, skip_rar=True,
use_subfolders=True, use_post_subfolders=False, custom_folder_name=None, compress_images=False, use_subfolders=True, use_post_subfolders=False, custom_folder_name=None, compress_images=False,
download_thumbnails=False, service=None, user_id=None, download_thumbnails=False, service=None, user_id=None,
@@ -1408,7 +1533,7 @@ class DownloadThread(QThread):
manga_mode_active=False, manga_mode_active=False,
unwanted_keywords=None, unwanted_keywords=None,
manga_filename_style=STYLE_POST_TITLE, manga_filename_style=STYLE_POST_TITLE,
char_filter_scope=CHAR_SCOPE_FILES, char_filter_scope=CHAR_SCOPE_FILES, # manga_date_file_counter_ref removed from here
remove_from_filename_words_list=None, remove_from_filename_words_list=None,
allow_multipart_download=True, allow_multipart_download=True,
manga_date_file_counter_ref=None, # New parameter manga_date_file_counter_ref=None, # New parameter
@@ -1418,9 +1543,11 @@ class DownloadThread(QThread):
self.output_dir = output_dir self.output_dir = output_dir
self.known_names = list(known_names_copy) self.known_names = list(known_names_copy)
self.cancellation_event = cancellation_event self.cancellation_event = cancellation_event
self.pause_event = pause_event # Store pause_event
self.skip_current_file_flag = skip_current_file_flag self.skip_current_file_flag = skip_current_file_flag
self.initial_target_post_id = target_post_id_from_initial_url self.initial_target_post_id = target_post_id_from_initial_url
self.filter_character_list_objects = filter_character_list if filter_character_list else [] self.filter_character_list_objects_initial = filter_character_list if filter_character_list else [] # Store initial
self.dynamic_filter_holder = dynamic_character_filter_holder # Store the holder
self.filter_mode = filter_mode self.filter_mode = filter_mode
self.skip_zip = skip_zip self.skip_zip = skip_zip
self.skip_rar = skip_rar self.skip_rar = skip_rar
@@ -1453,7 +1580,8 @@ class DownloadThread(QThread):
self.char_filter_scope = char_filter_scope self.char_filter_scope = char_filter_scope
self.remove_from_filename_words_list = remove_from_filename_words_list self.remove_from_filename_words_list = remove_from_filename_words_list
self.allow_multipart_download = allow_multipart_download self.allow_multipart_download = allow_multipart_download
self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker by DownloadThread
# self.manga_date_scan_dir = manga_date_scan_dir # Store scan directory
if self.compress_images and Image is None: if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).") self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
self.compress_images = False self.compress_images = False
@@ -1464,6 +1592,16 @@ class DownloadThread(QThread):
def isInterruptionRequested(self): def isInterruptionRequested(self):
return self.cancellation_event.is_set() or super().isInterruptionRequested() return self.cancellation_event.is_set() or super().isInterruptionRequested()
def _check_pause_self(self, context_message="DownloadThread operation"):
if self.pause_event and self.pause_event.is_set():
self.logger(f" {context_message} paused...")
while self.pause_event.is_set():
if self.isInterruptionRequested():
self.logger(f" {context_message} cancelled while paused.")
return True # Indicates cancellation occurred
time.sleep(0.5)
if not self.isInterruptionRequested(): self.logger(f" {context_message} resumed.")
return False
def skip_file(self): def skip_file(self):
if self.isRunning() and self.skip_current_file_flag: if self.isRunning() and self.skip_current_file_flag:
@@ -1478,6 +1616,33 @@ class DownloadThread(QThread):
grand_list_of_kept_original_filenames = [] grand_list_of_kept_original_filenames = []
was_process_cancelled = False was_process_cancelled = False
# Initialize manga_date_file_counter_ref if needed (moved from main.py)
# This is now done within the DownloadThread's run method.
current_manga_date_file_counter_ref = self.manga_date_file_counter_ref
if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED and \
not self.extract_links_only and current_manga_date_file_counter_ref is None: # Check if it needs calculation
# series_scan_directory calculation logic (simplified for direct use here)
series_scan_dir = self.output_dir
if self.use_subfolders:
if self.filter_character_list_objects and self.filter_character_list_objects[0] and self.filter_character_list_objects[0].get("name"):
series_folder_name = clean_folder_name(self.filter_character_list_objects[0]["name"])
series_scan_dir = os.path.join(series_scan_dir, series_folder_name)
elif self.service and self.user_id:
creator_based_folder_name = clean_folder_name(self.user_id)
series_scan_dir = os.path.join(series_scan_dir, creator_based_folder_name)
highest_num = 0
if os.path.isdir(series_scan_dir):
self.logger(f" [Thread] Manga Date Mode: Scanning for existing files in '{series_scan_dir}'...")
for dirpath, _, filenames_in_dir in os.walk(series_scan_dir):
for filename_to_check in filenames_in_dir:
base_name_no_ext = os.path.splitext(filename_to_check)[0]
match = re.match(r"(\d{3,})", base_name_no_ext)
if match: highest_num = max(highest_num, int(match.group(1)))
current_manga_date_file_counter_ref = [highest_num + 1, threading.Lock()]
self.logger(f" [Thread] Manga Date Mode: Initialized counter at {current_manga_date_file_counter_ref[0]}.")
# This DownloadThread (being a QThread) will use its own signals object # This DownloadThread (being a QThread) will use its own signals object
# to communicate with PostProcessorWorker if needed. # to communicate with PostProcessorWorker if needed.
worker_signals_obj = PostProcessorSignals() worker_signals_obj = PostProcessorSignals()
@@ -1495,19 +1660,23 @@ class DownloadThread(QThread):
start_page=self.start_page, start_page=self.start_page,
end_page=self.end_page, end_page=self.end_page,
manga_mode=self.manga_mode_active, manga_mode=self.manga_mode_active,
cancellation_event=self.cancellation_event cancellation_event=self.cancellation_event,
pause_event=self.pause_event # Pass pause_event
) )
for posts_batch_data in post_generator: for posts_batch_data in post_generator:
if self._check_pause_self("Post batch processing"): was_process_cancelled = True; break
if self.isInterruptionRequested(): was_process_cancelled = True; break if self.isInterruptionRequested(): was_process_cancelled = True; break
for individual_post_data in posts_batch_data: for individual_post_data in posts_batch_data:
if self._check_pause_self(f"Individual post processing for {individual_post_data.get('id', 'N/A')}"): was_process_cancelled = True; break
if self.isInterruptionRequested(): was_process_cancelled = True; break if self.isInterruptionRequested(): was_process_cancelled = True; break
post_processing_worker = PostProcessorWorker( post_processing_worker = PostProcessorWorker(
post_data=individual_post_data, post_data=individual_post_data,
download_root=self.output_dir, download_root=self.output_dir,
known_names=self.known_names, known_names=self.known_names,
filter_character_list=self.filter_character_list_objects, filter_character_list=self.filter_character_list_objects_initial, # Pass initial
dynamic_character_filter_holder=self.dynamic_filter_holder, # Pass the holder
unwanted_keywords=self.unwanted_keywords, unwanted_keywords=self.unwanted_keywords,
filter_mode=self.filter_mode, filter_mode=self.filter_mode,
skip_zip=self.skip_zip, skip_rar=self.skip_rar, skip_zip=self.skip_zip, skip_rar=self.skip_rar,
@@ -1517,6 +1686,7 @@ class DownloadThread(QThread):
compress_images=self.compress_images, download_thumbnails=self.download_thumbnails, compress_images=self.compress_images, download_thumbnails=self.download_thumbnails,
service=self.service, user_id=self.user_id, service=self.service, user_id=self.user_id,
api_url_input=self.api_url_input, api_url_input=self.api_url_input,
pause_event=self.pause_event, # Pass pause_event to worker
cancellation_event=self.cancellation_event, # emitter is PostProcessorSignals for single-thread cancellation_event=self.cancellation_event, # emitter is PostProcessorSignals for single-thread
emitter=worker_signals_obj, # Pass the signals object as the emitter emitter=worker_signals_obj, # Pass the signals object as the emitter
downloaded_files=self.downloaded_files, downloaded_files=self.downloaded_files,
@@ -1534,14 +1704,16 @@ class DownloadThread(QThread):
char_filter_scope=self.char_filter_scope, char_filter_scope=self.char_filter_scope,
remove_from_filename_words_list=self.remove_from_filename_words_list, remove_from_filename_words_list=self.remove_from_filename_words_list,
allow_multipart_download=self.allow_multipart_download, allow_multipart_download=self.allow_multipart_download,
manga_date_file_counter_ref=self.manga_date_file_counter_ref, # Pass it here manga_date_file_counter_ref=current_manga_date_file_counter_ref, # Pass the calculated or passed-in ref
) )
try: try:
dl_count, skip_count, kept_originals_this_post = post_processing_worker.process() dl_count, skip_count, kept_originals_this_post, retryable_failures = post_processing_worker.process()
grand_total_downloaded_files += dl_count grand_total_downloaded_files += dl_count
grand_total_skipped_files += skip_count grand_total_skipped_files += skip_count
if kept_originals_this_post: if kept_originals_this_post:
grand_list_of_kept_original_filenames.extend(kept_originals_this_post) grand_list_of_kept_original_filenames.extend(kept_originals_this_post)
if retryable_failures:
self.retryable_file_failed_signal.emit(retryable_failures)
except Exception as proc_err: except Exception as proc_err:
post_id_for_err = individual_post_data.get('id', 'N/A') post_id_for_err = individual_post_data.get('id', 'N/A')
self.logger(f"❌ Error processing post {post_id_for_err} in DownloadThread: {proc_err}") self.logger(f"❌ Error processing post {post_id_for_err} in DownloadThread: {proc_err}")
@@ -1572,6 +1744,7 @@ class DownloadThread(QThread):
worker_signals_obj.external_link_signal.disconnect(self.external_link_signal) worker_signals_obj.external_link_signal.disconnect(self.external_link_signal)
worker_signals_obj.file_progress_signal.disconnect(self.file_progress_signal) worker_signals_obj.file_progress_signal.disconnect(self.file_progress_signal)
worker_signals_obj.missed_character_post_signal.disconnect(self.missed_character_post_signal) worker_signals_obj.missed_character_post_signal.disconnect(self.missed_character_post_signal)
# No need to disconnect retryable_file_failed_signal from worker_signals_obj as it's not on it
except (TypeError, RuntimeError) as e: except (TypeError, RuntimeError) as e:
self.logger(f" Note during DownloadThread signal disconnection: {e}") self.logger(f" Note during DownloadThread signal disconnection: {e}")

971
main.py

File diff suppressed because it is too large Load Diff

View File

@@ -13,7 +13,7 @@ DOWNLOAD_CHUNK_SIZE_ITER = 1024 * 256 # 256KB for iter_content within a chunk d
def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, headers, def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, headers,
part_num, total_parts, progress_data, cancellation_event, skip_event, part_num, total_parts, progress_data, cancellation_event, skip_event, pause_event, global_emit_time_ref, # Added global_emit_time_ref
logger_func, emitter=None, api_original_filename=None): # Renamed logger, signals to emitter logger_func, emitter=None, api_original_filename=None): # Renamed logger, signals to emitter
"""Downloads a single chunk of a file and writes it to the temp file.""" """Downloads a single chunk of a file and writes it to the temp file."""
if cancellation_event and cancellation_event.is_set(): if cancellation_event and cancellation_event.is_set():
@@ -23,6 +23,15 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event triggered before start.") logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event triggered before start.")
return 0, False return 0, False
if pause_event and pause_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Download paused before start...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Download cancelled while paused.")
return 0, False
time.sleep(0.2) # Shorter sleep for responsive resume
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Download resumed.")
chunk_headers = headers.copy() chunk_headers = headers.copy()
# end_byte can be -1 for 0-byte files, meaning download from start_byte to end of file (which is start_byte itself) # end_byte can be -1 for 0-byte files, meaning download from start_byte to end of file (which is start_byte itself)
if end_byte != -1 : # For 0-byte files, end_byte might be -1, Range header should not be set or be 0-0 if end_byte != -1 : # For 0-byte files, end_byte might be -1, Range header should not be set or be 0-0
@@ -38,7 +47,7 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
bytes_this_chunk = 0 bytes_this_chunk = 0
last_progress_emit_time_for_chunk = time.time() # last_progress_emit_time_for_chunk = time.time() # Replaced by global_emit_time_ref logic
last_speed_calc_time = time.time() last_speed_calc_time = time.time()
bytes_at_last_speed_calc = 0 bytes_at_last_speed_calc = 0
@@ -49,6 +58,14 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
if skip_event and skip_event.is_set(): if skip_event and skip_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event during retry loop.") logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event during retry loop.")
return bytes_this_chunk, False return bytes_this_chunk, False
if pause_event and pause_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Paused during retry loop...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Cancelled while paused in retry loop.")
return bytes_this_chunk, False
time.sleep(0.2)
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Resumed from retry loop pause.")
try: try:
if attempt > 0: if attempt > 0:
@@ -82,6 +99,14 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
if skip_event and skip_event.is_set(): if skip_event and skip_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event during data iteration.") logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event during data iteration.")
return bytes_this_chunk, False return bytes_this_chunk, False
if pause_event and pause_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Paused during data iteration...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Cancelled while paused in data iteration.")
return bytes_this_chunk, False
time.sleep(0.2)
logger_func(f" [Chunk {part_num + 1}/{total_parts}] Resumed from data iteration pause.")
if data_segment: if data_segment:
f.write(data_segment) f.write(data_segment)
bytes_this_chunk += len(data_segment) bytes_this_chunk += len(data_segment)
@@ -101,17 +126,17 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
last_speed_calc_time = current_time last_speed_calc_time = current_time
bytes_at_last_speed_calc = bytes_this_chunk bytes_at_last_speed_calc = bytes_this_chunk
# Emit progress more frequently from within the chunk download # Throttle emissions globally for this file download
if current_time - last_progress_emit_time_for_chunk > 0.1: # Emit up to 10 times/sec per chunk if emitter and (current_time - global_emit_time_ref[0] > 0.25): # Max ~4Hz for the whole file
if emitter: global_emit_time_ref[0] = current_time # Update shared last emit time
# Prepare and emit the status_list_copy
status_list_copy = [dict(s) for s in progress_data['chunks_status']] # Make a deep enough copy
if isinstance(emitter, queue.Queue):
emitter.put({'type': 'file_progress', 'payload': (api_original_filename, status_list_copy)})
elif hasattr(emitter, 'file_progress_signal'): # PostProcessorSignals-like
# Ensure we read the latest total downloaded from progress_data # Ensure we read the latest total downloaded from progress_data
# Send a copy of the chunks_status list emitter.file_progress_signal.emit(api_original_filename, status_list_copy)
status_list_copy = [dict(s) for s in progress_data['chunks_status']] # Make a deep enough copy
if isinstance(emitter, queue.Queue):
emitter.put({'type': 'file_progress', 'payload': (api_original_filename, status_list_copy)})
elif hasattr(emitter, 'file_progress_signal'): # PostProcessorSignals-like
emitter.file_progress_signal.emit(api_original_filename, status_list_copy)
last_progress_emit_time_for_chunk = current_time
return bytes_this_chunk, True return bytes_this_chunk, True
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e: except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, http.client.IncompleteRead) as e:
@@ -134,7 +159,7 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, api_original_filename, def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, api_original_filename,
emitter_for_multipart, cancellation_event, skip_event, logger_func): # Renamed signals, logger emitter_for_multipart, cancellation_event, skip_event, logger_func, pause_event): # Added pause_event
""" """
Downloads a file in multiple parts concurrently. Downloads a file in multiple parts concurrently.
Returns: (download_successful_flag, downloaded_bytes, calculated_file_hash, temp_file_handle_or_None) Returns: (download_successful_flag, downloaded_bytes, calculated_file_hash, temp_file_handle_or_None)
@@ -181,7 +206,8 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers,
{'id': i, 'downloaded': 0, 'total': chunk_actual_sizes[i] if i < len(chunk_actual_sizes) else 0, 'active': False, 'speed_bps': 0.0} {'id': i, 'downloaded': 0, 'total': chunk_actual_sizes[i] if i < len(chunk_actual_sizes) else 0, 'active': False, 'speed_bps': 0.0}
for i in range(num_parts) for i in range(num_parts)
], ],
'lock': threading.Lock() 'lock': threading.Lock(),
'last_global_emit_time': [time.time()] # Shared mutable for global throttling timestamp
} }
chunk_futures = [] chunk_futures = []
@@ -194,8 +220,8 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers,
chunk_futures.append(chunk_pool.submit( chunk_futures.append(chunk_pool.submit(
_download_individual_chunk, chunk_url=file_url, temp_file_path=temp_file_path, _download_individual_chunk, chunk_url=file_url, temp_file_path=temp_file_path,
start_byte=start, end_byte=end, headers=headers, part_num=i, total_parts=num_parts, start_byte=start, end_byte=end, headers=headers, part_num=i, total_parts=num_parts,
progress_data=progress_data, cancellation_event=cancellation_event, skip_event=skip_event, progress_data=progress_data, cancellation_event=cancellation_event, skip_event=skip_event, global_emit_time_ref=progress_data['last_global_emit_time'],
logger_func=logger_func, emitter=emitter_for_multipart, # Pass emitter pause_event=pause_event, logger_func=logger_func, emitter=emitter_for_multipart, # Pass pause_event and emitter
api_original_filename=api_original_filename api_original_filename=api_original_filename
)) ))

View File

@@ -1,4 +1,10 @@
# Kemono Downloader v3.4.0 <h1 align="center">Kemono Downloader v3.4.0</h1>
<div align="center">
<img src="https://github.com/Yuvi9587/Kemono-Downloader/blob/main/Read.png" alt="Kemono Downloader"/>
</div>
---
A powerful, feature-rich GUI application for downloading content from **[Kemono.su](https://kemono.su)** and **[Coomer.party](https://coomer.party)**. A powerful, feature-rich GUI application for downloading content from **[Kemono.su](https://kemono.su)** and **[Coomer.party](https://coomer.party)**.
Built with **PyQt5**, this tool is ideal for users who want deep filtering, customizable folder structures, efficient downloads, and intelligent automation — all within a modern, user-friendly graphical interface. Built with **PyQt5**, this tool is ideal for users who want deep filtering, customizable folder structures, efficient downloads, and intelligent automation — all within a modern, user-friendly graphical interface.
@@ -9,8 +15,6 @@ Built with **PyQt5**, this tool is ideal for users who want deep filtering, cust
This version brings significant enhancements to manga/comic downloading, filtering capabilities, and user experience: This version brings significant enhancements to manga/comic downloading, filtering capabilities, and user experience:
---
### 📖 Enhanced Manga/Comic Mode ### 📖 Enhanced Manga/Comic Mode
- **New "Date Based" Filename Style:** - **New "Date Based" Filename Style:**
@@ -29,7 +33,7 @@ This version brings significant enhancements to manga/comic downloading, filteri
- Specify comma-separated words or phrases (case-insensitive) that will be automatically removed from filenames. - Specify comma-separated words or phrases (case-insensitive) that will be automatically removed from filenames.
- Example: `patreon, [HD], _final` transforms `AwesomeArt_patreon_[HD]_final.jpg` into `AwesomeArt.jpg`. - Example: `patreon, [HD], _final` transforms `AwesomeArt_patreon` `Hinata_Hd` into `AwesomeArt.jpg` `Hinata.jpg`.
--- ---