This commit is contained in:
Yuvi9587
2025-06-04 03:36:39 +01:00
parent cdf4e9bdfb
commit 5a6474cb8a
2 changed files with 107 additions and 16 deletions

View File

@@ -67,6 +67,24 @@ FOLDER_NAME_STOP_WORDS = {
"right", "s", "she", "so", "technically", "tell", "the", "their", "they", "this", "right", "s", "she", "so", "technically", "tell", "the", "their", "they", "this",
"to", "ve", "was", "we", "well", "were", "with", "www", "year", "you", "your", "to", "ve", "was", "we", "well", "were", "with", "www", "year", "you", "your",
} }
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = {
"poll", "cover", "fan-art", "fanart", "requests", "request", "holiday",
# Numbers 1-20 (as strings and words)
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
"eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen", "twenty",
# Months (short and long)
"jan", "january", "feb", "february", "mar", "march", "apr", "april",
"may", "jun", "june", "jul", "july", "aug", "august", "sep", "september",
"oct", "october", "nov", "november", "dec", "december",
# Weekdays (short and long)
"mon", "monday", "tue", "tuesday", "wed", "wednesday", "thu", "thursday",
"fri", "friday", "sat", "saturday", "sun", "sunday"
}
def parse_cookie_string(cookie_string): def parse_cookie_string(cookie_string):
"""Parses a 'name=value; name2=value2' cookie string into a dict.""" """Parses a 'name=value; name2=value2' cookie string into a dict."""
cookies = {} cookies = {}
@@ -588,6 +606,7 @@ class PostProcessorWorker:
manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter for date-based prefix manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter for date-based prefix
manga_date_file_counter_ref=None, # New parameter for date-based manga naming manga_date_file_counter_ref=None, # New parameter for date-based manga naming
scan_content_for_images=False, # New flag for scanning HTML content scan_content_for_images=False, # New flag for scanning HTML content
creator_download_folder_ignore_words=None, # New: For ignoring specific words for folder names
manga_global_file_counter_ref=None, # New parameter for global numbering manga_global_file_counter_ref=None, # New parameter for global numbering
): # type: ignore ): # type: ignore
self.post = post_data # type: ignore self.post = post_data # type: ignore
@@ -637,7 +656,9 @@ class PostProcessorWorker:
self.use_cookie = use_cookie # Store cookie setting self.use_cookie = use_cookie # Store cookie setting
self.override_output_dir = override_output_dir # Store the override directory self.override_output_dir = override_output_dir # Store the override directory
self.scan_content_for_images = scan_content_for_images # Store new flag self.scan_content_for_images = scan_content_for_images # Store new flag
self.creator_download_folder_ignore_words = creator_download_folder_ignore_words # Store new ignore words
if self.compress_images and Image is None: if self.compress_images and Image is None:
# self.logger is not available yet, PostProcessorSignals.progress_signal.emit can be used or print
self.logger("⚠️ Image compression disabled: Pillow library not found.") self.logger("⚠️ Image compression disabled: Pillow library not found.")
self.compress_images = False self.compress_images = False
def _emit_signal(self, signal_type_str, *payload_args): def _emit_signal(self, signal_type_str, *payload_args):
@@ -1052,6 +1073,13 @@ class PostProcessorWorker:
post_id = post_data.get('id', 'unknown_id') post_id = post_data.get('id', 'unknown_id')
post_main_file_info = post_data.get('file') post_main_file_info = post_data.get('file')
post_attachments = post_data.get('attachments', []) post_attachments = post_data.get('attachments', [])
effective_unwanted_keywords_for_folder_naming = self.unwanted_keywords.copy()
is_full_creator_download_no_char_filter = not self.target_post_id_from_initial_url and not current_character_filters
if is_full_creator_download_no_char_filter and self.creator_download_folder_ignore_words:
self.logger(f" Applying creator download specific folder ignore words ({len(self.creator_download_folder_ignore_words)} words).")
effective_unwanted_keywords_for_folder_naming.update(self.creator_download_folder_ignore_words)
post_content_html = post_data.get('content', '') post_content_html = post_data.get('content', '')
self.logger(f"\n--- Processing Post {post_id} ('{post_title[:50]}...') (Thread: {threading.current_thread().name}) ---") self.logger(f"\n--- Processing Post {post_id} ('{post_title[:50]}...') (Thread: {threading.current_thread().name}) ---")
num_potential_files_in_post = len(post_attachments or []) + (1 if post_main_file_info and post_main_file_info.get('path') else 0) num_potential_files_in_post = len(post_attachments or []) + (1 if post_main_file_info and post_main_file_info.get('path') else 0)
@@ -1204,16 +1232,48 @@ class PostProcessorWorker:
log_reason_for_folder = "Matched char filter in title" log_reason_for_folder = "Matched char filter in title"
if primary_char_filter_for_folder: if primary_char_filter_for_folder:
base_folder_names_for_post_content = [clean_folder_name(primary_char_filter_for_folder["name"])] base_folder_names_for_post_content = [clean_folder_name(primary_char_filter_for_folder["name"])]
cleaned_primary_folder_name = clean_folder_name(primary_char_filter_for_folder["name"])
if cleaned_primary_folder_name.lower() in effective_unwanted_keywords_for_folder_naming and cleaned_primary_folder_name.lower() != "untitled_folder":
self.logger(f" ⚠️ Primary char filter folder name '{cleaned_primary_folder_name}' is in ignore list. Using generic name.")
base_folder_names_for_post_content = ["Generic Post Content"]
else:
base_folder_names_for_post_content = [cleaned_primary_folder_name]
self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}") self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}")
elif not current_character_filters: # No char filters defined, use generic logic elif not current_character_filters: # No char filters defined, use generic logic
derived_folders = match_folders_from_title(post_title, self.known_names, self.unwanted_keywords) # 1. Try to match folder names from Known.txt using the post title
if derived_folders: derived_folders_from_known_txt = match_folders_from_title(
base_folder_names_for_post_content.extend(match_folders_from_title(post_title, KNOWN_NAMES, self.unwanted_keywords)) post_title,
self.known_names,
effective_unwanted_keywords_for_folder_naming
)
# Filter out any "untitled_folder" that might come from Known.txt if the primary name was problematic,
# and also filter empty strings.
valid_derived_folders = [
name for name in derived_folders_from_known_txt
if name and name.strip() and name.lower() != "untitled_folder"
]
if valid_derived_folders:
base_folder_names_for_post_content.extend(valid_derived_folders)
self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}")
else: else:
base_folder_names_for_post_content.append(extract_folder_name_from_title(post_title, self.unwanted_keywords)) # 2. If no valid folders from Known.txt, fall back to extracting from title directly.
if not base_folder_names_for_post_content or not base_folder_names_for_post_content[0]: extracted_folder_name = extract_folder_name_from_title(
base_folder_names_for_post_content = [clean_folder_name(post_title if post_title else "untitled_creator_content")] post_title,
self.logger(f" Base folder name(s) for post content (Generic title parsing - no char filters): {', '.join(base_folder_names_for_post_content)}") effective_unwanted_keywords_for_folder_naming
)
base_folder_names_for_post_content.append(extracted_folder_name)
self.logger(f" Base folder name(s) for post content (Generic title parsing - no valid Known.txt match): {', '.join(base_folder_names_for_post_content)}")
# 3. Final cleanup: Ensure list is not empty and contains valid, non-empty strings.
base_folder_names_for_post_content = [
name for name in base_folder_names_for_post_content if name and name.strip()
]
if not base_folder_names_for_post_content:
final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content")
base_folder_names_for_post_content = [final_fallback_name]
self.logger(f" Fallback folder name due to all derivations failing: {final_fallback_name}")
if not self.extract_links_only and self.use_subfolders and self.skip_words_list: if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, [] if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, []
for folder_name_to_check in base_folder_names_for_post_content: # type: ignore for folder_name_to_check in base_folder_names_for_post_content: # type: ignore
@@ -1547,6 +1607,7 @@ class DownloadThread(QThread):
manga_global_file_counter_ref=None, # New parameter for global numbering manga_global_file_counter_ref=None, # New parameter for global numbering
use_cookie=False, # Added: Expected by main.py use_cookie=False, # Added: Expected by main.py
scan_content_for_images=False, # Added new flag scan_content_for_images=False, # Added new flag
creator_download_folder_ignore_words=None, # Added for DownloadThread
cookie_text="", # Added: Expected by main.py cookie_text="", # Added: Expected by main.py
): ):
super().__init__() super().__init__()
@@ -1597,6 +1658,7 @@ class DownloadThread(QThread):
self.override_output_dir = override_output_dir # Store override dir self.override_output_dir = override_output_dir # Store override dir
self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker by DownloadThread self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker by DownloadThread
self.scan_content_for_images = scan_content_for_images # Store new flag self.scan_content_for_images = scan_content_for_images # Store new flag
self.creator_download_folder_ignore_words = creator_download_folder_ignore_words # Store new ignore words
self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store for global numbering self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store for global numbering
if self.compress_images and Image is None: if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).") self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
@@ -1718,6 +1780,7 @@ class DownloadThread(QThread):
use_cookie=self.use_cookie, # Pass cookie setting to worker use_cookie=self.use_cookie, # Pass cookie setting to worker
manga_date_file_counter_ref=current_manga_date_file_counter_ref, # Pass the calculated or passed-in ref manga_date_file_counter_ref=current_manga_date_file_counter_ref, # Pass the calculated or passed-in ref
scan_content_for_images=self.scan_content_for_images, # Pass new flag scan_content_for_images=self.scan_content_for_images, # Pass new flag
creator_download_folder_ignore_words=self.creator_download_folder_ignore_words, # Pass new ignore words
) )
try: try:
dl_count, skip_count, kept_originals_this_post, retryable_failures, permanent_failures = post_processing_worker.process() dl_count, skip_count, kept_originals_this_post, retryable_failures, permanent_failures = post_processing_worker.process()

46
main.py
View File

@@ -59,7 +59,8 @@ try:
CHAR_SCOPE_COMMENTS, CHAR_SCOPE_COMMENTS,
FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER, FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER,
STYLE_DATE_BASED, STYLE_DATE_BASED,
STYLE_POST_TITLE_GLOBAL_NUMBERING STYLE_POST_TITLE_GLOBAL_NUMBERING,
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS # Added import
) )
print("Successfully imported names from downloader_utils.") print("Successfully imported names from downloader_utils.")
@@ -93,6 +94,7 @@ except ImportError as e:
FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER = "failed_retry_later" FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER = "failed_retry_later"
STYLE_DATE_BASED = "date_based" STYLE_DATE_BASED = "date_based"
STYLE_POST_TITLE_GLOBAL_NUMBERING = "post_title_global_numbering" STYLE_POST_TITLE_GLOBAL_NUMBERING = "post_title_global_numbering"
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = set() # Mock for import error
except Exception as e: except Exception as e:
print(f"--- UNEXPECTED IMPORT ERROR ---") print(f"--- UNEXPECTED IMPORT ERROR ---")
@@ -3077,8 +3079,12 @@ class DownloaderApp(QWidget):
self.main_splitter.addWidget(left_panel_widget) self.main_splitter.addWidget(left_panel_widget)
self.main_splitter.addWidget(right_panel_widget) self.main_splitter.addWidget(right_panel_widget) # type: ignore
initial_width = self.width() # Ensure the window has a size before calculating splitter sizes
if self.width() == 0 or self.height() == 0: # Default size if not shown yet
initial_width = 1024 # A reasonable default
else:
initial_width = self.width()
left_width = int(initial_width * 0.35) left_width = int(initial_width * 0.35)
right_width = initial_width - left_width right_width = initial_width - left_width
self.main_splitter.setSizes([left_width, right_width]) self.main_splitter.setSizes([left_width, right_width])
@@ -4476,6 +4482,16 @@ class DownloaderApp(QWidget):
QMessageBox.critical(self, "Input Error", "Invalid or unsupported URL format.") QMessageBox.critical(self, "Input Error", "Invalid or unsupported URL format.")
return False # Indicate failure to start return False # Indicate failure to start
creator_folder_ignore_words_for_run = None
is_full_creator_download = not post_id_from_url
# Use actual_filters_to_use_for_run which is populated after parsing character_input
# This check needs to happen *after* actual_filters_to_use_for_run is determined.
# We will move this logic block down.
# if is_full_creator_download and character_filters_are_empty:
# creator_folder_ignore_words_for_run = CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS
# log_messages.append(f" Creator Download (No Char Filter): Applying default folder name ignore list ({len(creator_folder_ignore_words_for_run)} words).")
if compress_images and Image is None: if compress_images and Image is None:
QMessageBox.warning(self, "Missing Dependency", "Pillow library (for image compression) not found. Compression will be disabled.") QMessageBox.warning(self, "Missing Dependency", "Pillow library (for image compression) not found. Compression will be disabled.")
@@ -4635,6 +4651,13 @@ class DownloaderApp(QWidget):
else: else:
self.log_signal.emit("⚠️ Proceeding with Manga Mode without a specific title filter.") self.log_signal.emit("⚠️ Proceeding with Manga Mode without a specific title filter.")
self.dynamic_character_filter_holder.set_filters(actual_filters_to_use_for_run) self.dynamic_character_filter_holder.set_filters(actual_filters_to_use_for_run)
# Determine creator_folder_ignore_words_for_run *after* actual_filters_to_use_for_run is set
creator_folder_ignore_words_for_run = None
character_filters_are_empty = not actual_filters_to_use_for_run # Now this is accurate
if is_full_creator_download and character_filters_are_empty: # is_full_creator_download defined earlier
creator_folder_ignore_words_for_run = CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS
log_messages.append(f" Creator Download (No Char Filter): Applying default folder name ignore list ({len(creator_folder_ignore_words_for_run)} words).")
custom_folder_name_cleaned = None custom_folder_name_cleaned = None
if use_subfolders and post_id_from_url and self.custom_folder_widget and self.custom_folder_widget.isVisible() and not extract_links_only: if use_subfolders and post_id_from_url and self.custom_folder_widget and self.custom_folder_widget.isVisible() and not extract_links_only:
@@ -4747,8 +4770,8 @@ class DownloaderApp(QWidget):
should_use_multithreading_for_posts = use_multithreading_enabled_by_checkbox and not post_id_from_url should_use_multithreading_for_posts = use_multithreading_enabled_by_checkbox and not post_id_from_url
if manga_mode and (self.manga_filename_style == STYLE_DATE_BASED or self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING) and not post_id_from_url: if manga_mode and (self.manga_filename_style == STYLE_DATE_BASED or self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING) and not post_id_from_url:
enforced_by_style = "Date Mode" if self.manga_filename_style == STYLE_DATE_BASED else "Title+GlobalNum Mode" enforced_by_style = "Date Mode" if self.manga_filename_style == STYLE_DATE_BASED else "Title+GlobalNum Mode"
log_messages.append(f" Threading: Single-threaded (posts) - Enforced by Manga {enforced_by_style}")
should_use_multithreading_for_posts = False # Ensure this reflects the forced state should_use_multithreading_for_posts = False # Ensure this reflects the forced state
log_messages.append(f" Threading: Single-threaded (posts) - Enforced by Manga {enforced_by_style} (Actual workers: {effective_num_post_workers if effective_num_post_workers > 1 else 1})")
else: else:
log_messages.append(f" Threading: {'Multi-threaded (posts)' if should_use_multithreading_for_posts else 'Single-threaded (posts)'}") log_messages.append(f" Threading: {'Multi-threaded (posts)' if should_use_multithreading_for_posts else 'Single-threaded (posts)'}")
if should_use_multithreading_for_posts: if should_use_multithreading_for_posts:
@@ -4757,8 +4780,10 @@ class DownloaderApp(QWidget):
for msg in log_messages: self.log_signal.emit(msg) for msg in log_messages: self.log_signal.emit(msg)
self.set_ui_enabled(False) self.set_ui_enabled(False)
unwanted_keywords_for_folders = {'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'} # Use the global FOLDER_NAME_STOP_WORDS from downloader_utils
from downloader_utils import FOLDER_NAME_STOP_WORDS
# unwanted_keywords_for_folders = {'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'} # Old specific set
args_template = { args_template = {
'api_url_input': api_url, 'api_url_input': api_url,
@@ -4790,8 +4815,8 @@ class DownloaderApp(QWidget):
'end_page': end_page, 'end_page': end_page,
'target_post_id_from_initial_url': post_id_from_url, 'target_post_id_from_initial_url': post_id_from_url,
'custom_folder_name': custom_folder_name_cleaned, 'custom_folder_name': custom_folder_name_cleaned,
'manga_mode_active': manga_mode, 'manga_mode_active': manga_mode, # type: ignore
'unwanted_keywords': unwanted_keywords_for_folders, 'unwanted_keywords': FOLDER_NAME_STOP_WORDS, # Pass the global set
'cancellation_event': self.cancellation_event, 'cancellation_event': self.cancellation_event,
'manga_date_prefix': manga_date_prefix_text, # NEW ARGUMENT 'manga_date_prefix': manga_date_prefix_text, # NEW ARGUMENT
'dynamic_character_filter_holder': self.dynamic_character_filter_holder, # Pass the holder 'dynamic_character_filter_holder': self.dynamic_character_filter_holder, # Pass the holder
@@ -4806,6 +4831,7 @@ class DownloaderApp(QWidget):
'manga_global_file_counter_ref': manga_global_file_counter_ref_for_thread, # Pass new counter 'manga_global_file_counter_ref': manga_global_file_counter_ref_for_thread, # Pass new counter
'app_base_dir': app_base_dir_for_cookies, # Pass app base dir 'app_base_dir': app_base_dir_for_cookies, # Pass app base dir
'use_cookie': use_cookie_for_this_run, # Pass the potentially modified cookie setting 'use_cookie': use_cookie_for_this_run, # Pass the potentially modified cookie setting
'creator_download_folder_ignore_words': creator_folder_ignore_words_for_run, # New
} }
args_template['override_output_dir'] = override_output_dir # Pass override dir in template args_template['override_output_dir'] = override_output_dir # Pass override dir in template
@@ -5044,7 +5070,9 @@ class DownloaderApp(QWidget):
'num_file_threads', 'skip_current_file_flag', 'manga_date_file_counter_ref', 'scan_content_for_images', # Added scan_content_for_images 'num_file_threads', 'skip_current_file_flag', 'manga_date_file_counter_ref', 'scan_content_for_images', # Added scan_content_for_images
'manga_mode_active', 'manga_filename_style', 'manga_date_prefix', # ADD manga_date_prefix 'manga_mode_active', 'manga_filename_style', 'manga_date_prefix', # ADD manga_date_prefix
'manga_global_file_counter_ref' # Add new counter here 'manga_global_file_counter_ref' # Add new counter here
] , 'creator_download_folder_ignore_words' # Add new ignore words list
] # type: ignore
ppw_optional_keys_with_defaults = { ppw_optional_keys_with_defaults = {
'skip_words_list', 'skip_words_scope', 'char_filter_scope', 'remove_from_filename_words_list', 'skip_words_list', 'skip_words_scope', 'char_filter_scope', 'remove_from_filename_words_list',
'show_external_links', 'extract_links_only', 'duplicate_file_mode', # Added duplicate_file_mode here 'show_external_links', 'extract_links_only', 'duplicate_file_mode', # Added duplicate_file_mode here