This commit is contained in:
Yuvi9587
2025-06-04 03:36:39 +01:00
parent cdf4e9bdfb
commit 5a6474cb8a
2 changed files with 107 additions and 16 deletions

View File

@@ -67,6 +67,24 @@ FOLDER_NAME_STOP_WORDS = {
"right", "s", "she", "so", "technically", "tell", "the", "their", "they", "this",
"to", "ve", "was", "we", "well", "were", "with", "www", "year", "you", "your",
}
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = {
"poll", "cover", "fan-art", "fanart", "requests", "request", "holiday",
# Numbers 1-20 (as strings and words)
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
"eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen", "twenty",
# Months (short and long)
"jan", "january", "feb", "february", "mar", "march", "apr", "april",
"may", "jun", "june", "jul", "july", "aug", "august", "sep", "september",
"oct", "october", "nov", "november", "dec", "december",
# Weekdays (short and long)
"mon", "monday", "tue", "tuesday", "wed", "wednesday", "thu", "thursday",
"fri", "friday", "sat", "saturday", "sun", "sunday"
}
def parse_cookie_string(cookie_string):
"""Parses a 'name=value; name2=value2' cookie string into a dict."""
cookies = {}
@@ -588,6 +606,7 @@ class PostProcessorWorker:
manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter for date-based prefix
manga_date_file_counter_ref=None, # New parameter for date-based manga naming
scan_content_for_images=False, # New flag for scanning HTML content
creator_download_folder_ignore_words=None, # New: For ignoring specific words for folder names
manga_global_file_counter_ref=None, # New parameter for global numbering
): # type: ignore
self.post = post_data # type: ignore
@@ -637,7 +656,9 @@ class PostProcessorWorker:
self.use_cookie = use_cookie # Store cookie setting
self.override_output_dir = override_output_dir # Store the override directory
self.scan_content_for_images = scan_content_for_images # Store new flag
self.creator_download_folder_ignore_words = creator_download_folder_ignore_words # Store new ignore words
if self.compress_images and Image is None:
# self.logger is not available yet, PostProcessorSignals.progress_signal.emit can be used or print
self.logger("⚠️ Image compression disabled: Pillow library not found.")
self.compress_images = False
def _emit_signal(self, signal_type_str, *payload_args):
@@ -1052,6 +1073,13 @@ class PostProcessorWorker:
post_id = post_data.get('id', 'unknown_id')
post_main_file_info = post_data.get('file')
post_attachments = post_data.get('attachments', [])
effective_unwanted_keywords_for_folder_naming = self.unwanted_keywords.copy()
is_full_creator_download_no_char_filter = not self.target_post_id_from_initial_url and not current_character_filters
if is_full_creator_download_no_char_filter and self.creator_download_folder_ignore_words:
self.logger(f" Applying creator download specific folder ignore words ({len(self.creator_download_folder_ignore_words)} words).")
effective_unwanted_keywords_for_folder_naming.update(self.creator_download_folder_ignore_words)
post_content_html = post_data.get('content', '')
self.logger(f"\n--- Processing Post {post_id} ('{post_title[:50]}...') (Thread: {threading.current_thread().name}) ---")
num_potential_files_in_post = len(post_attachments or []) + (1 if post_main_file_info and post_main_file_info.get('path') else 0)
@@ -1204,16 +1232,48 @@ class PostProcessorWorker:
log_reason_for_folder = "Matched char filter in title"
if primary_char_filter_for_folder:
base_folder_names_for_post_content = [clean_folder_name(primary_char_filter_for_folder["name"])]
cleaned_primary_folder_name = clean_folder_name(primary_char_filter_for_folder["name"])
if cleaned_primary_folder_name.lower() in effective_unwanted_keywords_for_folder_naming and cleaned_primary_folder_name.lower() != "untitled_folder":
self.logger(f" ⚠️ Primary char filter folder name '{cleaned_primary_folder_name}' is in ignore list. Using generic name.")
base_folder_names_for_post_content = ["Generic Post Content"]
else:
base_folder_names_for_post_content = [cleaned_primary_folder_name]
self.logger(f" Base folder name(s) for post content ({log_reason_for_folder}): {', '.join(base_folder_names_for_post_content)}")
elif not current_character_filters: # No char filters defined, use generic logic
derived_folders = match_folders_from_title(post_title, self.known_names, self.unwanted_keywords)
if derived_folders:
base_folder_names_for_post_content.extend(match_folders_from_title(post_title, KNOWN_NAMES, self.unwanted_keywords))
# 1. Try to match folder names from Known.txt using the post title
derived_folders_from_known_txt = match_folders_from_title(
post_title,
self.known_names,
effective_unwanted_keywords_for_folder_naming
)
# Filter out any "untitled_folder" that might come from Known.txt if the primary name was problematic,
# and also filter empty strings.
valid_derived_folders = [
name for name in derived_folders_from_known_txt
if name and name.strip() and name.lower() != "untitled_folder"
]
if valid_derived_folders:
base_folder_names_for_post_content.extend(valid_derived_folders)
self.logger(f" Base folder name(s) for post content (Derived from Known.txt & Post Title): {', '.join(base_folder_names_for_post_content)}")
else:
base_folder_names_for_post_content.append(extract_folder_name_from_title(post_title, self.unwanted_keywords))
if not base_folder_names_for_post_content or not base_folder_names_for_post_content[0]:
base_folder_names_for_post_content = [clean_folder_name(post_title if post_title else "untitled_creator_content")]
self.logger(f" Base folder name(s) for post content (Generic title parsing - no char filters): {', '.join(base_folder_names_for_post_content)}")
# 2. If no valid folders from Known.txt, fall back to extracting from title directly.
extracted_folder_name = extract_folder_name_from_title(
post_title,
effective_unwanted_keywords_for_folder_naming
)
base_folder_names_for_post_content.append(extracted_folder_name)
self.logger(f" Base folder name(s) for post content (Generic title parsing - no valid Known.txt match): {', '.join(base_folder_names_for_post_content)}")
# 3. Final cleanup: Ensure list is not empty and contains valid, non-empty strings.
base_folder_names_for_post_content = [
name for name in base_folder_names_for_post_content if name and name.strip()
]
if not base_folder_names_for_post_content:
final_fallback_name = clean_folder_name(post_title if post_title and post_title.strip() else "Generic Post Content")
base_folder_names_for_post_content = [final_fallback_name]
self.logger(f" Fallback folder name due to all derivations failing: {final_fallback_name}")
if not self.extract_links_only and self.use_subfolders and self.skip_words_list:
if self._check_pause(f"Folder keyword skip check for post {post_id}"): return 0, num_potential_files_in_post, []
for folder_name_to_check in base_folder_names_for_post_content: # type: ignore
@@ -1547,6 +1607,7 @@ class DownloadThread(QThread):
manga_global_file_counter_ref=None, # New parameter for global numbering
use_cookie=False, # Added: Expected by main.py
scan_content_for_images=False, # Added new flag
creator_download_folder_ignore_words=None, # Added for DownloadThread
cookie_text="", # Added: Expected by main.py
):
super().__init__()
@@ -1597,6 +1658,7 @@ class DownloadThread(QThread):
self.override_output_dir = override_output_dir # Store override dir
self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker by DownloadThread
self.scan_content_for_images = scan_content_for_images # Store new flag
self.creator_download_folder_ignore_words = creator_download_folder_ignore_words # Store new ignore words
self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store for global numbering
if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
@@ -1718,6 +1780,7 @@ class DownloadThread(QThread):
use_cookie=self.use_cookie, # Pass cookie setting to worker
manga_date_file_counter_ref=current_manga_date_file_counter_ref, # Pass the calculated or passed-in ref
scan_content_for_images=self.scan_content_for_images, # Pass new flag
creator_download_folder_ignore_words=self.creator_download_folder_ignore_words, # Pass new ignore words
)
try:
dl_count, skip_count, kept_originals_this_post, retryable_failures, permanent_failures = post_processing_worker.process()

46
main.py
View File

@@ -59,7 +59,8 @@ try:
CHAR_SCOPE_COMMENTS,
FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER,
STYLE_DATE_BASED,
STYLE_POST_TITLE_GLOBAL_NUMBERING
STYLE_POST_TITLE_GLOBAL_NUMBERING,
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS # Added import
)
print("Successfully imported names from downloader_utils.")
@@ -93,6 +94,7 @@ except ImportError as e:
FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER = "failed_retry_later"
STYLE_DATE_BASED = "date_based"
STYLE_POST_TITLE_GLOBAL_NUMBERING = "post_title_global_numbering"
CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = set() # Mock for import error
except Exception as e:
print(f"--- UNEXPECTED IMPORT ERROR ---")
@@ -3077,8 +3079,12 @@ class DownloaderApp(QWidget):
self.main_splitter.addWidget(left_panel_widget)
self.main_splitter.addWidget(right_panel_widget)
initial_width = self.width()
self.main_splitter.addWidget(right_panel_widget) # type: ignore
# Ensure the window has a size before calculating splitter sizes
if self.width() == 0 or self.height() == 0: # Default size if not shown yet
initial_width = 1024 # A reasonable default
else:
initial_width = self.width()
left_width = int(initial_width * 0.35)
right_width = initial_width - left_width
self.main_splitter.setSizes([left_width, right_width])
@@ -4476,6 +4482,16 @@ class DownloaderApp(QWidget):
QMessageBox.critical(self, "Input Error", "Invalid or unsupported URL format.")
return False # Indicate failure to start
creator_folder_ignore_words_for_run = None
is_full_creator_download = not post_id_from_url
# Use actual_filters_to_use_for_run which is populated after parsing character_input
# This check needs to happen *after* actual_filters_to_use_for_run is determined.
# We will move this logic block down.
# if is_full_creator_download and character_filters_are_empty:
# creator_folder_ignore_words_for_run = CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS
# log_messages.append(f" Creator Download (No Char Filter): Applying default folder name ignore list ({len(creator_folder_ignore_words_for_run)} words).")
if compress_images and Image is None:
QMessageBox.warning(self, "Missing Dependency", "Pillow library (for image compression) not found. Compression will be disabled.")
@@ -4635,6 +4651,13 @@ class DownloaderApp(QWidget):
else:
self.log_signal.emit("⚠️ Proceeding with Manga Mode without a specific title filter.")
self.dynamic_character_filter_holder.set_filters(actual_filters_to_use_for_run)
# Determine creator_folder_ignore_words_for_run *after* actual_filters_to_use_for_run is set
creator_folder_ignore_words_for_run = None
character_filters_are_empty = not actual_filters_to_use_for_run # Now this is accurate
if is_full_creator_download and character_filters_are_empty: # is_full_creator_download defined earlier
creator_folder_ignore_words_for_run = CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS
log_messages.append(f" Creator Download (No Char Filter): Applying default folder name ignore list ({len(creator_folder_ignore_words_for_run)} words).")
custom_folder_name_cleaned = None
if use_subfolders and post_id_from_url and self.custom_folder_widget and self.custom_folder_widget.isVisible() and not extract_links_only:
@@ -4747,8 +4770,8 @@ class DownloaderApp(QWidget):
should_use_multithreading_for_posts = use_multithreading_enabled_by_checkbox and not post_id_from_url
if manga_mode and (self.manga_filename_style == STYLE_DATE_BASED or self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING) and not post_id_from_url:
enforced_by_style = "Date Mode" if self.manga_filename_style == STYLE_DATE_BASED else "Title+GlobalNum Mode"
log_messages.append(f" Threading: Single-threaded (posts) - Enforced by Manga {enforced_by_style}")
should_use_multithreading_for_posts = False # Ensure this reflects the forced state
log_messages.append(f" Threading: Single-threaded (posts) - Enforced by Manga {enforced_by_style} (Actual workers: {effective_num_post_workers if effective_num_post_workers > 1 else 1})")
else:
log_messages.append(f" Threading: {'Multi-threaded (posts)' if should_use_multithreading_for_posts else 'Single-threaded (posts)'}")
if should_use_multithreading_for_posts:
@@ -4757,8 +4780,10 @@ class DownloaderApp(QWidget):
for msg in log_messages: self.log_signal.emit(msg)
self.set_ui_enabled(False)
unwanted_keywords_for_folders = {'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'}
# Use the global FOLDER_NAME_STOP_WORDS from downloader_utils
from downloader_utils import FOLDER_NAME_STOP_WORDS
# unwanted_keywords_for_folders = {'spicy', 'hd', 'nsfw', '4k', 'preview', 'teaser', 'clip'} # Old specific set
args_template = {
'api_url_input': api_url,
@@ -4790,8 +4815,8 @@ class DownloaderApp(QWidget):
'end_page': end_page,
'target_post_id_from_initial_url': post_id_from_url,
'custom_folder_name': custom_folder_name_cleaned,
'manga_mode_active': manga_mode,
'unwanted_keywords': unwanted_keywords_for_folders,
'manga_mode_active': manga_mode, # type: ignore
'unwanted_keywords': FOLDER_NAME_STOP_WORDS, # Pass the global set
'cancellation_event': self.cancellation_event,
'manga_date_prefix': manga_date_prefix_text, # NEW ARGUMENT
'dynamic_character_filter_holder': self.dynamic_character_filter_holder, # Pass the holder
@@ -4806,6 +4831,7 @@ class DownloaderApp(QWidget):
'manga_global_file_counter_ref': manga_global_file_counter_ref_for_thread, # Pass new counter
'app_base_dir': app_base_dir_for_cookies, # Pass app base dir
'use_cookie': use_cookie_for_this_run, # Pass the potentially modified cookie setting
'creator_download_folder_ignore_words': creator_folder_ignore_words_for_run, # New
}
args_template['override_output_dir'] = override_output_dir # Pass override dir in template
@@ -5044,7 +5070,9 @@ class DownloaderApp(QWidget):
'num_file_threads', 'skip_current_file_flag', 'manga_date_file_counter_ref', 'scan_content_for_images', # Added scan_content_for_images
'manga_mode_active', 'manga_filename_style', 'manga_date_prefix', # ADD manga_date_prefix
'manga_global_file_counter_ref' # Add new counter here
]
, 'creator_download_folder_ignore_words' # Add new ignore words list
] # type: ignore
ppw_optional_keys_with_defaults = {
'skip_words_list', 'skip_words_scope', 'char_filter_scope', 'remove_from_filename_words_list',
'show_external_links', 'extract_links_only', 'duplicate_file_mode', # Added duplicate_file_mode here