This commit is contained in:
Yuvi9587
2025-06-15 09:49:09 +01:00
parent 7fe5f4b83e
commit 3473f6540d
3 changed files with 976 additions and 144 deletions

View File

@@ -8,6 +8,7 @@ import hashlib
import http .client
import traceback
from concurrent .futures import ThreadPoolExecutor ,Future ,CancelledError ,as_completed
from collections import deque # Import deque
import html
from PyQt5 .QtCore import QObject ,pyqtSignal ,QThread ,QMutex ,QMutexLocker
from urllib .parse import urlparse
@@ -41,6 +42,7 @@ from io import BytesIO
STYLE_POST_TITLE ="post_title"
STYLE_ORIGINAL_NAME ="original_name"
STYLE_DATE_BASED ="date_based"
STYLE_DATE_POST_TITLE = "date_post_title" # New style constant
MANGA_DATE_PREFIX_DEFAULT =""
STYLE_POST_TITLE_GLOBAL_NUMBERING ="post_title_global_numbering"
SKIP_SCOPE_FILES ="files"
@@ -509,13 +511,31 @@ def fetch_post_comments (api_domain ,service ,user_id ,post_id ,headers ,logger
raise RuntimeError (f"Error decoding JSON from comments API for post {post_id } ({comments_api_url }): {e }. Response text: {response .text [:200 ]}")
except Exception as e :
raise RuntimeError (f"Unexpected error fetching comments for post {post_id } ({comments_api_url }): {e }")
def download_from_api (api_url_input ,logger =print ,start_page =None ,end_page =None ,manga_mode =False ,
cancellation_event =None ,pause_event =None ,use_cookie =False ,cookie_text ="",selected_cookie_file =None ,app_base_dir =None ):
headers ={'User-Agent':'Mozilla/5.0','Accept':'application/json'}
service ,user_id ,target_post_id =extract_post_info (api_url_input )
if cancellation_event and cancellation_event .is_set ():
logger (" Download_from_api cancelled at start.")
return
def download_from_api(
api_url_input,
logger=print, # type: ignore
start_page=None, # type: ignore
end_page=None, # type: ignore
manga_mode=False, # type: ignore
cancellation_event=None, # type: ignore
pause_event=None, # type: ignore
use_cookie=False, # type: ignore
cookie_text="", # type: ignore
selected_cookie_file=None, # type: ignore
app_base_dir=None, # type: ignore
manga_filename_style_for_sort_check=None # type: ignore # Parameter is correctly defined
):
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json'
}
service, user_id, target_post_id = extract_post_info(api_url_input)
if cancellation_event and cancellation_event.is_set():
logger(" Download_from_api cancelled at start.")
return
parsed_input_url_for_domain =urlparse (api_url_input )
api_domain =parsed_input_url_for_domain .netloc
if not any (d in api_domain .lower ()for d in ['kemono.su','kemono.party','coomer.su','coomer.party']):
@@ -552,11 +572,14 @@ cancellation_event =None ,pause_event =None ,use_cookie =False ,cookie_text ="",
return
if target_post_id and (start_page or end_page ):
logger ("⚠️ Page range (start/end page) is ignored when a specific post URL is provided (searching all pages for the post).")
is_creator_feed_for_manga =manga_mode and not target_post_id
# determine if we should use the "fetch all then sort oldest first" logic for manga mode
is_manga_mode_fetch_all_and_sort_oldest_first = manga_mode and \
(manga_filename_style_for_sort_check != STYLE_DATE_POST_TITLE) and \
not target_post_id
api_base_url =f"https://{api_domain }/api/v1/{service }/user/{user_id }"
page_size =50
if is_creator_feed_for_manga :
logger (" Manga Mode: Fetching posts to sort by date (oldest processed first)...")
if is_manga_mode_fetch_all_and_sort_oldest_first :
logger(f" Manga Mode (Style: {manga_filename_style_for_sort_check if manga_filename_style_for_sort_check else 'Default'} - Oldest First Sort Active): Fetching all posts to sort by date...")
all_posts_for_manga_mode =[]
current_offset_manga =0
if start_page and start_page >1 :
@@ -635,6 +658,12 @@ cancellation_event =None ,pause_event =None ,use_cookie =False ,cookie_text ="",
break
yield all_posts_for_manga_mode [i :i +page_size ]
return
# If manga_mode is true but we didn't enter the block above,
# it means we want newest first for STYLE_DATE_POST_TITLE (or it's a single post URL)
if manga_mode and not target_post_id and (manga_filename_style_for_sort_check == STYLE_DATE_POST_TITLE):
logger(f" Manga Mode (Style: {STYLE_DATE_POST_TITLE}): Processing posts in default API order (newest first).")
current_page_num =1
current_offset =0
processed_target_post_flag =False
@@ -727,8 +756,10 @@ class PostProcessorSignals (QObject ):
file_download_status_signal =pyqtSignal (bool )
external_link_signal =pyqtSignal (str ,str ,str ,str ,str )
file_progress_signal =pyqtSignal (str ,object )
file_successfully_downloaded_signal = pyqtSignal(dict) # New signal for successfully downloaded files
missed_character_post_signal =pyqtSignal (str ,str )
class PostProcessorWorker :
# ... (other __init__ arguments)
def __init__ (self ,post_data ,download_root ,known_names ,
filter_character_list ,emitter ,
unwanted_keywords ,filter_mode ,skip_zip ,skip_rar ,
@@ -836,7 +867,7 @@ class PostProcessorWorker :
post_title ="",file_index_in_post =0 ,num_files_in_this_post =1 ,
manga_date_file_counter_ref =None ):
was_original_name_kept_flag =False
manga_global_file_counter_ref =None
# manga_global_file_counter_ref =None # This was a duplicate definition, removed
final_filename_saved_for_return =""
def _get_current_character_filters (self ):
if self .dynamic_filter_holder :
@@ -846,7 +877,7 @@ class PostProcessorWorker :
post_title ="",file_index_in_post =0 ,num_files_in_this_post =1 ,
manga_date_file_counter_ref =None ,
forced_filename_override =None ,
manga_global_file_counter_ref =None ):
manga_global_file_counter_ref =None, folder_context_name_for_history=None ): # Added folder_context_name_for_history
was_original_name_kept_flag =False
final_filename_saved_for_return =""
retry_later_details =None
@@ -948,6 +979,48 @@ class PostProcessorWorker :
self .logger (f"⚠️ Manga Title+GlobalNum Mode: Counter ref not provided or malformed for '{api_original_filename }'. Using original. Ref: {manga_global_file_counter_ref }")
filename_to_save_in_main_path =cleaned_original_api_filename
self .logger (f"⚠️ Manga mode (Title+GlobalNum Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path }' for post {original_post_id_for_log }.")
elif self.manga_filename_style == STYLE_DATE_POST_TITLE:
published_date_str = self.post.get('published')
added_date_str = self.post.get('added')
formatted_date_str = "nodate" # Default if no date found
if published_date_str:
try:
formatted_date_str = published_date_str.split('T')[0]
except Exception: # pylint: disable=bare-except
self.logger(f" ⚠️ Could not parse 'published' date '{published_date_str}' for STYLE_DATE_POST_TITLE. Using 'nodate'.")
elif added_date_str:
try:
formatted_date_str = added_date_str.split('T')[0]
self.logger(f" ⚠️ Post ID {original_post_id_for_log} missing 'published' date, using 'added' date '{added_date_str}' for STYLE_DATE_POST_TITLE naming.")
except Exception: # pylint: disable=bare-except
self.logger(f" ⚠️ Could not parse 'added' date '{added_date_str}' for STYLE_DATE_POST_TITLE. Using 'nodate'.")
else:
self.logger(f" ⚠️ Post ID {original_post_id_for_log} missing both 'published' and 'added' dates for STYLE_DATE_POST_TITLE. Using 'nodate'.")
if post_title and post_title.strip():
temp_cleaned_title = clean_filename(post_title.strip())
if not temp_cleaned_title or temp_cleaned_title.startswith("untitled_file"):
self.logger(f"⚠️ Manga mode (Date+PostTitle Style): Post title for post {original_post_id_for_log} ('{post_title}') was empty or generic after cleaning. Using 'post' as title part.")
cleaned_post_title_for_filename = "post"
else:
cleaned_post_title_for_filename = temp_cleaned_title
base_name_for_style = f"{formatted_date_str}_{cleaned_post_title_for_filename}"
if num_files_in_this_post > 1:
filename_to_save_in_main_path = f"{base_name_for_style}_{file_index_in_post}{original_ext}" if file_index_in_post > 0 else f"{base_name_for_style}{original_ext}"
else: # Single file post
filename_to_save_in_main_path = f"{base_name_for_style}{original_ext}"
else:
self.logger(f"⚠️ Manga mode (Date+PostTitle Style): Post title missing for post {original_post_id_for_log}. Using 'post' as title part with date prefix.")
cleaned_post_title_for_filename = "post" # Fallback title part
base_name_for_style = f"{formatted_date_str}_{cleaned_post_title_for_filename}"
if num_files_in_this_post > 1:
filename_to_save_in_main_path = f"{base_name_for_style}_{file_index_in_post}{original_ext}" if file_index_in_post > 0 else f"{base_name_for_style}{original_ext}"
else: # Single file post
filename_to_save_in_main_path = f"{base_name_for_style}{original_ext}"
self .logger (f"⚠️ Manga mode (Title+GlobalNum Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path }' for post {original_post_id_for_log }.")
else :
self .logger (f"⚠️ Manga mode: Unknown filename style '{self .manga_filename_style }'. Defaulting to original filename for '{api_original_filename }'.")
filename_to_save_in_main_path =cleaned_original_api_filename
@@ -1320,7 +1393,23 @@ class PostProcessorWorker :
with self .downloaded_files_lock :self .downloaded_files .add (filename_to_save_in_main_path )
final_filename_saved_for_return =final_filename_on_disk
self .logger (f"✅ Saved: '{final_filename_saved_for_return }' (from '{api_original_filename }', {downloaded_size_bytes /(1024 *1024 ):.2f} MB) in '{os .path .basename (effective_save_folder )}'")
# Emit signal for successfully downloaded file
downloaded_file_details = {
'disk_filename': final_filename_saved_for_return,
'post_title': post_title,
'post_id': original_post_id_for_log,
'upload_date_str': self.post.get('published') or self.post.get('added') or "N/A",
'download_timestamp': time.time(), # Will be recorded by main app
'download_path': effective_save_folder, # The folder it was saved into
'service': self.service,
'user_id': self.user_id,
'api_original_filename': api_original_filename,
'folder_context_name': folder_context_name_for_history or os.path.basename(effective_save_folder) # Best effort context name
}
self._emit_signal('file_successfully_downloaded', downloaded_file_details)
time .sleep (0.05 )
return 1 ,0 ,final_filename_saved_for_return ,was_original_name_kept_flag ,FILE_DOWNLOAD_STATUS_SUCCESS ,None
except Exception as save_err :
self .logger (f"->>Save Fail for '{final_filename_on_disk }': {save_err }")
@@ -1336,18 +1425,20 @@ class PostProcessorWorker :
def process (self ):
if self ._check_pause (f"Post processing for ID {self .post .get ('id','N/A')}"):return 0 ,0 ,[],[],[]
if self .check_cancel ():return 0 ,0 ,[],[],[]
if self ._check_pause (f"Post processing for ID {self .post .get ('id','N/A')}"):return 0 ,0 ,[],[],[], None
if self .check_cancel ():return 0 ,0 ,[],[],[], None
current_character_filters =self ._get_current_character_filters ()
kept_original_filenames_for_log =[]
retryable_failures_this_post =[]
permanent_failures_this_post =[]
total_downloaded_this_post =0
total_skipped_this_post =0
history_data_for_this_post = None
parsed_api_url =urlparse (self .api_url_input )
referer_url =f"https://{parsed_api_url .netloc }/"
headers ={'User-Agent':'Mozilla/5.0','Referer':referer_url ,'Accept':'*/*'}
link_pattern =re .compile (r"""<a\s+.*?href=["'](https?://[^"']+)["'][^>]*>(.*?)</a>""",
link_pattern =re .compile (r"""<a\s+.*?href=["'](https?://[^"']+)["'][^>]*>(.*?)</a>""", # type: ignore
re .IGNORECASE |re .DOTALL )
post_data =self .post
post_title =post_data .get ('title','')or 'untitled_post'
@@ -1370,17 +1461,17 @@ class PostProcessorWorker :
post_is_candidate_by_file_char_match_in_comment_scope =False
char_filter_that_matched_file_in_comment_scope =None
char_filter_that_matched_comment =None
if current_character_filters and (self .char_filter_scope ==CHAR_SCOPE_TITLE or self .char_filter_scope ==CHAR_SCOPE_BOTH ):
if self ._check_pause (f"Character title filter for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[]
if current_character_filters and (self .char_filter_scope ==CHAR_SCOPE_TITLE or self .char_filter_scope ==CHAR_SCOPE_BOTH ): # type: ignore
if self ._check_pause (f"Character title filter for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
for idx ,filter_item_obj in enumerate (current_character_filters ):
if self .check_cancel ():break
terms_to_check_for_title =list (filter_item_obj ["aliases"])
terms_to_check_for_title =list (filter_item_obj ["aliases"]) # type: ignore
if filter_item_obj ["is_group"]:
if filter_item_obj ["name"]not in terms_to_check_for_title :
terms_to_check_for_title .append (filter_item_obj ["name"])
if filter_item_obj ["name"]not in terms_to_check_for_title : # type: ignore
terms_to_check_for_title .append (filter_item_obj ["name"]) # type: ignore
unique_terms_for_title_check =list (set (terms_to_check_for_title ))
for term_to_match in unique_terms_for_title_check :
match_found_for_term =is_title_match_for_character (post_title ,term_to_match )
match_found_for_term =is_title_match_for_character (post_title ,term_to_match ) # type: ignore
if match_found_for_term :
post_is_candidate_by_title_char_match =True
char_filter_that_matched_title =filter_item_obj
@@ -1402,18 +1493,18 @@ class PostProcessorWorker :
all_files_from_post_api_for_char_check .append ({'_original_name_for_log':original_api_att_name })
if current_character_filters and self .char_filter_scope ==CHAR_SCOPE_COMMENTS :
self .logger (f" [Char Scope: Comments] Phase 1: Checking post files for matches before comments for post ID '{post_id }'.")
if self ._check_pause (f"File check (comments scope) for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[]
if self ._check_pause (f"File check (comments scope) for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
for file_info_item in all_files_from_post_api_for_char_check :
if self .check_cancel ():break
current_api_original_filename_for_check =file_info_item .get ('_original_name_for_log')
if not current_api_original_filename_for_check :continue
for filter_item_obj in current_character_filters :
terms_to_check =list (filter_item_obj ["aliases"])
if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check :
terms_to_check .append (filter_item_obj ["name"])
terms_to_check =list (filter_item_obj ["aliases"]) # type: ignore
if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check : # type: ignore
terms_to_check .append (filter_item_obj ["name"]) # type: ignore
for term_to_match in terms_to_check :
if is_filename_match_for_character (current_api_original_filename_for_check ,term_to_match ):
post_is_candidate_by_file_char_match_in_comment_scope =True
post_is_candidate_by_file_char_match_in_comment_scope =True # type: ignore
char_filter_that_matched_file_in_comment_scope =filter_item_obj
self .logger (f" Match Found (File in Comments Scope): File '{current_api_original_filename_for_check }' matches char filter term '{term_to_match }' (from group/name '{filter_item_obj ['name']}'). Post is candidate.")
break
@@ -1422,7 +1513,7 @@ class PostProcessorWorker :
self .logger (f" [Char Scope: Comments] Phase 1 Result: post_is_candidate_by_file_char_match_in_comment_scope = {post_is_candidate_by_file_char_match_in_comment_scope }")
if current_character_filters and self .char_filter_scope ==CHAR_SCOPE_COMMENTS :
if not post_is_candidate_by_file_char_match_in_comment_scope :
if self ._check_pause (f"Comment check for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[]
if self ._check_pause (f"Comment check for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
self .logger (f" [Char Scope: Comments] Phase 2: No file match found. Checking post comments for post ID '{post_id }'.")
try :
parsed_input_url_for_comments =urlparse (self .api_url_input )
@@ -1444,11 +1535,11 @@ class PostProcessorWorker :
raw_comment_content =comment_item .get ('content','')
if not raw_comment_content :continue
cleaned_comment_text =strip_html_tags (raw_comment_content )
if not cleaned_comment_text .strip ():continue
if not cleaned_comment_text .strip ():continue # type: ignore
for filter_item_obj in current_character_filters :
terms_to_check_comment =list (filter_item_obj ["aliases"])
if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check_comment :
terms_to_check_comment .append (filter_item_obj ["name"])
terms_to_check_comment =list (filter_item_obj ["aliases"]) # type: ignore
if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check_comment : # type: ignore
terms_to_check_comment .append (filter_item_obj ["name"]) # type: ignore
for term_to_match_comment in terms_to_check_comment :
if is_title_match_for_character (cleaned_comment_text ,term_to_match_comment ):
post_is_candidate_by_comment_char_match =True
@@ -1470,32 +1561,33 @@ class PostProcessorWorker :
if current_character_filters :
if self .char_filter_scope ==CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match :
self .logger (f" -> Skip Post (Scope: Title - No Char Match): Title '{post_title [:50 ]}' does not match character filters.")
self ._emit_signal ('missed_character_post',post_title ,"No title match for character filter")
return 0 ,num_potential_files_in_post ,[],[],[]
self ._emit_signal ('missed_character_post',post_title ,"No title match for character filter") # type: ignore
return 0 ,num_potential_files_in_post ,[],[],[], None
if self .char_filter_scope ==CHAR_SCOPE_COMMENTS and not post_is_candidate_by_file_char_match_in_comment_scope and not post_is_candidate_by_comment_char_match :
self .logger (f" -> Skip Post (Scope: Comments - No Char Match in Comments): Post ID '{post_id }', Title '{post_title [:50 ]}...'")
if self .emitter and hasattr (self .emitter ,'missed_character_post_signal'):
self ._emit_signal ('missed_character_post',post_title ,"No character match in files or comments (Comments scope)")
return 0 ,num_potential_files_in_post ,[],[],[]
self ._emit_signal ('missed_character_post',post_title ,"No character match in files or comments (Comments scope)") # type: ignore
return 0 ,num_potential_files_in_post ,[],[],[], None
if self .skip_words_list and (self .skip_words_scope ==SKIP_SCOPE_POSTS or self .skip_words_scope ==SKIP_SCOPE_BOTH ):
if self ._check_pause (f"Skip words (post title) for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[]
if self ._check_pause (f"Skip words (post title) for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
post_title_lower =post_title .lower ()
for skip_word in self .skip_words_list :
if skip_word .lower ()in post_title_lower :
self .logger (f" -> Skip Post (Keyword in Title '{skip_word }'): '{post_title [:50 ]}...'. Scope: {self .skip_words_scope }")
return 0 ,num_potential_files_in_post ,[],[],[]
return 0 ,num_potential_files_in_post ,[],[],[], None
if not self .extract_links_only and self .manga_mode_active and current_character_filters and (self .char_filter_scope ==CHAR_SCOPE_TITLE or self .char_filter_scope ==CHAR_SCOPE_BOTH )and not post_is_candidate_by_title_char_match :
self .logger (f" -> Skip Post (Manga Mode with Title/Both Scope - No Title Char Match): Title '{post_title [:50 ]}' doesn't match filters.")
self ._emit_signal ('missed_character_post',post_title ,"Manga Mode: No title match for character filter (Title/Both scope)")
return 0 ,num_potential_files_in_post ,[],[],[]
self ._emit_signal ('missed_character_post',post_title ,"Manga Mode: No title match for character filter (Title/Both scope)") # type: ignore
return 0 ,num_potential_files_in_post ,[],[],[], None
if not isinstance (post_attachments ,list ):
self .logger (f"⚠️ Corrupt attachment data for post {post_id } (expected list, got {type (post_attachments )}). Skipping attachments.")
post_attachments =[]
base_folder_names_for_post_content =[]
determined_post_save_path_for_history = self.override_output_dir if self.override_output_dir else self.download_root
if not self .extract_links_only and self .use_subfolders :
if self ._check_pause (f"Subfolder determination for post {post_id }"):return 0 ,num_potential_files_in_post ,[]
if self ._check_pause (f"Subfolder determination for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
primary_char_filter_for_folder =None
log_reason_for_folder =""
log_reason_for_folder ="" # type: ignore
if self .char_filter_scope ==CHAR_SCOPE_COMMENTS and char_filter_that_matched_comment :
if post_is_candidate_by_file_char_match_in_comment_scope and char_filter_that_matched_file_in_comment_scope :
primary_char_filter_for_folder =char_filter_that_matched_file_in_comment_scope
@@ -1506,10 +1598,10 @@ class PostProcessorWorker :
elif (self .char_filter_scope ==CHAR_SCOPE_TITLE or self .char_filter_scope ==CHAR_SCOPE_BOTH )and char_filter_that_matched_title :
primary_char_filter_for_folder =char_filter_that_matched_title
log_reason_for_folder ="Matched char filter in title"
if primary_char_filter_for_folder :
base_folder_names_for_post_content =[clean_folder_name (primary_char_filter_for_folder ["name"])]
cleaned_primary_folder_name =clean_folder_name (primary_char_filter_for_folder ["name"])
if cleaned_primary_folder_name .lower ()in effective_unwanted_keywords_for_folder_naming and cleaned_primary_folder_name .lower ()!="untitled_folder":
if primary_char_filter_for_folder : # type: ignore
base_folder_names_for_post_content =[clean_folder_name (primary_char_filter_for_folder ["name"])] # type: ignore
cleaned_primary_folder_name =clean_folder_name (primary_char_filter_for_folder ["name"]) # type: ignore
if cleaned_primary_folder_name .lower ()in effective_unwanted_keywords_for_folder_naming and cleaned_primary_folder_name .lower ()!="untitled_folder": # type: ignore
self .logger (f" ⚠️ Primary char filter folder name '{cleaned_primary_folder_name }' is in ignore list. Using generic name.")
base_folder_names_for_post_content =["Generic Post Content"]
else :
@@ -1524,7 +1616,7 @@ class PostProcessorWorker :
)
valid_derived_folders_from_title_known_txt =[
name for name in derived_folders_from_title_via_known_txt
name for name in derived_folders_from_title_via_known_txt # type: ignore
if name and name .strip ()and name .lower ()!="untitled_folder"
]
@@ -1541,7 +1633,7 @@ class PostProcessorWorker :
FOLDER_NAME_STOP_WORDS
)
title_is_only_creator_ignored_words =False
title_is_only_creator_ignored_words =False # type: ignore
if candidate_name_from_title_basic_clean and candidate_name_from_title_basic_clean .lower ()!="untitled_folder"and self .creator_download_folder_ignore_words :
candidate_title_words ={word .lower ()for word in candidate_name_from_title_basic_clean .split ()}
@@ -1592,23 +1684,31 @@ class PostProcessorWorker :
if not base_folder_names_for_post_content :
final_fallback_name =clean_folder_name (post_title if post_title and post_title .strip ()else "Generic Post Content")
base_folder_names_for_post_content =[final_fallback_name ]
self .logger (f" Ultimate fallback folder name: {final_fallback_name }")
self .logger (f" Ultimate fallback folder name: {final_fallback_name }") # type: ignore
if base_folder_names_for_post_content:
determined_post_save_path_for_history = os.path.join(determined_post_save_path_for_history, base_folder_names_for_post_content[0])
if not self.extract_links_only and self.use_post_subfolders:
cleaned_post_title_for_sub = clean_folder_name(post_title)
determined_post_save_path_for_history = os.path.join(determined_post_save_path_for_history, cleaned_post_title_for_sub)
if not self .extract_links_only and self .use_subfolders and self .skip_words_list :
if self ._check_pause (f"Folder keyword skip check for post {post_id }"):return 0 ,num_potential_files_in_post ,[]
if self ._check_pause (f"Folder keyword skip check for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
for folder_name_to_check in base_folder_names_for_post_content :
if not folder_name_to_check :continue
if any (skip_word .lower ()in folder_name_to_check .lower ()for skip_word in self .skip_words_list ):
matched_skip =next ((sw for sw in self .skip_words_list if sw .lower ()in folder_name_to_check .lower ()),"unknown_skip_word")
self .logger (f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check }' contains '{matched_skip }'.")
return 0 ,num_potential_files_in_post ,[],[],[]
if (self .show_external_links or self .extract_links_only )and post_content_html :
if self ._check_pause (f"External link extraction for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[]
matched_skip =next ((sw for sw in self .skip_words_list if sw .lower ()in folder_name_to_check .lower ()),"unknown_skip_word") # type: ignore
self .logger (f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check }' contains '{matched_skip }'.") # type: ignore
return 0 ,num_potential_files_in_post ,[],[],[], None
if (self .show_external_links or self .extract_links_only )and post_content_html : # type: ignore
if self ._check_pause (f"External link extraction for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[], None
try :
mega_key_pattern =re .compile (r'\b([a-zA-Z0-9_-]{43}|[a-zA-Z0-9_-]{22})\b')
unique_links_data ={}
for match in link_pattern .finditer (post_content_html ):
link_url =match .group (1 ).strip ()
link_url =html .unescape (link_url )
link_url =html .unescape (link_url ) # type: ignore
link_inner_text =match .group (2 )
if not any (ext in link_url .lower ()for ext in ['.css','.js','.ico','.xml','.svg'])and not link_url .startswith ('javascript:')and link_url not in unique_links_data :
clean_link_text =re .sub (r'<.*?>','',link_inner_text )
@@ -1637,12 +1737,12 @@ class PostProcessorWorker :
decryption_key_found =key_match_in_content .group (1 )
if platform not in scraped_platforms :
self ._emit_signal ('external_link',post_title ,link_text ,link_url ,platform ,decryption_key_found or "")
links_emitted_count +=1
links_emitted_count +=1 # type: ignore
if links_emitted_count >0 :self .logger (f" 🔗 Found {links_emitted_count } potential external link(s) in post content.")
except Exception as e :self .logger (f"⚠️ Error parsing post content for links: {e }\n{traceback .format_exc (limit =2 )}")
if self .extract_links_only :
self .logger (f" Extract Links Only mode: Finished processing post {post_id } for links.")
return 0 ,0 ,[],[],[]
return 0 ,0 ,[],[],[], None
all_files_from_post_api =[]
api_file_domain =urlparse (self .api_url_input ).netloc
if not api_file_domain or not any (d in api_file_domain .lower ()for d in ['kemono.su','kemono.party','coomer.su','coomer.party']):
@@ -1729,22 +1829,22 @@ class PostProcessorWorker :
all_files_from_post_api =[finfo for finfo in all_files_from_post_api if finfo .get ('_from_content_scan')]
if not all_files_from_post_api :
self .logger (f" -> No images found via content scan for post {post_id } in this combined mode.")
return 0 ,0 ,[],[],[]
return 0 ,0 ,[],[],[], None
else :
self .logger (f" Mode: 'Download Thumbnails Only' active. Filtering for API thumbnails for post {post_id }.")
all_files_from_post_api =[finfo for finfo in all_files_from_post_api if finfo .get ('_is_thumbnail')]
if not all_files_from_post_api :
self .logger (f" -> No API image thumbnails found for post {post_id } in thumbnail-only mode.")
return 0 ,0 ,[],[],[]
return 0 ,0 ,[],[],[], None
if self .manga_mode_active and self .manga_filename_style ==STYLE_DATE_BASED :
def natural_sort_key_for_files (file_api_info ):
name =file_api_info .get ('_original_name_for_log','').lower ()
return [int (text )if text .isdigit ()else text for text in re .split ('([0-9]+)',name )]
all_files_from_post_api .sort (key =natural_sort_key_for_files )
self .logger (f" Manga Date Mode: Sorted {len (all_files_from_post_api )} files within post {post_id } by original name for sequential numbering.")
self .logger (f" Manga Date Mode: Sorted {len (all_files_from_post_api )} files within post {post_id } by original name for sequential numbering.") # type: ignore
if not all_files_from_post_api :
self .logger (f" No files found to download for post {post_id }.")
return 0 ,0 ,[],[],[]
self .logger (f" No files found to download for post {post_id }.") # type: ignore
return 0 ,0 ,[],[],[], None
files_to_download_info_list =[]
processed_original_filenames_in_this_post =set ()
for file_info in all_files_from_post_api :
@@ -1758,7 +1858,7 @@ class PostProcessorWorker :
processed_original_filenames_in_this_post .add (current_api_original_filename )
if not files_to_download_info_list :
self .logger (f" All files for post {post_id } were duplicate original names or skipped earlier.")
return 0 ,total_skipped_this_post ,[],[],[]
return 0 ,total_skipped_this_post ,[],[],[], None
self .logger (f" Identified {len (files_to_download_info_list )} unique original file(s) for potential download from post {post_id }.")
with ThreadPoolExecutor (max_workers =self .num_file_threads ,thread_name_prefix =f'P{post_id }File_')as file_pool :
@@ -1773,10 +1873,10 @@ class PostProcessorWorker :
file_is_candidate_by_char_filter_scope =True
else :
if self .char_filter_scope ==CHAR_SCOPE_FILES :
for filter_item_obj in current_character_filters :
terms_to_check_for_file =list (filter_item_obj ["aliases"])
if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check_for_file :
terms_to_check_for_file .append (filter_item_obj ["name"])
for filter_item_obj in current_character_filters : # type: ignore
terms_to_check_for_file =list (filter_item_obj ["aliases"]) # type: ignore
if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check_for_file : # type: ignore
terms_to_check_for_file .append (filter_item_obj ["name"]) # type: ignore
unique_terms_for_file_check =list (set (terms_to_check_for_file ))
for term_to_match in unique_terms_for_file_check :
if is_filename_match_for_character (current_api_original_filename ,term_to_match ):
@@ -1796,10 +1896,10 @@ class PostProcessorWorker :
char_filter_info_that_matched_file =char_filter_that_matched_title
self .logger (f" File '{current_api_original_filename }' is candidate because post title matched. Scope: Both (Title part).")
else :
for filter_item_obj_both_file in current_character_filters :
terms_to_check_for_file_both =list (filter_item_obj_both_file ["aliases"])
if filter_item_obj_both_file ["is_group"]and filter_item_obj_both_file ["name"]not in terms_to_check_for_file_both :
terms_to_check_for_file_both .append (filter_item_obj_both_file ["name"])
for filter_item_obj_both_file in current_character_filters : # type: ignore
terms_to_check_for_file_both =list (filter_item_obj_both_file ["aliases"]) # type: ignore
if filter_item_obj_both_file ["is_group"]and filter_item_obj_both_file ["name"]not in terms_to_check_for_file_both : # type: ignore
terms_to_check_for_file_both .append (filter_item_obj_both_file ["name"]) # type: ignore
unique_terms_for_file_both_check =list (set (terms_to_check_for_file_both ))
for term_to_match in unique_terms_for_file_both_check :
if is_filename_match_for_character (current_api_original_filename ,term_to_match ):
@@ -1829,13 +1929,13 @@ class PostProcessorWorker :
char_title_subfolder_name =None
if self .target_post_id_from_initial_url and self .custom_folder_name :
char_title_subfolder_name =self .custom_folder_name
elif char_filter_info_that_matched_file :
char_title_subfolder_name =clean_folder_name (char_filter_info_that_matched_file ["name"])
elif char_filter_that_matched_title :
char_title_subfolder_name =clean_folder_name (char_filter_that_matched_title ["name"])
elif char_filter_that_matched_comment :
char_title_subfolder_name =clean_folder_name (char_filter_that_matched_comment ["name"])
if char_title_subfolder_name :
elif char_filter_info_that_matched_file : # type: ignore
char_title_subfolder_name =clean_folder_name (char_filter_info_that_matched_file ["name"]) # type: ignore
elif char_filter_that_matched_title : # type: ignore
char_title_subfolder_name =clean_folder_name (char_filter_that_matched_title ["name"]) # type: ignore
elif char_filter_that_matched_comment : # type: ignore
char_title_subfolder_name =clean_folder_name (char_filter_that_matched_comment ["name"]) # type: ignore
if char_title_subfolder_name : # type: ignore
target_base_folders_for_this_file_iteration .append (char_title_subfolder_name )
else :
self .logger (f"⚠️ File '{current_api_original_filename }' candidate by char filter, but no folder name derived. Using post title.")
@@ -1860,13 +1960,16 @@ class PostProcessorWorker :
manga_date_counter_to_pass =self .manga_date_file_counter_ref if self .manga_mode_active and self .manga_filename_style ==STYLE_DATE_BASED else None
manga_global_counter_to_pass =self .manga_global_file_counter_ref if self .manga_mode_active and self .manga_filename_style ==STYLE_POST_TITLE_GLOBAL_NUMBERING else None
# Pass the determined folder name for history context
folder_context_for_file = target_base_folder_name_for_instance if self.use_subfolders and target_base_folder_name_for_instance else clean_folder_name(post_title)
futures_list .append (file_pool .submit (
self ._download_single_file ,
file_info =file_info_to_dl ,
target_folder_path =current_path_for_file_instance ,
headers =headers ,original_post_id_for_log =post_id ,skip_event =self .skip_current_file_flag ,
post_title =post_title ,manga_date_file_counter_ref =manga_date_counter_to_pass ,
manga_global_file_counter_ref =manga_global_counter_to_pass ,
manga_global_file_counter_ref =manga_global_counter_to_pass, folder_context_name_for_history=folder_context_for_file,
file_index_in_post =file_idx ,num_files_in_this_post =len (files_to_download_info_list )
))
@@ -1893,18 +1996,46 @@ class PostProcessorWorker :
self .logger (f"❌ File download task for post {post_id } resulted in error: {exc_f }")
total_skipped_this_post +=1
self ._emit_signal ('file_progress',"",None )
# --- History Data Collection ---
# This part is added to collect data for the history feature.
# It's placed after the file processing loop for the post.
if not self.extract_links_only and (total_downloaded_this_post > 0 or not ( # Condition: if not extract_links_only AND (files were downloaded OR post wasn't skipped at very start by title/char filter)
(current_character_filters and (
(self.char_filter_scope == CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match) or
(self.char_filter_scope == CHAR_SCOPE_COMMENTS and not post_is_candidate_by_file_char_match_in_comment_scope and not post_is_candidate_by_comment_char_match)
)) or
(self.skip_words_list and (self.skip_words_scope == SKIP_SCOPE_POSTS or self.skip_words_scope == SKIP_SCOPE_BOTH) and any(sw.lower() in post_title.lower() for sw in self.skip_words_list))
)):
top_file_name_for_history = "N/A"
if post_main_file_info and post_main_file_info.get('name'):
top_file_name_for_history = post_main_file_info['name']
elif post_attachments and post_attachments[0].get('name'):
top_file_name_for_history = post_attachments[0]['name']
history_data_for_this_post = {
'post_title': post_title, 'post_id': post_id,
'top_file_name': top_file_name_for_history,
'num_files': num_potential_files_in_post, # Already calculated
'upload_date_str': post_data.get('published') or post_data.get('added') or "Unknown",
'download_location': determined_post_save_path_for_history, # Calculated earlier
'service': self.service, 'user_id': self.user_id,
}
if self .check_cancel ():self .logger (f" Post {post_id } processing interrupted/cancelled.");
else :self .logger (f" Post {post_id } Summary: Downloaded={total_downloaded_this_post }, Skipped Files={total_skipped_this_post }")
return total_downloaded_this_post ,total_skipped_this_post ,kept_original_filenames_for_log ,retryable_failures_this_post ,permanent_failures_this_post
return total_downloaded_this_post ,total_skipped_this_post ,kept_original_filenames_for_log ,retryable_failures_this_post ,permanent_failures_this_post, history_data_for_this_post
class DownloadThread (QThread ):
progress_signal =pyqtSignal (str )
add_character_prompt_signal =pyqtSignal (str )
file_download_status_signal =pyqtSignal (bool )
finished_signal =pyqtSignal (int ,int ,bool ,list )
external_link_signal =pyqtSignal (str ,str ,str ,str ,str )
file_successfully_downloaded_signal = pyqtSignal(dict) # Relay from worker
file_progress_signal =pyqtSignal (str ,object )
retryable_file_failed_signal =pyqtSignal (list )
missed_character_post_signal =pyqtSignal (str ,str )
post_processed_for_history_signal = pyqtSignal(dict) # New signal for history data
final_history_entries_signal = pyqtSignal(list) # New signal for the final 3 history entries
permanent_file_failed_signal =pyqtSignal (list )
def __init__ (self ,api_url_input ,output_dir ,known_names_copy ,
cancellation_event ,
@@ -1987,6 +2118,7 @@ class DownloadThread (QThread ):
self .scan_content_for_images =scan_content_for_images
self .creator_download_folder_ignore_words =creator_download_folder_ignore_words
self .manga_global_file_counter_ref =manga_global_file_counter_ref
self.history_candidates_buffer = deque(maxlen=8) # Buffer for the first 8 posts
if self .compress_images and Image is None :
self .logger ("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
self .compress_images =False
@@ -2052,9 +2184,10 @@ class DownloadThread (QThread ):
worker_signals_obj .file_progress_signal .connect (self .file_progress_signal )
worker_signals_obj .external_link_signal .connect (self .external_link_signal )
worker_signals_obj .missed_character_post_signal .connect (self .missed_character_post_signal )
worker_signals_obj.file_successfully_downloaded_signal.connect(self.file_successfully_downloaded_signal) # Connect new signal
self .logger (" Starting post fetch (single-threaded download process)...")
post_generator =download_from_api (
self .api_url_input ,
self .api_url_input , # type: ignore
logger =self .logger ,
start_page =self .start_page ,
end_page =self .end_page ,
@@ -2064,7 +2197,8 @@ class DownloadThread (QThread ):
use_cookie =self .use_cookie ,
cookie_text =self .cookie_text ,
selected_cookie_file =self .selected_cookie_file ,
app_base_dir =self .app_base_dir
app_base_dir =self .app_base_dir ,
manga_filename_style_for_sort_check =self .manga_filename_style if self .manga_mode_active else None
)
for posts_batch_data in post_generator :
if self ._check_pause_self ("Post batch processing"):was_process_cancelled =True ;break
@@ -2116,13 +2250,16 @@ class DownloadThread (QThread ):
creator_download_folder_ignore_words =self .creator_download_folder_ignore_words ,
)
try :
dl_count ,skip_count ,kept_originals_this_post ,retryable_failures ,permanent_failures =post_processing_worker .process ()
dl_count ,skip_count ,kept_originals_this_post ,retryable_failures ,permanent_failures, history_data =post_processing_worker .process ()
grand_total_downloaded_files +=dl_count
grand_total_skipped_files +=skip_count
if kept_originals_this_post :
grand_list_of_kept_original_filenames .extend (kept_originals_this_post )
if retryable_failures :
self .retryable_file_failed_signal .emit (retryable_failures )
if history_data: # New: Handle history data from worker
if len(self.history_candidates_buffer) < 8:
self.post_processed_for_history_signal.emit(history_data) # Emit for App to handle
if permanent_failures :
self .permanent_file_failed_signal .emit (permanent_failures )
except Exception as proc_err :
@@ -2138,6 +2275,10 @@ class DownloadThread (QThread ):
if was_process_cancelled :break
if not was_process_cancelled and not self .isInterruptionRequested ():
self .logger ("✅ All posts processed or end of content reached by DownloadThread.")
# Process history candidates at the end of the thread's run
# This part is now handled by DownloaderApp for both single and multi-thread
except Exception as main_thread_err :
self .logger (f"\n❌ Critical error within DownloadThread run loop: {main_thread_err }")
traceback .print_exc ()
@@ -2150,6 +2291,7 @@ class DownloadThread (QThread ):
worker_signals_obj .external_link_signal .disconnect (self .external_link_signal )
worker_signals_obj .file_progress_signal .disconnect (self .file_progress_signal )
worker_signals_obj .missed_character_post_signal .disconnect (self .missed_character_post_signal )
worker_signals_obj.file_successfully_downloaded_signal.disconnect(self.file_successfully_downloaded_signal) # Disconnect new signal
except (TypeError ,RuntimeError )as e :
self .logger (f" Note during DownloadThread signal disconnection: {e }")