From 539e76aa9efa3928fbf863165630f372c5653512 Mon Sep 17 00:00:00 2001
From: Yuvi9587 <114073886+Yuvi9587@users.noreply.github.com>
Date: Tue, 15 Jul 2025 21:09:16 -0700
Subject: [PATCH] Delete workers.py
---
workers.py | 2063 ----------------------------------------------------
1 file changed, 2063 deletions(-)
delete mode 100644 workers.py
diff --git a/workers.py b/workers.py
deleted file mode 100644
index d541cb3..0000000
--- a/workers.py
+++ /dev/null
@@ -1,2063 +0,0 @@
-# --- Standard Library Imports ---
-import os
-import queue
-import re
-import threading
-import time
-import traceback
-import uuid
-import http
-import html
-import json
-from collections import deque
-import hashlib
-from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError, Future
-from io import BytesIO
-from urllib .parse import urlparse
-import requests
-# --- Third-Party Library Imports ---
-try:
- from PIL import Image
-except ImportError:
- Image = None
-#
-try:
- from fpdf import FPDF
- # Add a simple class to handle the header/footer for stories
- class PDF(FPDF):
- def header(self):
- pass # No header
- def footer(self):
- self.set_y(-15)
- self.set_font('Arial', 'I', 8)
- self.cell(0, 10, 'Page %s' % self.page_no(), 0, 0, 'C')
-
-except ImportError:
- FPDF = None
-
-try:
- from docx import Document
-except ImportError:
- Document = None
-
-# --- PyQt5 Imports ---
-from PyQt5 .QtCore import Qt ,QThread ,pyqtSignal ,QMutex ,QMutexLocker ,QObject ,QTimer ,QSettings ,QStandardPaths ,QCoreApplication ,QUrl ,QSize ,QProcess
-# --- Local Application Imports ---
-from .api_client import download_from_api, fetch_post_comments
-from ..services.multipart_downloader import download_file_in_parts, MULTIPART_DOWNLOADER_AVAILABLE
-from ..services.drive_downloader import (
- download_mega_file, download_gdrive_file, download_dropbox_file
-)
-# Corrected Imports:
-from ..utils.file_utils import (
- is_image, is_video, is_zip, is_rar, is_archive, is_audio, KNOWN_NAMES,
- clean_filename, clean_folder_name
-)
-from ..utils.network_utils import prepare_cookies_for_request, get_link_platform
-from ..utils.text_utils import (
- is_title_match_for_character, is_filename_match_for_character, strip_html_tags,
- extract_folder_name_from_title, # This was the function causing the error
- match_folders_from_title, match_folders_from_filename_enhanced
-)
-from ..config.constants import *
-
-class PostProcessorSignals (QObject ):
- progress_signal =pyqtSignal (str )
- file_download_status_signal =pyqtSignal (bool )
- external_link_signal =pyqtSignal (str ,str ,str ,str ,str )
- file_progress_signal =pyqtSignal (str ,object )
- file_successfully_downloaded_signal =pyqtSignal (dict )
- missed_character_post_signal =pyqtSignal (str ,str )
- worker_finished_signal = pyqtSignal(tuple)
-
-class PostProcessorWorker:
- def __init__ (self ,post_data ,download_root ,known_names ,
- filter_character_list ,emitter ,
- unwanted_keywords ,filter_mode ,skip_zip ,skip_rar ,
- use_subfolders ,use_post_subfolders ,target_post_id_from_initial_url ,custom_folder_name ,
- compress_images ,download_thumbnails ,service ,user_id ,pause_event ,
- api_url_input ,cancellation_event ,
- downloaded_files ,downloaded_file_hashes ,downloaded_files_lock ,downloaded_file_hashes_lock ,
- dynamic_character_filter_holder =None ,skip_words_list =None ,
- skip_words_scope =SKIP_SCOPE_FILES ,
- show_external_links =False ,
- extract_links_only =False ,
- num_file_threads =4 ,skip_current_file_flag =None ,
- manga_mode_active =False ,
- manga_filename_style =STYLE_POST_TITLE ,
- char_filter_scope =CHAR_SCOPE_FILES ,
- remove_from_filename_words_list =None ,
- allow_multipart_download =True ,
- cookie_text ="",
- use_cookie =False ,
- override_output_dir =None ,
- selected_cookie_file =None ,
- app_base_dir =None ,
- manga_date_prefix =MANGA_DATE_PREFIX_DEFAULT ,
- manga_date_file_counter_ref =None ,
- scan_content_for_images =False ,
- creator_download_folder_ignore_words =None ,
- manga_global_file_counter_ref =None ,
- use_date_prefix_for_subfolder=False,
- keep_in_post_duplicates=False,
- session_file_path=None,
- session_lock=None,
- text_only_scope=None,
- text_export_format='txt',
- single_pdf_mode=False,
- project_root_dir=None,
- ):
- self .post =post_data
- self .download_root =download_root
- self .known_names =known_names
- self .filter_character_list_objects_initial =filter_character_list if filter_character_list else []
- self .dynamic_filter_holder =dynamic_character_filter_holder
- self .unwanted_keywords =unwanted_keywords if unwanted_keywords is not None else set ()
- self .filter_mode =filter_mode
- self .skip_zip =skip_zip
- self .skip_rar =skip_rar
- self .use_subfolders =use_subfolders
- self .use_post_subfolders =use_post_subfolders
- self .target_post_id_from_initial_url =target_post_id_from_initial_url
- self .custom_folder_name =custom_folder_name
- self .compress_images =compress_images
- self .download_thumbnails =download_thumbnails
- self .service =service
- self .user_id =user_id
- self .api_url_input =api_url_input
- self .cancellation_event =cancellation_event
- self .pause_event =pause_event
- self .emitter =emitter
- if not self .emitter :
- raise ValueError ("PostProcessorWorker requires an emitter (signals object or queue).")
- self .skip_current_file_flag =skip_current_file_flag
- self .downloaded_files =downloaded_files if downloaded_files is not None else set ()
- self .downloaded_file_hashes =downloaded_file_hashes if downloaded_file_hashes is not None else set ()
- self .downloaded_files_lock =downloaded_files_lock if downloaded_files_lock is not None else threading .Lock ()
- self .downloaded_file_hashes_lock =downloaded_file_hashes_lock if downloaded_file_hashes_lock is not None else threading .Lock ()
- self .skip_words_list =skip_words_list if skip_words_list is not None else []
- self .skip_words_scope =skip_words_scope
- self .show_external_links =show_external_links
- self .extract_links_only =extract_links_only
- self .num_file_threads =num_file_threads
- self .manga_mode_active =manga_mode_active
- self .manga_filename_style =manga_filename_style
- self .char_filter_scope =char_filter_scope
- self .remove_from_filename_words_list =remove_from_filename_words_list if remove_from_filename_words_list is not None else []
- self .allow_multipart_download =allow_multipart_download
- self .manga_date_file_counter_ref =manga_date_file_counter_ref
- self .selected_cookie_file =selected_cookie_file
- self .app_base_dir =app_base_dir
- self .cookie_text =cookie_text
- self .manga_date_prefix =manga_date_prefix
- self .manga_global_file_counter_ref =manga_global_file_counter_ref
- self .use_cookie =use_cookie
- self .override_output_dir =override_output_dir
- self .scan_content_for_images =scan_content_for_images
- self .creator_download_folder_ignore_words =creator_download_folder_ignore_words
- self.use_date_prefix_for_subfolder = use_date_prefix_for_subfolder
- self.keep_in_post_duplicates = keep_in_post_duplicates
- self.session_file_path = session_file_path
- self.session_lock = session_lock
- self.text_only_scope = text_only_scope
- self.text_export_format = text_export_format
- self.single_pdf_mode = single_pdf_mode # <-- ADD THIS LINE
- self.project_root_dir = project_root_dir
- if self .compress_images and Image is None :
-
- self .logger ("⚠️ Image compression disabled: Pillow library not found.")
- self .compress_images =False
- def _emit_signal (self ,signal_type_str ,*payload_args ):
- """Helper to emit signal either directly or via queue."""
- if isinstance (self .emitter ,queue .Queue ):
- self .emitter .put ({'type':signal_type_str ,'payload':payload_args })
- elif self .emitter and hasattr (self .emitter ,f"{signal_type_str }_signal"):
- signal_attr =getattr (self .emitter ,f"{signal_type_str }_signal")
- signal_attr .emit (*payload_args )
- else :
- print (f"(Worker Log - Unrecognized Emitter for {signal_type_str }): {payload_args [0 ]if payload_args else ''}")
- def logger (self ,message ):
- self ._emit_signal ('progress',message )
- def check_cancel (self ):
- return self .cancellation_event .is_set ()
- def _check_pause (self ,context_message ="Operation"):
- if self .pause_event and self .pause_event .is_set ():
- self .logger (f" {context_message } paused...")
- while self .pause_event .is_set ():
- if self .check_cancel ():
- self .logger (f" {context_message } cancelled while paused.")
- return True
- time .sleep (0.5 )
- if not self .check_cancel ():self .logger (f" {context_message } resumed.")
- return False
- def _download_single_file (self ,file_info ,target_folder_path ,headers ,original_post_id_for_log ,skip_event ,
- post_title ="",file_index_in_post =0 ,num_files_in_this_post =1 ,
- manga_date_file_counter_ref =None ):
- was_original_name_kept_flag =False
-
- final_filename_saved_for_return =""
- def _get_current_character_filters (self ):
- if self .dynamic_filter_holder :
- return self .dynamic_filter_holder .get_filters ()
- return self .filter_character_list_objects_initial
-
- def _download_single_file (self ,file_info ,target_folder_path ,headers ,original_post_id_for_log ,skip_event ,
- post_title ="",file_index_in_post =0 ,num_files_in_this_post =1 ,
- manga_date_file_counter_ref =None ,
- forced_filename_override =None ,
- manga_global_file_counter_ref =None ,folder_context_name_for_history =None ):
- was_original_name_kept_flag =False
- final_filename_saved_for_return =""
- retry_later_details =None
-
-
-
- if self ._check_pause (f"File download prep for '{file_info .get ('name','unknown file')}'"):return 0 ,1 ,"",False
- if self .check_cancel ()or (skip_event and skip_event .is_set ()):return 0 ,1 ,"",False
-
-
-
- file_url =file_info .get ('url')
- cookies_to_use_for_file =None
- if self .use_cookie :
-
- cookies_to_use_for_file =prepare_cookies_for_request (self .use_cookie ,self .cookie_text ,self .selected_cookie_file ,self .app_base_dir ,self .logger )
-
-
- api_original_filename =file_info .get ('_original_name_for_log',file_info .get ('name'))
-
-
- filename_to_save_in_main_path =""
- if forced_filename_override :
- filename_to_save_in_main_path =forced_filename_override
- self .logger (f" Retrying with forced filename: '{filename_to_save_in_main_path }'")
- else :
-
- if self .skip_words_list and (self .skip_words_scope ==SKIP_SCOPE_FILES or self .skip_words_scope ==SKIP_SCOPE_BOTH ):
- filename_to_check_for_skip_words =api_original_filename .lower ()
- for skip_word in self .skip_words_list :
- if skip_word .lower ()in filename_to_check_for_skip_words :
- self .logger (f" -> Skip File (Keyword in Original Name '{skip_word }'): '{api_original_filename }'. Scope: {self .skip_words_scope }")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
-
- cleaned_original_api_filename =clean_filename (api_original_filename )
-
- original_filename_cleaned_base ,original_ext =os .path .splitext (cleaned_original_api_filename )
-
- if not original_ext .startswith ('.'):original_ext ='.'+original_ext if original_ext else ''
- if self .manga_mode_active :
-
- if self .manga_filename_style ==STYLE_ORIGINAL_NAME :
- filename_to_save_in_main_path =cleaned_original_api_filename
- if self .manga_date_prefix and self .manga_date_prefix .strip ():
- cleaned_prefix =clean_filename (self .manga_date_prefix .strip ())
- if cleaned_prefix :
- filename_to_save_in_main_path =f"{cleaned_prefix } {filename_to_save_in_main_path }"
- else :
- self .logger (f"⚠️ Manga Original Name Mode: Provided prefix '{self .manga_date_prefix }' was empty after cleaning. Using original name only.")
- was_original_name_kept_flag =True
- elif self .manga_filename_style ==STYLE_POST_TITLE :
- if post_title and post_title .strip ():
- cleaned_post_title_base =clean_filename (post_title .strip ())
- if num_files_in_this_post >1 :
- if file_index_in_post ==0 :
- filename_to_save_in_main_path =f"{cleaned_post_title_base }{original_ext }"
- else :
- filename_to_save_in_main_path =f"{cleaned_post_title_base }_{file_index_in_post }{original_ext }"
- was_original_name_kept_flag =False
- else :
- filename_to_save_in_main_path =f"{cleaned_post_title_base }{original_ext }"
- else :
- filename_to_save_in_main_path =cleaned_original_api_filename
- self .logger (f"⚠️ Manga mode (Post Title Style): Post title missing for post {original_post_id_for_log }. Using cleaned original filename '{filename_to_save_in_main_path }'.")
- elif self .manga_filename_style ==STYLE_DATE_BASED :
- current_thread_name =threading .current_thread ().name
- if manga_date_file_counter_ref is not None and len (manga_date_file_counter_ref )==2 :
- counter_val_for_filename =-1
- counter_lock =manga_date_file_counter_ref [1 ]
-
- with counter_lock :
- counter_val_for_filename =manga_date_file_counter_ref [0 ]
- manga_date_file_counter_ref [0 ]+=1
-
- base_numbered_name =f"{counter_val_for_filename :03d}"
- if self .manga_date_prefix and self .manga_date_prefix .strip ():
- cleaned_prefix =clean_filename (self .manga_date_prefix .strip ())
- if cleaned_prefix :
- filename_to_save_in_main_path =f"{cleaned_prefix } {base_numbered_name }{original_ext }"
- else :
- filename_to_save_in_main_path =f"{base_numbered_name }{original_ext }";self .logger (f"⚠️ Manga Date Mode: Provided prefix '{self .manga_date_prefix }' was empty after cleaning. Using number only.")
- else :
- filename_to_save_in_main_path =f"{base_numbered_name }{original_ext }"
- else :
- self .logger (f"⚠️ Manga Date Mode: Counter ref not provided or malformed for '{api_original_filename }'. Using original. Ref: {manga_date_file_counter_ref }")
- filename_to_save_in_main_path =cleaned_original_api_filename
- elif self .manga_filename_style ==STYLE_POST_TITLE_GLOBAL_NUMBERING :
- if manga_global_file_counter_ref is not None and len (manga_global_file_counter_ref )==2 :
- counter_val_for_filename =-1
- counter_lock =manga_global_file_counter_ref [1 ]
-
- with counter_lock :
- counter_val_for_filename =manga_global_file_counter_ref [0 ]
- manga_global_file_counter_ref [0 ]+=1
-
- cleaned_post_title_base_for_global =clean_filename (post_title .strip ()if post_title and post_title .strip ()else "post")
- filename_to_save_in_main_path =f"{cleaned_post_title_base_for_global }_{counter_val_for_filename :03d}{original_ext }"
- else :
- self .logger (f"⚠️ Manga Title+GlobalNum Mode: Counter ref not provided or malformed for '{api_original_filename }'. Using original. Ref: {manga_global_file_counter_ref }")
- filename_to_save_in_main_path =cleaned_original_api_filename
- self .logger (f"⚠️ Manga mode (Title+GlobalNum Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path }' for post {original_post_id_for_log }.")
- elif self.manga_filename_style == STYLE_POST_ID:
- if original_post_id_for_log and original_post_id_for_log != 'unknown_id':
- base_name = str(original_post_id_for_log)
- # Always append the file index for consistency (e.g., xxxxxx_0, xxxxxx_1)
- filename_to_save_in_main_path = f"{base_name}_{file_index_in_post}{original_ext}"
- else:
- # Fallback if post_id is somehow not available
- self.logger(f"⚠️ Manga mode (Post ID Style): Post ID missing. Using cleaned original filename '{cleaned_original_api_filename}'.")
- filename_to_save_in_main_path = cleaned_original_api_filename
- elif self .manga_filename_style ==STYLE_DATE_POST_TITLE :
- published_date_str =self .post .get ('published')
- added_date_str =self .post .get ('added')
- formatted_date_str ="nodate"
-
- if published_date_str :
- try :
- formatted_date_str =published_date_str .split ('T')[0 ]
- except Exception :
- self .logger (f" ⚠️ Could not parse 'published' date '{published_date_str }' for STYLE_DATE_POST_TITLE. Using 'nodate'.")
- elif added_date_str :
- try :
- formatted_date_str =added_date_str .split ('T')[0 ]
- self .logger (f" ⚠️ Post ID {original_post_id_for_log } missing 'published' date, using 'added' date '{added_date_str }' for STYLE_DATE_POST_TITLE naming.")
- except Exception :
- self .logger (f" ⚠️ Could not parse 'added' date '{added_date_str }' for STYLE_DATE_POST_TITLE. Using 'nodate'.")
- else :
- self .logger (f" ⚠️ Post ID {original_post_id_for_log } missing both 'published' and 'added' dates for STYLE_DATE_POST_TITLE. Using 'nodate'.")
-
- if post_title and post_title .strip ():
- temp_cleaned_title =clean_filename (post_title .strip ())
- if not temp_cleaned_title or temp_cleaned_title .startswith ("untitled_file"):
- self .logger (f"⚠️ Manga mode (Date+PostTitle Style): Post title for post {original_post_id_for_log } ('{post_title }') was empty or generic after cleaning. Using 'post' as title part.")
- cleaned_post_title_for_filename ="post"
- else :
- cleaned_post_title_for_filename =temp_cleaned_title
-
- base_name_for_style =f"{formatted_date_str }_{cleaned_post_title_for_filename }"
-
- if num_files_in_this_post >1 :
- filename_to_save_in_main_path =f"{base_name_for_style }_{file_index_in_post }{original_ext }"if file_index_in_post >0 else f"{base_name_for_style }{original_ext }"
- else :
- filename_to_save_in_main_path =f"{base_name_for_style }{original_ext }"
- else :
- self .logger (f"⚠️ Manga mode (Date+PostTitle Style): Post title missing for post {original_post_id_for_log }. Using 'post' as title part with date prefix.")
- cleaned_post_title_for_filename ="post"
- base_name_for_style =f"{formatted_date_str }_{cleaned_post_title_for_filename }"
- if num_files_in_this_post >1 :
- filename_to_save_in_main_path =f"{base_name_for_style }_{file_index_in_post }{original_ext }"if file_index_in_post >0 else f"{base_name_for_style }{original_ext }"
- else :
- filename_to_save_in_main_path =f"{base_name_for_style }{original_ext }"
- self .logger (f"⚠️ Manga mode (Title+GlobalNum Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path }' for post {original_post_id_for_log }.")
- else :
- self .logger (f"⚠️ Manga mode: Unknown filename style '{self .manga_filename_style }'. Defaulting to original filename for '{api_original_filename }'.")
- filename_to_save_in_main_path =cleaned_original_api_filename
- if not filename_to_save_in_main_path :
- filename_to_save_in_main_path =f"manga_file_{original_post_id_for_log }_{file_index_in_post +1 }{original_ext }"
- self .logger (f"⚠️ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save_in_main_path }'.")
- was_original_name_kept_flag =False
- else :
-
- filename_to_save_in_main_path =cleaned_original_api_filename
- was_original_name_kept_flag =False
-
-
-
- if self .remove_from_filename_words_list and filename_to_save_in_main_path :
-
- base_name_for_removal ,ext_for_removal =os .path .splitext (filename_to_save_in_main_path )
- modified_base_name =base_name_for_removal
- for word_to_remove in self .remove_from_filename_words_list :
- if not word_to_remove :continue
- pattern =re .compile (re .escape (word_to_remove ),re .IGNORECASE )
- modified_base_name =pattern .sub ("",modified_base_name )
- modified_base_name =re .sub (r'[_.\s-]+',' ',modified_base_name )
- modified_base_name =re .sub (r'\s+',' ',modified_base_name )
- modified_base_name =modified_base_name .strip ()
- if modified_base_name and modified_base_name !=ext_for_removal .lstrip ('.'):
- filename_to_save_in_main_path =modified_base_name +ext_for_removal
- else :
- filename_to_save_in_main_path =base_name_for_removal +ext_for_removal
-
-
-
- if not self .download_thumbnails :
-
- is_img_type =is_image (api_original_filename )
- is_vid_type =is_video (api_original_filename )
- is_archive_type =is_archive (api_original_filename )
- is_audio_type =is_audio (api_original_filename )
- if self .filter_mode =='archive':
- if not is_archive_type :
- self .logger (f" -> Filter Skip (Archive Mode): '{api_original_filename }' (Not an Archive).")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
- elif self .filter_mode =='image':
- if not is_img_type :
- self .logger (f" -> Filter Skip: '{api_original_filename }' (Not Image).")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
- elif self .filter_mode =='video':
- if not is_vid_type :
- self .logger (f" -> Filter Skip: '{api_original_filename }' (Not Video).")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
- elif self .filter_mode =='audio':
- if not is_audio_type :
- self .logger (f" -> Filter Skip: '{api_original_filename }' (Not Audio).")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
- if self .skip_zip and is_zip (api_original_filename ):
- self .logger (f" -> Pref Skip: '{api_original_filename }' (ZIP).")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
- if self .skip_rar and is_rar (api_original_filename ):
- self .logger (f" -> Pref Skip: '{api_original_filename }' (RAR).")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
-
-
-
- try :
- os .makedirs (target_folder_path ,exist_ok =True )
-
- except OSError as e :
- self .logger (f" ❌ Critical error creating directory '{target_folder_path }': {e }. Skipping file '{api_original_filename }'.")
- return 0 ,1 ,api_original_filename ,False ,FILE_DOWNLOAD_STATUS_SKIPPED ,None
-
-
-
-
-
- temp_file_base_for_unique_part ,temp_file_ext_for_unique_part =os .path .splitext (filename_to_save_in_main_path if filename_to_save_in_main_path else api_original_filename )
- unique_id_for_part_file =uuid .uuid4 ().hex [:8 ]
- unique_part_file_stem_on_disk =f"{temp_file_base_for_unique_part }_{unique_id_for_part_file }"
- max_retries =3
- retry_delay =5
- downloaded_size_bytes =0
- calculated_file_hash =None
- downloaded_part_file_path =None
- was_multipart_download =False
- total_size_bytes =0
- download_successful_flag =False
- last_exception_for_retry_later =None
-
- response_for_this_attempt =None
- for attempt_num_single_stream in range (max_retries +1 ):
- response_for_this_attempt =None
- if self ._check_pause (f"File download attempt for '{api_original_filename }'"):break
- if self .check_cancel ()or (skip_event and skip_event .is_set ()):break
- try :
- if attempt_num_single_stream >0 :
- self .logger (f" Retrying download for '{api_original_filename }' (Overall Attempt {attempt_num_single_stream +1 }/{max_retries +1 })...")
- time .sleep (retry_delay *(2 **(attempt_num_single_stream -1 )))
- self ._emit_signal ('file_download_status',True )
- response =requests .get (file_url ,headers =headers ,timeout =(15 ,300 ),stream =True ,cookies =cookies_to_use_for_file )
- response .raise_for_status ()
- total_size_bytes =int (response .headers .get ('Content-Length',0 ))
- num_parts_for_file =min (self .num_file_threads ,MAX_PARTS_FOR_MULTIPART_DOWNLOAD )
- attempt_multipart =(self .allow_multipart_download and MULTIPART_DOWNLOADER_AVAILABLE and
- num_parts_for_file >1 and total_size_bytes >MIN_SIZE_FOR_MULTIPART_DOWNLOAD and
- 'bytes'in response .headers .get ('Accept-Ranges','').lower ())
- if self ._check_pause (f"Multipart decision for '{api_original_filename }'"):break
-
- if attempt_multipart :
- if response_for_this_attempt :
- response_for_this_attempt .close ()
- response_for_this_attempt =None
-
-
-
-
-
- mp_save_path_for_unique_part_stem_arg =os .path .join (target_folder_path ,f"{unique_part_file_stem_on_disk }{temp_file_ext_for_unique_part }")
- mp_success ,mp_bytes ,mp_hash ,mp_file_handle =download_file_in_parts (
- file_url ,mp_save_path_for_unique_part_stem_arg ,total_size_bytes ,num_parts_for_file ,headers ,api_original_filename ,
- emitter_for_multipart =self .emitter ,cookies_for_chunk_session =cookies_to_use_for_file ,
- cancellation_event =self .cancellation_event ,skip_event =skip_event ,logger_func =self .logger ,
- pause_event =self .pause_event
- )
- if mp_success :
- download_successful_flag =True
- downloaded_size_bytes =mp_bytes
- calculated_file_hash =mp_hash
-
-
-
- downloaded_part_file_path =mp_save_path_for_unique_part_stem_arg +".part"
- was_multipart_download =True
- if mp_file_handle :mp_file_handle .close ()
- break
- else :
- if attempt_num_single_stream
tags found. Falling back to basic HTML cleaning for the whole block.")
- text_with_br = re.sub(r'
', '\n', raw_text_content, flags=re.IGNORECASE)
- cleaned_text = re.sub(r'<.*?>', '', text_with_br)
- else:
- cleaned_paragraphs_list = []
- for p_content in html_paragraphs:
- p_with_br = re.sub(r'
', '\n', p_content, flags=re.IGNORECASE)
- p_cleaned = re.sub(r'<.*?>', '', p_with_br)
- p_final = html.unescape(p_cleaned).strip()
- if p_final:
- cleaned_paragraphs_list.append(p_final)
- cleaned_text = '\n\n'.join(cleaned_paragraphs_list)
- cleaned_text = cleaned_text.replace('…', '...')
-
- # --- Logic for Single PDF Mode (File-based) ---
- if self.single_pdf_mode:
- if not cleaned_text:
- return 0, 0, [], [], [], None, None
-
- content_data = {
- 'title': post_title,
- 'content': cleaned_text,
- 'published': self.post.get('published') or self.post.get('added')
- }
- temp_dir = os.path.join(self.app_base_dir, "appdata")
- os.makedirs(temp_dir, exist_ok=True)
- temp_filename = f"tmp_{post_id}_{uuid.uuid4().hex[:8]}.json"
- temp_filepath = os.path.join(temp_dir, temp_filename)
-
- try:
- with open(temp_filepath, 'w', encoding='utf-8') as f:
- json.dump(content_data, f, indent=2)
- self.logger(f" Saved temporary text for '{post_title}' for single PDF compilation.")
- return 0, 0, [], [], [], None, temp_filepath
- except Exception as e:
- self.logger(f" ❌ Failed to write temporary file for single PDF: {e}")
- return 0, 0, [], [], [], None, None
-
- # --- Logic for Individual File Saving ---
- else:
- file_extension = self.text_export_format
- txt_filename = clean_filename(post_title) + f".{file_extension}"
- final_save_path = os.path.join(determined_post_save_path_for_history, txt_filename)
-
- try:
- os.makedirs(determined_post_save_path_for_history, exist_ok=True)
- base, ext = os.path.splitext(final_save_path)
- counter = 1
- while os.path.exists(final_save_path):
- final_save_path = f"{base}_{counter}{ext}"
- counter += 1
-
- if file_extension == 'pdf':
- if FPDF:
- self.logger(f" Converting to PDF...")
- pdf = PDF()
- font_path = ""
- if self.project_root_dir:
- font_path = os.path.join(self.project_root_dir, 'data', 'dejavu-sans', 'DejaVuSans.ttf')
- try:
- if not os.path.exists(font_path): raise RuntimeError(f"Font file not found: {font_path}")
- pdf.add_font('DejaVu', '', font_path, uni=True)
- pdf.set_font('DejaVu', '', 12)
- except Exception as font_error:
- self.logger(f" ⚠️ Could not load DejaVu font: {font_error}. Falling back to Arial.")
- pdf.set_font('Arial', '', 12)
- pdf.add_page()
- pdf.multi_cell(0, 5, cleaned_text)
- pdf.output(final_save_path)
- else:
- self.logger(f" ⚠️ Cannot create PDF: 'fpdf2' library not installed. Saving as .txt.")
- final_save_path = os.path.splitext(final_save_path)[0] + ".txt"
- with open(final_save_path, 'w', encoding='utf-8') as f: f.write(cleaned_text)
-
- elif file_extension == 'docx':
- if Document:
- self.logger(f" Converting to DOCX...")
- document = Document()
- document.add_paragraph(cleaned_text)
- document.save(final_save_path)
- else:
- self.logger(f" ⚠️ Cannot create DOCX: 'python-docx' library not installed. Saving as .txt.")
- final_save_path = os.path.splitext(final_save_path)[0] + ".txt"
- with open(final_save_path, 'w', encoding='utf-8') as f: f.write(cleaned_text)
-
- else: # Default to TXT
- with open(final_save_path, 'w', encoding='utf-8') as f:
- f.write(cleaned_text)
-
- self.logger(f"✅ Saved Text: '{os.path.basename(final_save_path)}' in '{os.path.basename(determined_post_save_path_for_history)}'")
- return 1, num_potential_files_in_post, [], [], [], history_data_for_this_post, None
- except Exception as e:
- self.logger(f" ❌ Critical error saving text file '{txt_filename}': {e}")
- return 0, num_potential_files_in_post, [], [], [], None, None
-
- if not self .extract_links_only and self .use_subfolders and self .skip_words_list :
- if self ._check_pause (f"Folder keyword skip check for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[],None
- for folder_name_to_check in base_folder_names_for_post_content :
- if not folder_name_to_check :continue
- if any (skip_word .lower ()in folder_name_to_check .lower ()for skip_word in self .skip_words_list ):
- matched_skip =next ((sw for sw in self .skip_words_list if sw .lower ()in folder_name_to_check .lower ()),"unknown_skip_word")
- self .logger (f" -> Skip Post (Folder Keyword): Potential folder '{folder_name_to_check }' contains '{matched_skip }'.")
- return 0 ,num_potential_files_in_post ,[],[],[],None, None
- if (self .show_external_links or self .extract_links_only )and post_content_html :
- if self ._check_pause (f"External link extraction for post {post_id }"):return 0 ,num_potential_files_in_post ,[],[],[],None
- try :
- mega_key_pattern =re .compile (r'\b([a-zA-Z0-9_-]{43}|[a-zA-Z0-9_-]{22})\b')
- unique_links_data ={}
- for match in link_pattern .finditer (post_content_html ):
- link_url =match .group (1 ).strip ()
- link_url =html .unescape (link_url )
- link_inner_text =match .group (2 )
- if not any (ext in link_url .lower ()for ext in ['.css','.js','.ico','.xml','.svg'])and not link_url .startswith ('javascript:')and link_url not in unique_links_data :
- clean_link_text =re .sub (r'<.*?>','',link_inner_text )
- clean_link_text =html .unescape (clean_link_text ).strip ()
- display_text =clean_link_text if clean_link_text else "[Link]"
- unique_links_data [link_url ]=display_text
- links_emitted_count =0
- scraped_platforms ={'kemono','coomer','patreon'}
- for link_url ,link_text in unique_links_data .items ():
- platform =get_link_platform (link_url )
- decryption_key_found =""
- if platform =='mega':
- parsed_mega_url =urlparse (link_url )
- if parsed_mega_url .fragment :
- potential_key_from_fragment =parsed_mega_url .fragment .split ('!')[-1 ]
- if mega_key_pattern .fullmatch (potential_key_from_fragment ):
- decryption_key_found =potential_key_from_fragment
-
- if not decryption_key_found and link_text :
- key_match_in_text =mega_key_pattern .search (link_text )
- if key_match_in_text :
- decryption_key_found =key_match_in_text .group (1 )
- if not decryption_key_found and self .extract_links_only and post_content_html :
- key_match_in_content =mega_key_pattern .search (strip_html_tags (post_content_html ))
- if key_match_in_content :
- decryption_key_found =key_match_in_content .group (1 )
- if platform not in scraped_platforms :
- self ._emit_signal ('external_link',post_title ,link_text ,link_url ,platform ,decryption_key_found or "")
- links_emitted_count +=1
- if links_emitted_count >0 :self .logger (f" 🔗 Found {links_emitted_count } potential external link(s) in post content.")
- except Exception as e :self .logger (f"⚠️ Error parsing post content for links: {e }\n{traceback .format_exc (limit =2 )}")
- if self .extract_links_only :
- self .logger (f" Extract Links Only mode: Finished processing post {post_id } for links.")
- return 0 ,0 ,[],[],[],None
- all_files_from_post_api =[]
- api_file_domain =urlparse (self .api_url_input ).netloc
- if not api_file_domain or not any (d in api_file_domain .lower ()for d in ['kemono.su','kemono.party','coomer.su','coomer.party']):
- api_file_domain ="kemono.su"if "kemono"in self .service .lower ()else "coomer.party"
- if post_main_file_info and isinstance (post_main_file_info ,dict )and post_main_file_info .get ('path'):
- file_path =post_main_file_info ['path'].lstrip ('/')
- original_api_name =post_main_file_info .get ('name')or os .path .basename (file_path )
- if original_api_name :
- all_files_from_post_api .append ({
- 'url':f"https://{api_file_domain }{file_path }"if file_path .startswith ('/')else f"https://{api_file_domain }/data/{file_path }",
- 'name':original_api_name ,
- '_original_name_for_log':original_api_name ,
- '_is_thumbnail':is_image (original_api_name )
- })
- else :self .logger (f" ⚠️ Skipping main file for post {post_id }: Missing name (Path: {file_path })")
- for idx ,att_info in enumerate (post_attachments ):
- if isinstance (att_info ,dict )and att_info .get ('path'):
- att_path =att_info ['path'].lstrip ('/')
- original_api_att_name =att_info .get ('name')or os .path .basename (att_path )
- if original_api_att_name :
- all_files_from_post_api .append ({
- 'url':f"https://{api_file_domain }{att_path }"if att_path .startswith ('/')else f"https://{api_file_domain }/data/{att_path }",
- 'name':original_api_att_name ,
- '_original_name_for_log':original_api_att_name ,
- '_is_thumbnail':is_image (original_api_att_name )
- })
- else :self .logger (f" ⚠️ Skipping attachment {idx +1 } for post {post_id }: Missing name (Path: {att_path })")
- else :self .logger (f" ⚠️ Skipping invalid attachment {idx +1 } for post {post_id }: {str (att_info )[:100 ]}")
- if self .scan_content_for_images and post_content_html and not self .extract_links_only :
- self .logger (f" Scanning post content for additional image URLs (Post ID: {post_id })...")
- parsed_input_url =urlparse (self .api_url_input )
- base_url_for_relative_paths =f"{parsed_input_url .scheme }://{parsed_input_url .netloc }"
- img_ext_pattern ="|".join (ext .lstrip ('.')for ext in IMAGE_EXTENSIONS )
- direct_url_pattern_str =r"""(?i)\b(https?://[^\s"'<>\[\]\{\}\|\^\\^~\[\]`]+\.(?:"""+img_ext_pattern +r"""))\b"""
- img_tag_src_pattern_str =r"""]*?src\s*=\s*["']([^"']+)["']"""
- found_image_sources =set ()
- for direct_url_match in re .finditer (direct_url_pattern_str ,post_content_html ):
- found_image_sources .add (direct_url_match .group (1 ))
- for img_tag_match in re .finditer (img_tag_src_pattern_str ,post_content_html ,re .IGNORECASE ):
- src_attr =img_tag_match .group (1 ).strip ()
- src_attr =html .unescape (src_attr )
- if not src_attr :continue
- resolved_src_url =""
- if src_attr .startswith (('http://','https://')):
- resolved_src_url =src_attr
- elif src_attr .startswith ('//'):
- resolved_src_url =f"{parsed_input_url .scheme }:{src_attr }"
- elif src_attr .startswith ('/'):
- resolved_src_url =f"{base_url_for_relative_paths }{src_attr }"
- if resolved_src_url :
- parsed_resolved_url =urlparse (resolved_src_url )
- if any (parsed_resolved_url .path .lower ().endswith (ext )for ext in IMAGE_EXTENSIONS ):
- found_image_sources .add (resolved_src_url )
- if found_image_sources :
- self .logger (f" Found {len (found_image_sources )} potential image URLs/sources in content.")
- existing_urls_in_api_list ={f_info ['url']for f_info in all_files_from_post_api }
- for found_url in found_image_sources :
- if self .check_cancel ():break
- if found_url in existing_urls_in_api_list :
- self .logger (f" Skipping URL from content (already in API list or previously added from content): {found_url [:70 ]}...")
- continue
- try :
- parsed_found_url =urlparse (found_url )
- url_filename =os .path .basename (parsed_found_url .path )
- if not url_filename or not is_image (url_filename ):
- self .logger (f" Skipping URL from content (no filename part or not an image extension): {found_url [:70 ]}...")
- continue
- self .logger (f" Adding image from content: {url_filename } (URL: {found_url [:70 ]}...)")
- all_files_from_post_api .append ({
- 'url':found_url ,
- 'name':url_filename ,
- '_original_name_for_log':url_filename ,
- '_is_thumbnail':False ,
- '_from_content_scan':True
- })
- existing_urls_in_api_list .add (found_url )
- except Exception as e_url_parse :
- self .logger (f" Error processing URL from content '{found_url [:70 ]}...': {e_url_parse }")
- else :
- self .logger (f" No additional image URLs found in post content scan for post {post_id }.")
- if self .download_thumbnails :
- if self .scan_content_for_images :
- self .logger (f" Mode: 'Download Thumbnails Only' + 'Scan Content for Images' active. Prioritizing images from content scan for post {post_id }.")
- all_files_from_post_api =[finfo for finfo in all_files_from_post_api if finfo .get ('_from_content_scan')]
- if not all_files_from_post_api :
- self .logger (f" -> No images found via content scan for post {post_id } in this combined mode.")
- return 0 ,0 ,[],[],[],None
- else :
- self .logger (f" Mode: 'Download Thumbnails Only' active. Filtering for API thumbnails for post {post_id }.")
- all_files_from_post_api =[finfo for finfo in all_files_from_post_api if finfo .get ('_is_thumbnail')]
- if not all_files_from_post_api :
- self .logger (f" -> No API image thumbnails found for post {post_id } in thumbnail-only mode.")
- return 0 ,0 ,[],[],[],None
- if self .manga_mode_active and self .manga_filename_style ==STYLE_DATE_BASED :
- def natural_sort_key_for_files (file_api_info ):
- name =file_api_info .get ('_original_name_for_log','').lower ()
- return [int (text )if text .isdigit ()else text for text in re .split ('([0-9]+)',name )]
- all_files_from_post_api .sort (key =natural_sort_key_for_files )
- self .logger (f" Manga Date Mode: Sorted {len (all_files_from_post_api )} files within post {post_id } by original name for sequential numbering.")
- if not all_files_from_post_api :
- self .logger (f" No files found to download for post {post_id }.")
- return 0 ,0 ,[],[],[],None
- files_to_download_info_list =[]
- processed_original_filenames_in_this_post =set ()
-
- if self.keep_in_post_duplicates:
- # If we keep duplicates, just add every file to the list to be processed.
- # The downstream hash check and rename-on-collision logic will handle them.
- files_to_download_info_list.extend(all_files_from_post_api)
- self.logger(f" ℹ️ 'Keep Duplicates' is on. All {len(all_files_from_post_api)} files from post will be processed.")
- else:
- # This is the original logic that skips duplicates by name within a post.
- for file_info in all_files_from_post_api:
- current_api_original_filename = file_info.get('_original_name_for_log')
- if current_api_original_filename in processed_original_filenames_in_this_post:
- self.logger(f" -> Skip Duplicate Original Name (within post {post_id}): '{current_api_original_filename}' already processed/listed for this post.")
- total_skipped_this_post += 1
- else:
- files_to_download_info_list.append(file_info)
- if current_api_original_filename:
- processed_original_filenames_in_this_post.add(current_api_original_filename)
-
- if not files_to_download_info_list:
-
- self .logger (f" All files for post {post_id } were duplicate original names or skipped earlier.")
- return 0 ,total_skipped_this_post ,[],[],[],None
-
- self .logger (f" Identified {len (files_to_download_info_list )} unique original file(s) for potential download from post {post_id }.")
- with ThreadPoolExecutor (max_workers =self .num_file_threads ,thread_name_prefix =f'P{post_id }File_')as file_pool :
- futures_list =[]
- for file_idx ,file_info_to_dl in enumerate (files_to_download_info_list ):
- if self ._check_pause (f"File processing loop for post {post_id }, file {file_idx }"):break
- if self .check_cancel ():break
- current_api_original_filename =file_info_to_dl .get ('_original_name_for_log')
- file_is_candidate_by_char_filter_scope =False
- char_filter_info_that_matched_file =None
- if not current_character_filters :
- file_is_candidate_by_char_filter_scope =True
- else :
- if self .char_filter_scope ==CHAR_SCOPE_FILES :
- for filter_item_obj in current_character_filters :
- terms_to_check_for_file =list (filter_item_obj ["aliases"])
- if filter_item_obj ["is_group"]and filter_item_obj ["name"]not in terms_to_check_for_file :
- terms_to_check_for_file .append (filter_item_obj ["name"])
- unique_terms_for_file_check =list (set (terms_to_check_for_file ))
- for term_to_match in unique_terms_for_file_check :
- if is_filename_match_for_character (current_api_original_filename ,term_to_match ):
- file_is_candidate_by_char_filter_scope =True
- char_filter_info_that_matched_file =filter_item_obj
- self .logger (f" File '{current_api_original_filename }' matches char filter term '{term_to_match }' (from '{filter_item_obj ['name']}'). Scope: Files.")
- break
- if file_is_candidate_by_char_filter_scope :break
- elif self .char_filter_scope ==CHAR_SCOPE_TITLE :
- if post_is_candidate_by_title_char_match :
- file_is_candidate_by_char_filter_scope =True
- char_filter_info_that_matched_file =char_filter_that_matched_title
- self .logger (f" File '{current_api_original_filename }' is candidate because post title matched. Scope: Title.")
- elif self .char_filter_scope ==CHAR_SCOPE_BOTH :
- if post_is_candidate_by_title_char_match :
- file_is_candidate_by_char_filter_scope =True
- char_filter_info_that_matched_file =char_filter_that_matched_title
- self .logger (f" File '{current_api_original_filename }' is candidate because post title matched. Scope: Both (Title part).")
- else :
- for filter_item_obj_both_file in current_character_filters :
- terms_to_check_for_file_both =list (filter_item_obj_both_file ["aliases"])
- if filter_item_obj_both_file ["is_group"]and filter_item_obj_both_file ["name"]not in terms_to_check_for_file_both :
- terms_to_check_for_file_both .append (filter_item_obj_both_file ["name"])
- unique_terms_for_file_both_check =list (set (terms_to_check_for_file_both ))
- for term_to_match in unique_terms_for_file_both_check :
- if is_filename_match_for_character (current_api_original_filename ,term_to_match ):
- file_is_candidate_by_char_filter_scope =True
- char_filter_info_that_matched_file =filter_item_obj_both_file
- self .logger (f" File '{current_api_original_filename }' matches char filter term '{term_to_match }' (from '{filter_item_obj ['name']}'). Scope: Both (File part).")
- break
- if file_is_candidate_by_char_filter_scope :break
- elif self .char_filter_scope ==CHAR_SCOPE_COMMENTS :
- if post_is_candidate_by_file_char_match_in_comment_scope :
- file_is_candidate_by_char_filter_scope =True
- char_filter_info_that_matched_file =char_filter_that_matched_file_in_comment_scope
- self .logger (f" File '{current_api_original_filename }' is candidate because a file in this post matched char filter (Overall Scope: Comments).")
- elif post_is_candidate_by_comment_char_match :
- file_is_candidate_by_char_filter_scope =True
- char_filter_info_that_matched_file =char_filter_that_matched_comment
- self .logger (f" File '{current_api_original_filename }' is candidate because post comments matched char filter (Overall Scope: Comments).")
- if not file_is_candidate_by_char_filter_scope :
- self .logger (f" -> Skip File (Char Filter Scope '{self .char_filter_scope }'): '{current_api_original_filename }' no match.")
- total_skipped_this_post +=1
- continue
-
-
- target_base_folders_for_this_file_iteration =[]
-
- if current_character_filters :
- char_title_subfolder_name =None
- if self .target_post_id_from_initial_url and self .custom_folder_name :
- char_title_subfolder_name =self .custom_folder_name
- elif char_filter_info_that_matched_file :
- char_title_subfolder_name =clean_folder_name (char_filter_info_that_matched_file ["name"])
- elif char_filter_that_matched_title :
- char_title_subfolder_name =clean_folder_name (char_filter_that_matched_title ["name"])
- elif char_filter_that_matched_comment :
- char_title_subfolder_name =clean_folder_name (char_filter_that_matched_comment ["name"])
- if char_title_subfolder_name :
- target_base_folders_for_this_file_iteration .append (char_title_subfolder_name )
- else :
- self .logger (f"⚠️ File '{current_api_original_filename }' candidate by char filter, but no folder name derived. Using post title.")
- target_base_folders_for_this_file_iteration .append (clean_folder_name (post_title ))
- else :
- if base_folder_names_for_post_content :
- target_base_folders_for_this_file_iteration .extend (base_folder_names_for_post_content )
- else :
- target_base_folders_for_this_file_iteration .append (clean_folder_name (post_title ))
-
- if not target_base_folders_for_this_file_iteration :
- target_base_folders_for_this_file_iteration .append (clean_folder_name (post_title if post_title else "Uncategorized_Post_Content"))
-
- for target_base_folder_name_for_instance in target_base_folders_for_this_file_iteration :
- current_path_for_file_instance =self .override_output_dir if self .override_output_dir else self .download_root
- if self .use_subfolders and target_base_folder_name_for_instance :
- current_path_for_file_instance =os .path .join (current_path_for_file_instance ,target_base_folder_name_for_instance )
- if self .use_post_subfolders :
-
- current_path_for_file_instance =os .path .join (current_path_for_file_instance ,final_post_subfolder_name )
-
- manga_date_counter_to_pass =self .manga_date_file_counter_ref if self .manga_mode_active and self .manga_filename_style ==STYLE_DATE_BASED else None
- manga_global_counter_to_pass =self .manga_global_file_counter_ref if self .manga_mode_active and self .manga_filename_style ==STYLE_POST_TITLE_GLOBAL_NUMBERING else None
-
-
- folder_context_for_file =target_base_folder_name_for_instance if self .use_subfolders and target_base_folder_name_for_instance else clean_folder_name (post_title )
-
- futures_list .append (file_pool .submit (
- self ._download_single_file ,
- file_info =file_info_to_dl ,
- target_folder_path =current_path_for_file_instance ,
- headers =headers ,original_post_id_for_log =post_id ,skip_event =self .skip_current_file_flag ,
- post_title =post_title ,manga_date_file_counter_ref =manga_date_counter_to_pass ,
- manga_global_file_counter_ref =manga_global_counter_to_pass ,folder_context_name_for_history =folder_context_for_file ,
- file_index_in_post =file_idx ,num_files_in_this_post =len (files_to_download_info_list )
- ))
-
- for future in as_completed (futures_list ):
- if self .check_cancel ():
- for f_to_cancel in futures_list :
- if not f_to_cancel .done ():
- f_to_cancel .cancel ()
- break
- try :
- dl_count ,skip_count ,actual_filename_saved ,original_kept_flag ,status ,details_for_dialog_or_retry =future .result ()
- total_downloaded_this_post +=dl_count
- total_skipped_this_post +=skip_count
- if original_kept_flag and dl_count >0 and actual_filename_saved :
- kept_original_filenames_for_log .append (actual_filename_saved )
- if status ==FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER and details_for_dialog_or_retry :
- retryable_failures_this_post .append (details_for_dialog_or_retry )
- elif status ==FILE_DOWNLOAD_STATUS_FAILED_PERMANENTLY_THIS_SESSION and details_for_dialog_or_retry :
- permanent_failures_this_post .append (details_for_dialog_or_retry )
- except CancelledError :
- self .logger (f" File download task for post {post_id } was cancelled.")
- total_skipped_this_post +=1
- except Exception as exc_f :
- self .logger (f"❌ File download task for post {post_id } resulted in error: {exc_f }")
- total_skipped_this_post +=1
- self ._emit_signal ('file_progress',"",None )
-
- # After a post's files are all processed, update the session file to mark this post as done.
- if self.session_file_path and self.session_lock:
- try:
- with self.session_lock:
- if os.path.exists(self.session_file_path): # Only update if the session file exists
- # Read current state
- with open(self.session_file_path, 'r', encoding='utf-8') as f:
- session_data = json.load(f)
-
- if 'download_state' not in session_data:
- session_data['download_state'] = {}
-
- # Add processed ID
- if not isinstance(session_data['download_state'].get('processed_post_ids'), list):
- session_data['download_state']['processed_post_ids'] = []
- session_data['download_state']['processed_post_ids'].append(self.post.get('id'))
-
- # Add any permanent failures from this worker to the session file
- if permanent_failures_this_post:
- if not isinstance(session_data['download_state'].get('permanently_failed_files'), list):
- session_data['download_state']['permanently_failed_files'] = []
- # To avoid duplicates if the same post is somehow re-processed
- existing_failed_urls = {f.get('file_info', {}).get('url') for f in session_data['download_state']['permanently_failed_files']}
- for failure in permanent_failures_this_post:
- if failure.get('file_info', {}).get('url') not in existing_failed_urls:
- session_data['download_state']['permanently_failed_files'].append(failure)
-
- # Write to temp file and then atomically replace
- temp_file_path = self.session_file_path + ".tmp"
- with open(temp_file_path, 'w', encoding='utf-8') as f_tmp:
- json.dump(session_data, f_tmp, indent=2)
- os.replace(temp_file_path, self.session_file_path)
- except Exception as e:
- self.logger(f"⚠️ Could not update session file for post {post_id}: {e}")
-
- if not self .extract_links_only and (total_downloaded_this_post >0 or not (
- (current_character_filters and (
- (self .char_filter_scope ==CHAR_SCOPE_TITLE and not post_is_candidate_by_title_char_match )or
- (self .char_filter_scope ==CHAR_SCOPE_COMMENTS and not post_is_candidate_by_file_char_match_in_comment_scope and not post_is_candidate_by_comment_char_match )
- ))or
- (self .skip_words_list and (self .skip_words_scope ==SKIP_SCOPE_POSTS or self .skip_words_scope ==SKIP_SCOPE_BOTH )and any (sw .lower ()in post_title .lower ()for sw in self .skip_words_list ))
- )):
- top_file_name_for_history ="N/A"
- if post_main_file_info and post_main_file_info .get ('name'):
- top_file_name_for_history =post_main_file_info ['name']
- elif post_attachments and post_attachments [0 ].get ('name'):
- top_file_name_for_history =post_attachments [0 ]['name']
-
- history_data_for_this_post ={
- 'post_title':post_title ,'post_id':post_id ,
- 'top_file_name':top_file_name_for_history ,
- 'num_files':num_potential_files_in_post ,
- 'upload_date_str':post_data .get ('published')or post_data .get ('added')or "Unknown",
- 'download_location':determined_post_save_path_for_history ,
- 'service':self .service ,'user_id':self .user_id ,
- }
- if not self.check_cancel():
- self.logger(f" Post {post_id} Summary: Downloaded={total_downloaded_this_post}, Skipped Files={total_skipped_this_post}")
-
- if not self .extract_links_only and self .use_post_subfolders and total_downloaded_this_post ==0 :
-
- path_to_check_for_emptiness =determined_post_save_path_for_history
- try :
- if os .path .isdir (path_to_check_for_emptiness )and not os .listdir (path_to_check_for_emptiness ):
- self .logger (f" 🗑️ Removing empty post-specific subfolder: '{path_to_check_for_emptiness }'")
- os .rmdir (path_to_check_for_emptiness )
- except OSError as e_rmdir :
- self .logger (f" ⚠️ Could not remove empty post-specific subfolder '{path_to_check_for_emptiness }': {e_rmdir }")
-
- result_tuple = (total_downloaded_this_post, total_skipped_this_post,
- kept_original_filenames_for_log, retryable_failures_this_post,
- permanent_failures_this_post, history_data_for_this_post,
- None) # The 7th item is None because we already saved the temp file
-
- # In Single PDF mode, the 7th item is the temp file path we created.
- if self.single_pdf_mode and os.path.exists(temp_filepath):
- result_tuple = (0, 0, [], [], [], None, temp_filepath)
-
- self._emit_signal('worker_finished', result_tuple)
- return # The method now returns nothing.
-
-class DownloadThread (QThread ):
- progress_signal =pyqtSignal (str )
- add_character_prompt_signal =pyqtSignal (str )
- file_download_status_signal =pyqtSignal (bool )
- finished_signal =pyqtSignal (int ,int ,bool ,list )
- external_link_signal =pyqtSignal (str ,str ,str ,str ,str )
- file_successfully_downloaded_signal =pyqtSignal (dict )
- file_progress_signal =pyqtSignal (str ,object )
- retryable_file_failed_signal =pyqtSignal (list )
- missed_character_post_signal =pyqtSignal (str ,str )
- post_processed_for_history_signal =pyqtSignal (dict )
- final_history_entries_signal =pyqtSignal (list )
- permanent_file_failed_signal =pyqtSignal (list )
- def __init__ (self ,api_url_input ,output_dir ,known_names_copy ,
- cancellation_event ,
- pause_event ,filter_character_list =None ,dynamic_character_filter_holder =None ,
- filter_mode ='all',skip_zip =True ,skip_rar =True ,
- use_subfolders =True ,use_post_subfolders =False ,custom_folder_name =None ,compress_images =False ,
- download_thumbnails =False ,service =None ,user_id =None ,
- downloaded_files =None ,downloaded_file_hashes =None ,downloaded_files_lock =None ,downloaded_file_hashes_lock =None ,
- skip_words_list =None ,
- skip_words_scope =SKIP_SCOPE_FILES ,
- show_external_links =False ,
- extract_links_only =False ,
- num_file_threads_for_worker =1 ,
- skip_current_file_flag =None ,
- start_page =None ,end_page =None ,
- target_post_id_from_initial_url =None ,
- manga_mode_active =False ,
- unwanted_keywords =None ,
- manga_filename_style =STYLE_POST_TITLE ,
- char_filter_scope =CHAR_SCOPE_FILES ,
- remove_from_filename_words_list =None ,
- manga_date_prefix =MANGA_DATE_PREFIX_DEFAULT ,
- allow_multipart_download =True ,
- selected_cookie_file =None ,
- override_output_dir =None ,
- app_base_dir =None ,
- manga_date_file_counter_ref =None ,
- manga_global_file_counter_ref =None ,
- use_cookie =False ,
- scan_content_for_images =False ,
- creator_download_folder_ignore_words =None ,
- use_date_prefix_for_subfolder=False,
- keep_in_post_duplicates=False,
- cookie_text ="",
- session_file_path=None,
- session_lock=None,
- text_only_scope=None,
- text_export_format='txt',
- single_pdf_mode=False,
- project_root_dir=None,
- ):
- super ().__init__ ()
- self .api_url_input =api_url_input
- self .output_dir =output_dir
- self .known_names =list (known_names_copy )
- self .cancellation_event =cancellation_event
- self .pause_event =pause_event
- self .skip_current_file_flag =skip_current_file_flag
- self .initial_target_post_id =target_post_id_from_initial_url
- self .filter_character_list_objects_initial =filter_character_list if filter_character_list else []
- self .dynamic_filter_holder =dynamic_character_filter_holder
- self .filter_mode =filter_mode
- self .skip_zip =skip_zip
- self .skip_rar =skip_rar
- self .use_subfolders =use_subfolders
- self .use_post_subfolders =use_post_subfolders
- self .custom_folder_name =custom_folder_name
- self .compress_images =compress_images
- self .download_thumbnails =download_thumbnails
- self .service =service
- self .user_id =user_id
- self .skip_words_list =skip_words_list if skip_words_list is not None else []
- self .skip_words_scope =skip_words_scope
- self .downloaded_files =downloaded_files
- self .downloaded_files_lock =downloaded_files_lock
- self .downloaded_file_hashes =downloaded_file_hashes
- self .downloaded_file_hashes_lock =downloaded_file_hashes_lock
- self ._add_character_response =None
- self .prompt_mutex =QMutex ()
- self .show_external_links =show_external_links
- self .extract_links_only =extract_links_only
- self .num_file_threads_for_worker =num_file_threads_for_worker
- self .start_page =start_page
- self .end_page =end_page
- self .manga_mode_active =manga_mode_active
- self .unwanted_keywords =unwanted_keywords if unwanted_keywords is not None else {'spicy','hd','nsfw','4k','preview','teaser','clip'}
- self .manga_filename_style =manga_filename_style
- self .char_filter_scope =char_filter_scope
- self .remove_from_filename_words_list =remove_from_filename_words_list
- self .manga_date_prefix =manga_date_prefix
- self .allow_multipart_download =allow_multipart_download
- self .selected_cookie_file =selected_cookie_file
- self .app_base_dir =app_base_dir
- self .cookie_text =cookie_text
- self .use_cookie =use_cookie
- self .override_output_dir =override_output_dir
- self .manga_date_file_counter_ref =manga_date_file_counter_ref
- self .scan_content_for_images =scan_content_for_images
- self .creator_download_folder_ignore_words =creator_download_folder_ignore_words
- self.use_date_prefix_for_subfolder = use_date_prefix_for_subfolder
- self.keep_in_post_duplicates = keep_in_post_duplicates
- self .manga_global_file_counter_ref =manga_global_file_counter_ref
- self.session_file_path = session_file_path
- self.session_lock = session_lock
- self.history_candidates_buffer =deque (maxlen =8 )
- self.text_only_scope = text_only_scope
- self.text_export_format = text_export_format
- self.single_pdf_mode = single_pdf_mode # <-- ADD THIS LINE
- self.project_root_dir = project_root_dir # Add this assignment
-
- if self .compress_images and Image is None :
- self .logger ("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
- self .compress_images =False
- def logger (self ,message ):
- self .progress_signal .emit (str (message ))
- def isInterruptionRequested (self ):
- return self .cancellation_event .is_set ()or super ().isInterruptionRequested ()
- def _check_pause_self (self ,context_message ="DownloadThread operation"):
- if self .pause_event and self .pause_event .is_set ():
- self .logger (f" {context_message } paused...")
- while self .pause_event .is_set ():
- if self .isInterruptionRequested ():
- self .logger (f" {context_message } cancelled while paused.")
- return True
- time .sleep (0.5 )
- if not self .isInterruptionRequested ():self .logger (f" {context_message } resumed.")
- return False
- def skip_file (self ):
- if self .isRunning ()and self .skip_current_file_flag :
- self .logger ("⏭️ Skip requested for current file (single-thread mode).")
- self .skip_current_file_flag .set ()
- else :self .logger ("ℹ️ Skip file: No download active or skip flag not available for current context.")
-
- def run (self ):
- """
- The main execution method for the single-threaded download process.
- This version is corrected to handle 7 return values from the worker and
- to pass the 'single_pdf_mode' setting correctly.
- """
- grand_total_downloaded_files =0
- grand_total_skipped_files =0
- grand_list_of_kept_original_filenames =[]
- was_process_cancelled =False
-
- # This block for initializing manga mode counters remains unchanged
- if self .manga_mode_active and self .manga_filename_style ==STYLE_DATE_BASED and not self .extract_links_only and self .manga_date_file_counter_ref is None :
- # ... (existing manga counter initialization logic) ...
- pass
- if self .manga_mode_active and self .manga_filename_style ==STYLE_POST_TITLE_GLOBAL_NUMBERING and not self .extract_links_only and self .manga_global_file_counter_ref is None :
- # ... (existing manga counter initialization logic) ...
- pass
-
- worker_signals_obj = PostProcessorSignals()
- try :
- # Connect signals
- worker_signals_obj.progress_signal.connect(self.progress_signal)
- worker_signals_obj.file_download_status_signal.connect(self.file_download_status_signal)
- worker_signals_obj.file_progress_signal.connect(self.file_progress_signal)
- worker_signals_obj.external_link_signal.connect(self.external_link_signal)
- worker_signals_obj.missed_character_post_signal.connect(self.missed_character_post_signal)
- worker_signals_obj.file_successfully_downloaded_signal.connect(self.file_successfully_downloaded_signal)
- worker_signals_obj.worker_finished_signal.connect(lambda result: None) # Connect to dummy lambda to avoid errors
-
- self.logger(" Starting post fetch (single-threaded download process)...")
- post_generator = download_from_api(
- self.api_url_input,
- logger=self.logger,
- start_page=self.start_page,
- end_page=self.end_page,
- manga_mode=self.manga_mode_active,
- cancellation_event=self.cancellation_event,
- pause_event=self.pause_event,
- use_cookie=self.use_cookie,
- cookie_text=self.cookie_text,
- selected_cookie_file=self.selected_cookie_file,
- app_base_dir=self.app_base_dir,
- manga_filename_style_for_sort_check=self.manga_filename_style if self.manga_mode_active else None
- )
-
- for posts_batch_data in post_generator:
- if self.isInterruptionRequested():
- was_process_cancelled = True
- break
- for individual_post_data in posts_batch_data:
- if self.isInterruptionRequested():
- was_process_cancelled = True
- break
-
- # Create the worker, now correctly passing single_pdf_mode
- post_processing_worker = PostProcessorWorker(
- post_data=individual_post_data,
- download_root=self.output_dir,
- known_names=self.known_names,
- filter_character_list=self.filter_character_list_objects_initial,
- dynamic_character_filter_holder=self.dynamic_filter_holder,
- unwanted_keywords=self.unwanted_keywords,
- filter_mode=self.filter_mode,
- skip_zip=self.skip_zip, skip_rar=self.skip_rar,
- use_subfolders=self.use_subfolders, use_post_subfolders=self.use_post_subfolders,
- target_post_id_from_initial_url=self.initial_target_post_id,
- custom_folder_name=self.custom_folder_name,
- compress_images=self.compress_images, download_thumbnails=self.download_thumbnails,
- service=self.service, user_id=self.user_id,
- api_url_input=self.api_url_input,
- pause_event=self.pause_event,
- cancellation_event=self.cancellation_event,
- emitter=worker_signals_obj,
- downloaded_files=self.downloaded_files,
- downloaded_file_hashes=self.downloaded_file_hashes,
- downloaded_files_lock=self.downloaded_files_lock,
- downloaded_file_hashes_lock=self.downloaded_file_hashes_lock,
- skip_words_list=self.skip_words_list,
- skip_words_scope=self.skip_words_scope,
- show_external_links=self.show_external_links,
- extract_links_only=self.extract_links_only,
- num_file_threads=self.num_file_threads_for_worker,
- skip_current_file_flag=self.skip_current_file_flag,
- manga_mode_active=self.manga_mode_active,
- manga_filename_style=self.manga_filename_style,
- manga_date_prefix=self.manga_date_prefix,
- char_filter_scope=self.char_filter_scope,
- remove_from_filename_words_list=self.remove_from_filename_words_list,
- allow_multipart_download=self.allow_multipart_download,
- selected_cookie_file=self.selected_cookie_file,
- app_base_dir=self.app_base_dir,
- cookie_text=self.cookie_text,
- override_output_dir=self.override_output_dir,
- manga_global_file_counter_ref=self.manga_global_file_counter_ref,
- use_cookie=self.use_cookie,
- manga_date_file_counter_ref=self.manga_date_file_counter_ref,
- use_date_prefix_for_subfolder=self.use_date_prefix_for_subfolder,
- keep_in_post_duplicates=self.keep_in_post_duplicates,
- creator_download_folder_ignore_words=self.creator_download_folder_ignore_words,
- session_file_path=self.session_file_path,
- session_lock=self.session_lock,
- text_only_scope=self.text_only_scope,
- text_export_format=self.text_export_format,
- single_pdf_mode=self.single_pdf_mode, # <-- This is now correctly passed
- project_root_dir=self.project_root_dir
- )
- try:
- # Correctly unpack the 7 values returned from the worker
- (dl_count, skip_count, kept_originals_this_post,
- retryable_failures, permanent_failures,
- history_data, temp_filepath) = post_processing_worker.process()
-
- grand_total_downloaded_files += dl_count
- grand_total_skipped_files += skip_count
-
- if kept_originals_this_post:
- grand_list_of_kept_original_filenames.extend(kept_originals_this_post)
- if retryable_failures:
- self.retryable_file_failed_signal.emit(retryable_failures)
- if history_data:
- if len(self.history_candidates_buffer) < 8:
- self.post_processed_for_history_signal.emit(history_data)
- if permanent_failures:
- self.permanent_file_failed_signal.emit(permanent_failures)
-
- # In single-threaded text mode, pass the temp file path back to the main window
- if self.single_pdf_mode and temp_filepath:
- self.progress_signal.emit(f"TEMP_FILE_PATH:{temp_filepath}")
-
- except Exception as proc_err:
- post_id_for_err = individual_post_data.get('id', 'N/A')
- self.logger(f"❌ Error processing post {post_id_for_err} in DownloadThread: {proc_err}")
- traceback.print_exc()
- num_potential_files_est = len(individual_post_data.get('attachments', [])) + (1 if individual_post_data.get('file') else 0)
- grand_total_skipped_files += num_potential_files_est
-
- if self.skip_current_file_flag and self.skip_current_file_flag.is_set():
- self.skip_current_file_flag.clear()
- self.logger(" Skip current file flag was processed and cleared by DownloadThread.")
- self.msleep(10)
- if was_process_cancelled:
- break
- if not was_process_cancelled and not self.isInterruptionRequested():
- self.logger("✅ All posts processed or end of content reached by DownloadThread.")
-
- except Exception as main_thread_err:
- self.logger(f"\n❌ Critical error within DownloadThread run loop: {main_thread_err}")
- traceback.print_exc()
- finally:
- try:
- # Disconnect signals
- if worker_signals_obj:
- worker_signals_obj.progress_signal.disconnect(self.progress_signal)
- worker_signals_obj.file_download_status_signal.disconnect(self.file_download_status_signal)
- worker_signals_obj.external_link_signal.disconnect(self.external_link_signal)
- worker_signals_obj.file_progress_signal.disconnect(self.file_progress_signal)
- worker_signals_obj.missed_character_post_signal.disconnect(self.missed_character_post_signal)
- worker_signals_obj.file_successfully_downloaded_signal.disconnect(self.file_successfully_downloaded_signal)
- except (TypeError, RuntimeError) as e:
- self.logger(f"ℹ️ Note during DownloadThread signal disconnection: {e}")
-
- # Emit the final signal with all collected results
- self.finished_signal.emit(grand_total_downloaded_files, grand_total_skipped_files, self.isInterruptionRequested(), grand_list_of_kept_original_filenames)
-
- def receive_add_character_result (self ,result ):
- with QMutexLocker (self .prompt_mutex ):
- self ._add_character_response =result
- self .logger (f" (DownloadThread) Received character prompt response: {'Yes (added/confirmed)'if result else 'No (declined/failed)'}")
-
-class InterruptedError(Exception):
- """Custom exception for handling cancellations gracefully."""
- pass
\ No newline at end of file