2025-05-07 07:20:40 +05:30
import os
import time
import requests
import re
import threading
2025-05-10 11:07:27 +05:30
import queue # Not directly used for link queue, but kept for historical reasons
2025-05-07 07:20:40 +05:30
import hashlib
2025-05-08 19:49:50 +05:30
import http . client
import traceback
from concurrent . futures import ThreadPoolExecutor , Future , CancelledError , as_completed
2025-05-10 11:07:27 +05:30
import html
2025-05-07 07:20:40 +05:30
from PyQt5 . QtCore import QObject , pyqtSignal , QThread , QMutex , QMutexLocker
from urllib . parse import urlparse
try :
from PIL import Image
except ImportError :
print ( " ERROR: Pillow library not found. Please install it: pip install Pillow " )
Image = None
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
from io import BytesIO
2025-05-10 11:07:27 +05:30
# Constants for filename styles, mirroring main.py for clarity if used directly here
STYLE_POST_TITLE = " post_title "
STYLE_ORIGINAL_NAME = " original_name "
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
# Constants for skip_words_scope, mirroring main.py
SKIP_SCOPE_FILES = " files "
SKIP_SCOPE_POSTS = " posts "
SKIP_SCOPE_BOTH = " both "
fastapi_app = None
KNOWN_NAMES = [ ]
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
IMAGE_EXTENSIONS = {
' .jpg ' , ' .jpeg ' , ' .png ' , ' .gif ' , ' .bmp ' , ' .tiff ' , ' .tif ' , ' .webp ' ,
' .heic ' , ' .heif ' , ' .svg ' , ' .ico ' , ' .jfif ' , ' .pjpeg ' , ' .pjp ' , ' .avif '
}
VIDEO_EXTENSIONS = {
' .mp4 ' , ' .mov ' , ' .mkv ' , ' .webm ' , ' .avi ' , ' .wmv ' , ' .flv ' , ' .mpeg ' ,
' .mpg ' , ' .m4v ' , ' .3gp ' , ' .ogv ' , ' .ts ' , ' .vob '
}
2025-05-10 11:07:27 +05:30
# ADDED: Archive Extensions
ARCHIVE_EXTENSIONS = {
' .zip ' , ' .rar ' , ' .7z ' , ' .tar ' , ' .gz ' , ' .bz2 ' # Added more common archive types
}
2025-05-08 19:49:50 +05:30
def is_title_match_for_character ( post_title , character_name_filter ) :
""" Checks if a post title contains a specific character name (case-insensitive, whole word). """
2025-05-09 19:03:01 +05:30
if not post_title or not character_name_filter :
2025-05-08 19:49:50 +05:30
return False
pattern = r " (?i) \ b " + re . escape ( character_name_filter ) + r " \ b "
2025-05-09 19:03:01 +05:30
return bool ( re . search ( pattern , post_title ) )
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
def is_filename_match_for_character ( filename , character_name_filter ) :
""" Checks if a filename contains a specific character name (case-insensitive, substring). """
if not filename or not character_name_filter :
return False
return character_name_filter . lower ( ) in filename . lower ( )
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def clean_folder_name ( name ) :
2025-05-08 19:49:50 +05:30
""" Cleans a string to be suitable for a folder name. """
2025-05-09 19:03:01 +05:30
if not isinstance ( name , str ) : name = str ( name )
2025-05-10 11:07:27 +05:30
cleaned = re . sub ( r ' [^ \ w \ s \ - \ _ \ . \ ( \ )] ' , ' ' , name )
cleaned = cleaned . strip ( )
2025-05-09 19:03:01 +05:30
cleaned = re . sub ( r ' \ s+ ' , ' _ ' , cleaned )
return cleaned if cleaned else " untitled_folder "
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def clean_filename ( name ) :
2025-05-09 19:03:01 +05:30
""" Cleans a string to be suitable for a file name. """
if not isinstance ( name , str ) : name = str ( name )
2025-05-10 11:07:27 +05:30
cleaned = re . sub ( r ' [^ \ w \ s \ - \ _ \ . \ ( \ )] ' , ' ' , name )
cleaned = cleaned . strip ( )
2025-05-09 19:03:01 +05:30
cleaned = re . sub ( r ' \ s+ ' , ' _ ' , cleaned )
return cleaned if cleaned else " untitled_file "
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def extract_folder_name_from_title ( title , unwanted_keywords ) :
2025-05-09 19:03:01 +05:30
""" Extracts a potential folder name from a title, avoiding unwanted keywords. """
2025-05-07 07:20:40 +05:30
if not title : return ' Uncategorized '
title_lower = title . lower ( )
2025-05-10 11:07:27 +05:30
tokens = re . findall ( r ' \ b[ \ w \ -]+ \ b ' , title_lower )
2025-05-07 07:20:40 +05:30
for token in tokens :
2025-05-10 11:07:27 +05:30
clean_token = clean_folder_name ( token )
if clean_token and clean_token . lower ( ) not in unwanted_keywords :
2025-05-09 19:03:01 +05:30
return clean_token
2025-05-08 19:49:50 +05:30
cleaned_full_title = clean_folder_name ( title )
return cleaned_full_title if cleaned_full_title else ' Uncategorized '
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def match_folders_from_title ( title , names_to_match , unwanted_keywords ) :
2025-05-09 19:03:01 +05:30
"""
Matches names from a list against a title to determine potential folder names .
Prioritizes longer matches .
"""
2025-05-08 19:49:50 +05:30
if not title or not names_to_match : return [ ]
title_lower = title . lower ( )
2025-05-07 07:20:40 +05:30
matched_cleaned_names = set ( )
2025-05-08 19:49:50 +05:30
sorted_names_to_match = sorted ( names_to_match , key = len , reverse = True )
for name in sorted_names_to_match :
name_lower = name . lower ( )
2025-05-10 11:07:27 +05:30
if not name_lower : continue
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
pattern = r ' \ b ' + re . escape ( name_lower ) + r ' \ b '
2025-05-08 19:49:50 +05:30
if re . search ( pattern , title_lower ) :
2025-05-09 19:03:01 +05:30
cleaned_name_for_folder = clean_folder_name ( name )
2025-05-10 11:07:27 +05:30
if cleaned_name_for_folder . lower ( ) not in unwanted_keywords :
matched_cleaned_names . add ( cleaned_name_for_folder )
2025-05-08 19:49:50 +05:30
return sorted ( list ( matched_cleaned_names ) )
2025-05-07 07:20:40 +05:30
def is_image ( filename ) :
2025-05-09 19:03:01 +05:30
""" Checks if the filename has a common image extension. """
2025-05-07 07:20:40 +05:30
if not filename : return False
2025-05-09 19:03:01 +05:30
_ , ext = os . path . splitext ( filename )
return ext . lower ( ) in IMAGE_EXTENSIONS
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_video ( filename ) :
2025-05-09 19:03:01 +05:30
""" Checks if the filename has a common video extension. """
2025-05-07 07:20:40 +05:30
if not filename : return False
2025-05-09 19:03:01 +05:30
_ , ext = os . path . splitext ( filename )
return ext . lower ( ) in VIDEO_EXTENSIONS
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_zip ( filename ) :
2025-05-09 19:03:01 +05:30
""" Checks if the filename ends with .zip (case-insensitive). """
2025-05-07 07:20:40 +05:30
if not filename : return False
return filename . lower ( ) . endswith ( ' .zip ' )
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_rar ( filename ) :
2025-05-09 19:03:01 +05:30
""" Checks if the filename ends with .rar (case-insensitive). """
2025-05-07 07:20:40 +05:30
if not filename : return False
return filename . lower ( ) . endswith ( ' .rar ' )
2025-05-10 11:07:27 +05:30
# ADDED: Generic is_archive function
def is_archive ( filename ) :
""" Checks if the filename has a common archive extension. """
if not filename : return False
_ , ext = os . path . splitext ( filename )
return ext . lower ( ) in ARCHIVE_EXTENSIONS
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def is_post_url ( url ) :
2025-05-09 19:03:01 +05:30
""" Checks if the URL likely points to a specific post. """
2025-05-07 07:20:40 +05:30
if not isinstance ( url , str ) : return False
return ' /post/ ' in urlparse ( url ) . path
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
def extract_post_info ( url_string ) :
2025-05-08 19:49:50 +05:30
""" Extracts service, user ID, and post ID from a Kemono/Coomer URL. """
2025-05-07 07:20:40 +05:30
service , user_id , post_id = None , None , None
2025-05-08 19:49:50 +05:30
if not isinstance ( url_string , str ) or not url_string . strip ( ) : return None , None , None
2025-05-07 07:20:40 +05:30
try :
parsed_url = urlparse ( url_string . strip ( ) )
domain = parsed_url . netloc . lower ( )
2025-05-08 19:49:50 +05:30
is_kemono = any ( d in domain for d in [ ' kemono.su ' , ' kemono.party ' ] )
is_coomer = any ( d in domain for d in [ ' coomer.su ' , ' coomer.party ' ] )
2025-05-10 11:07:27 +05:30
if not ( is_kemono or is_coomer ) : return None , None , None
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
path_parts = [ part for part in parsed_url . path . strip ( ' / ' ) . split ( ' / ' ) if part ]
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
if len ( path_parts ) > = 3 and path_parts [ 1 ] . lower ( ) == ' user ' :
service = path_parts [ 0 ]
user_id = path_parts [ 2 ]
if len ( path_parts ) > = 5 and path_parts [ 3 ] . lower ( ) == ' post ' :
post_id = path_parts [ 4 ]
return service , user_id , post_id
2025-05-08 19:49:50 +05:30
if len ( path_parts ) > = 5 and path_parts [ 0 ] . lower ( ) == ' api ' and \
path_parts [ 1 ] . lower ( ) == ' v1 ' and path_parts [ 3 ] . lower ( ) == ' user ' :
2025-05-07 07:20:40 +05:30
service = path_parts [ 2 ]
user_id = path_parts [ 4 ]
if len ( path_parts ) > = 7 and path_parts [ 5 ] . lower ( ) == ' post ' :
2025-05-08 19:49:50 +05:30
post_id = path_parts [ 6 ]
2025-05-07 07:20:40 +05:30
return service , user_id , post_id
2025-05-09 19:03:01 +05:30
2025-05-07 07:20:40 +05:30
except Exception as e :
print ( f " Debug: Exception during extract_post_info for URL ' { url_string } ' : { e } " )
2025-05-10 11:07:27 +05:30
return None , None , None
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def fetch_posts_paginated ( api_url_base , headers , offset , logger , cancellation_event = None ) :
2025-05-09 19:03:01 +05:30
""" Fetches a single page of posts from the API. """
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event . is_set ( ) :
logger ( " Fetch cancelled before request. " )
2025-05-10 11:07:27 +05:30
raise RuntimeError ( " Fetch operation cancelled by user. " )
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
paginated_url = f ' { api_url_base } ?o= { offset } '
2025-05-10 11:07:27 +05:30
logger ( f " Fetching: { paginated_url } (Page approx. { offset / / 50 + 1 } ) " )
2025-05-07 07:20:40 +05:30
try :
2025-05-10 11:07:27 +05:30
response = requests . get ( paginated_url , headers = headers , timeout = ( 10 , 60 ) )
response . raise_for_status ( )
2025-05-08 19:49:50 +05:30
if ' application/json ' not in response . headers . get ( ' Content-Type ' , ' ' ) . lower ( ) :
logger ( f " ⚠️ Unexpected content type from API: { response . headers . get ( ' Content-Type ' ) } . Body: { response . text [ : 200 ] } " )
2025-05-10 11:07:27 +05:30
return [ ] # Return empty list on unexpected content type
2025-05-07 07:20:40 +05:30
return response . json ( )
except requests . exceptions . Timeout :
2025-05-08 19:49:50 +05:30
raise RuntimeError ( f " Timeout fetching offset { offset } from { paginated_url } " )
2025-05-07 07:20:40 +05:30
except requests . exceptions . RequestException as e :
2025-05-08 19:49:50 +05:30
err_msg = f " Error fetching offset { offset } from { paginated_url } : { e } "
2025-05-07 07:20:40 +05:30
if e . response is not None :
err_msg + = f " (Status: { e . response . status_code } , Body: { e . response . text [ : 200 ] } ) "
raise RuntimeError ( err_msg )
2025-05-10 11:07:27 +05:30
except ValueError as e : # JSONDecodeError is a subclass of ValueError
2025-05-08 19:49:50 +05:30
raise RuntimeError ( f " Error decoding JSON from offset { offset } ( { paginated_url } ): { e } . Response text: { response . text [ : 200 ] } " )
2025-05-09 19:03:01 +05:30
except Exception as e :
2025-05-08 19:49:50 +05:30
raise RuntimeError ( f " Unexpected error fetching offset { offset } ( { paginated_url } ): { e } " )
def download_from_api ( api_url_input , logger = print , start_page = None , end_page = None , manga_mode = False , cancellation_event = None ) :
"""
2025-05-09 19:03:01 +05:30
Generator function to fetch post data from Kemono / Coomer API .
Handles pagination and yields batches of posts .
In Manga Mode , fetches all posts first , then yields them in reverse order ( oldest first ) .
2025-05-10 11:07:27 +05:30
If target_post_id is specified , it will paginate until that post is found or all pages are exhausted .
2025-05-08 19:49:50 +05:30
"""
2025-05-10 11:07:27 +05:30
headers = { ' User-Agent ' : ' Mozilla/5.0 ' , ' Accept ' : ' application/json ' }
2025-05-07 07:20:40 +05:30
service , user_id , target_post_id = extract_post_info ( api_url_input )
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event . is_set ( ) :
logger ( " Download_from_api cancelled at start. " )
2025-05-07 07:20:40 +05:30
return
2025-05-08 19:49:50 +05:30
if not service or not user_id :
logger ( f " ❌ Invalid URL or could not extract service/user: { api_url_input } " )
2025-05-09 19:03:01 +05:30
return
2025-05-08 19:49:50 +05:30
if target_post_id and ( start_page or end_page ) :
2025-05-10 11:07:27 +05:30
logger ( " ⚠️ Page range (start/end page) is ignored when a specific post URL is provided (searching all pages for the post). " )
start_page = end_page = None # Ensure no page limits when searching for a specific post
2025-05-08 19:49:50 +05:30
is_creator_feed_for_manga = manga_mode and not target_post_id
2025-05-07 07:20:40 +05:30
parsed_input = urlparse ( api_url_input )
2025-05-08 19:49:50 +05:30
api_domain = parsed_input . netloc
if not any ( d in api_domain . lower ( ) for d in [ ' kemono.su ' , ' kemono.party ' , ' coomer.su ' , ' coomer.party ' ] ) :
logger ( f " ⚠️ Unrecognized domain ' { api_domain } ' . Defaulting to kemono.su for API calls. " )
2025-05-10 11:07:27 +05:30
api_domain = " kemono.su "
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
api_base_url = f " https:// { api_domain } /api/v1/ { service } /user/ { user_id } "
2025-05-09 19:03:01 +05:30
page_size = 50 # Kemono API typically returns 50 posts per page
2025-05-08 19:49:50 +05:30
if is_creator_feed_for_manga :
logger ( " Manga Mode: Fetching all posts to reverse order (oldest posts processed first)... " )
all_posts_for_manga_mode = [ ]
current_offset_manga = 0
2025-05-09 19:03:01 +05:30
while True :
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event . is_set ( ) :
logger ( " Manga mode post fetching cancelled. " )
break
try :
posts_batch_manga = fetch_posts_paginated ( api_base_url , headers , current_offset_manga , logger , cancellation_event )
2025-05-10 11:07:27 +05:30
if not isinstance ( posts_batch_manga , list ) :
2025-05-08 19:49:50 +05:30
logger ( f " ❌ API Error (Manga Mode): Expected list of posts, got { type ( posts_batch_manga ) } . " )
break
2025-05-10 11:07:27 +05:30
if not posts_batch_manga :
2025-05-08 19:49:50 +05:30
logger ( " ✅ Reached end of posts (Manga Mode fetch all). " )
break
all_posts_for_manga_mode . extend ( posts_batch_manga )
2025-05-10 11:07:27 +05:30
current_offset_manga + = len ( posts_batch_manga ) # Use actual length
time . sleep ( 0.6 )
except RuntimeError as e :
2025-05-09 19:03:01 +05:30
if " cancelled by user " in str ( e ) . lower ( ) :
2025-05-08 19:49:50 +05:30
logger ( f " ℹ ️ Manga mode pagination stopped due to cancellation: { e } " )
else :
logger ( f " ❌ { e } \n Aborting manga mode pagination. " )
2025-05-10 11:07:27 +05:30
break # Stop on runtime error
2025-05-09 19:03:01 +05:30
except Exception as e : # Catch any other unexpected errors
2025-05-08 19:49:50 +05:30
logger ( f " ❌ Unexpected error during manga mode fetch: { e } " )
traceback . print_exc ( )
2025-05-10 11:07:27 +05:30
break # Stop on other errors
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if cancellation_event and cancellation_event . is_set ( ) : return
2025-05-08 19:49:50 +05:30
if all_posts_for_manga_mode :
logger ( f " Manga Mode: Fetched { len ( all_posts_for_manga_mode ) } total posts. Reversing order... " )
2025-05-10 11:07:27 +05:30
all_posts_for_manga_mode . reverse ( ) # Oldest first
2025-05-08 19:49:50 +05:30
for i in range ( 0 , len ( all_posts_for_manga_mode ) , page_size ) :
if cancellation_event and cancellation_event . is_set ( ) :
logger ( " Manga mode post yielding cancelled. " )
break
yield all_posts_for_manga_mode [ i : i + page_size ]
else :
logger ( " Manga Mode: No posts found to process. " )
2025-05-10 11:07:27 +05:30
return # End of manga mode logic
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
# --- Regular pagination (Creator feed or Single Post search) ---
2025-05-08 19:49:50 +05:30
current_page_num = 1
current_offset = 0
2025-05-10 11:07:27 +05:30
processed_target_post_flag = False
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
if start_page and start_page > 1 and not target_post_id : # Only apply start_page if not searching for a specific post
current_offset = ( start_page - 1 ) * page_size
2025-05-08 19:49:50 +05:30
current_page_num = start_page
logger ( f " Starting from page { current_page_num } (calculated offset { current_offset } ). " )
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
while True :
2025-05-08 19:49:50 +05:30
if cancellation_event and cancellation_event . is_set ( ) :
logger ( " Post fetching loop cancelled. " )
break
2025-05-10 11:07:27 +05:30
if target_post_id and processed_target_post_flag : # If target post was found and yielded in a previous iteration
# logger(f"✅ Target post {target_post_id} was processed. Stopping pagination.") # Logged when found
2025-05-08 19:49:50 +05:30
break
2025-05-10 11:07:27 +05:30
# For creator feeds (not target_post_id mode), check end_page limit
if not target_post_id and end_page and current_page_num > end_page :
logger ( f " ✅ Reached specified end page ( { end_page } ) for creator feed. Stopping. " )
2025-05-07 07:20:40 +05:30
break
try :
2025-05-08 19:49:50 +05:30
posts_batch = fetch_posts_paginated ( api_base_url , headers , current_offset , logger , cancellation_event )
2025-05-07 07:20:40 +05:30
if not isinstance ( posts_batch , list ) :
2025-05-10 11:07:27 +05:30
logger ( f " ❌ API Error: Expected list of posts, got { type ( posts_batch ) } at page { current_page_num } (offset { current_offset } ). " )
2025-05-08 19:49:50 +05:30
break
2025-05-10 11:07:27 +05:30
except RuntimeError as e :
2025-05-08 19:49:50 +05:30
if " cancelled by user " in str ( e ) . lower ( ) :
logger ( f " ℹ ️ Pagination stopped due to cancellation: { e } " )
else :
2025-05-10 11:07:27 +05:30
logger ( f " ❌ { e } \n Aborting pagination at page { current_page_num } (offset { current_offset } ). " )
break # Stop on runtime error
2025-05-09 19:03:01 +05:30
except Exception as e : # Catch any other unexpected errors
2025-05-10 11:07:27 +05:30
logger ( f " ❌ Unexpected error fetching page { current_page_num } (offset { current_offset } ): { e } " )
2025-05-08 19:49:50 +05:30
traceback . print_exc ( )
2025-05-10 11:07:27 +05:30
break # Stop on other errors
if not posts_batch : # API returned an empty list, meaning no more posts
if target_post_id and not processed_target_post_flag :
logger ( f " ❌ Target post { target_post_id } not found after checking all available pages (API returned no more posts at offset { current_offset } ). " )
elif not target_post_id : # Normal creator feed end
if current_page_num == ( start_page or 1 ) : # Check if it was the first page attempted
logger ( f " 😕 No posts found on the first page checked (page { current_page_num } , offset { current_offset } ). " )
else :
logger ( f " ✅ Reached end of posts (no more content from API at offset { current_offset } ). " )
break # Exit while loop
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if target_post_id and not processed_target_post_flag :
2025-05-08 19:49:50 +05:30
matching_post = next ( ( p for p in posts_batch if str ( p . get ( ' id ' ) ) == str ( target_post_id ) ) , None )
2025-05-07 07:20:40 +05:30
if matching_post :
2025-05-10 11:07:27 +05:30
logger ( f " 🎯 Found target post { target_post_id } on page { current_page_num } (offset { current_offset } ). " )
yield [ matching_post ] # Yield only the matching post as a list
processed_target_post_flag = True
# Loop will break at the top in the next iteration due to processed_target_post_flag
# If not found in this batch, the loop continues to the next page.
# Logger message for "not found in batch" is removed here to avoid spam if post is on a later page.
elif not target_post_id : # Processing a creator feed (no specific target post)
yield posts_batch
if processed_target_post_flag : # If we just found and yielded the target post, stop.
2025-05-09 19:03:01 +05:30
break
2025-05-10 11:07:27 +05:30
# Increment page and offset for the next iteration
current_offset + = len ( posts_batch ) # Use actual length of batch for offset
current_page_num + = 1
time . sleep ( 0.6 ) # Keep the delay
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Final check after the loop, specifically if a target post was being searched for but not found
2025-05-08 19:49:50 +05:30
if target_post_id and not processed_target_post_flag and not ( cancellation_event and cancellation_event . is_set ( ) ) :
2025-05-10 11:07:27 +05:30
# This log might be redundant if the one inside "if not posts_batch:" already covered it,
# but it serves as a final confirmation if the loop exited for other reasons before exhausting pages.
logger ( f " ❌ Target post { target_post_id } could not be found after checking all relevant pages (final check after loop). " )
2025-05-08 19:49:50 +05:30
def get_link_platform ( url ) :
2025-05-09 19:03:01 +05:30
""" Attempts to identify the platform of an external link from its domain. """
2025-05-08 19:49:50 +05:30
try :
domain = urlparse ( url ) . netloc . lower ( )
if ' drive.google.com ' in domain : return ' google drive '
if ' mega.nz ' in domain or ' mega.io ' in domain : return ' mega '
if ' dropbox.com ' in domain : return ' dropbox '
if ' patreon.com ' in domain : return ' patreon '
if ' instagram.com ' in domain : return ' instagram '
if ' twitter.com ' in domain or ' x.com ' in domain : return ' twitter/x '
if ' discord.gg ' in domain or ' discord.com/invite ' in domain : return ' discord invite '
if ' pixiv.net ' in domain : return ' pixiv '
2025-05-10 11:07:27 +05:30
if ' kemono.su ' in domain or ' kemono.party ' in domain : return ' kemono '
if ' coomer.su ' in domain or ' coomer.party ' in domain : return ' coomer '
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
parts = domain . split ( ' . ' )
if len ( parts ) > = 2 :
if parts [ - 2 ] not in [ ' com ' , ' org ' , ' net ' , ' gov ' , ' edu ' , ' co ' ] or len ( parts ) == 2 :
2025-05-09 19:03:01 +05:30
return parts [ - 2 ]
elif len ( parts ) > = 3 and parts [ - 3 ] not in [ ' com ' , ' org ' , ' net ' , ' gov ' , ' edu ' , ' co ' ] :
2025-05-08 19:49:50 +05:30
return parts [ - 3 ]
2025-05-10 11:07:27 +05:30
else :
2025-05-09 19:03:01 +05:30
return domain
2025-05-10 11:07:27 +05:30
return ' external '
except Exception : return ' unknown '
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
class PostProcessorSignals ( QObject ) :
2025-05-08 19:49:50 +05:30
""" Defines signals used by PostProcessorWorker to communicate with the GUI thread. """
2025-05-10 11:07:27 +05:30
progress_signal = pyqtSignal ( str )
file_download_status_signal = pyqtSignal ( bool )
external_link_signal = pyqtSignal ( str , str , str , str )
2025-05-09 19:03:01 +05:30
file_progress_signal = pyqtSignal ( str , int , int )
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
class PostProcessorWorker :
2025-05-08 19:49:50 +05:30
""" Processes a single post: determines save paths, downloads files, handles compression. """
def __init__ ( self , post_data , download_root , known_names ,
filter_character_list ,
2025-05-07 07:20:40 +05:30
unwanted_keywords , filter_mode , skip_zip , skip_rar ,
2025-05-08 19:49:50 +05:30
use_subfolders , use_post_subfolders , target_post_id_from_initial_url , custom_folder_name ,
2025-05-07 07:20:40 +05:30
compress_images , download_thumbnails , service , user_id ,
api_url_input , cancellation_event , signals ,
downloaded_files , downloaded_file_hashes , downloaded_files_lock , downloaded_file_hashes_lock ,
2025-05-10 11:07:27 +05:30
skip_words_list = None ,
skip_words_scope = SKIP_SCOPE_FILES , # New parameter with default
show_external_links = False ,
2025-05-09 19:03:01 +05:30
extract_links_only = False ,
num_file_threads = 4 , skip_current_file_flag = None ,
2025-05-10 11:07:27 +05:30
manga_mode_active = False ,
manga_filename_style = STYLE_POST_TITLE
2025-05-08 19:49:50 +05:30
) :
2025-05-07 07:20:40 +05:30
self . post = post_data
self . download_root = download_root
self . known_names = known_names
2025-05-08 19:49:50 +05:30
self . filter_character_list = filter_character_list if filter_character_list else [ ]
2025-05-09 19:03:01 +05:30
self . unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else set ( )
2025-05-10 11:07:27 +05:30
self . filter_mode = filter_mode
2025-05-07 07:20:40 +05:30
self . skip_zip = skip_zip
self . skip_rar = skip_rar
self . use_subfolders = use_subfolders
2025-05-08 19:49:50 +05:30
self . use_post_subfolders = use_post_subfolders
2025-05-10 11:07:27 +05:30
self . target_post_id_from_initial_url = target_post_id_from_initial_url
self . custom_folder_name = custom_folder_name
2025-05-07 07:20:40 +05:30
self . compress_images = compress_images
self . download_thumbnails = download_thumbnails
self . service = service
self . user_id = user_id
2025-05-10 11:07:27 +05:30
self . api_url_input = api_url_input
2025-05-07 07:20:40 +05:30
self . cancellation_event = cancellation_event
2025-05-10 11:07:27 +05:30
self . signals = signals
self . skip_current_file_flag = skip_current_file_flag
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
self . downloaded_files = downloaded_files if downloaded_files is not None else set ( )
self . downloaded_file_hashes = downloaded_file_hashes if downloaded_file_hashes is not None else set ( )
self . downloaded_files_lock = downloaded_files_lock if downloaded_files_lock is not None else threading . Lock ( )
self . downloaded_file_hashes_lock = downloaded_file_hashes_lock if downloaded_file_hashes_lock is not None else threading . Lock ( )
2025-05-09 19:03:01 +05:30
2025-05-07 07:20:40 +05:30
self . skip_words_list = skip_words_list if skip_words_list is not None else [ ]
2025-05-10 11:07:27 +05:30
self . skip_words_scope = skip_words_scope # Store the new scope
self . show_external_links = show_external_links
self . extract_links_only = extract_links_only
self . num_file_threads = num_file_threads
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
self . manga_mode_active = manga_mode_active
self . manga_filename_style = manga_filename_style
2025-05-08 19:49:50 +05:30
2025-05-07 07:20:40 +05:30
if self . compress_images and Image is None :
2025-05-08 19:49:50 +05:30
self . logger ( " ⚠️ Image compression disabled: Pillow library not found. " )
2025-05-07 07:20:40 +05:30
self . compress_images = False
def logger ( self , message ) :
2025-05-08 19:49:50 +05:30
""" Emits a log message via the progress_signal if available. """
2025-05-07 07:20:40 +05:30
if self . signals and hasattr ( self . signals , ' progress_signal ' ) :
self . signals . progress_signal . emit ( message )
2025-05-10 11:07:27 +05:30
else :
2025-05-09 19:03:01 +05:30
print ( f " (Worker Log - No Signal): { message } " )
2025-05-07 07:20:40 +05:30
def check_cancel ( self ) :
2025-05-08 19:49:50 +05:30
""" Checks if cancellation has been requested. """
return self . cancellation_event . is_set ( )
2025-05-07 07:20:40 +05:30
2025-05-09 19:03:01 +05:30
def _download_single_file ( self , file_info , target_folder_path , headers , original_post_id_for_log , skip_event ,
2025-05-10 11:07:27 +05:30
post_title = " " , file_index_in_post = 0 , num_files_in_this_post = 1 ) :
"""
Downloads a single file , handles retries , compression , and hash checking .
Returns :
( int , int , str , bool ) : ( downloaded_count , skipped_count , final_filename_saved , was_original_name_kept_flag )
"""
was_original_name_kept_flag = False
final_filename_saved_for_return = " "
if self . check_cancel ( ) or ( skip_event and skip_event . is_set ( ) ) : return 0 , 1 , " " , False
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
file_url = file_info . get ( ' url ' )
api_original_filename = file_info . get ( ' _original_name_for_log ' , file_info . get ( ' name ' ) )
if not file_url or not api_original_filename :
self . logger ( f " ⚠️ Skipping file from post { original_post_id_for_log } : Missing URL or original filename. Info: { str ( file_info ) [ : 100 ] } " )
2025-05-10 11:07:27 +05:30
return 0 , 1 , api_original_filename or " " , False
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
final_filename_saved_for_return = api_original_filename
# Apply skip_words_list based on skip_words_scope (for files)
if self . skip_words_list and ( self . skip_words_scope == SKIP_SCOPE_FILES or self . skip_words_scope == SKIP_SCOPE_BOTH ) :
filename_to_check_for_skip_words = api_original_filename . lower ( )
2025-05-09 19:03:01 +05:30
for skip_word in self . skip_words_list :
2025-05-10 11:07:27 +05:30
if skip_word . lower ( ) in filename_to_check_for_skip_words :
self . logger ( f " -> Skip File (Keyword in Original Name ' { skip_word } ' ): ' { api_original_filename } ' . Scope: { self . skip_words_scope } " )
return 0 , 1 , api_original_filename , False
2025-05-09 19:03:01 +05:30
if self . filter_character_list :
matches_any_character_filter = False
2025-05-10 11:07:27 +05:30
if self . manga_mode_active : # In manga mode, character filter applies to post title primarily
2025-05-09 19:03:01 +05:30
if any ( is_title_match_for_character ( post_title , char_filter ) for char_filter in self . filter_character_list ) :
matches_any_character_filter = True
2025-05-10 11:07:27 +05:30
# Fallback: if title doesn't match, but filename does, still consider it a match for manga mode if desired
# For now, let's stick to title match for manga post filtering, file name match for file filtering.
# If you want manga mode character filter to also check filenames, uncomment below:
# if not matches_any_character_filter and any(is_filename_match_for_character(api_original_filename, char_filter) for char_filter in self.filter_character_list):
# matches_any_character_filter = True
else : # Normal mode, character filter applies to filename
2025-05-09 19:03:01 +05:30
if any ( is_filename_match_for_character ( api_original_filename , char_filter ) for char_filter in self . filter_character_list ) :
matches_any_character_filter = True
2025-05-10 11:07:27 +05:30
if not matches_any_character_filter : # If no character filter matched (based on mode)
self . logger ( f " -> Skip File (No Char Match): ' { api_original_filename } ' (Post: ' { post_title [ : 30 ] } ... ' ) doesn ' t match character filters. " )
return 0 , 1 , api_original_filename , False
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
original_filename_cleaned_base , original_ext = os . path . splitext ( clean_filename ( api_original_filename ) )
if not original_ext . startswith ( ' . ' ) : original_ext = ' . ' + original_ext if original_ext else ' '
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
filename_to_save = " "
2025-05-08 19:49:50 +05:30
if self . manga_mode_active :
2025-05-10 11:07:27 +05:30
if self . manga_filename_style == STYLE_ORIGINAL_NAME :
filename_to_save = clean_filename ( api_original_filename )
was_original_name_kept_flag = True # Original name is kept by definition here
elif self . manga_filename_style == STYLE_POST_TITLE :
if post_title and post_title . strip ( ) :
cleaned_post_title_base = clean_filename ( post_title . strip ( ) )
if num_files_in_this_post > 1 : # Multi-file post
if file_index_in_post == 0 : # First file of multi-file post
filename_to_save = f " { cleaned_post_title_base } { original_ext } "
was_original_name_kept_flag = False
else : # Subsequent files of multi-file post
filename_to_save = clean_filename ( api_original_filename ) # Keep original for subsequent
was_original_name_kept_flag = True
else : # Single file post in manga mode
filename_to_save = f " { cleaned_post_title_base } { original_ext } "
was_original_name_kept_flag = False
else : # Manga mode, post title style, but post_title is missing
filename_to_save = clean_filename ( api_original_filename )
was_original_name_kept_flag = False # Not truly "kept original" in the spirit of the style choice
self . logger ( f " ⚠️ Manga mode (Post Title Style): Post title missing for post { original_post_id_for_log } . Using cleaned original filename ' { filename_to_save } ' . " )
else : # Unknown manga style
self . logger ( f " ⚠️ Manga mode: Unknown filename style ' { self . manga_filename_style } ' . Defaulting to original filename for ' { api_original_filename } ' . " )
filename_to_save = clean_filename ( api_original_filename )
was_original_name_kept_flag = False # Or True, depending on interpretation. Let's say False as it's a fallback.
# Collision handling for manga mode filenames
if filename_to_save :
2025-05-08 19:49:50 +05:30
counter = 1
base_name_coll , ext_coll = os . path . splitext ( filename_to_save )
temp_filename_for_collision_check = filename_to_save
2025-05-10 11:07:27 +05:30
# Ensure unique filename in target folder
2025-05-08 19:49:50 +05:30
while os . path . exists ( os . path . join ( target_folder_path , temp_filename_for_collision_check ) ) :
2025-05-10 11:07:27 +05:30
# If it's the first file of a multi-file post using post_title style, append _N
if self . manga_filename_style == STYLE_POST_TITLE and file_index_in_post == 0 and num_files_in_this_post > 1 :
temp_filename_for_collision_check = f " { base_name_coll } _ { counter } { ext_coll } "
# If it's original name style, or subsequent file, or single file post, append _N to its base
else :
temp_filename_for_collision_check = f " { base_name_coll } _ { counter } { ext_coll } "
2025-05-08 19:49:50 +05:30
counter + = 1
if temp_filename_for_collision_check != filename_to_save :
filename_to_save = temp_filename_for_collision_check
2025-05-10 11:07:27 +05:30
else : # Fallback if filename_to_save ended up empty
filename_to_save = f " manga_file_ { original_post_id_for_log } _ { file_index_in_post + 1 } { original_ext } "
self . logger ( f " ⚠️ Manga mode: Generated filename was empty. Using generic fallback: ' { filename_to_save } ' . " )
was_original_name_kept_flag = False
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
else : # Not Manga Mode
filename_to_save = clean_filename ( api_original_filename )
was_original_name_kept_flag = False # Not manga mode, so this flag isn't relevant in the same way
# Collision handling for non-manga mode
counter = 1
base_name_coll , ext_coll = os . path . splitext ( filename_to_save )
temp_filename_for_collision_check = filename_to_save
while os . path . exists ( os . path . join ( target_folder_path , temp_filename_for_collision_check ) ) :
temp_filename_for_collision_check = f " { base_name_coll } _ { counter } { ext_coll } "
counter + = 1
if temp_filename_for_collision_check != filename_to_save :
filename_to_save = temp_filename_for_collision_check
final_filename_for_sets_and_saving = filename_to_save
final_filename_saved_for_return = final_filename_for_sets_and_saving
if not self . download_thumbnails :
# Determine file type based on the original API filename
is_img_type = is_image ( api_original_filename )
2025-05-08 19:49:50 +05:30
is_vid_type = is_video ( api_original_filename )
2025-05-10 11:07:27 +05:30
# Use the generic is_archive function
is_archive_type = is_archive ( api_original_filename )
# ===== MODIFICATION START =====
if self . filter_mode == ' archive ' :
if not is_archive_type : # If in 'archive' mode and the file is NOT an archive
self . logger ( f " -> Filter Skip (Archive Mode): ' { api_original_filename } ' (Not an Archive). " )
return 0 , 1 , api_original_filename , False
# If it IS an archive, it will proceed.
# self.skip_zip and self.skip_rar are False in this mode (set in main.py), so they won't cause a skip.
elif self . filter_mode == ' image ' :
if not is_img_type :
self . logger ( f " -> Filter Skip: ' { api_original_filename } ' (Not Image). " )
return 0 , 1 , api_original_filename , False
elif self . filter_mode == ' video ' :
if not is_vid_type :
self . logger ( f " -> Filter Skip: ' { api_original_filename } ' (Not Video). " )
return 0 , 1 , api_original_filename , False
# No specific 'elif self.filter_mode == 'all':' is needed here, as 'all' implies no primary type filtering.
# The self.skip_zip / self.skip_rar checks below will handle user preference for skipping archives in 'all' mode.
# These skip checks are now primarily for 'all' mode or if filter_mode is something else.
# In 'archive' mode, self.skip_zip and self.skip_rar will be False.
if self . skip_zip and is_zip ( api_original_filename ) : # Use specific is_zip for the skip_zip flag
2025-05-09 19:03:01 +05:30
self . logger ( f " -> Pref Skip: ' { api_original_filename } ' (ZIP). " )
2025-05-10 11:07:27 +05:30
return 0 , 1 , api_original_filename , False
if self . skip_rar and is_rar ( api_original_filename ) : # Use specific is_rar for the skip_rar flag
2025-05-09 19:03:01 +05:30
self . logger ( f " -> Pref Skip: ' { api_original_filename } ' (RAR). " )
2025-05-10 11:07:27 +05:30
return 0 , 1 , api_original_filename , False
# ===== MODIFICATION END =====
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
target_folder_basename = os . path . basename ( target_folder_path )
2025-05-08 19:49:50 +05:30
current_save_path = os . path . join ( target_folder_path , final_filename_for_sets_and_saving )
if os . path . exists ( current_save_path ) and os . path . getsize ( current_save_path ) > 0 :
self . logger ( f " -> Exists (Path): ' { final_filename_for_sets_and_saving } ' in ' { target_folder_basename } ' . " )
2025-05-10 11:07:27 +05:30
with self . downloaded_files_lock : self . downloaded_files . add ( final_filename_for_sets_and_saving ) # Add final name
return 0 , 1 , final_filename_for_sets_and_saving , was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
with self . downloaded_files_lock :
if final_filename_for_sets_and_saving in self . downloaded_files :
2025-05-10 11:07:27 +05:30
self . logger ( f " -> Global Skip (Filename): ' { final_filename_for_sets_and_saving } ' already recorded this session. " )
return 0 , 1 , final_filename_for_sets_and_saving , was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
max_retries = 3
2025-05-10 11:07:27 +05:30
retry_delay = 5
downloaded_size_bytes = 0
2025-05-09 19:03:01 +05:30
calculated_file_hash = None
2025-05-10 11:07:27 +05:30
file_content_bytes = None
total_size_bytes = 0 # Initialize total_size_bytes for this download attempt
2025-05-08 19:49:50 +05:30
download_successful_flag = False
2025-05-10 11:07:27 +05:30
for attempt_num in range ( max_retries + 1 ) :
2025-05-09 19:03:01 +05:30
if self . check_cancel ( ) or ( skip_event and skip_event . is_set ( ) ) :
2025-05-10 11:07:27 +05:30
break
2025-05-07 07:20:40 +05:30
try :
2025-05-09 19:03:01 +05:30
if attempt_num > 0 :
self . logger ( f " Retrying ' { api_original_filename } ' (Attempt { attempt_num } / { max_retries } )... " )
time . sleep ( retry_delay * ( 2 * * ( attempt_num - 1 ) ) ) # Exponential backoff
if self . signals and hasattr ( self . signals , ' file_download_status_signal ' ) :
2025-05-10 11:07:27 +05:30
self . signals . file_download_status_signal . emit ( True ) # Indicate download attempt start
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
response = requests . get ( file_url , headers = headers , timeout = ( 15 , 300 ) , stream = True ) # Generous timeout
2025-05-09 19:03:01 +05:30
response . raise_for_status ( ) # Check for HTTP errors
2025-05-08 19:49:50 +05:30
2025-05-09 19:03:01 +05:30
current_total_size_bytes_from_headers = int ( response . headers . get ( ' Content-Length ' , 0 ) )
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if attempt_num == 0 : # Only set total_size_bytes on the first attempt from headers
2025-05-09 19:03:01 +05:30
total_size_bytes = current_total_size_bytes_from_headers
size_str = f " { total_size_bytes / ( 1024 * 1024 ) : .2f } MB " if total_size_bytes > 0 else " unknown size "
self . logger ( f " ⬇️ Downloading: ' { api_original_filename } ' (Size: { size_str } ) [Saving as: ' { final_filename_for_sets_and_saving } ' ] " )
2025-05-10 11:07:27 +05:30
current_attempt_total_size = total_size_bytes # Use the initial total_size for progress calculation
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
file_content_buffer = BytesIO ( )
2025-05-09 19:03:01 +05:30
current_attempt_downloaded_bytes = 0
md5_hasher = hashlib . md5 ( )
2025-05-08 19:49:50 +05:30
last_progress_time = time . time ( )
for chunk in response . iter_content ( chunk_size = 1 * 1024 * 1024 ) : # 1MB chunks
2025-05-09 19:03:01 +05:30
if self . check_cancel ( ) or ( skip_event and skip_event . is_set ( ) ) :
2025-05-10 11:07:27 +05:30
break
2025-05-08 19:49:50 +05:30
if chunk :
2025-05-09 19:03:01 +05:30
file_content_buffer . write ( chunk )
md5_hasher . update ( chunk )
current_attempt_downloaded_bytes + = len ( chunk )
if time . time ( ) - last_progress_time > 1 and current_attempt_total_size > 0 and \
self . signals and hasattr ( self . signals , ' file_progress_signal ' ) :
self . signals . file_progress_signal . emit (
api_original_filename , # Show original name in progress
current_attempt_downloaded_bytes ,
2025-05-10 11:07:27 +05:30
current_attempt_total_size
2025-05-09 19:03:01 +05:30
)
2025-05-08 19:49:50 +05:30
last_progress_time = time . time ( )
2025-05-09 19:03:01 +05:30
if self . check_cancel ( ) or ( skip_event and skip_event . is_set ( ) ) :
if file_content_buffer : file_content_buffer . close ( )
2025-05-10 11:07:27 +05:30
break # Exit retry loop if cancelled
# After loop, check if download was successful for this attempt
if current_attempt_downloaded_bytes > 0 or ( current_attempt_total_size == 0 and response . status_code == 200 ) : # Successfully downloaded something or it's a valid 0-byte file
2025-05-08 19:49:50 +05:30
calculated_file_hash = md5_hasher . hexdigest ( )
2025-05-09 19:03:01 +05:30
downloaded_size_bytes = current_attempt_downloaded_bytes
2025-05-10 11:07:27 +05:30
if file_content_bytes : file_content_bytes . close ( ) # Close previous buffer if any
file_content_bytes = file_content_buffer # Assign the new buffer
file_content_bytes . seek ( 0 ) # Rewind for reading
2025-05-08 19:49:50 +05:30
download_successful_flag = True
2025-05-10 11:07:27 +05:30
break # Successful download, exit retry loop
else : # No bytes downloaded, and not a 0-byte file case
if file_content_buffer : file_content_buffer . close ( )
# Continue to next retry if not max retries
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
except ( requests . exceptions . ConnectionError , requests . exceptions . Timeout , http . client . IncompleteRead ) as e :
2025-05-09 19:03:01 +05:30
self . logger ( f " ❌ Download Error (Retryable): { api_original_filename } . Error: { e } " )
if ' file_content_buffer ' in locals ( ) and file_content_buffer : file_content_buffer . close ( )
2025-05-10 11:07:27 +05:30
# Continue to next retry if not max retries
except requests . exceptions . RequestException as e : # Non-retryable HTTP errors
2025-05-09 19:03:01 +05:30
self . logger ( f " ❌ Download Error (Non-Retryable): { api_original_filename } . Error: { e } " )
if ' file_content_buffer ' in locals ( ) and file_content_buffer : file_content_buffer . close ( )
2025-05-10 11:07:27 +05:30
break # Exit retry loop
2025-05-09 19:03:01 +05:30
except Exception as e : # Other unexpected errors
self . logger ( f " ❌ Unexpected Download Error: { api_original_filename } : { e } \n { traceback . format_exc ( limit = 2 ) } " )
if ' file_content_buffer ' in locals ( ) and file_content_buffer : file_content_buffer . close ( )
2025-05-10 11:07:27 +05:30
break # Exit retry loop
2025-05-08 19:49:50 +05:30
finally :
2025-05-09 19:03:01 +05:30
if self . signals and hasattr ( self . signals , ' file_download_status_signal ' ) :
2025-05-10 11:07:27 +05:30
self . signals . file_download_status_signal . emit ( False ) # Indicate download attempt end
# Final progress update after all retries or success
2025-05-09 19:03:01 +05:30
if self . signals and hasattr ( self . signals , ' file_progress_signal ' ) :
final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes
self . signals . file_progress_signal . emit ( api_original_filename , downloaded_size_bytes , final_total_for_progress )
2025-05-08 19:49:50 +05:30
if self . check_cancel ( ) or ( skip_event and skip_event . is_set ( ) ) :
2025-05-09 19:03:01 +05:30
self . logger ( f " ⚠️ Download interrupted for { api_original_filename } . " )
2025-05-08 19:49:50 +05:30
if file_content_bytes : file_content_bytes . close ( )
2025-05-10 11:07:27 +05:30
return 0 , 1 , final_filename_saved_for_return , was_original_name_kept_flag
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
if not download_successful_flag :
2025-05-09 19:03:01 +05:30
self . logger ( f " ❌ Download failed for ' { api_original_filename } ' after { max_retries + 1 } attempts. " )
2025-05-10 11:07:27 +05:30
if file_content_bytes : file_content_bytes . close ( )
return 0 , 1 , final_filename_saved_for_return , was_original_name_kept_flag
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
# Check hash against already downloaded files (session-based)
2025-05-08 19:49:50 +05:30
with self . downloaded_file_hashes_lock :
if calculated_file_hash in self . downloaded_file_hashes :
2025-05-09 19:03:01 +05:30
self . logger ( f " -> Content Skip (Hash): ' { api_original_filename } ' (Hash: { calculated_file_hash [ : 8 ] } ...) already downloaded this session. " )
2025-05-10 11:07:27 +05:30
with self . downloaded_files_lock : self . downloaded_files . add ( final_filename_for_sets_and_saving ) # Add final name
2025-05-08 19:49:50 +05:30
if file_content_bytes : file_content_bytes . close ( )
2025-05-10 11:07:27 +05:30
return 0 , 1 , final_filename_for_sets_and_saving , was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
bytes_to_write = file_content_bytes # This is the BytesIO object with downloaded content
final_filename_after_processing = final_filename_for_sets_and_saving
current_save_path_final = current_save_path # Path with potentially collided name
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
is_img_for_compress_check = is_image ( api_original_filename ) # Check original name for image type
if is_img_for_compress_check and self . compress_images and Image and downloaded_size_bytes > ( 1.5 * 1024 * 1024 ) : # 1.5MB threshold
2025-05-08 19:49:50 +05:30
self . logger ( f " Compressing ' { api_original_filename } ' ( { downloaded_size_bytes / ( 1024 * 1024 ) : .2f } MB)... " )
try :
2025-05-10 11:07:27 +05:30
bytes_to_write . seek ( 0 ) # Ensure buffer is at the beginning
2025-05-08 19:49:50 +05:30
with Image . open ( bytes_to_write ) as img_obj :
2025-05-10 11:07:27 +05:30
# Handle palette mode images by converting to RGBA/RGB
if img_obj . mode == ' P ' : img_obj = img_obj . convert ( ' RGBA ' )
2025-05-09 19:03:01 +05:30
elif img_obj . mode not in [ ' RGB ' , ' RGBA ' , ' L ' ] : img_obj = img_obj . convert ( ' RGB ' )
2025-05-08 19:49:50 +05:30
compressed_bytes_io = BytesIO ( )
2025-05-10 11:07:27 +05:30
img_obj . save ( compressed_bytes_io , format = ' WebP ' , quality = 80 , method = 4 ) # method=4 is a good balance
2025-05-08 19:49:50 +05:30
compressed_size = compressed_bytes_io . getbuffer ( ) . nbytes
2025-05-10 11:07:27 +05:30
if compressed_size < downloaded_size_bytes * 0.9 : # Only save if significantly smaller (e.g., 10% reduction)
2025-05-08 19:49:50 +05:30
self . logger ( f " Compression success: { compressed_size / ( 1024 * 1024 ) : .2f } MB. " )
2025-05-10 11:07:27 +05:30
bytes_to_write . close ( ) # Close original downloaded buffer
bytes_to_write = compressed_bytes_io # Switch to compressed buffer
bytes_to_write . seek ( 0 ) # Rewind compressed buffer
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
base_name_orig , _ = os . path . splitext ( final_filename_for_sets_and_saving )
2025-05-10 11:07:27 +05:30
final_filename_after_processing = base_name_orig + ' .webp '
current_save_path_final = os . path . join ( target_folder_path , final_filename_after_processing ) # Update save path
2025-05-08 19:49:50 +05:30
self . logger ( f " Updated filename (compressed): { final_filename_after_processing } " )
2025-05-07 07:20:40 +05:30
else :
2025-05-10 11:07:27 +05:30
self . logger ( f " Compression skipped: WebP not significantly smaller. " ) ; bytes_to_write . seek ( 0 ) # Rewind original if not using compressed
2025-05-08 19:49:50 +05:30
except Exception as comp_e :
2025-05-10 11:07:27 +05:30
self . logger ( f " ❌ Compression failed for ' { api_original_filename } ' : { comp_e } . Saving original. " ) ; bytes_to_write . seek ( 0 ) # Rewind original
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
final_filename_saved_for_return = final_filename_after_processing # This is the name that will be saved
# Final check if the (potentially new, e.g. .webp) filename already exists
2025-05-08 19:49:50 +05:30
if final_filename_after_processing != final_filename_for_sets_and_saving and \
os . path . exists ( current_save_path_final ) and os . path . getsize ( current_save_path_final ) > 0 :
self . logger ( f " -> Exists (Path - Post-Compress): ' { final_filename_after_processing } ' in ' { target_folder_basename } ' . " )
with self . downloaded_files_lock : self . downloaded_files . add ( final_filename_after_processing )
bytes_to_write . close ( )
2025-05-10 11:07:27 +05:30
return 0 , 1 , final_filename_after_processing , was_original_name_kept_flag
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
try :
2025-05-10 11:07:27 +05:30
os . makedirs ( os . path . dirname ( current_save_path_final ) , exist_ok = True )
2025-05-08 19:49:50 +05:30
with open ( current_save_path_final , ' wb ' ) as f_out :
2025-05-10 11:07:27 +05:30
f_out . write ( bytes_to_write . getvalue ( ) )
2025-05-08 19:49:50 +05:30
with self . downloaded_file_hashes_lock : self . downloaded_file_hashes . add ( calculated_file_hash )
2025-05-10 11:07:27 +05:30
with self . downloaded_files_lock : self . downloaded_files . add ( final_filename_after_processing ) # Add final name
2025-05-08 19:49:50 +05:30
self . logger ( f " ✅ Saved: ' { final_filename_after_processing } ' (from ' { api_original_filename } ' , { downloaded_size_bytes / ( 1024 * 1024 ) : .2f } MB) in ' { target_folder_basename } ' " )
2025-05-10 11:07:27 +05:30
time . sleep ( 0.05 ) # Small delay
return 1 , 0 , final_filename_after_processing , was_original_name_kept_flag
2025-05-08 19:49:50 +05:30
except Exception as save_err :
self . logger ( f " ❌ Save Fail for ' { final_filename_after_processing } ' : { save_err } " )
2025-05-10 11:07:27 +05:30
if os . path . exists ( current_save_path_final ) : # Attempt to clean up partial file
2025-05-09 19:03:01 +05:30
try : os . remove ( current_save_path_final ) ;
2025-05-08 19:49:50 +05:30
except OSError : self . logger ( f " -> Failed to remove partially saved file: { current_save_path_final } " )
2025-05-10 11:07:27 +05:30
return 0 , 1 , final_filename_saved_for_return , was_original_name_kept_flag # Return the name it attempted to save as
2025-05-08 19:49:50 +05:30
finally :
2025-05-10 11:07:27 +05:30
if bytes_to_write : bytes_to_write . close ( )
2025-05-08 19:49:50 +05:30
def process ( self ) :
""" Main processing logic for a single post. """
2025-05-10 11:07:27 +05:30
if self . check_cancel ( ) : return 0 , 0 , [ ]
kept_original_filenames_for_log = [ ]
2025-05-08 19:49:50 +05:30
total_downloaded_this_post = 0
total_skipped_this_post = 0
2025-05-10 11:07:27 +05:30
parsed_api_url = urlparse ( self . api_url_input )
2025-05-09 19:03:01 +05:30
referer_url = f " https:// { parsed_api_url . netloc } / "
headers = { ' User-Agent ' : ' Mozilla/5.0 ' , ' Referer ' : referer_url , ' Accept ' : ' */* ' }
link_pattern = re . compile ( r """ <a \ s+.*?href=[ " ' ](https?://[^ " ' ]+)[ " ' ][^>]*>(.*?)</a> """ ,
re . IGNORECASE | re . DOTALL )
post_data = self . post
post_title = post_data . get ( ' title ' , ' ' ) or ' untitled_post '
2025-05-08 19:49:50 +05:30
post_id = post_data . get ( ' id ' , ' unknown_id ' )
2025-05-10 11:07:27 +05:30
post_main_file_info = post_data . get ( ' file ' ) # This is a dict if present
post_attachments = post_data . get ( ' attachments ' , [ ] ) # This is a list of dicts
post_content_html = post_data . get ( ' content ' , ' ' )
2025-05-08 19:49:50 +05:30
self . logger ( f " \n --- Processing Post { post_id } ( ' { post_title [ : 50 ] } ... ' ) (Thread: { threading . current_thread ( ) . name } ) --- " )
2025-05-10 11:07:27 +05:30
num_potential_files_in_post = len ( post_attachments or [ ] ) + ( 1 if post_main_file_info and post_main_file_info . get ( ' path ' ) else 0 )
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Apply skip_words_list based on skip_words_scope (for posts)
if self . skip_words_list and ( self . skip_words_scope == SKIP_SCOPE_POSTS or self . skip_words_scope == SKIP_SCOPE_BOTH ) :
post_title_lower = post_title . lower ( )
for skip_word in self . skip_words_list :
if skip_word . lower ( ) in post_title_lower :
self . logger ( f " -> Skip Post (Keyword in Title ' { skip_word } ' ): ' { post_title [ : 50 ] } ... ' . Scope: { self . skip_words_scope } " )
return 0 , num_potential_files_in_post , [ ] # Skip all files in this post
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Character filter for Manga Mode (applies to post title)
2025-05-09 19:03:01 +05:30
if not self . extract_links_only and self . manga_mode_active and self . filter_character_list :
if not any ( is_title_match_for_character ( post_title , char_name ) for char_name in self . filter_character_list ) :
self . logger ( f " -> Skip Post (Manga Mode - Title No Char Match): Title ' { post_title [ : 50 ] } ' doesn ' t match active character filters. " )
2025-05-10 11:07:27 +05:30
return 0 , num_potential_files_in_post , [ ]
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
if not isinstance ( post_attachments , list ) : # Basic sanity check
2025-05-08 19:49:50 +05:30
self . logger ( f " ⚠️ Corrupt attachment data for post { post_id } (expected list, got { type ( post_attachments ) } ). Skipping attachments. " )
post_attachments = [ ]
2025-05-10 11:07:27 +05:30
potential_base_save_folders = [ ]
if not self . extract_links_only :
2025-05-09 19:03:01 +05:30
if self . use_subfolders :
2025-05-10 11:07:27 +05:30
# If character filters are active and it's manga mode, folder name comes from character filter matching post title
if self . filter_character_list and self . manga_mode_active :
for char_filter_name in self . filter_character_list :
if is_title_match_for_character ( post_title , char_filter_name ) :
2025-05-09 19:03:01 +05:30
cleaned_folder = clean_folder_name ( char_filter_name )
if cleaned_folder : potential_base_save_folders . append ( cleaned_folder )
2025-05-10 11:07:27 +05:30
# If not manga mode with character filter, or if manga mode didn't find a match, try known names / title
if not potential_base_save_folders :
2025-05-09 19:03:01 +05:30
derived_folders = match_folders_from_title ( post_title , self . known_names , self . unwanted_keywords )
if derived_folders :
potential_base_save_folders . extend ( derived_folders )
self . logger ( f " Folder Target(s) (Derived from Title & Known Names): { ' , ' . join ( derived_folders ) } " )
2025-05-10 11:07:27 +05:30
else :
2025-05-09 19:03:01 +05:30
fallback_folder = extract_folder_name_from_title ( post_title , self . unwanted_keywords )
potential_base_save_folders . append ( fallback_folder )
self . logger ( f " Folder Target (Fallback from Title): { fallback_folder } " )
2025-05-10 11:07:27 +05:30
if not potential_base_save_folders : # Absolute fallback
2025-05-09 19:03:01 +05:30
potential_base_save_folders . append ( clean_folder_name ( post_title if post_title else " untitled_creator_content " ) )
self . logger ( f " Folder Target (Final Fallback): { potential_base_save_folders [ 0 ] } " )
2025-05-10 11:07:27 +05:30
else : # Not using subfolders, save to root
potential_base_save_folders = [ " " ]
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Skip post if folder name contains skip words (only if subfolders are used)
2025-05-09 19:03:01 +05:30
if not self . extract_links_only and self . use_subfolders and self . skip_words_list :
for folder_name_to_check in potential_base_save_folders :
2025-05-10 11:07:27 +05:30
if not folder_name_to_check : continue # Skip if base folder is root
2025-05-09 19:03:01 +05:30
if any ( skip_word . lower ( ) in folder_name_to_check . lower ( ) for skip_word in self . skip_words_list ) :
matched_skip = next ( ( sw for sw in self . skip_words_list if sw . lower ( ) in folder_name_to_check . lower ( ) ) , " unknown_skip_word " )
self . logger ( f " -> Skip Post (Folder Keyword): Potential folder ' { folder_name_to_check } ' contains ' { matched_skip } ' . " )
2025-05-10 11:07:27 +05:30
return 0 , num_potential_files_in_post , [ ]
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# External Link Extraction
2025-05-08 19:49:50 +05:30
if ( self . show_external_links or self . extract_links_only ) and post_content_html :
try :
2025-05-10 11:07:27 +05:30
unique_links_data = { }
2025-05-09 19:03:01 +05:30
for match in link_pattern . finditer ( post_content_html ) :
link_url = match . group ( 1 ) . strip ( )
2025-05-10 11:07:27 +05:30
link_inner_text = match . group ( 2 )
2025-05-09 19:03:01 +05:30
if not any ( ext in link_url . lower ( ) for ext in [ ' .css ' , ' .js ' , ' .ico ' , ' .xml ' , ' .svg ' ] ) \
and not link_url . startswith ( ' javascript: ' ) \
and link_url not in unique_links_data :
clean_link_text = re . sub ( r ' <.*?> ' , ' ' , link_inner_text )
clean_link_text = html . unescape ( clean_link_text ) . strip ( )
display_text = clean_link_text if clean_link_text else " [Link] "
unique_links_data [ link_url ] = display_text
links_emitted_count = 0
scraped_platforms = { ' kemono ' , ' coomer ' , ' patreon ' }
for link_url , link_text in unique_links_data . items ( ) :
platform = get_link_platform ( link_url )
if platform not in scraped_platforms :
if self . signals and hasattr ( self . signals , ' external_link_signal ' ) :
self . signals . external_link_signal . emit ( post_title , link_text , link_url , platform )
links_emitted_count + = 1
if links_emitted_count > 0 : self . logger ( f " 🔗 Found { links_emitted_count } potential external link(s) in post content. " )
2025-05-08 19:49:50 +05:30
except Exception as e : self . logger ( f " ⚠️ Error parsing post content for links: { e } \n { traceback . format_exc ( limit = 2 ) } " )
if self . extract_links_only :
2025-05-09 19:03:01 +05:30
self . logger ( f " Extract Links Only mode: Finished processing post { post_id } for links. " )
2025-05-10 11:07:27 +05:30
return 0 , 0 , [ ] # No files downloaded or skipped in this mode for this counter
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
# --- Prepare list of files to download from this post ---
all_files_from_post_api = [ ]
api_file_domain = urlparse ( self . api_url_input ) . netloc # Use domain from input URL
if not api_file_domain or not any ( d in api_file_domain . lower ( ) for d in [ ' kemono.su ' , ' kemono.party ' , ' coomer.su ' , ' coomer.party ' ] ) :
# Fallback if input URL isn't a direct site URL (e.g. API URL was passed, though less common for user input)
2025-05-09 19:03:01 +05:30
api_file_domain = " kemono.su " if " kemono " in self . service . lower ( ) else " coomer.party "
2025-05-10 11:07:27 +05:30
if post_main_file_info and isinstance ( post_main_file_info , dict ) and post_main_file_info . get ( ' path ' ) :
file_path = post_main_file_info [ ' path ' ] . lstrip ( ' / ' )
original_api_name = post_main_file_info . get ( ' name ' ) or os . path . basename ( file_path )
if original_api_name :
all_files_from_post_api . append ( {
' url ' : f " https:// { api_file_domain } { file_path } " if file_path . startswith ( ' / ' ) else f " https:// { api_file_domain } /data/ { file_path } " ,
' name ' : original_api_name , # This 'name' might be used for initial filtering if _original_name_for_log isn't set
' _original_name_for_log ' : original_api_name , # Store the true original for logging/type checks
' _is_thumbnail ' : self . download_thumbnails and is_image ( original_api_name )
} )
else : self . logger ( f " ⚠️ Skipping main file for post { post_id } : Missing name (Path: { file_path } ) " )
for idx , att_info in enumerate ( post_attachments ) :
if isinstance ( att_info , dict ) and att_info . get ( ' path ' ) :
att_path = att_info [ ' path ' ] . lstrip ( ' / ' )
original_api_att_name = att_info . get ( ' name ' ) or os . path . basename ( att_path )
if original_api_att_name :
all_files_from_post_api . append ( {
' url ' : f " https:// { api_file_domain } { att_path } " if att_path . startswith ( ' / ' ) else f " https:// { api_file_domain } /data/ { att_path } " ,
' name ' : original_api_att_name ,
' _original_name_for_log ' : original_api_att_name ,
' _is_thumbnail ' : self . download_thumbnails and is_image ( original_api_att_name )
2025-05-08 19:49:50 +05:30
} )
2025-05-10 11:07:27 +05:30
else : self . logger ( f " ⚠️ Skipping attachment { idx + 1 } for post { post_id } : Missing name (Path: { att_path } ) " )
else : self . logger ( f " ⚠️ Skipping invalid attachment { idx + 1 } for post { post_id } : { str ( att_info ) [ : 100 ] } " )
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if self . download_thumbnails : # Filter non-images if in thumbnail mode
all_files_from_post_api = [ finfo for finfo in all_files_from_post_api if finfo [ ' _is_thumbnail ' ] ]
if not all_files_from_post_api :
self . logger ( f " -> No image thumbnails found for post { post_id } in thumbnail-only mode. " )
return 0 , 0 , [ ]
if not all_files_from_post_api :
2025-05-07 07:20:40 +05:30
self . logger ( f " No files found to download for post { post_id } . " )
2025-05-10 11:07:27 +05:30
return 0 , 0 , [ ]
# --- Filter out duplicates based on original API filename WITHIN THIS POST ---
files_to_download_info_list = [ ]
processed_original_filenames_in_this_post = set ( )
for file_info in all_files_from_post_api :
current_api_original_filename = file_info . get ( ' _original_name_for_log ' )
if current_api_original_filename in processed_original_filenames_in_this_post :
self . logger ( f " -> Skip Duplicate Original Name (within post { post_id } ): ' { current_api_original_filename } ' already processed/listed for this post. " )
total_skipped_this_post + = 1
else :
files_to_download_info_list . append ( file_info )
if current_api_original_filename :
processed_original_filenames_in_this_post . add ( current_api_original_filename )
if not files_to_download_info_list :
self . logger ( f " All files for post { post_id } were duplicate original names or skipped earlier. " )
return 0 , total_skipped_this_post , [ ]
num_files_in_this_post_for_naming = len ( files_to_download_info_list )
self . logger ( f " Identified { num_files_in_this_post_for_naming } unique original file(s) for potential download from post { post_id } . " )
2025-05-08 19:49:50 +05:30
with ThreadPoolExecutor ( max_workers = self . num_file_threads , thread_name_prefix = f ' P { post_id } File_ ' ) as file_pool :
futures_list = [ ]
2025-05-09 19:03:01 +05:30
for file_idx , file_info_to_dl in enumerate ( files_to_download_info_list ) :
2025-05-07 07:20:40 +05:30
if self . check_cancel ( ) : break
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
actual_target_full_paths_for_this_file = [ ]
2025-05-09 19:03:01 +05:30
if self . use_subfolders :
2025-05-10 11:07:27 +05:30
# If character filters are active and NOT manga mode, folder name comes from char filter matching filename
if self . filter_character_list and not self . manga_mode_active :
2025-05-09 19:03:01 +05:30
for char_name_from_filter_list in self . filter_character_list :
2025-05-10 11:07:27 +05:30
if is_filename_match_for_character ( file_info_to_dl . get ( ' _original_name_for_log ' ) , char_name_from_filter_list ) :
2025-05-09 19:03:01 +05:30
base_char_folder_path = os . path . join ( self . download_root , clean_folder_name ( char_name_from_filter_list ) )
if self . use_post_subfolders :
cleaned_title_for_subfolder = clean_folder_name ( post_title )
post_specific_subfolder_name = f " { post_id } _ { cleaned_title_for_subfolder } " if cleaned_title_for_subfolder else f " { post_id } _untitled "
actual_target_full_paths_for_this_file . append ( os . path . join ( base_char_folder_path , post_specific_subfolder_name ) )
else :
actual_target_full_paths_for_this_file . append ( base_char_folder_path )
2025-05-10 11:07:27 +05:30
else : # Manga mode with char filter (already handled for potential_base_save_folders) OR no char filter OR char filter didn't match filename in normal mode
for base_folder_name in potential_base_save_folders : # These were determined earlier
2025-05-09 19:03:01 +05:30
base_folder_path = os . path . join ( self . download_root , base_folder_name )
if self . use_post_subfolders :
cleaned_title_for_subfolder = clean_folder_name ( post_title )
post_specific_subfolder_name = f " { post_id } _ { cleaned_title_for_subfolder } " if cleaned_title_for_subfolder else f " { post_id } _untitled "
actual_target_full_paths_for_this_file . append ( os . path . join ( base_folder_path , post_specific_subfolder_name ) )
else :
actual_target_full_paths_for_this_file . append ( base_folder_path )
2025-05-10 11:07:27 +05:30
else : # Not using subfolders at all
2025-05-09 19:03:01 +05:30
actual_target_full_paths_for_this_file = [ self . download_root ]
2025-05-10 11:07:27 +05:30
# Override with custom folder name if it's a single post download and custom name is provided
if self . target_post_id_from_initial_url and self . custom_folder_name : # custom_folder_name is already cleaned
2025-05-09 19:03:01 +05:30
custom_full_path = os . path . join ( self . download_root , self . custom_folder_name )
actual_target_full_paths_for_this_file = [ custom_full_path ]
2025-05-10 11:07:27 +05:30
# Fallback if no specific target paths were determined (e.g. char filter normal mode no match)
2025-05-09 19:03:01 +05:30
if not actual_target_full_paths_for_this_file :
2025-05-10 11:07:27 +05:30
default_target_for_non_match = self . download_root
if self . use_subfolders : # Should use one of the potential_base_save_folders if subfolders enabled
gen_folder_name = potential_base_save_folders [ 0 ] if potential_base_save_folders and potential_base_save_folders [ 0 ] else clean_folder_name ( post_title )
default_target_for_non_match = os . path . join ( self . download_root , gen_folder_name )
if self . use_post_subfolders :
cleaned_title_for_subfolder = clean_folder_name ( post_title )
post_specific_subfolder_name = f " { post_id } _ { cleaned_title_for_subfolder } " if cleaned_title_for_subfolder else f " { post_id } _untitled "
default_target_for_non_match = os . path . join ( default_target_for_non_match , post_specific_subfolder_name )
actual_target_full_paths_for_this_file = [ default_target_for_non_match ]
for target_path in set ( actual_target_full_paths_for_this_file ) : # Use set to avoid duplicate downloads to same path
2025-05-08 19:49:50 +05:30
if self . check_cancel ( ) : break
futures_list . append ( file_pool . submit (
self . _download_single_file ,
file_info_to_dl ,
2025-05-10 11:07:27 +05:30
target_path ,
2025-05-08 19:49:50 +05:30
headers ,
2025-05-10 11:07:27 +05:30
post_id ,
self . skip_current_file_flag ,
post_title , # Pass post_title for manga naming
file_idx ,
num_files_in_this_post_for_naming
2025-05-08 19:49:50 +05:30
) )
2025-05-10 11:07:27 +05:30
if self . check_cancel ( ) : break
2025-05-09 19:03:01 +05:30
for future in as_completed ( futures_list ) :
2025-05-10 11:07:27 +05:30
if self . check_cancel ( ) :
for f_to_cancel in futures_list : # Attempt to cancel pending futures
2025-05-09 19:03:01 +05:30
if not f_to_cancel . done ( ) :
f_to_cancel . cancel ( )
2025-05-10 11:07:27 +05:30
break
2025-05-07 07:20:40 +05:30
try :
2025-05-10 11:07:27 +05:30
dl_count , skip_count , actual_filename_saved , original_kept_flag = future . result ( )
2025-05-08 19:49:50 +05:30
total_downloaded_this_post + = dl_count
total_skipped_this_post + = skip_count
2025-05-10 11:07:27 +05:30
if original_kept_flag and dl_count > 0 and actual_filename_saved : # Ensure filename is not empty
kept_original_filenames_for_log . append ( actual_filename_saved )
except CancelledError :
self . logger ( f " File download task for post { post_id } was cancelled. " )
total_skipped_this_post + = 1 # Assume one file per cancelled future
2025-05-09 19:03:01 +05:30
except Exception as exc_f :
2025-05-08 19:49:50 +05:30
self . logger ( f " ❌ File download task for post { post_id } resulted in error: { exc_f } " )
2025-05-10 11:07:27 +05:30
total_skipped_this_post + = 1 # Assume one file failed
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
# Clear file progress after all files for this post are done or cancelled
2025-05-08 19:49:50 +05:30
if self . signals and hasattr ( self . signals , ' file_progress_signal ' ) :
2025-05-10 11:07:27 +05:30
self . signals . file_progress_signal . emit ( " " , 0 , 0 )
2025-05-07 07:20:40 +05:30
2025-05-09 19:03:01 +05:30
if self . check_cancel ( ) : self . logger ( f " Post { post_id } processing interrupted/cancelled. " ) ;
2025-05-08 19:49:50 +05:30
else : self . logger ( f " Post { post_id } Summary: Downloaded= { total_downloaded_this_post } , Skipped Files= { total_skipped_this_post } " )
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
return total_downloaded_this_post , total_skipped_this_post , kept_original_filenames_for_log
2025-05-08 19:49:50 +05:30
class DownloadThread ( QThread ) :
2025-05-09 19:03:01 +05:30
"""
2025-05-10 11:07:27 +05:30
Manages the overall download process .
2025-05-09 19:03:01 +05:30
Fetches posts using download_from_api and then processes each post using PostProcessorWorker .
"""
2025-05-10 11:07:27 +05:30
progress_signal = pyqtSignal ( str )
add_character_prompt_signal = pyqtSignal ( str ) # For main app to show prompt
file_download_status_signal = pyqtSignal ( bool ) # True when a file dl starts, False when ends/fails
finished_signal = pyqtSignal ( int , int , bool , list ) # dl_count, skip_count, was_cancelled, kept_original_names
external_link_signal = pyqtSignal ( str , str , str , str ) # post_title, link_text, link_url, platform
file_progress_signal = pyqtSignal ( str , int , int ) # filename, downloaded_bytes, total_bytes
2025-05-08 19:49:50 +05:30
def __init__ ( self , api_url_input , output_dir , known_names_copy ,
2025-05-10 11:07:27 +05:30
cancellation_event , # This is a threading.Event from the main app
2025-05-08 19:49:50 +05:30
filter_character_list = None ,
filter_mode = ' all ' , skip_zip = True , skip_rar = True ,
use_subfolders = True , use_post_subfolders = False , custom_folder_name = None , compress_images = False ,
2025-05-07 07:20:40 +05:30
download_thumbnails = False , service = None , user_id = None ,
2025-05-08 19:49:50 +05:30
downloaded_files = None , downloaded_file_hashes = None , downloaded_files_lock = None , downloaded_file_hashes_lock = None ,
skip_words_list = None ,
2025-05-10 11:07:27 +05:30
skip_words_scope = SKIP_SCOPE_FILES ,
2025-05-08 19:49:50 +05:30
show_external_links = False ,
2025-05-09 19:03:01 +05:30
extract_links_only = False ,
2025-05-10 11:07:27 +05:30
num_file_threads_for_worker = 1 , # For PostProcessorWorker's internal pool
skip_current_file_flag = None , # This is a threading.Event
2025-05-09 19:03:01 +05:30
start_page = None , end_page = None ,
2025-05-10 11:07:27 +05:30
target_post_id_from_initial_url = None , # The specific post ID if single post URL
2025-05-08 19:49:50 +05:30
manga_mode_active = False ,
2025-05-10 11:07:27 +05:30
unwanted_keywords = None ,
manga_filename_style = STYLE_POST_TITLE
2025-05-08 19:49:50 +05:30
) :
2025-05-07 07:20:40 +05:30
super ( ) . __init__ ( )
2025-05-08 19:49:50 +05:30
self . api_url_input = api_url_input
2025-05-07 07:20:40 +05:30
self . output_dir = output_dir
2025-05-10 11:07:27 +05:30
self . known_names = list ( known_names_copy ) # Make a copy
self . cancellation_event = cancellation_event # Use the shared event
self . skip_current_file_flag = skip_current_file_flag # Use the shared event
self . initial_target_post_id = target_post_id_from_initial_url # Store the original target
2025-05-08 19:49:50 +05:30
self . filter_character_list = filter_character_list if filter_character_list else [ ]
2025-05-07 07:20:40 +05:30
self . filter_mode = filter_mode
self . skip_zip = skip_zip
self . skip_rar = skip_rar
self . use_subfolders = use_subfolders
2025-05-08 19:49:50 +05:30
self . use_post_subfolders = use_post_subfolders
2025-05-07 07:20:40 +05:30
self . custom_folder_name = custom_folder_name
self . compress_images = compress_images
self . download_thumbnails = download_thumbnails
self . service = service
self . user_id = user_id
self . skip_words_list = skip_words_list if skip_words_list is not None else [ ]
2025-05-10 11:07:27 +05:30
self . skip_words_scope = skip_words_scope
self . downloaded_files = downloaded_files # Should be the shared set from main app
self . downloaded_files_lock = downloaded_files_lock # Shared lock
self . downloaded_file_hashes = downloaded_file_hashes # Shared set
self . downloaded_file_hashes_lock = downloaded_file_hashes_lock # Shared lock
2025-05-09 19:03:01 +05:30
2025-05-10 11:07:27 +05:30
self . _add_character_response = None # For sync prompt result
self . prompt_mutex = QMutex ( ) # For sync prompt result
2025-05-09 19:03:01 +05:30
2025-05-08 19:49:50 +05:30
self . show_external_links = show_external_links
2025-05-09 19:03:01 +05:30
self . extract_links_only = extract_links_only
2025-05-08 19:49:50 +05:30
self . num_file_threads_for_worker = num_file_threads_for_worker
self . start_page = start_page
self . end_page = end_page
self . manga_mode_active = manga_mode_active
2025-05-09 19:03:01 +05:30
self . unwanted_keywords = unwanted_keywords if unwanted_keywords is not None else \
2025-05-10 11:07:27 +05:30
{ ' spicy ' , ' hd ' , ' nsfw ' , ' 4k ' , ' preview ' , ' teaser ' , ' clip ' }
self . manga_filename_style = manga_filename_style
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
if self . compress_images and Image is None : # Check Pillow again
2025-05-08 19:49:50 +05:30
self . logger ( " ⚠️ Image compression disabled: Pillow library not found (DownloadThread). " )
self . compress_images = False
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def logger ( self , message ) :
""" Emits a log message via the progress_signal. """
self . progress_signal . emit ( str ( message ) )
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def isInterruptionRequested ( self ) :
2025-05-09 19:03:01 +05:30
""" Checks if Qt interruption or manual cancellation event is set. """
2025-05-10 11:07:27 +05:30
# QThread's interruption is different from threading.Event
# We primarily use the threading.Event (self.cancellation_event)
return self . cancellation_event . is_set ( ) or super ( ) . isInterruptionRequested ( )
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def skip_file ( self ) :
2025-05-09 19:03:01 +05:30
""" Sets the flag to skip the currently processing file (if any). """
2025-05-10 11:07:27 +05:30
# This method is called from the main thread via the GUI button.
# It needs to signal the PostProcessorWorker's skip_event if one is active.
# However, the DownloadThread itself doesn't directly manage the skip_event for individual files.
# The skip_current_file_flag is passed to PostProcessorWorker.
2025-05-08 19:49:50 +05:30
if self . isRunning ( ) and self . skip_current_file_flag :
self . logger ( " ⏭️ Skip requested for current file (single-thread mode). " )
2025-05-10 11:07:27 +05:30
self . skip_current_file_flag . set ( ) # Signal the event
else : self . logger ( " ℹ ️ Skip file: No download active or skip flag not available for current context." )
2025-05-07 07:20:40 +05:30
2025-05-08 19:49:50 +05:30
def run ( self ) :
2025-05-09 19:03:01 +05:30
""" Main execution logic for the download thread. """
2025-05-08 19:49:50 +05:30
grand_total_downloaded_files = 0
grand_total_skipped_files = 0
2025-05-10 11:07:27 +05:30
grand_list_of_kept_original_filenames = [ ]
2025-05-08 19:49:50 +05:30
was_process_cancelled = False
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
# Create a PostProcessorSignals instance for this thread's workers
2025-05-08 19:49:50 +05:30
worker_signals_obj = PostProcessorSignals ( )
try :
2025-05-10 11:07:27 +05:30
# Connect signals from this worker_signals_obj to the DownloadThread's own signals
2025-05-08 19:49:50 +05:30
worker_signals_obj . progress_signal . connect ( self . progress_signal )
worker_signals_obj . file_download_status_signal . connect ( self . file_download_status_signal )
worker_signals_obj . file_progress_signal . connect ( self . file_progress_signal )
worker_signals_obj . external_link_signal . connect ( self . external_link_signal )
self . logger ( " Starting post fetch (single-threaded download process)... " )
post_generator = download_from_api (
self . api_url_input ,
2025-05-10 11:07:27 +05:30
logger = self . logger ,
2025-05-08 19:49:50 +05:30
start_page = self . start_page ,
end_page = self . end_page ,
manga_mode = self . manga_mode_active ,
2025-05-10 11:07:27 +05:30
cancellation_event = self . cancellation_event # Pass the shared event
2025-05-08 19:49:50 +05:30
)
2025-05-07 07:20:40 +05:30
2025-05-10 11:07:27 +05:30
for posts_batch_data in post_generator : # download_from_api yields batches
2025-05-08 19:49:50 +05:30
if self . isInterruptionRequested ( ) : was_process_cancelled = True ; break
2025-05-10 11:07:27 +05:30
for individual_post_data in posts_batch_data : # Iterate through posts in the batch
2025-05-08 19:49:50 +05:30
if self . isInterruptionRequested ( ) : was_process_cancelled = True ; break
2025-05-10 11:07:27 +05:30
# Create and run PostProcessorWorker for each post
# The PostProcessorWorker will use its own ThreadPoolExecutor for files if num_file_threads_for_worker > 1
2025-05-08 19:49:50 +05:30
post_processing_worker = PostProcessorWorker (
post_data = individual_post_data ,
download_root = self . output_dir ,
2025-05-10 11:07:27 +05:30
known_names = self . known_names , # Pass the copy
2025-05-08 19:49:50 +05:30
filter_character_list = self . filter_character_list ,
unwanted_keywords = self . unwanted_keywords ,
filter_mode = self . filter_mode ,
skip_zip = self . skip_zip , skip_rar = self . skip_rar ,
use_subfolders = self . use_subfolders , use_post_subfolders = self . use_post_subfolders ,
2025-05-10 11:07:27 +05:30
target_post_id_from_initial_url = self . initial_target_post_id , # Pass the original target
2025-05-08 19:49:50 +05:30
custom_folder_name = self . custom_folder_name ,
compress_images = self . compress_images , download_thumbnails = self . download_thumbnails ,
service = self . service , user_id = self . user_id ,
2025-05-10 11:07:27 +05:30
api_url_input = self . api_url_input , # Pass the original input URL
cancellation_event = self . cancellation_event , # Pass the shared event
signals = worker_signals_obj , # Pass the signals object for this thread
downloaded_files = self . downloaded_files , # Pass shared set
downloaded_file_hashes = self . downloaded_file_hashes , # Pass shared set
downloaded_files_lock = self . downloaded_files_lock , # Pass shared lock
downloaded_file_hashes_lock = self . downloaded_file_hashes_lock , # Pass shared lock
2025-05-08 19:49:50 +05:30
skip_words_list = self . skip_words_list ,
2025-05-10 11:07:27 +05:30
skip_words_scope = self . skip_words_scope ,
2025-05-08 19:49:50 +05:30
show_external_links = self . show_external_links ,
2025-05-09 19:03:01 +05:30
extract_links_only = self . extract_links_only ,
2025-05-10 11:07:27 +05:30
num_file_threads = self . num_file_threads_for_worker , # Threads for files within this post
skip_current_file_flag = self . skip_current_file_flag , # Pass the shared event
manga_mode_active = self . manga_mode_active ,
manga_filename_style = self . manga_filename_style
2025-05-08 19:49:50 +05:30
)
try :
2025-05-10 11:07:27 +05:30
# The process method of PostProcessorWorker handles its internal file downloads
dl_count , skip_count , kept_originals_this_post = post_processing_worker . process ( )
2025-05-08 19:49:50 +05:30
grand_total_downloaded_files + = dl_count
grand_total_skipped_files + = skip_count
2025-05-10 11:07:27 +05:30
if kept_originals_this_post : # This is a list
grand_list_of_kept_original_filenames . extend ( kept_originals_this_post )
2025-05-08 19:49:50 +05:30
except Exception as proc_err :
post_id_for_err = individual_post_data . get ( ' id ' , ' N/A ' )
self . logger ( f " ❌ Error processing post { post_id_for_err } in DownloadThread: { proc_err } " )
traceback . print_exc ( )
2025-05-10 11:07:27 +05:30
# Estimate skipped files for this post if worker crashes
2025-05-09 19:03:01 +05:30
num_potential_files_est = len ( individual_post_data . get ( ' attachments ' , [ ] ) ) + \
( 1 if individual_post_data . get ( ' file ' ) else 0 )
grand_total_skipped_files + = num_potential_files_est
2025-05-08 19:49:50 +05:30
if self . skip_current_file_flag and self . skip_current_file_flag . is_set ( ) :
2025-05-10 11:07:27 +05:30
self . skip_current_file_flag . clear ( ) # Reset for the next file/post
2025-05-09 19:03:01 +05:30
self . logger ( " Skip current file flag was processed and cleared by DownloadThread. " )
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
self . msleep ( 10 ) # Small delay between processing posts in single-thread mode
if was_process_cancelled : break # Break from outer loop (batches)
2025-05-08 19:49:50 +05:30
2025-05-10 11:07:27 +05:30
if not was_process_cancelled and not self . isInterruptionRequested ( ) : # Check again after loops
self . logger ( " ✅ All posts processed or end of content reached by DownloadThread. " )
2025-05-08 19:49:50 +05:30
except Exception as main_thread_err :
self . logger ( f " \n ❌ Critical error within DownloadThread run loop: { main_thread_err } " )
traceback . print_exc ( )
2025-05-10 11:07:27 +05:30
# Don't assume cancelled if an unexpected error occurs, let was_process_cancelled reflect actual interruption
if not self . isInterruptionRequested ( ) : was_process_cancelled = False
2025-05-08 19:49:50 +05:30
finally :
2025-05-10 11:07:27 +05:30
# Disconnect signals
2025-05-08 19:49:50 +05:30
try :
2025-05-09 19:03:01 +05:30
if worker_signals_obj : # Check if it was initialized
2025-05-08 19:49:50 +05:30
worker_signals_obj . progress_signal . disconnect ( self . progress_signal )
worker_signals_obj . file_download_status_signal . disconnect ( self . file_download_status_signal )
worker_signals_obj . external_link_signal . disconnect ( self . external_link_signal )
worker_signals_obj . file_progress_signal . disconnect ( self . file_progress_signal )
2025-05-10 11:07:27 +05:30
except ( TypeError , RuntimeError ) as e : #TypeError if not connected, RuntimeError if object deleted
2025-05-09 19:03:01 +05:30
self . logger ( f " ℹ ️ Note during DownloadThread signal disconnection: { e } " )
2025-05-10 11:07:27 +05:30
# Emit finished signal with final counts and status
self . finished_signal . emit ( grand_total_downloaded_files , grand_total_skipped_files , self . isInterruptionRequested ( ) , grand_list_of_kept_original_filenames )
2025-05-07 07:20:40 +05:30
def receive_add_character_result ( self , result ) :
2025-05-09 19:03:01 +05:30
""" Slot to receive the result from a character add prompt shown in the main thread. """
2025-05-10 11:07:27 +05:30
# This is called by a signal from the main thread
with QMutexLocker ( self . prompt_mutex ) :
2025-05-07 07:20:40 +05:30
self . _add_character_response = result
2025-05-10 11:07:27 +05:30
self . logger ( f " (DownloadThread) Received character prompt response: { ' Yes (added/confirmed) ' if result else ' No (declined/failed) ' } " )