29 Commits

Author SHA1 Message Date
Yuvi9587
0316813792 Delete dist directory 2025-05-26 13:55:54 +05:30
Yuvi9587
d201a5396c Delete build/Kemono Downloader directory 2025-05-26 13:55:25 +05:30
Yuvi9587
86f9396b6c Commit 2025-05-26 13:52:34 +05:30
Yuvi9587
0fb4bb3cb0 Commit 2025-05-26 13:52:07 +05:30
Yuvi9587
1528d7ce25 Update Read.png 2025-05-26 09:54:26 +05:30
Yuvi9587
4e7eeb7989 Commit 2025-05-26 09:52:06 +05:30
Yuvi9587
7f2976a4f4 Commit 2025-05-26 09:48:00 +05:30
Yuvi9587
8928cb92da readme.md 2025-05-26 01:39:39 +05:30
Yuvi9587
a181b76124 Update main.py 2025-05-25 17:18:11 +05:30
Yuvi9587
8f085a8f63 Commit 2025-05-25 21:52:04 +05:30
Yuvi9587
93a997351b Update readme.md 2025-05-25 21:22:47 +05:30
Yuvi9587
b3af6c1c15 Commit 2025-05-25 21:21:00 +05:30
Yuvi9587
4a65263f7d Commit 2025-05-25 19:49:17 +05:30
Yuvi9587
1091b5b9b4 Commit 2025-05-25 19:48:08 +05:30
Yuvi9587
f6b3ff2f5c Update main.py 2025-05-25 11:36:35 +05:30
Yuvi9587
b399bdf5cf readme.md 2025-05-25 16:54:35 +05:30
Yuvi9587
9ace161bc8 Update downloader_utils.py 2025-05-25 11:22:04 +05:30
Yuvi9587
66e52cfd78 Commit 2025-05-25 12:27:15 +05:30
Yuvi9587
e665fd3cde Commit 2025-05-25 11:38:38 +05:30
Yuvi9587
fc94f4c691 Commit 2025-05-24 22:55:23 +05:30
Yuvi9587
78e2012f04 Commit 2025-05-24 13:30:06 +05:30
Yuvi9587
3fe9dbacc6 Commit 2025-05-24 13:15:08 +05:30
Yuvi9587
004dea06e0 Commit 2025-05-24 16:22:47 +05:30
Yuvi9587
8994a69c34 Add files via upload 2025-05-24 10:36:15 +05:30
Yuvi9587
f4a692673e main.py 2025-05-24 10:35:46 +05:30
Yuvi9587
4cb5f14ef6 Delete Known.txt 2025-05-23 21:01:05 +05:30
Yuvi9587
a596c4f350 Update main.py 2025-05-23 20:59:35 +05:30
Yuvi9587
e091c60d29 Commit 2025-05-23 20:23:36 +05:30
Yuvi9587
d2ea026a41 Commit 2025-05-23 19:11:52 +05:30
9 changed files with 1279 additions and 171 deletions

BIN
Kemono.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

View File

@@ -1 +0,0 @@
([Yor], Yor Briar, Yor Forger)

BIN
Read.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 165 KiB

After

Width:  |  Height:  |  Size: 162 KiB

BIN
assets/discord.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

BIN
assets/github.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
assets/instagram.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

View File

@@ -31,6 +31,8 @@ from io import BytesIO
STYLE_POST_TITLE = "post_title"
STYLE_ORIGINAL_NAME = "original_name"
STYLE_DATE_BASED = "date_based" # For manga date-based sequential naming
MANGA_DATE_PREFIX_DEFAULT = "" # Default for the new prefix
STYLE_POST_TITLE_GLOBAL_NUMBERING = "post_title_global_numbering" # For manga post title + global counter
SKIP_SCOPE_FILES = "files"
SKIP_SCOPE_POSTS = "posts"
@@ -50,6 +52,9 @@ KNOWN_NAMES = [] # This will now store dicts: {'name': str, 'is_group': bool, 'a
MIN_SIZE_FOR_MULTIPART_DOWNLOAD = 10 * 1024 * 1024 # 10 MB - Stays the same
MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 15 # Max concurrent connections for a single file
# Max length for a single filename or folder name component to ensure cross-OS compatibility
# Windows MAX_PATH is 260 for the full path. Individual components are usually shorter.
MAX_FILENAME_COMPONENT_LENGTH = 150
IMAGE_EXTENSIONS = {
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp',
@@ -62,6 +67,11 @@ VIDEO_EXTENSIONS = {
ARCHIVE_EXTENSIONS = {
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'
}
AUDIO_EXTENSIONS = {
'.mp3', '.wav', '.aac', '.flac', '.ogg', '.wma', '.m4a', '.opus',
'.aiff', '.ape', '.mid', '.midi'
}
def parse_cookie_string(cookie_string):
"""Parses a 'name=value; name2=value2' cookie string into a dict."""
cookies = {}
@@ -130,18 +140,46 @@ def clean_folder_name(name):
if not cleaned: # If empty after initial cleaning
return "untitled_folder"
# Truncate if too long
if len(cleaned) > MAX_FILENAME_COMPONENT_LENGTH:
cleaned = cleaned[:MAX_FILENAME_COMPONENT_LENGTH]
# After truncation, it's possible a new trailing space/dot is at the end
# or an existing one remains. So, strip them using the loop below.
# Strip trailing dots/spaces (original logic, now applied to potentially truncated name)
temp_name = cleaned
while len(temp_name) > 0 and (temp_name.endswith('.') or temp_name.endswith(' ')):
temp_name = temp_name[:-1]
return temp_name if temp_name else "untitled_folder"
def clean_filename(name):
if not isinstance(name, str): name = str(name)
cleaned = re.sub(r'[^\w\s\-\_\.\(\)]', '', name)
cleaned = cleaned.strip()
cleaned = re.sub(r'\s+', '_', cleaned)
return cleaned if cleaned else "untitled_file"
cleaned = cleaned.strip() # Remove leading/trailing spaces first
cleaned = re.sub(r'\s+', ' ', cleaned) # Replace multiple internal spaces with a single space
if not cleaned: return "untitled_file"
base_name, ext = os.path.splitext(cleaned)
# Calculate max length for base_name, reserving space for the extension
max_base_len = MAX_FILENAME_COMPONENT_LENGTH - len(ext)
if len(base_name) > max_base_len:
if max_base_len > 0: # If there's space for at least some of the base name
base_name = base_name[:max_base_len]
else: # No space for base name (extension is too long or fills the entire allowed space)
# In this case, we have to truncate the original 'cleaned' string,
# which might cut into the extension, but it's necessary to meet the length.
return cleaned[:MAX_FILENAME_COMPONENT_LENGTH] if cleaned else "untitled_file"
final_name = base_name + ext
# Ensure the final reconstructed name isn't empty (e.g. if base_name became empty and ext was also empty)
return final_name if final_name else "untitled_file"
def strip_html_tags(html_text):
if not html_text: return ""
@@ -217,6 +255,12 @@ def is_archive(filename):
_, ext = os.path.splitext(filename)
return ext.lower() in ARCHIVE_EXTENSIONS
def is_audio(filename):
if not filename: return False
_, ext = os.path.splitext(filename)
return ext.lower() in AUDIO_EXTENSIONS
def is_post_url(url):
if not isinstance(url, str): return False
@@ -276,7 +320,7 @@ def prepare_cookies_for_request(use_cookie_flag, cookie_text_input, selected_coo
return None
def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None, pause_event=None, cookies_dict=None):
if cancellation_event and cancellation_event.is_set():
if cancellation_event and cancellation_event.is_set(): # type: ignore
logger(" Fetch cancelled before request.")
raise RuntimeError("Fetch operation cancelled by user.")
@@ -284,7 +328,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
logger(" Post fetching paused...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger(" Post fetching cancelled while paused.")
logger(" Post fetching cancelled while paused.") # type: ignore
raise RuntimeError("Fetch operation cancelled by user.")
time.sleep(0.5)
logger(" Post fetching resumed.")
@@ -379,21 +423,37 @@ def download_from_api(api_url_input, logger=print, start_page=None, end_page=Non
page_size = 50
if is_creator_feed_for_manga:
logger(" Manga Mode: Fetching all posts to reverse order (oldest posts processed first)...")
logger(" Manga Mode: Fetching posts to sort by date (oldest processed first)...")
all_posts_for_manga_mode = []
current_offset_manga = 0
# Determine starting page and offset for manga mode
if start_page and start_page > 1:
current_offset_manga = (start_page - 1) * page_size
logger(f" Manga Mode: Starting fetch from page {start_page} (offset {current_offset_manga}).")
elif start_page: # start_page is 1
logger(f" Manga Mode: Starting fetch from page 1 (offset 0).")
if end_page:
logger(f" Manga Mode: Will fetch up to page {end_page}.")
while True:
if pause_event and pause_event.is_set():
logger(" Manga mode post fetching paused...") # type: ignor
logger(" Manga mode post fetching paused...") # type: ignore
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post fetching cancelled while paused.")
logger(" Manga mode post fetching cancelled while paused.") # type: ignore
break
time.sleep(0.5)
if not (cancellation_event and cancellation_event.is_set()): logger(" Manga mode post fetching resumed.")
if cancellation_event and cancellation_event.is_set():
logger(" Manga mode post fetching cancelled.")
break
current_page_num_manga = (current_offset_manga // page_size) + 1
if end_page and current_page_num_manga > end_page:
logger(f" Manga Mode: Reached specified end page ({end_page}). Stopping post fetch.")
break
try:
posts_batch_manga = fetch_posts_paginated(api_base_url, headers, current_offset_manga, logger, cancellation_event, pause_event, cookies_dict=cookies_for_api)
if not isinstance(posts_batch_manga, list):
@@ -401,7 +461,11 @@ def download_from_api(api_url_input, logger=print, start_page=None, end_page=Non
break
if not posts_batch_manga:
logger("✅ Reached end of posts (Manga Mode fetch all).")
break
if start_page and not end_page and current_page_num_manga < start_page: # Started on a page with no posts
logger(f" Manga Mode: No posts found on or after specified start page {start_page}.")
elif end_page and current_page_num_manga <= end_page and not all_posts_for_manga_mode: # Range specified but no posts in it
logger(f" Manga Mode: No posts found within the specified page range ({start_page or 1}-{end_page}).")
break # No more posts from API
all_posts_for_manga_mode.extend(posts_batch_manga)
current_offset_manga += page_size # Increment by page_size for the next API call's 'o' parameter
time.sleep(0.6)
@@ -420,7 +484,7 @@ def download_from_api(api_url_input, logger=print, start_page=None, end_page=Non
if all_posts_for_manga_mode:
logger(f" Manga Mode: Fetched {len(all_posts_for_manga_mode)} total posts. Sorting by publication date (oldest first)...")
# ... (rest of sorting and yielding logic for manga mode remains the same) ...
def sort_key_tuple(post):
published_date_str = post.get('published')
added_date_str = post.get('added')
@@ -583,8 +647,11 @@ class PostProcessorWorker:
use_cookie=False, # Added missing parameter
selected_cookie_file=None, # Added missing parameter
app_base_dir=None, # New parameter for app's base directory
manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter for date-based prefix
manga_date_file_counter_ref=None, # New parameter for date-based manga naming
):
scan_content_for_images=False, # New flag for scanning HTML content
manga_global_file_counter_ref=None, # New parameter for global numbering
): # type: ignore
self.post = post_data
self.download_root = download_root
self.known_names = known_names
@@ -630,7 +697,10 @@ class PostProcessorWorker:
self.selected_cookie_file = selected_cookie_file # Store selected cookie file path
self.app_base_dir = app_base_dir # Store app base dir
self.cookie_text = cookie_text # Store cookie text
self.manga_date_prefix = manga_date_prefix # Store the prefix
self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store global counter
self.use_cookie = use_cookie # Store cookie setting
self.scan_content_for_images = scan_content_for_images # Store new flag
if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found.")
@@ -666,7 +736,8 @@ class PostProcessorWorker:
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event, # skip_event is threading.Event
post_title="", file_index_in_post=0, num_files_in_this_post=1,
manga_date_file_counter_ref=None): # Added manga_date_file_counter_ref
was_original_name_kept_flag = False
was_original_name_kept_flag = False
manga_global_file_counter_ref = None # Placeholder, will be passed from process()
final_filename_saved_for_return = ""
def _get_current_character_filters(self):
@@ -677,7 +748,8 @@ class PostProcessorWorker:
def _download_single_file(self, file_info, target_folder_path, headers, original_post_id_for_log, skip_event,
post_title="", file_index_in_post=0, num_files_in_this_post=1, # Added manga_date_file_counter_ref
manga_date_file_counter_ref=None,
forced_filename_override=None): # New for retries
forced_filename_override=None, # New for retries
manga_global_file_counter_ref=None): # New for global numbering
was_original_name_kept_flag = False
final_filename_saved_for_return = ""
retry_later_details = None # For storing info if retryable failure
@@ -709,6 +781,14 @@ class PostProcessorWorker:
if self.manga_mode_active: # Note: duplicate_file_mode is overridden to "Delete" in main.py if manga_mode is on
if self.manga_filename_style == STYLE_ORIGINAL_NAME:
filename_to_save_in_main_path = clean_filename(api_original_filename)
# Apply prefix if provided for Original Name style
if self.manga_date_prefix and self.manga_date_prefix.strip():
cleaned_prefix = clean_filename(self.manga_date_prefix.strip())
if cleaned_prefix:
filename_to_save_in_main_path = f"{cleaned_prefix} {filename_to_save_in_main_path}"
else:
self.logger(f"⚠️ Manga Original Name Mode: Provided prefix '{self.manga_date_prefix}' was empty after cleaning. Using original name only.")
was_original_name_kept_flag = True
elif self.manga_filename_style == STYLE_POST_TITLE:
if post_title and post_title.strip():
@@ -717,8 +797,8 @@ class PostProcessorWorker:
if file_index_in_post == 0:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
filename_to_save_in_main_path = clean_filename(api_original_filename)
was_original_name_kept_flag = True
filename_to_save_in_main_path = f"{cleaned_post_title_base}_{file_index_in_post}{original_ext}"
was_original_name_kept_flag = False # Name is derived, not original
else:
filename_to_save_in_main_path = f"{cleaned_post_title_base}{original_ext}"
else:
@@ -734,11 +814,32 @@ class PostProcessorWorker:
counter_val_for_filename = manga_date_file_counter_ref[0]
manga_date_file_counter_ref[0] += 1
filename_to_save_in_main_path = f"{counter_val_for_filename:03d}{original_ext}"
base_numbered_name = f"{counter_val_for_filename:03d}"
if self.manga_date_prefix and self.manga_date_prefix.strip():
cleaned_prefix = clean_filename(self.manga_date_prefix.strip())
if cleaned_prefix: # Ensure prefix is not empty after cleaning
filename_to_save_in_main_path = f"{cleaned_prefix} {base_numbered_name}{original_ext}"
else: # Prefix became empty after cleaning
filename_to_save_in_main_path = f"{base_numbered_name}{original_ext}"; self.logger(f"⚠️ Manga Date Mode: Provided prefix '{self.manga_date_prefix}' was empty after cleaning. Using number only.")
else: # No prefix provided
filename_to_save_in_main_path = f"{base_numbered_name}{original_ext}"
else:
self.logger(f"⚠️ Manga Date Mode: Counter ref not provided or malformed for '{api_original_filename}'. Using original. Ref: {manga_date_file_counter_ref}")
filename_to_save_in_main_path = clean_filename(api_original_filename)
self.logger(f"⚠️ Manga mode (Date Based Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path}' for post {original_post_id_for_log}.")
elif self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING:
if manga_global_file_counter_ref is not None and len(manga_global_file_counter_ref) == 2:
counter_val_for_filename = -1
counter_lock = manga_global_file_counter_ref[1]
with counter_lock:
counter_val_for_filename = manga_global_file_counter_ref[0]
manga_global_file_counter_ref[0] += 1
cleaned_post_title_base_for_global = clean_filename(post_title.strip() if post_title and post_title.strip() else "post")
filename_to_save_in_main_path = f"{cleaned_post_title_base_for_global}_{counter_val_for_filename:03d}{original_ext}"
else:
self.logger(f"⚠️ Manga Title+GlobalNum Mode: Counter ref not provided or malformed for '{api_original_filename}'. Using original. Ref: {manga_global_file_counter_ref}")
self.logger(f"⚠️ Manga mode (Date Based Style Fallback): Using cleaned original filename '{filename_to_save_in_main_path}' for post {original_post_id_for_log}.")
else:
self.logger(f"⚠️ Manga mode: Unknown filename style '{self.manga_filename_style}'. Defaulting to original filename for '{api_original_filename}'.")
filename_to_save_in_main_path = clean_filename(api_original_filename)
@@ -758,8 +859,10 @@ class PostProcessorWorker:
if not word_to_remove: continue
pattern = re.compile(re.escape(word_to_remove), re.IGNORECASE)
modified_base_name = pattern.sub("", modified_base_name)
modified_base_name = re.sub(r'[_.\s-]+', '_', modified_base_name)
modified_base_name = modified_base_name.strip('_')
# After removals, normalize all seps (underscore, dot, multiple spaces, hyphen) to a single space, then strip.
modified_base_name = re.sub(r'[_.\s-]+', ' ', modified_base_name) # Convert all separators to spaces
modified_base_name = re.sub(r'\s+', ' ', modified_base_name) # Condense multiple spaces to one
modified_base_name = modified_base_name.strip() # Remove leading/trailing spaces
if modified_base_name and modified_base_name != ext_for_removal.lstrip('.'):
filename_to_save_in_main_path = modified_base_name + ext_for_removal
else:
@@ -769,6 +872,7 @@ class PostProcessorWorker:
is_img_type = is_image(api_original_filename)
is_vid_type = is_video(api_original_filename)
is_archive_type = is_archive(api_original_filename)
is_audio_type = is_audio(api_original_filename)
if self.filter_mode == 'archive':
if not is_archive_type:
@@ -782,6 +886,10 @@ class PostProcessorWorker:
if not is_vid_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Video).")
return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
elif self.filter_mode == 'audio': # New audio filter mode
if not is_audio_type:
self.logger(f" -> Filter Skip: '{api_original_filename}' (Not Audio).")
return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
if self.skip_zip and is_zip(api_original_filename):
self.logger(f" -> Pref Skip: '{api_original_filename}' (ZIP).")
@@ -1280,14 +1388,14 @@ class PostProcessorWorker:
if original_api_name:
all_files_from_post_api.append({
'url': f"https://{api_file_domain}{file_path}" if file_path.startswith('/') else f"https://{api_file_domain}/data/{file_path}",
'name': original_api_name,
'name': original_api_name, # This is the cleaned/API provided name
'_original_name_for_log': original_api_name,
'_is_thumbnail': self.download_thumbnails and is_image(original_api_name)
'_is_thumbnail': is_image(original_api_name) # Mark if it's an image from API
})
else: self.logger(f" ⚠️ Skipping main file for post {post_id}: Missing name (Path: {file_path})")
for idx, att_info in enumerate(post_attachments):
if isinstance(att_info, dict) and att_info.get('path'):
if isinstance(att_info, dict) and att_info.get('path'): # Ensure att_info is a dict
att_path = att_info['path'].lstrip('/')
original_api_att_name = att_info.get('name') or os.path.basename(att_path)
if original_api_att_name:
@@ -1295,16 +1403,99 @@ class PostProcessorWorker:
'url': f"https://{api_file_domain}{att_path}" if att_path.startswith('/') else f"https://{api_file_domain}/data/{att_path}",
'name': original_api_att_name,
'_original_name_for_log': original_api_att_name,
'_is_thumbnail': self.download_thumbnails and is_image(original_api_att_name)
'_is_thumbnail': is_image(original_api_att_name) # Mark if it's an image from API
})
else: self.logger(f" ⚠️ Skipping attachment {idx+1} for post {post_id}: Missing name (Path: {att_path})")
else: self.logger(f" ⚠️ Skipping invalid attachment {idx+1} for post {post_id}: {str(att_info)[:100]}")
# --- New: Scan post content for additional image URLs if enabled ---
if self.scan_content_for_images and post_content_html and not self.extract_links_only: # This block was duplicated, ensure only one exists
self.logger(f" Scanning post content for additional image URLs (Post ID: {post_id})...")
parsed_input_url = urlparse(self.api_url_input)
base_url_for_relative_paths = f"{parsed_input_url.scheme}://{parsed_input_url.netloc}"
img_ext_pattern = "|".join(ext.lstrip('.') for ext in IMAGE_EXTENSIONS)
# 1. Regex for direct absolute image URLs in text
direct_url_pattern_str = r"""(?i)\b(https?://[^\s"'<>\[\]\{\}\|\^\\^~\[\]`]+\.(?:""" + img_ext_pattern + r"""))\b"""
# 2. Regex for <img> tags (captures src content)
img_tag_src_pattern_str = r"""<img\s+[^>]*?src\s*=\s*["']([^"']+)["']"""
found_image_sources = set()
for direct_url_match in re.finditer(direct_url_pattern_str, post_content_html):
found_image_sources.add(direct_url_match.group(1))
for img_tag_match in re.finditer(img_tag_src_pattern_str, post_content_html, re.IGNORECASE):
src_attr = img_tag_match.group(1).strip()
src_attr = html.unescape(src_attr)
if not src_attr: continue
resolved_src_url = ""
if src_attr.startswith(('http://', 'https://')):
resolved_src_url = src_attr
elif src_attr.startswith('//'):
resolved_src_url = f"{parsed_input_url.scheme}:{src_attr}"
elif src_attr.startswith('/'):
resolved_src_url = f"{base_url_for_relative_paths}{src_attr}"
if resolved_src_url:
parsed_resolved_url = urlparse(resolved_src_url)
if any(parsed_resolved_url.path.lower().endswith(ext) for ext in IMAGE_EXTENSIONS):
found_image_sources.add(resolved_src_url)
if found_image_sources:
self.logger(f" Found {len(found_image_sources)} potential image URLs/sources in content.")
existing_urls_in_api_list = {f_info['url'] for f_info in all_files_from_post_api}
for found_url in found_image_sources: # Iterate over the unique, resolved URLs
if self.check_cancel(): break
if found_url in existing_urls_in_api_list:
self.logger(f" Skipping URL from content (already in API list or previously added from content): {found_url[:70]}...")
continue
try:
parsed_found_url = urlparse(found_url)
url_filename = os.path.basename(parsed_found_url.path)
if not url_filename or not is_image(url_filename):
self.logger(f" Skipping URL from content (no filename part or not an image extension): {found_url[:70]}...")
continue
self.logger(f" Adding image from content: {url_filename} (URL: {found_url[:70]}...)")
all_files_from_post_api.append({
'url': found_url,
'name': url_filename,
'_original_name_for_log': url_filename,
'_is_thumbnail': False, # Images from content are not API thumbnails
'_from_content_scan': True
})
existing_urls_in_api_list.add(found_url)
except Exception as e_url_parse:
self.logger(f" Error processing URL from content '{found_url[:70]}...': {e_url_parse}")
else:
self.logger(f" No additional image URLs found in post content scan for post {post_id}.")
# --- End of new content scanning logic ---
# --- Final filtering based on download_thumbnails and scan_content_for_images flags ---
if self.download_thumbnails:
all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo['_is_thumbnail']]
if not all_files_from_post_api:
self.logger(f" -> No image thumbnails found for post {post_id} in thumbnail-only mode.")
return 0, 0, [], []
if self.scan_content_for_images:
# Both "Download Thumbnails Only" AND "Scan Content for Images" are checked.
# Prioritize images from content scan.
self.logger(f" Mode: 'Download Thumbnails Only' + 'Scan Content for Images' active. Prioritizing images from content scan for post {post_id}.")
all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo.get('_from_content_scan')]
if not all_files_from_post_api:
self.logger(f" -> No images found via content scan for post {post_id} in this combined mode.")
return 0, 0, [], [] # No files to download for this post
else:
# Only "Download Thumbnails Only" is checked. Filter for API thumbnails.
self.logger(f" Mode: 'Download Thumbnails Only' active. Filtering for API thumbnails for post {post_id}.")
all_files_from_post_api = [finfo for finfo in all_files_from_post_api if finfo.get('_is_thumbnail')]
if not all_files_from_post_api:
self.logger(f" -> No API image thumbnails found for post {post_id} in thumbnail-only mode.")
return 0, 0, [], [] # No files to download for this post
# If self.download_thumbnails is False, all_files_from_post_api remains as is.
# It will contain all API files (images marked with _is_thumbnail: True, others False)
# and potentially content-scanned images (marked with _from_content_scan: True).
if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED:
def natural_sort_key_for_files(file_api_info):
name = file_api_info.get('_original_name_for_log', '').lower()
@@ -1429,6 +1620,14 @@ class PostProcessorWorker:
target_folder_path_for_this_file = current_path_for_file
manga_date_counter_to_pass = None
manga_global_counter_to_pass = None
if self.manga_mode_active:
if self.manga_filename_style == STYLE_DATE_BASED:
manga_date_counter_to_pass = self.manga_date_file_counter_ref
elif self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING:
manga_global_counter_to_pass = self.manga_global_file_counter_ref if self.manga_global_file_counter_ref is not None else self.manga_date_file_counter_ref
futures_list.append(file_pool.submit(
self._download_single_file,
file_info_to_dl,
@@ -1436,8 +1635,9 @@ class PostProcessorWorker:
headers,
post_id,
self.skip_current_file_flag,
post_title=post_title, # Keyword argument
manga_date_file_counter_ref=self.manga_date_file_counter_ref if self.manga_mode_active and self.manga_filename_style == STYLE_DATE_BASED else None,
post_title=post_title,
manga_date_file_counter_ref=manga_date_counter_to_pass,
manga_global_file_counter_ref=manga_global_counter_to_pass,
file_index_in_post=file_idx, # Changed to keyword argument
num_files_in_this_post=num_files_in_this_post_for_naming # Changed to keyword argument
))
@@ -1501,10 +1701,15 @@ class DownloadThread(QThread):
manga_filename_style=STYLE_POST_TITLE,
char_filter_scope=CHAR_SCOPE_FILES, # manga_date_file_counter_ref removed from here
remove_from_filename_words_list=None,
manga_date_prefix=MANGA_DATE_PREFIX_DEFAULT, # New parameter
allow_multipart_download=True,
selected_cookie_file=None, # New parameter for selected cookie file
app_base_dir=None, # New parameter
manga_date_file_counter_ref=None, # New parameter
manga_global_file_counter_ref=None, # New parameter for global numbering
use_cookie=False, # Added: Expected by main.py
scan_content_for_images=False, # Added new flag
cookie_text="", # Added: Expected by main.py
):
super().__init__()
self.api_url_input = api_url_input
@@ -1547,12 +1752,15 @@ class DownloadThread(QThread):
self.manga_filename_style = manga_filename_style
self.char_filter_scope = char_filter_scope
self.remove_from_filename_words_list = remove_from_filename_words_list
self.manga_date_prefix = manga_date_prefix # Store the prefix
self.allow_multipart_download = allow_multipart_download
self.selected_cookie_file = selected_cookie_file # Store selected cookie file
self.app_base_dir = app_base_dir # Store app base dir
self.cookie_text = cookie_text # Store cookie text
self.use_cookie = use_cookie # Store cookie setting
self.manga_date_file_counter_ref = manga_date_file_counter_ref # Store for passing to worker by DownloadThread
self.scan_content_for_images = scan_content_for_images # Store new flag
self.manga_global_file_counter_ref = manga_global_file_counter_ref # Store for global numbering
if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
self.compress_images = False
@@ -1591,8 +1799,8 @@ class DownloadThread(QThread):
not self.extract_links_only and current_manga_date_file_counter_ref is None: # Check if it needs calculation
series_scan_dir = self.output_dir
if self.use_subfolders:
if self.filter_character_list_objects and self.filter_character_list_objects[0] and self.filter_character_list_objects[0].get("name"):
series_folder_name = clean_folder_name(self.filter_character_list_objects[0]["name"])
if self.filter_character_list_objects_initial and self.filter_character_list_objects_initial[0] and self.filter_character_list_objects_initial[0].get("name"):
series_folder_name = clean_folder_name(self.filter_character_list_objects_initial[0]["name"])
series_scan_dir = os.path.join(series_scan_dir, series_folder_name)
elif self.service and self.user_id:
creator_based_folder_name = clean_folder_name(self.user_id)
@@ -1605,9 +1813,16 @@ class DownloadThread(QThread):
for filename_to_check in filenames_in_dir:
base_name_no_ext = os.path.splitext(filename_to_check)[0]
match = re.match(r"(\d{3,})", base_name_no_ext)
if match: highest_num = max(highest_num, int(match.group(1)))
if match: highest_num = max(highest_num, int(match.group(1))) # Corrected indentation
current_manga_date_file_counter_ref = [highest_num + 1, threading.Lock()]
self.logger(f" [Thread] Manga Date Mode: Initialized counter at {current_manga_date_file_counter_ref[0]}.")
elif self.manga_mode_active and self.manga_filename_style == STYLE_POST_TITLE_GLOBAL_NUMBERING and not self.extract_links_only and current_manga_date_file_counter_ref is None: # Use current_manga_date_file_counter_ref for STYLE_POST_TITLE_GLOBAL_NUMBERING as well
# For global numbering, we always start from 1 for the session unless a ref is passed.
# If you need to resume global numbering across sessions, similar scanning logic would be needed.
# For now, it starts at 1 per session if no ref is provided.
current_manga_date_file_counter_ref = [1, threading.Lock()] # Start global numbering at 1
self.logger(f" [Thread] Manga Title+GlobalNum Mode: Initialized counter at {current_manga_date_file_counter_ref[0]}.")
worker_signals_obj = PostProcessorSignals()
try:
worker_signals_obj.progress_signal.connect(self.progress_signal)
@@ -1668,14 +1883,17 @@ class DownloadThread(QThread):
skip_current_file_flag=self.skip_current_file_flag,
manga_mode_active=self.manga_mode_active,
manga_filename_style=self.manga_filename_style,
manga_date_prefix=self.manga_date_prefix, # Pass the prefix
char_filter_scope=self.char_filter_scope,
remove_from_filename_words_list=self.remove_from_filename_words_list,
allow_multipart_download=self.allow_multipart_download,
selected_cookie_file=self.selected_cookie_file, # Pass selected cookie file
app_base_dir=self.app_base_dir, # Pass app_base_dir
cookie_text=self.cookie_text, # Pass cookie text
manga_global_file_counter_ref=self.manga_global_file_counter_ref, # Pass the ref
use_cookie=self.use_cookie, # Pass cookie setting to worker
manga_date_file_counter_ref=current_manga_date_file_counter_ref, # Pass the calculated or passed-in ref
scan_content_for_images=self.scan_content_for_images, # Pass new flag
)
try:
dl_count, skip_count, kept_originals_this_post, retryable_failures = post_processing_worker.process()

1100
main.py

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
<h1 align="center">Kemono Downloader v4.0.0</h1>
<h1 align="center">Kemono Downloader v4.1.1</h1>
<div align="center">
<img src="https://github.com/Yuvi9587/Kemono-Downloader/blob/main/Read.png" alt="Kemono Downloader"/>
@@ -11,9 +11,33 @@ Built with **PyQt5**, this tool is ideal for users who want deep filtering, cust
---
## What's New in v4.0.0?
## What's New in v4.1.1?
Version 3.5.0 focuses on enhancing access to content and providing even smarter organization:
Version 4.1.1 introduces a smarter way to capture images that might be embedded directly within post descriptions, enhancing content discovery.
### "Scan Content for Images" Feature
- **Enhanced Image Discovery:** A new checkbox, "**Scan Content for Images**," has been added to the UI (grouped with "Download Thumbnails Only" and "Compress Large Images").
- **How it Works:**
- When enabled, the downloader scans the HTML content of posts (e.g., the description area).
- It looks for images embedded via HTML `<img>` tags or as direct absolute URL links (e.g., `https://.../image.png`).
- It intelligently resolves relative image paths found in `<img>` tags (like `/data/image.jpg`) into full, downloadable URLs.
- This is particularly useful for capturing images that are part of the post's narrative but not formally listed in the API's file or attachment sections.
- **Default State:** This option is **unchecked by default**.
- **Interaction with "Download Thumbnails Only":**
- If you check "Download Thumbnails Only":
- The "Scan Content for Images" checkbox will **automatically become checked and disabled** (locked).
- In this combined mode, the downloader will **only download images found by the content scan**. API-listed thumbnails will be ignored, prioritizing images from the post's body.
- If you uncheck "Download Thumbnails Only":
- The "Scan Content for Images" checkbox will become **enabled again and revert to being unchecked**. You can then manually enable it if you wish to scan content without being in thumbnail-only mode.
This feature ensures a more comprehensive download experience, especially for posts where images are integrated directly into the text.
---
## Previous Update: What's New in v4.0.1?
Version 4.0.1 focuses on enhancing access to content and providing even smarter organization:
### Cookie Management
@@ -71,13 +95,30 @@ This field allows for dynamic filtering for the current download session and pro
- **Adding New Names from Filters:** When you use the "Filter by Character(s)" input, if any names or groups are new (not already in `Known.txt`), a dialog will appear after you start the download. This dialog allows you to select which of these new names/groups should be added to `Known.txt`, formatted according to the rules described above.
- **Intelligent Fallback:** If "Separate Folders by Name/Title" is active, and content doesn't match the "Filter by Character(s)" UI input, the downloader consults your `Known.txt` file for folder naming.
- **Direct Management:** You can add simple entries directly to `Known.txt` using the list and "Add" button in the UI's `Known.txt` management section. For creating or modifying complex grouped alias entries directly in the file, or for bulk edits, click the "Open Known.txt" button. The application reloads `Known.txt` on startup or before a download process begins.
- **Using Known Names to Populate Filters (via "Add to Filter" Button):**
- Next to the "Add" button in the `Known.txt` management section, a "⤵️ Add to Filter" button provides a quick way to use your existing known names.
- Clicking this opens a popup window displaying all entries from your `Known.txt` file, each with a checkbox.
- The popup includes:
- A search bar to quickly filter the list of names.
- "Select All" and "Deselect All" buttons for convenience.
- After selecting the desired names, click "Add Selected".
- The chosen names will be inserted into the "Filter by Character(s)" input field.
- **Important Formatting:** If a selected entry from `Known.txt` is a group (e.g., originally `(Boa Hancock)` in `Known.txt`, which implies aliases "Boa" and "Hancock"), it will be added to the filter field as `(Boa, Hancock)~`. Simple names are added as-is.
---
## What's in v3.5.0? (Previous Update)
This version brings significant enhancements to manga/comic downloading, filtering capabilities, and user experience:
This version brought significant enhancements to manga/comic downloading, filtering capabilities, and user experience:
### Enhanced Manga/Comic Mode
- **Optional Filename Prefix:**
- When using the "Date Based" or "Original File Name" manga styles, an optional prefix can be specified in the UI.
- This prefix will be prepended to each filename generated by these styles.
- **Example (Date Based):** If prefix is `MySeries_`, files become `MySeries_001.jpg`, `MySeries_002.png`, etc.
- **Example (Original File Name):** If prefix is `Comic_Vol1_`, an original file `page_01.jpg` becomes `Comic_Vol1_page_01.jpg`.
- This input field appears automatically when either of these two manga naming styles is selected.
- **New "Date Based" Filename Style:**
- Perfect for truly sequential content! Files are named numerically (e.g., `001.jpg`, `002.jpg`, `003.ext`...) across an *entire creator's feed*, strictly following post publication order.
@@ -87,6 +128,13 @@ This version brings significant enhancements to manga/comic downloading, filteri
- **Guaranteed Order:** Disables multi-threading for post processing to ensure sequential accuracy.
- Works alongside the existing "Post Title" and "Original File Name" styles.
- **New "Title+G.Num (Post Title + Global Numbering)" Filename Style:**
- Ideal for series where you want each file to be prefixed by its post title but still maintain a global sequential number across all posts from a single download session.
- **Naming Convention:** Files are named using the cleaned post title as a prefix, followed by an underscore and a globally incrementing number (e.g., `Post Title_001.ext`, `Post Title_002.ext`).
- **Example:**
- Post "Chapter 1: The Adventure Begins" (contains 2 files: `imageA.jpg`, `imageB.png`) -> `Chapter 1 The Adventure Begins_001.jpg`, `Chapter 1 The Adventure Begins_002.png`
- Next Post "Chapter 2: New Friends" (contains 1 file: `cover.jpg`) -> `Chapter 2 New Friends_003.jpg`
- **Sequential Integrity:** Multithreading for post processing is automatically disabled when this style is selected to ensure the global numbering is strictly sequential.
---
@@ -169,6 +217,7 @@ This version brings significant enhancements to manga/comic downloading, filteri
- `Nami` (simple character)
- `(Boa Hancock)~` (aliases for one character, session folder "Boa Hancock", adds `(Boa Hancock)` to `Known.txt`)
- `(Vivi, Uta)` (distinct characters, session folder "Vivi Uta", adds `Vivi` and `Uta` separately to `Known.txt`)
- A "⤵️ Add to Filter" button (near the `Known.txt` management UI) allows you to quickly populate this field by selecting from your existing `Known.txt` entries via a popup with search and checkbox selection.
- See "Advanced `Known.txt` and Character Filtering" for full details.
- **Filter Scopes:**
- `Files`
@@ -200,6 +249,7 @@ This version brings significant enhancements to manga/comic downloading, filteri
- `Name: Post Title (Default)`
- `Name: Original File`
- `Name: Date Based (New)`
- `Name: Title+G.Num (Post Title + Global Numbering)`
- **Best With:** Character filters set to manga/series title
@@ -217,12 +267,17 @@ This version brings significant enhancements to manga/comic downloading, filteri
---
### Thumbnail & Compression Tools
- **Download Thumbnails Only**
- **Download Thumbnails Only:**
- Downloads small preview images from the API instead of full-sized files (if available).
- **Interaction with "Scan Content for Images" (New in v4.1.1):** When "Download Thumbnails Only" is active, "Scan Content for Images" is auto-enabled, and only images found by the content scan are downloaded. See "What's New in v4.1.1" for details.
- **Scan Content for Images (New in v4.1.1):**
- A UI option to scan the HTML content of posts for embedded image URLs (from `<img>` tags or direct links).
- Resolves relative paths and helps capture images not listed in the API's formal attachments.
- See the "What's New in v4.1.1?" section for a comprehensive explanation.
- **Compress to WebP** (via Pillow)
- Converts large images to smaller WebP versions
---
### Performance Features