Kemono-Downloader/src/core/Hentai2read_client.py
2025-12-14 19:33:17 +05:30

292 lines
11 KiB
Python

import re
import os
import time
import cloudscraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
import queue
def run_hentai2read_download(start_url, output_dir, progress_callback, overall_progress_callback, check_pause_func):
"""
Orchestrates the download process using a producer-consumer model.
"""
scraper = cloudscraper.create_scraper()
all_failed_files = [] # Track all failures across chapters
try:
progress_callback(" [Hentai2Read] Scraping series page for all metadata...")
top_level_folder_name, chapters_to_process = _get_series_metadata(start_url, progress_callback, scraper)
if not chapters_to_process:
progress_callback("❌ No chapters found to download. Aborting.")
return 0, 0
total_chapters = len(chapters_to_process)
overall_progress_callback(total_chapters, 0)
total_downloaded_count = 0
total_skipped_count = 0
for idx, chapter in enumerate(chapters_to_process):
if check_pause_func(): break
progress_callback(f"\n-- Processing and Downloading Chapter {idx + 1}/{total_chapters}: '{chapter['title']}' --")
series_folder = re.sub(r'[\\/*?:"<>|]', "", top_level_folder_name).strip()
chapter_folder = re.sub(r'[\\/*?:"<>|]', "", chapter['title']).strip()
final_save_path = os.path.join(output_dir, series_folder, chapter_folder)
os.makedirs(final_save_path, exist_ok=True)
dl_count, skip_count, chapter_failures = _process_and_download_chapter(
chapter_url=chapter['url'],
save_path=final_save_path,
scraper=scraper,
progress_callback=progress_callback,
check_pause_func=check_pause_func
)
total_downloaded_count += dl_count
total_skipped_count += skip_count
if chapter_failures:
all_failed_files.extend(chapter_failures)
overall_progress_callback(total_chapters, idx + 1)
if check_pause_func(): break
# --- FINAL SUMMARY OF FAILURES ---
if all_failed_files:
progress_callback("\n" + "="*40)
progress_callback(f"❌ SUMMARY: {len(all_failed_files)} files failed permanently after 10 retries:")
for fail_msg in all_failed_files:
progress_callback(f"{fail_msg}")
progress_callback("="*40 + "\n")
else:
progress_callback("\n✅ All chapters processed successfully with no permanent failures.")
return total_downloaded_count, total_skipped_count
except Exception as e:
progress_callback(f"❌ A critical error occurred in the Hentai2Read client: {e}")
return 0, 0
def _get_series_metadata(start_url, progress_callback, scraper):
"""
Scrapes the main series page to get the Artist Name, Series Title, and chapter list.
"""
max_retries = 4
last_exception = None
soup = None
for attempt in range(max_retries):
try:
if attempt > 0:
progress_callback(f" [Hentai2Read] ⚠️ Retrying connection (Attempt {attempt + 1}/{max_retries})...")
response = scraper.get(start_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
last_exception = None
break
except Exception as e:
last_exception = e
progress_callback(f" [Hentai2Read] ⚠️ Connection attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 * (attempt + 1))
continue
if last_exception:
progress_callback(f" [Hentai2Read] ❌ Error getting series metadata after {max_retries} attempts: {last_exception}")
return "Unknown Series", []
try:
series_title = "Unknown Series"
artist_name = None
# 1. Try fetching Title
title_tag = soup.select_one("h3.block-title a")
if title_tag:
series_title = title_tag.get_text(strip=True)
else:
meta_title = soup.select_one("meta[property='og:title']")
if meta_title:
series_title = meta_title.get("content", "Unknown Series").replace(" - Hentai2Read", "")
# 2. Try fetching Artist
metadata_list = soup.select_one("ul.list.list-simple-mini")
if metadata_list:
for b_tag in metadata_list.find_all('b'):
label = b_tag.get_text(strip=True)
if "Artist" in label or "Author" in label:
a_tag = b_tag.find_next_sibling('a')
if a_tag:
artist_name = a_tag.get_text(strip=True)
break
if not artist_name:
artist_link = soup.find('a', href=re.compile(r'/hentai-list/artist/'))
if artist_link:
artist_name = artist_link.get_text(strip=True)
if artist_name:
top_level_folder_name = f"{artist_name} - {series_title}"
else:
top_level_folder_name = series_title
chapter_links = soup.select("div.media a.pull-left.font-w600")
if not chapter_links:
chapters_to_process = [{'url': start_url, 'title': series_title}]
else:
chapters_to_process = [
{'url': urljoin(start_url, link['href']), 'title': " ".join(link.stripped_strings)}
for link in chapter_links
]
chapters_to_process.reverse()
progress_callback(f" [Hentai2Read] ✅ Found Metadata: '{top_level_folder_name}'")
progress_callback(f" [Hentai2Read] ✅ Found {len(chapters_to_process)} chapters to process.")
return top_level_folder_name, chapters_to_process
except Exception as e:
progress_callback(f" [Hentai2Read] ❌ Error parsing metadata after successful connection: {e}")
return "Unknown Series", []
def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func):
"""
Uses a producer-consumer pattern to download a chapter.
Includes RETRY LOGIC and ACTIVE LOGGING.
"""
task_queue = queue.Queue()
num_download_threads = 8
download_stats = {'downloaded': 0, 'skipped': 0}
failed_files_list = []
def downloader_worker():
worker_scraper = cloudscraper.create_scraper()
while True:
task = task_queue.get()
if task is None:
task_queue.task_done()
break
filepath, img_url = task
filename = os.path.basename(filepath)
if os.path.exists(filepath):
# We log skips to show it's checking files
progress_callback(f" -> Skip (Exists): '{filename}'")
download_stats['skipped'] += 1
task_queue.task_done()
continue
# --- RETRY LOGIC START ---
success = False
# UNCOMMENTED: Log the start of download so you see activity
progress_callback(f" Downloading: '{filename}'...")
for attempt in range(10): # Try 10 times
try:
if attempt > 0:
progress_callback(f" ⚠️ Retrying '{filename}' (Attempt {attempt+1}/10)...")
time.sleep(2)
response = worker_scraper.get(img_url, stream=True, timeout=60, headers={'Referer': chapter_url})
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
download_stats['downloaded'] += 1
success = True
# UNCOMMENTED: Log success
progress_callback(f" ✅ Downloaded: '{filename}'")
break
except Exception as e:
if attempt == 9:
progress_callback(f" ❌ Failed '{filename}' after 10 attempts: {e}")
if not success:
failed_files_list.append(f"{filename} (Chapter: {os.path.basename(save_path)})")
# Clean up empty file if failed
if os.path.exists(filepath):
try:
os.remove(filepath)
except OSError: pass
task_queue.task_done()
executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader')
for _ in range(num_download_threads):
executor.submit(downloader_worker)
page_number = 1
progress_callback(" [Hentai2Read] Scanning pages...") # Initial log
while True:
if check_pause_func(): break
if page_number > 300:
progress_callback(" [Hentai2Read] ⚠️ Safety break: Reached 300 pages.")
break
# Log occasionally to show scanning is alive
if page_number % 10 == 0:
progress_callback(f" [Hentai2Read] Scanned {page_number} pages so far...")
page_url_to_check = f"{chapter_url}{page_number}/"
try:
page_response = None
page_last_exception = None
for page_attempt in range(3):
try:
page_response = scraper.get(page_url_to_check, timeout=30)
page_last_exception = None
break
except Exception as e:
page_last_exception = e
time.sleep(1)
if page_last_exception:
raise page_last_exception
if page_response.history or page_response.status_code != 200:
progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.")
break
soup = BeautifulSoup(page_response.text, 'html.parser')
img_tag = soup.select_one("img#arf-reader")
img_src = img_tag.get("src") if img_tag else None
if not img_tag or img_src == "https://static.hentai.direct/hentai":
progress_callback(f" [Hentai2Read] End of chapter detected (Last page reached at {page_number}).")
break
normalized_img_src = urljoin(page_response.url, img_src)
ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg"
filename = f"{page_number:03d}{ext}"
filepath = os.path.join(save_path, filename)
task_queue.put((filepath, normalized_img_src))
page_number += 1
time.sleep(0.1)
except Exception as e:
progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}")
break
# Signal workers to exit
for _ in range(num_download_threads):
task_queue.put(None)
# Wait for all tasks to complete
task_queue.join()
executor.shutdown(wait=True)
progress_callback(f" Chapter complete. Processed {page_number - 1} images.")
return download_stats['downloaded'], download_stats['skipped'], failed_files_list