mirror of
https://github.com/Yuvi9587/Kemono-Downloader.git
synced 2025-12-29 16:14:44 +00:00
Commit
This commit is contained in:
@@ -2,71 +2,206 @@
|
||||
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import queue
|
||||
|
||||
def fetch_hentai2read_data(url, logger, session):
|
||||
def run_hentai2read_download(start_url, output_dir, progress_callback, overall_progress_callback, check_pause_func):
|
||||
"""
|
||||
Scrapes a SINGLE Hentai2Read chapter page using a provided session.
|
||||
Orchestrates the download process using a producer-consumer model.
|
||||
The main thread scrapes image URLs and puts them in a queue.
|
||||
A pool of worker threads consumes from the queue to download images concurrently.
|
||||
"""
|
||||
logger(f"Attempting to fetch chapter data from: {url}")
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
try:
|
||||
response = session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
progress_callback(" [Hentai2Read] Scraping series page for all metadata...")
|
||||
top_level_folder_name, chapters_to_process = _get_series_metadata(start_url, progress_callback, scraper)
|
||||
|
||||
if not chapters_to_process:
|
||||
progress_callback("❌ No chapters found to download. Aborting.")
|
||||
return 0, 0
|
||||
|
||||
page_content_text = response.text
|
||||
soup = BeautifulSoup(page_content_text, 'html.parser')
|
||||
|
||||
album_title = ""
|
||||
title_tags = soup.select('span[itemprop="name"]')
|
||||
if title_tags:
|
||||
album_title = title_tags[-1].text.strip()
|
||||
total_chapters = len(chapters_to_process)
|
||||
overall_progress_callback(total_chapters, 0)
|
||||
|
||||
if not album_title:
|
||||
title_tag = soup.select_one('h1.title')
|
||||
if title_tag:
|
||||
album_title = title_tag.text.strip()
|
||||
total_downloaded_count = 0
|
||||
total_skipped_count = 0
|
||||
|
||||
if not album_title:
|
||||
logger("❌ Could not find album title on page.")
|
||||
return None, None
|
||||
for idx, chapter in enumerate(chapters_to_process):
|
||||
if check_pause_func(): break
|
||||
|
||||
progress_callback(f"\n-- Processing and Downloading Chapter {idx + 1}/{total_chapters}: '{chapter['title']}' --")
|
||||
|
||||
series_folder = re.sub(r'[\\/*?:"<>|]', "", top_level_folder_name).strip()
|
||||
chapter_folder = re.sub(r'[\\/*?:"<>|]', "", chapter['title']).strip()
|
||||
final_save_path = os.path.join(output_dir, series_folder, chapter_folder)
|
||||
os.makedirs(final_save_path, exist_ok=True)
|
||||
|
||||
# This function now scrapes and downloads simultaneously
|
||||
dl_count, skip_count = _process_and_download_chapter(
|
||||
chapter_url=chapter['url'],
|
||||
save_path=final_save_path,
|
||||
scraper=scraper,
|
||||
progress_callback=progress_callback,
|
||||
check_pause_func=check_pause_func
|
||||
)
|
||||
|
||||
total_downloaded_count += dl_count
|
||||
total_skipped_count += skip_count
|
||||
|
||||
overall_progress_callback(total_chapters, idx + 1)
|
||||
if check_pause_func(): break
|
||||
|
||||
image_urls = []
|
||||
try:
|
||||
start_index = page_content_text.index("'images' : ") + len("'images' : ")
|
||||
end_index = page_content_text.index(",\n", start_index)
|
||||
images_json_str = page_content_text[start_index:end_index]
|
||||
image_paths = json.loads(images_json_str)
|
||||
image_urls = ["https://hentaicdn.com/hentai" + part for part in image_paths]
|
||||
except (ValueError, json.JSONDecodeError):
|
||||
logger("❌ Could not find or parse image JSON data for this chapter.")
|
||||
return None, None
|
||||
return total_downloaded_count, total_skipped_count
|
||||
|
||||
if not image_urls:
|
||||
logger("❌ No image URLs found for this chapter.")
|
||||
return None, None
|
||||
|
||||
logger(f" Found {len(image_urls)} images for album '{album_title}'.")
|
||||
|
||||
files_to_download = []
|
||||
for i, img_url in enumerate(image_urls):
|
||||
page_num = i + 1
|
||||
extension = os.path.splitext(img_url)[1].split('?')[0]
|
||||
if not extension: extension = ".jpg"
|
||||
filename = f"{page_num:03d}{extension}"
|
||||
files_to_download.append({'url': img_url, 'filename': filename})
|
||||
|
||||
return album_title, files_to_download
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 404:
|
||||
logger(f" Chapter not found (404 Error). This likely marks the end of the series.")
|
||||
else:
|
||||
logger(f"❌ An HTTP error occurred: {e}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
logger(f"❌ An unexpected error occurred while fetching data: {e}")
|
||||
return None, None
|
||||
progress_callback(f"❌ A critical error occurred in the Hentai2Read client: {e}")
|
||||
return 0, 0
|
||||
|
||||
def _get_series_metadata(start_url, progress_callback, scraper):
|
||||
"""
|
||||
Scrapes the main series page to get the Artist Name, Series Title, and chapter list.
|
||||
"""
|
||||
try:
|
||||
response = scraper.get(start_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
series_title = "Unknown Series"
|
||||
artist_name = None
|
||||
metadata_list = soup.select_one("ul.list.list-simple-mini")
|
||||
|
||||
if metadata_list:
|
||||
first_li = metadata_list.find('li', recursive=False)
|
||||
if first_li and not first_li.find('a'):
|
||||
series_title = first_li.get_text(strip=True)
|
||||
|
||||
for b_tag in metadata_list.find_all('b'):
|
||||
label = b_tag.get_text(strip=True)
|
||||
if label in ("Artist", "Author"):
|
||||
a_tag = b_tag.find_next_sibling('a')
|
||||
if a_tag:
|
||||
artist_name = a_tag.get_text(strip=True)
|
||||
if label == "Artist":
|
||||
break
|
||||
|
||||
top_level_folder_name = artist_name if artist_name else series_title
|
||||
|
||||
chapter_links = soup.select("div.media a.pull-left.font-w600")
|
||||
if not chapter_links:
|
||||
chapters_to_process = [{'url': start_url, 'title': series_title}]
|
||||
else:
|
||||
chapters_to_process = [
|
||||
{'url': urljoin(start_url, link['href']), 'title': " ".join(link.stripped_strings)}
|
||||
for link in chapter_links
|
||||
]
|
||||
chapters_to_process.reverse()
|
||||
|
||||
progress_callback(f" [Hentai2Read] ✅ Found Artist/Series: '{top_level_folder_name}'")
|
||||
progress_callback(f" [Hentai2Read] ✅ Found {len(chapters_to_process)} chapters to process.")
|
||||
|
||||
return top_level_folder_name, chapters_to_process
|
||||
|
||||
except Exception as e:
|
||||
progress_callback(f" [Hentai2Read] ❌ Error getting series metadata: {e}")
|
||||
return "Unknown Series", []
|
||||
|
||||
### NEW: This function contains the pipeline logic ###
|
||||
def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func):
|
||||
"""
|
||||
Uses a producer-consumer pattern to download a chapter.
|
||||
The main thread (producer) scrapes URLs one by one.
|
||||
Worker threads (consumers) download the URLs as they are found.
|
||||
"""
|
||||
task_queue = queue.Queue()
|
||||
num_download_threads = 8
|
||||
|
||||
# These will be updated by the worker threads
|
||||
download_stats = {'downloaded': 0, 'skipped': 0}
|
||||
|
||||
def downloader_worker():
|
||||
"""The function that each download thread will run."""
|
||||
# Create a unique session for each thread to avoid conflicts
|
||||
worker_scraper = cloudscraper.create_scraper()
|
||||
while True:
|
||||
try:
|
||||
# Get a task from the queue
|
||||
task = task_queue.get()
|
||||
# The sentinel value to signal the end
|
||||
if task is None:
|
||||
break
|
||||
|
||||
filepath, img_url = task
|
||||
if os.path.exists(filepath):
|
||||
progress_callback(f" -> Skip: '{os.path.basename(filepath)}'")
|
||||
download_stats['skipped'] += 1
|
||||
else:
|
||||
progress_callback(f" Downloading: '{os.path.basename(filepath)}'...")
|
||||
response = worker_scraper.get(img_url, stream=True, timeout=60, headers={'Referer': chapter_url})
|
||||
response.raise_for_status()
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
download_stats['downloaded'] += 1
|
||||
except Exception as e:
|
||||
progress_callback(f" ❌ Download failed for task. Error: {e}")
|
||||
download_stats['skipped'] += 1
|
||||
finally:
|
||||
task_queue.task_done()
|
||||
|
||||
# --- Start the downloader threads ---
|
||||
executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader')
|
||||
for _ in range(num_download_threads):
|
||||
executor.submit(downloader_worker)
|
||||
|
||||
# --- Main thread acts as the scraper (producer) ---
|
||||
page_number = 1
|
||||
while True:
|
||||
if check_pause_func(): break
|
||||
if page_number > 300: # Safety break
|
||||
progress_callback(" [Hentai2Read] ⚠️ Safety break: Reached 300 pages.")
|
||||
break
|
||||
|
||||
page_url_to_check = f"{chapter_url}{page_number}/"
|
||||
try:
|
||||
response = scraper.get(page_url_to_check, timeout=30)
|
||||
if response.history or response.status_code != 200:
|
||||
progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.")
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
img_tag = soup.select_one("img#arf-reader")
|
||||
img_src = img_tag.get("src") if img_tag else None
|
||||
|
||||
if not img_tag or img_src == "https://static.hentai.direct/hentai":
|
||||
progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).")
|
||||
break
|
||||
|
||||
normalized_img_src = urljoin(response.url, img_src)
|
||||
ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg"
|
||||
filename = f"{page_number:03d}{ext}"
|
||||
filepath = os.path.join(save_path, filename)
|
||||
|
||||
# Put the download task into the queue for a worker to pick up
|
||||
task_queue.put((filepath, normalized_img_src))
|
||||
|
||||
page_number += 1
|
||||
time.sleep(0.1) # Small delay between scraping pages
|
||||
except Exception as e:
|
||||
progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}")
|
||||
break
|
||||
|
||||
# --- Shutdown sequence ---
|
||||
# Tell all worker threads to exit by sending the sentinel value
|
||||
for _ in range(num_download_threads):
|
||||
task_queue.put(None)
|
||||
|
||||
# Wait for all download tasks to be completed
|
||||
executor.shutdown(wait=True)
|
||||
|
||||
progress_callback(f" Found and processed {page_number - 1} images for this chapter.")
|
||||
return download_stats['downloaded'], download_stats['skipped']
|
||||
116
src/core/allcomic_client.py
Normal file
116
src/core/allcomic_client.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import requests
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import cloudscraper
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def get_chapter_list(series_url, logger_func):
|
||||
"""
|
||||
Checks if a URL is a series page and returns a list of all chapter URLs if it is.
|
||||
Includes a retry mechanism for robust connection.
|
||||
"""
|
||||
logger_func(f" [AllComic] Checking for chapter list at: {series_url}")
|
||||
|
||||
scraper = cloudscraper.create_scraper()
|
||||
response = None
|
||||
max_retries = 8
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = scraper.get(series_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
logger_func(f" [AllComic] Successfully connected to series page on attempt {attempt + 1}.")
|
||||
break # Success, exit the loop
|
||||
except requests.RequestException as e:
|
||||
logger_func(f" [AllComic] ⚠️ Series page check attempt {attempt + 1}/{max_retries} failed: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 * (attempt + 1)
|
||||
logger_func(f" Retrying in {wait_time} seconds...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
logger_func(f" [AllComic] ❌ All attempts to check series page failed.")
|
||||
return [] # Return empty on final failure
|
||||
|
||||
if not response:
|
||||
return []
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
chapter_links = soup.select('li.wp-manga-chapter a')
|
||||
|
||||
if not chapter_links:
|
||||
logger_func(" [AllComic] ℹ️ No chapter list found. Assuming this is a single chapter page.")
|
||||
return []
|
||||
|
||||
chapter_urls = [link['href'] for link in chapter_links]
|
||||
chapter_urls.reverse() # Reverse for oldest-to-newest reading order
|
||||
|
||||
logger_func(f" [AllComic] ✅ Found {len(chapter_urls)} chapters.")
|
||||
return chapter_urls
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" [AllComic] ❌ Error parsing chapters after successful connection: {e}")
|
||||
return []
|
||||
|
||||
def fetch_chapter_data(chapter_url, logger_func):
|
||||
"""
|
||||
Fetches the comic title, chapter title, and image URLs for a single chapter page.
|
||||
"""
|
||||
logger_func(f" [AllComic] Fetching page: {chapter_url}")
|
||||
|
||||
scraper = cloudscraper.create_scraper(
|
||||
browser={'browser': 'firefox', 'platform': 'windows', 'desktop': True}
|
||||
)
|
||||
headers = {'Referer': 'https://allporncomic.com/'}
|
||||
|
||||
response = None
|
||||
max_retries = 8
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = scraper.get(chapter_url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
break
|
||||
except requests.RequestException as e:
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(2 * (attempt + 1))
|
||||
else:
|
||||
logger_func(f" [AllComic] ❌ All connection attempts failed for chapter: {chapter_url}")
|
||||
return None, None, None
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
title_element = soup.find('h1', class_='post-title')
|
||||
comic_title = None
|
||||
if title_element:
|
||||
comic_title = title_element.text.strip()
|
||||
else:
|
||||
try:
|
||||
path_parts = urlparse(chapter_url).path.strip('/').split('/')
|
||||
if len(path_parts) >= 3 and path_parts[-3] == 'porncomic':
|
||||
comic_slug = path_parts[-2]
|
||||
comic_title = comic_slug.replace('-', ' ').title()
|
||||
except Exception:
|
||||
comic_title = "Unknown Comic"
|
||||
|
||||
chapter_slug = chapter_url.strip('/').split('/')[-1]
|
||||
chapter_title = chapter_slug.replace('-', ' ').title()
|
||||
|
||||
reading_container = soup.find('div', class_='reading-content')
|
||||
list_of_image_urls = []
|
||||
if reading_container:
|
||||
image_elements = reading_container.find_all('img', class_='wp-manga-chapter-img')
|
||||
for img in image_elements:
|
||||
img_url = (img.get('data-src') or img.get('src', '')).strip()
|
||||
if img_url:
|
||||
list_of_image_urls.append(img_url)
|
||||
|
||||
if not comic_title or comic_title == "Unknown Comic" or not list_of_image_urls:
|
||||
logger_func(f" [AllComic] ❌ Could not find a valid title or images on the page. Title found: '{comic_title}'")
|
||||
return None, None, None
|
||||
|
||||
return comic_title, chapter_title, list_of_image_urls
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" [AllComic] ❌ An unexpected error occurred while parsing the page: {e}")
|
||||
return None, None, None
|
||||
@@ -33,7 +33,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
|
||||
if cancellation_event and cancellation_event.is_set():
|
||||
raise RuntimeError("Fetch operation cancelled by user during retry loop.")
|
||||
|
||||
log_message = f" Fetching post list: {paginated_url} (Page approx. {offset // 50 + 1})"
|
||||
log_message = f" Fetching post list: {api_url_base} (Page approx. {offset // 50 + 1})"
|
||||
if attempt > 0:
|
||||
log_message += f" (Attempt {attempt + 1}/{max_retries})"
|
||||
logger(log_message)
|
||||
@@ -247,7 +247,7 @@ def download_from_api(
|
||||
break
|
||||
all_posts_for_manga_mode.extend(posts_batch_manga)
|
||||
|
||||
logger(f"MANGA_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}")
|
||||
logger(f"RENAMING_MODE_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}")
|
||||
|
||||
current_offset_manga += page_size
|
||||
time.sleep(0.6)
|
||||
@@ -265,7 +265,7 @@ def download_from_api(
|
||||
if cancellation_event and cancellation_event.is_set(): return
|
||||
|
||||
if all_posts_for_manga_mode:
|
||||
logger(f"MANGA_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}")
|
||||
logger(f"RENAMING_MODE_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}")
|
||||
|
||||
if all_posts_for_manga_mode:
|
||||
if processed_post_ids:
|
||||
|
||||
375
src/core/booru_client.py
Normal file
375
src/core/booru_client.py
Normal file
@@ -0,0 +1,375 @@
|
||||
# src/core/booru_client.py
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import datetime
|
||||
import urllib.parse
|
||||
import requests
|
||||
import logging
|
||||
import cloudscraper
|
||||
# --- Start of Combined Code from 1.py ---
|
||||
|
||||
# Part 1: Essential Utilities & Exceptions
|
||||
|
||||
class BooruClientException(Exception):
|
||||
"""Base class for exceptions in this client."""
|
||||
pass
|
||||
|
||||
class HttpError(BooruClientException):
|
||||
"""HTTP request during data extraction failed."""
|
||||
def __init__(self, message="", response=None):
|
||||
self.response = response
|
||||
self.status = response.status_code if response else 0
|
||||
if response and not message:
|
||||
message = f"'{response.status_code} {response.reason}' for '{response.url}'"
|
||||
super().__init__(message)
|
||||
|
||||
class NotFoundError(BooruClientException):
|
||||
pass
|
||||
|
||||
def unquote(s):
|
||||
return urllib.parse.unquote(s)
|
||||
|
||||
def parse_datetime(date_string, fmt):
|
||||
try:
|
||||
# Assumes date_string is in a format that strptime can handle with timezone
|
||||
return datetime.datetime.strptime(date_string, fmt)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
def nameext_from_url(url, data=None):
|
||||
if data is None: data = {}
|
||||
try:
|
||||
path = urllib.parse.urlparse(url).path
|
||||
filename = unquote(os.path.basename(path))
|
||||
if '.' in filename:
|
||||
name, ext = filename.rsplit('.', 1)
|
||||
data["filename"], data["extension"] = name, ext.lower()
|
||||
else:
|
||||
data["filename"], data["extension"] = filename, ""
|
||||
except Exception:
|
||||
data["filename"], data["extension"] = "", ""
|
||||
return data
|
||||
|
||||
USERAGENT_FIREFOX = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"
|
||||
|
||||
# Part 2: Core Extractor Logic
|
||||
|
||||
class Extractor:
|
||||
category = ""
|
||||
subcategory = ""
|
||||
directory_fmt = ("{category}", "{id}")
|
||||
filename_fmt = "{filename}.{extension}"
|
||||
_retries = 3
|
||||
_timeout = 30
|
||||
|
||||
def __init__(self, match, logger_func=print):
|
||||
self.url = match.string
|
||||
self.match = match
|
||||
self.groups = match.groups()
|
||||
self.session = cloudscraper.create_scraper()
|
||||
self.session.headers["User-Agent"] = USERAGENT_FIREFOX
|
||||
self.log = logger_func
|
||||
self.api_key = None
|
||||
self.user_id = None
|
||||
|
||||
def set_auth(self, api_key, user_id):
|
||||
self.api_key = api_key
|
||||
self.user_id = user_id
|
||||
self._init_auth()
|
||||
|
||||
def _init_auth(self):
|
||||
"""Placeholder for extractor-specific auth setup."""
|
||||
pass
|
||||
|
||||
def request(self, url, method="GET", fatal=True, **kwargs):
|
||||
for attempt in range(self._retries + 1):
|
||||
try:
|
||||
response = self.session.request(method, url, timeout=self._timeout, **kwargs)
|
||||
if response.status_code < 400:
|
||||
return response
|
||||
if response.status_code == 404 and fatal:
|
||||
raise NotFoundError(f"Resource not found at {url}")
|
||||
self.log(f"Request for {url} failed with status {response.status_code}. Retrying...")
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.log(f"Request for {url} failed: {e}. Retrying...")
|
||||
if attempt < self._retries:
|
||||
time.sleep(2 ** attempt)
|
||||
if fatal:
|
||||
raise HttpError(f"Failed to retrieve {url} after {self._retries} retries.")
|
||||
return None
|
||||
|
||||
def request_json(self, url, **kwargs):
|
||||
response = self.request(url, **kwargs)
|
||||
try:
|
||||
return response.json()
|
||||
except (ValueError, TypeError) as exc:
|
||||
self.log(f"Failed to decode JSON from {url}: {exc}")
|
||||
raise BooruClientException("Invalid JSON response")
|
||||
|
||||
def items(self):
|
||||
data = self.metadata()
|
||||
for item in self.posts():
|
||||
# Check for our special page update message
|
||||
if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE':
|
||||
yield item
|
||||
continue
|
||||
|
||||
# Otherwise, process it as a post
|
||||
post = item
|
||||
url = post.get("file_url")
|
||||
if not url: continue
|
||||
|
||||
nameext_from_url(url, post)
|
||||
post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
|
||||
|
||||
if url.startswith("/"):
|
||||
url = self.root + url
|
||||
post['file_url'] = url # Ensure full URL
|
||||
|
||||
post.update(data)
|
||||
yield post
|
||||
|
||||
class BaseExtractor(Extractor):
|
||||
instances = ()
|
||||
|
||||
def __init__(self, match, logger_func=print):
|
||||
super().__init__(match, logger_func)
|
||||
self._init_category()
|
||||
|
||||
def _init_category(self):
|
||||
parsed_url = urllib.parse.urlparse(self.url)
|
||||
self.root = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
for i, group in enumerate(self.groups):
|
||||
if group is not None:
|
||||
try:
|
||||
self.category = self.instances[i][0]
|
||||
return
|
||||
except IndexError:
|
||||
continue
|
||||
|
||||
@classmethod
|
||||
def update(cls, instances):
|
||||
pattern_list = []
|
||||
instance_list = cls.instances = []
|
||||
for category, info in instances.items():
|
||||
root = info["root"].rstrip("/") if info["root"] else ""
|
||||
instance_list.append((category, root, info))
|
||||
pattern = info.get("pattern", re.escape(root.partition("://")[2]))
|
||||
pattern_list.append(f"({pattern})")
|
||||
return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
|
||||
|
||||
# Part 3: Danbooru Extractor
|
||||
|
||||
class DanbooruExtractor(BaseExtractor):
|
||||
filename_fmt = "{category}_{id}_{filename}.{extension}"
|
||||
per_page = 200
|
||||
|
||||
def __init__(self, match, logger_func=print):
|
||||
super().__init__(match, logger_func)
|
||||
self._auth_logged = False
|
||||
|
||||
def _init_auth(self):
|
||||
if self.user_id and self.api_key:
|
||||
if not self._auth_logged:
|
||||
self.log("Danbooru auth set.")
|
||||
self._auth_logged = True
|
||||
self.session.auth = (self.user_id, self.api_key)
|
||||
|
||||
|
||||
def items(self):
|
||||
data = self.metadata()
|
||||
for item in self.posts():
|
||||
# Check for our special page update message
|
||||
if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE':
|
||||
yield item
|
||||
continue
|
||||
|
||||
# Otherwise, process it as a post
|
||||
post = item
|
||||
url = post.get("file_url")
|
||||
if not url: continue
|
||||
|
||||
nameext_from_url(url, post)
|
||||
post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
|
||||
|
||||
if url.startswith("/"):
|
||||
url = self.root + url
|
||||
post['file_url'] = url # Ensure full URL
|
||||
|
||||
post.update(data)
|
||||
yield post
|
||||
|
||||
def metadata(self):
|
||||
return {}
|
||||
|
||||
def posts(self):
|
||||
return []
|
||||
|
||||
def _pagination(self, endpoint, params, prefix="b"):
|
||||
url = self.root + endpoint
|
||||
params["limit"] = self.per_page
|
||||
params["page"] = 1
|
||||
threshold = self.per_page - 20
|
||||
|
||||
while True:
|
||||
posts = self.request_json(url, params=params)
|
||||
if not posts: break
|
||||
yield ('PAGE_UPDATE', len(posts))
|
||||
yield from posts
|
||||
if len(posts) < threshold: return
|
||||
if prefix:
|
||||
params["page"] = f"{prefix}{posts[-1]['id']}"
|
||||
else:
|
||||
params["page"] += 1
|
||||
|
||||
BASE_PATTERN = DanbooruExtractor.update({
|
||||
"danbooru": {"root": None, "pattern": r"(?:danbooru|safebooru)\.donmai\.us"},
|
||||
})
|
||||
|
||||
class DanbooruTagExtractor(DanbooruExtractor):
|
||||
subcategory = "tag"
|
||||
directory_fmt = ("{category}", "{search_tags}")
|
||||
pattern = BASE_PATTERN + r"(/posts\?(?:[^&#]*&)*tags=([^&#]*))"
|
||||
|
||||
def metadata(self):
|
||||
self.tags = unquote(self.groups[-1].replace("+", " ")).strip()
|
||||
sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags)
|
||||
return {"search_tags": sanitized_tags}
|
||||
|
||||
def posts(self):
|
||||
return self._pagination("/posts.json", {"tags": self.tags})
|
||||
|
||||
class DanbooruPostExtractor(DanbooruExtractor):
|
||||
subcategory = "post"
|
||||
pattern = BASE_PATTERN + r"(/post(?:s|/show)/(\d+))"
|
||||
|
||||
def posts(self):
|
||||
post_id = self.groups[-1]
|
||||
url = f"{self.root}/posts/{post_id}.json"
|
||||
post = self.request_json(url)
|
||||
return (post,) if post else ()
|
||||
|
||||
class GelbooruBase(Extractor):
|
||||
category = "gelbooru"
|
||||
root = "https://gelbooru.com"
|
||||
|
||||
def __init__(self, match, logger_func=print):
|
||||
super().__init__(match, logger_func)
|
||||
self._auth_logged = False
|
||||
|
||||
def _api_request(self, params, key="post"):
|
||||
# Auth is now added dynamically
|
||||
if self.api_key and self.user_id:
|
||||
if not self._auth_logged:
|
||||
self.log("Gelbooru auth set.")
|
||||
self._auth_logged = True
|
||||
params.update({"api_key": self.api_key, "user_id": self.user_id})
|
||||
|
||||
url = self.root + "/index.php?page=dapi&q=index&json=1"
|
||||
data = self.request_json(url, params=params)
|
||||
|
||||
if not key: return data
|
||||
posts = data.get(key, [])
|
||||
return posts if isinstance(posts, list) else [posts] if posts else []
|
||||
|
||||
def items(self):
|
||||
base_data = self.metadata()
|
||||
base_data['category'] = self.category
|
||||
|
||||
for item in self.posts():
|
||||
# Check for our special page update message
|
||||
if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE':
|
||||
yield item
|
||||
continue
|
||||
|
||||
# Otherwise, process it as a post
|
||||
post = item
|
||||
url = post.get("file_url")
|
||||
if not url: continue
|
||||
|
||||
data = base_data.copy()
|
||||
data.update(post)
|
||||
nameext_from_url(url, data)
|
||||
yield data
|
||||
|
||||
def metadata(self): return {}
|
||||
def posts(self): return []
|
||||
|
||||
GELBOORU_PATTERN = r"(?:https?://)?(?:www\.)?gelbooru\.com"
|
||||
|
||||
class GelbooruTagExtractor(GelbooruBase):
|
||||
subcategory = "tag"
|
||||
directory_fmt = ("{category}", "{search_tags}")
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=list&tags=([^&#]*))"
|
||||
|
||||
def metadata(self):
|
||||
self.tags = unquote(self.groups[-1].replace("+", " ")).strip()
|
||||
sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags)
|
||||
return {"search_tags": sanitized_tags}
|
||||
|
||||
def posts(self):
|
||||
"""Scrapes HTML search pages as API can be restrictive for tags."""
|
||||
pid = 0
|
||||
posts_per_page = 42
|
||||
search_url = self.root + "/index.php"
|
||||
params = {"page": "post", "s": "list", "tags": self.tags}
|
||||
|
||||
while True:
|
||||
params['pid'] = pid
|
||||
self.log(f"Scraping search results page (offset: {pid})...")
|
||||
response = self.request(search_url, params=params)
|
||||
html_content = response.text
|
||||
post_ids = re.findall(r'id="p(\d+)"', html_content)
|
||||
|
||||
if not post_ids:
|
||||
self.log("No more posts found on page. Ending scrape.")
|
||||
break
|
||||
yield ('PAGE_UPDATE', len(post_ids))
|
||||
for post_id in post_ids:
|
||||
post_data = self._api_request({"s": "post", "id": post_id})
|
||||
yield from post_data
|
||||
|
||||
pid += posts_per_page
|
||||
|
||||
class GelbooruPostExtractor(GelbooruBase):
|
||||
subcategory = "post"
|
||||
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
||||
pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=view&id=(\d+))"
|
||||
|
||||
def posts(self):
|
||||
post_id = self.groups[-1]
|
||||
return self._api_request({"s": "post", "id": post_id})
|
||||
|
||||
# --- Main Entry Point ---
|
||||
|
||||
EXTRACTORS = [
|
||||
DanbooruTagExtractor,
|
||||
DanbooruPostExtractor,
|
||||
GelbooruTagExtractor,
|
||||
GelbooruPostExtractor,
|
||||
]
|
||||
|
||||
def find_extractor(url, logger_func):
|
||||
for extractor_cls in EXTRACTORS:
|
||||
match = re.search(extractor_cls.pattern, url)
|
||||
if match:
|
||||
return extractor_cls(match, logger_func)
|
||||
return None
|
||||
|
||||
def fetch_booru_data(url, api_key, user_id, logger_func):
|
||||
"""
|
||||
Main function to find an extractor and yield image data.
|
||||
"""
|
||||
extractor = find_extractor(url, logger_func)
|
||||
if not extractor:
|
||||
logger_func(f"No suitable Booru extractor found for URL: {url}")
|
||||
return
|
||||
|
||||
logger_func(f"Using extractor: {extractor.__class__.__name__}")
|
||||
extractor.set_auth(api_key, user_id)
|
||||
|
||||
# The 'items' method will now yield the data dictionaries directly
|
||||
yield from extractor.items()
|
||||
@@ -207,7 +207,7 @@ def get_bunkr_extractor(url, logger):
|
||||
def fetch_bunkr_data(url, logger):
|
||||
"""
|
||||
Main function to be called from the GUI.
|
||||
It extracts all file information from a Bunkr URL.
|
||||
It extracts all file information from a Bunkr URL, now handling both albums and direct file links.
|
||||
|
||||
Returns:
|
||||
A tuple of (album_name, list_of_files)
|
||||
@@ -215,6 +215,30 @@ def fetch_bunkr_data(url, logger):
|
||||
- list_of_files (list): A list of dicts, each containing 'url', 'name', and '_http_headers'.
|
||||
Returns (None, None) on failure.
|
||||
"""
|
||||
# --- START: New logic to handle direct CDN file URLs ---
|
||||
try:
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
# Check if the hostname contains 'cdn' and the path has a common file extension
|
||||
is_direct_cdn_file = (parsed_url.hostname and 'cdn' in parsed_url.hostname and 'bunkr' in parsed_url.hostname and
|
||||
any(parsed_url.path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.webm', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar']))
|
||||
|
||||
if is_direct_cdn_file:
|
||||
logger.info("Bunkr direct file URL detected.")
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
# Use the filename (without extension) as a sensible album name
|
||||
album_name = os.path.splitext(filename)[0]
|
||||
|
||||
files_to_download = [{
|
||||
'url': url,
|
||||
'name': filename,
|
||||
'_http_headers': {'Referer': 'https://bunkr.ru/'} # Use a generic Referer
|
||||
}]
|
||||
return album_name, files_to_download
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse Bunkr URL for direct file check: {e}")
|
||||
# --- END: New logic ---
|
||||
|
||||
# This is the original logic for album and media pages
|
||||
extractor = get_bunkr_extractor(url, logger)
|
||||
if not extractor:
|
||||
return None, None
|
||||
@@ -238,4 +262,4 @@ def fetch_bunkr_data(url, logger):
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while extracting Bunkr info: {e}", exc_info=True)
|
||||
return None, None
|
||||
return None, None
|
||||
|
||||
125
src/core/fap_nation_client.py
Normal file
125
src/core/fap_nation_client.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import re
|
||||
import os
|
||||
import cloudscraper
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from ..utils.file_utils import clean_folder_name
|
||||
|
||||
def fetch_fap_nation_data(album_url, logger_func):
|
||||
"""
|
||||
Scrapes a fap-nation page by prioritizing HLS streams first, then falling
|
||||
back to direct download links. Selects the highest quality available.
|
||||
"""
|
||||
logger_func(f" [Fap-Nation] Fetching album data from: {album_url}")
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
try:
|
||||
response = scraper.get(album_url, timeout=45)
|
||||
response.raise_for_status()
|
||||
html_content = response.text
|
||||
|
||||
title_match = re.search(r'<h1[^>]*itemprop="name"[^>]*>(.*?)</h1>', html_content, re.IGNORECASE)
|
||||
album_slug = clean_folder_name(os.path.basename(urlparse(album_url).path.strip('/')))
|
||||
album_title = clean_folder_name(title_match.group(1).strip()) if title_match else album_slug
|
||||
|
||||
files_to_download = []
|
||||
final_url = None
|
||||
link_type = None
|
||||
filename_from_video_tag = None
|
||||
|
||||
video_tag_title_match = re.search(r'data-plyr-config=.*?"title":.*?"([^&]+?\.mp4)"', html_content, re.IGNORECASE)
|
||||
if video_tag_title_match:
|
||||
filename_from_video_tag = clean_folder_name(video_tag_title_match.group(1))
|
||||
logger_func(f" [Fap-Nation] Found high-quality filename in video tag: {filename_from_video_tag}")
|
||||
|
||||
# --- REVISED LOGIC: HLS FIRST ---
|
||||
|
||||
# 1. Prioritize finding an HLS stream.
|
||||
logger_func(" [Fap-Nation] Priority 1: Searching for HLS stream...")
|
||||
iframe_match = re.search(r'<iframe[^>]+src="([^"]+mediadelivery\.net[^"]+)"', html_content, re.IGNORECASE)
|
||||
|
||||
if iframe_match:
|
||||
iframe_url = iframe_match.group(1)
|
||||
logger_func(f" [Fap-Nation] Found video iframe. Visiting: {iframe_url}")
|
||||
try:
|
||||
iframe_response = scraper.get(iframe_url, timeout=30)
|
||||
iframe_response.raise_for_status()
|
||||
iframe_html = iframe_response.text
|
||||
|
||||
playlist_match = re.search(r'<source[^>]+src="([^"]+\.m3u8)"', iframe_html, re.IGNORECASE)
|
||||
if playlist_match:
|
||||
final_url = playlist_match.group(1)
|
||||
link_type = 'hls'
|
||||
logger_func(f" [Fap-Nation] Found embedded HLS stream in iframe: {final_url}")
|
||||
except Exception as e:
|
||||
logger_func(f" [Fap-Nation] ⚠️ Error fetching or parsing iframe content: {e}")
|
||||
|
||||
if not final_url:
|
||||
logger_func(" [Fap-Nation] No stream found in iframe. Checking main page content as a last resort...")
|
||||
js_var_match = re.search(r'"(https?://[^"]+\.m3u8)"', html_content, re.IGNORECASE)
|
||||
if js_var_match:
|
||||
final_url = js_var_match.group(1)
|
||||
link_type = 'hls'
|
||||
logger_func(f" [Fap-Nation] Found HLS stream on main page: {final_url}")
|
||||
|
||||
# 2. Fallback: If no HLS stream was found, search for direct links.
|
||||
if not final_url:
|
||||
logger_func(" [Fap-Nation] No HLS stream found. Priority 2 (Fallback): Searching for direct download links...")
|
||||
direct_link_pattern = r'<a\s+[^>]*href="([^"]+\.(?:mp4|webm|mkv|mov))"[^>]*>'
|
||||
direct_links_found = re.findall(direct_link_pattern, html_content, re.IGNORECASE)
|
||||
|
||||
if direct_links_found:
|
||||
logger_func(f" [Fap-Nation] Found {len(direct_links_found)} direct media link(s). Selecting the best quality...")
|
||||
best_link = direct_links_found[0]
|
||||
for link in direct_links_found:
|
||||
if '1080p' in link.lower():
|
||||
best_link = link
|
||||
break
|
||||
final_url = best_link
|
||||
link_type = 'direct'
|
||||
logger_func(f" [Fap-Nation] Identified direct media link: {final_url}")
|
||||
|
||||
# If after all checks, we still have no URL, then fail.
|
||||
if not final_url:
|
||||
logger_func(" [Fap-Nation] ❌ Stage 1 Failed: Could not find any HLS stream or direct link.")
|
||||
return None, []
|
||||
|
||||
# --- HLS Quality Selection Logic ---
|
||||
if link_type == 'hls' and final_url:
|
||||
logger_func(" [Fap-Nation] HLS stream found. Checking for higher quality variants...")
|
||||
try:
|
||||
master_playlist_response = scraper.get(final_url, timeout=20)
|
||||
master_playlist_response.raise_for_status()
|
||||
playlist_content = master_playlist_response.text
|
||||
|
||||
streams = re.findall(r'#EXT-X-STREAM-INF:.*?RESOLUTION=(\d+)x(\d+).*?\n(.*?)\s', playlist_content)
|
||||
|
||||
if streams:
|
||||
best_stream = max(streams, key=lambda s: int(s[0]) * int(s[1]))
|
||||
height = best_stream[1]
|
||||
relative_path = best_stream[2]
|
||||
new_final_url = urljoin(final_url, relative_path)
|
||||
|
||||
logger_func(f" [Fap-Nation] ✅ Best quality found: {height}p. Updating URL to: {new_final_url}")
|
||||
final_url = new_final_url
|
||||
else:
|
||||
logger_func(" [Fap-Nation] ℹ️ No alternate quality streams found in playlist. Using original.")
|
||||
except Exception as e:
|
||||
logger_func(f" [Fap-Nation] ⚠️ Could not parse HLS master playlist for quality selection: {e}. Using original URL.")
|
||||
|
||||
if final_url and link_type:
|
||||
if filename_from_video_tag:
|
||||
base_name, _ = os.path.splitext(filename_from_video_tag)
|
||||
new_filename = f"{base_name}.mp4"
|
||||
else:
|
||||
new_filename = f"{album_slug}.mp4"
|
||||
|
||||
files_to_download.append({'url': final_url, 'filename': new_filename, 'type': link_type})
|
||||
logger_func(f" [Fap-Nation] ✅ Ready to download '{new_filename}' ({link_type} method).")
|
||||
return album_title, files_to_download
|
||||
|
||||
logger_func(f" [Fap-Nation] ❌ Could not determine a valid download link.")
|
||||
return None, []
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" [Fap-Nation] ❌ Error fetching Fap-Nation data: {e}")
|
||||
return None, []
|
||||
189
src/core/mangadex_client.py
Normal file
189
src/core/mangadex_client.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# src/core/mangadex_client.py
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import cloudscraper
|
||||
from collections import defaultdict
|
||||
from ..utils.file_utils import clean_folder_name
|
||||
|
||||
def fetch_mangadex_data(start_url, output_dir, logger_func, file_progress_callback, overall_progress_callback, pause_event, cancellation_event):
|
||||
"""
|
||||
Fetches and downloads all content from a MangaDex series or chapter URL.
|
||||
Returns a tuple of (downloaded_count, skipped_count).
|
||||
"""
|
||||
grand_total_dl = 0
|
||||
grand_total_skip = 0
|
||||
|
||||
api = _MangadexAPI(logger_func)
|
||||
|
||||
def _check_pause():
|
||||
if cancellation_event and cancellation_event.is_set(): return True
|
||||
if pause_event and pause_event.is_set():
|
||||
logger_func(" Download paused...")
|
||||
while pause_event.is_set():
|
||||
if cancellation_event and cancellation_event.is_set(): return True
|
||||
time.sleep(0.5)
|
||||
logger_func(" Download resumed.")
|
||||
return cancellation_event.is_set()
|
||||
|
||||
series_match = re.search(r"mangadex\.org/(?:title|manga)/([0-9a-f-]+)", start_url)
|
||||
chapter_match = re.search(r"mangadex\.org/chapter/([0-9a-f-]+)", start_url)
|
||||
|
||||
chapters_to_process = []
|
||||
if series_match:
|
||||
series_id = series_match.group(1)
|
||||
logger_func(f" Series detected. Fetching chapter list for ID: {series_id}")
|
||||
chapters_to_process = api.get_manga_chapters(series_id, cancellation_event, pause_event)
|
||||
elif chapter_match:
|
||||
chapter_id = chapter_match.group(1)
|
||||
logger_func(f" Single chapter detected. Fetching info for ID: {chapter_id}")
|
||||
chapter_info = api.get_chapter_info(chapter_id)
|
||||
if chapter_info:
|
||||
chapters_to_process = [chapter_info]
|
||||
|
||||
if not chapters_to_process:
|
||||
logger_func("❌ No chapters found or failed to fetch chapter info.")
|
||||
return 0, 0
|
||||
|
||||
logger_func(f"✅ Found {len(chapters_to_process)} chapter(s) to download.")
|
||||
if overall_progress_callback:
|
||||
overall_progress_callback.emit(len(chapters_to_process), 0)
|
||||
|
||||
for chap_idx, chapter_json in enumerate(chapters_to_process):
|
||||
if _check_pause(): break
|
||||
try:
|
||||
metadata = api.transform_chapter_data(chapter_json)
|
||||
logger_func("-" * 40)
|
||||
logger_func(f"Processing Chapter {chap_idx + 1}/{len(chapters_to_process)}: Vol. {metadata['volume']} Ch. {metadata['chapter']}{metadata['chapter_minor']} - {metadata['title']}")
|
||||
|
||||
server_info = api.get_at_home_server(chapter_json["id"])
|
||||
if not server_info:
|
||||
logger_func(" ❌ Could not get image server for this chapter. Skipping.")
|
||||
continue
|
||||
|
||||
base_url = f"{server_info['baseUrl']}/data/{server_info['chapter']['hash']}/"
|
||||
image_files = server_info['chapter']['data']
|
||||
|
||||
series_folder = clean_folder_name(metadata['manga'])
|
||||
chapter_folder_title = metadata['title'] or ''
|
||||
chapter_folder = clean_folder_name(f"Vol {metadata['volume']:02d} Chap {metadata['chapter']:03d}{metadata['chapter_minor']} - {chapter_folder_title}".strip().strip('-').strip())
|
||||
final_save_path = os.path.join(output_dir, series_folder, chapter_folder)
|
||||
os.makedirs(final_save_path, exist_ok=True)
|
||||
|
||||
for img_idx, filename in enumerate(image_files):
|
||||
if _check_pause(): break
|
||||
|
||||
full_img_url = base_url + filename
|
||||
img_path = os.path.join(final_save_path, f"{img_idx + 1:03d}{os.path.splitext(filename)[1]}")
|
||||
|
||||
if os.path.exists(img_path):
|
||||
logger_func(f" -> Skip ({img_idx+1}/{len(image_files)}): '{os.path.basename(img_path)}' already exists.")
|
||||
grand_total_skip += 1
|
||||
continue
|
||||
|
||||
logger_func(f" Downloading ({img_idx+1}/{len(image_files)}): '{os.path.basename(img_path)}'...")
|
||||
|
||||
try:
|
||||
response = api.session.get(full_img_url, stream=True, timeout=60, headers={'Referer': 'https://mangadex.org/'})
|
||||
response.raise_for_status()
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
|
||||
if file_progress_callback:
|
||||
file_progress_callback.emit(os.path.basename(img_path), (0, total_size))
|
||||
|
||||
with open(img_path, 'wb') as f:
|
||||
downloaded_bytes = 0
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if _check_pause(): break
|
||||
f.write(chunk)
|
||||
downloaded_bytes += len(chunk)
|
||||
if file_progress_callback:
|
||||
file_progress_callback.emit(os.path.basename(img_path), (downloaded_bytes, total_size))
|
||||
|
||||
if _check_pause():
|
||||
if os.path.exists(img_path): os.remove(img_path)
|
||||
break
|
||||
|
||||
grand_total_dl += 1
|
||||
except Exception as e:
|
||||
logger_func(f" ❌ Failed to download page {img_idx+1}: {e}")
|
||||
grand_total_skip += 1
|
||||
|
||||
if overall_progress_callback:
|
||||
overall_progress_callback.emit(len(chapters_to_process), chap_idx + 1)
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" ❌ An unexpected error occurred while processing chapter {chapter_json.get('id')}: {e}")
|
||||
|
||||
return grand_total_dl, grand_total_skip
|
||||
|
||||
class _MangadexAPI:
|
||||
def __init__(self, logger_func):
|
||||
self.logger_func = logger_func
|
||||
self.session = cloudscraper.create_scraper()
|
||||
self.root = "https://api.mangadex.org"
|
||||
|
||||
def _call(self, endpoint, params=None, cancellation_event=None):
|
||||
if cancellation_event and cancellation_event.is_set(): return None
|
||||
try:
|
||||
response = self.session.get(f"{self.root}{endpoint}", params=params, timeout=30)
|
||||
if response.status_code == 429:
|
||||
retry_after = int(response.headers.get("X-RateLimit-Retry-After", 5))
|
||||
self.logger_func(f" ⚠️ Rate limited. Waiting for {retry_after} seconds...")
|
||||
time.sleep(retry_after)
|
||||
return self._call(endpoint, params, cancellation_event)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
self.logger_func(f" ❌ API call to '{endpoint}' failed: {e}")
|
||||
return None
|
||||
|
||||
def get_manga_chapters(self, series_id, cancellation_event, pause_event):
|
||||
all_chapters = []
|
||||
offset = 0
|
||||
limit = 500
|
||||
base_params = {
|
||||
"limit": limit, "order[volume]": "asc", "order[chapter]": "asc",
|
||||
"translatedLanguage[]": ["en"], "includes[]": ["scanlation_group", "user", "manga"]
|
||||
}
|
||||
while True:
|
||||
if cancellation_event.is_set(): break
|
||||
while pause_event.is_set(): time.sleep(0.5)
|
||||
|
||||
params = {**base_params, "offset": offset}
|
||||
data = self._call(f"/manga/{series_id}/feed", params, cancellation_event)
|
||||
if not data or data.get("result") != "ok": break
|
||||
|
||||
results = data.get("data", [])
|
||||
all_chapters.extend(results)
|
||||
|
||||
if (offset + limit) >= data.get("total", 0): break
|
||||
offset += limit
|
||||
return all_chapters
|
||||
|
||||
def get_chapter_info(self, chapter_id):
|
||||
params = {"includes[]": ["scanlation_group", "user", "manga"]}
|
||||
data = self._call(f"/chapter/{chapter_id}", params)
|
||||
return data.get("data") if data and data.get("result") == "ok" else None
|
||||
|
||||
def get_at_home_server(self, chapter_id):
|
||||
return self._call(f"/at-home/server/{chapter_id}")
|
||||
|
||||
def transform_chapter_data(self, chapter):
|
||||
relationships = {item["type"]: item for item in chapter.get("relationships", [])}
|
||||
manga = relationships.get("manga", {})
|
||||
c_attrs = chapter.get("attributes", {})
|
||||
m_attrs = manga.get("attributes", {})
|
||||
|
||||
chapter_num_str = c_attrs.get("chapter", "0") or "0"
|
||||
chnum, sep, minor = chapter_num_str.partition(".")
|
||||
|
||||
return {
|
||||
"manga": (m_attrs.get("title", {}).get("en") or next(iter(m_attrs.get("title", {}).values()), "Unknown Series")),
|
||||
"title": c_attrs.get("title", ""),
|
||||
"volume": int(float(c_attrs.get("volume", 0) or 0)),
|
||||
"chapter": int(float(chnum or 0)),
|
||||
"chapter_minor": sep + minor if minor else ""
|
||||
}
|
||||
93
src/core/pixeldrain_client.py
Normal file
93
src/core/pixeldrain_client.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import os
|
||||
import re
|
||||
import cloudscraper
|
||||
from ..utils.file_utils import clean_folder_name
|
||||
# --- ADDED IMPORTS ---
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
def fetch_pixeldrain_data(url: str, logger):
|
||||
"""
|
||||
Scrapes a given Pixeldrain URL to extract album or file information.
|
||||
Handles single files (/u/), albums/lists (/l/), and folders (/d/).
|
||||
"""
|
||||
logger(f"Fetching data for Pixeldrain URL: {url}")
|
||||
scraper = cloudscraper.create_scraper()
|
||||
root = "https://pixeldrain.com"
|
||||
|
||||
# --- START OF FIX: Add a robust retry strategy ---
|
||||
try:
|
||||
retry_strategy = Retry(
|
||||
total=5, # Total number of retries
|
||||
backoff_factor=1, # Wait 1s, 2s, 4s, 8s between retries
|
||||
status_forcelist=[429, 500, 502, 503, 504], # Retry on these server errors
|
||||
allowed_methods=["HEAD", "GET"]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
scraper.mount("https://", adapter)
|
||||
scraper.mount("http://", adapter)
|
||||
logger(" [Pixeldrain] Configured retry strategy for network requests.")
|
||||
except Exception as e:
|
||||
logger(f" [Pixeldrain] ⚠️ Could not configure retry strategy: {e}")
|
||||
# --- END OF FIX ---
|
||||
|
||||
file_match = re.search(r"/u/(\w+)", url)
|
||||
album_match = re.search(r"/l/(\w+)", url)
|
||||
folder_match = re.search(r"/d/([^?]+)", url)
|
||||
|
||||
try:
|
||||
if file_match:
|
||||
file_id = file_match.group(1)
|
||||
logger(f" Detected Pixeldrain File ID: {file_id}")
|
||||
api_url = f"{root}/api/file/{file_id}/info"
|
||||
data = scraper.get(api_url).json()
|
||||
|
||||
title = data.get("name", file_id)
|
||||
|
||||
files = [{
|
||||
'url': f"{root}/api/file/{file_id}?download",
|
||||
'filename': data.get("name", f"{file_id}.tmp")
|
||||
}]
|
||||
return title, files
|
||||
|
||||
elif album_match:
|
||||
album_id = album_match.group(1)
|
||||
logger(f" Detected Pixeldrain Album ID: {album_id}")
|
||||
api_url = f"{root}/api/list/{album_id}"
|
||||
data = scraper.get(api_url).json()
|
||||
|
||||
title = data.get("title", album_id)
|
||||
|
||||
files = []
|
||||
for file_info in data.get("files", []):
|
||||
files.append({
|
||||
'url': f"{root}/api/file/{file_info['id']}?download",
|
||||
'filename': file_info.get("name", f"{file_info['id']}.tmp")
|
||||
})
|
||||
return title, files
|
||||
|
||||
elif folder_match:
|
||||
path_id = folder_match.group(1)
|
||||
logger(f" Detected Pixeldrain Folder Path: {path_id}")
|
||||
api_url = f"{root}/api/filesystem/{path_id}?stat"
|
||||
data = scraper.get(api_url).json()
|
||||
|
||||
path_info = data["path"][data["base_index"]]
|
||||
title = path_info.get("name", path_id)
|
||||
|
||||
files = []
|
||||
for child in data.get("children", []):
|
||||
if child.get("type") == "file":
|
||||
files.append({
|
||||
'url': f"{root}/api/filesystem{child['path']}?attach",
|
||||
'filename': child.get("name")
|
||||
})
|
||||
return title, files
|
||||
|
||||
else:
|
||||
logger(" ❌ Could not identify Pixeldrain URL type (file, album, or folder).")
|
||||
return None, []
|
||||
|
||||
except Exception as e:
|
||||
logger(f"❌ An error occurred while fetching Pixeldrain data: {e}")
|
||||
return None, []
|
||||
100
src/core/simpcity_client.py
Normal file
100
src/core/simpcity_client.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# src/core/simpcity_client.py
|
||||
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, unquote
|
||||
import os
|
||||
import re
|
||||
from ..utils.file_utils import clean_folder_name
|
||||
import urllib.parse
|
||||
|
||||
def fetch_single_simpcity_page(url, logger_func, cookies=None, post_id=None):
|
||||
"""
|
||||
Scrapes a single SimpCity page for images, external links, video tags, and iframes.
|
||||
"""
|
||||
scraper = cloudscraper.create_scraper()
|
||||
headers = {'Referer': 'https://simpcity.cr/'}
|
||||
|
||||
try:
|
||||
response = scraper.get(url, timeout=30, headers=headers, cookies=cookies)
|
||||
if response.status_code == 404:
|
||||
return None, []
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
album_title = None
|
||||
title_element = soup.find('h1', class_='p-title-value')
|
||||
if title_element:
|
||||
album_title = title_element.text.strip()
|
||||
|
||||
search_scope = soup
|
||||
if post_id:
|
||||
post_content_container = soup.find('div', attrs={'data-lb-id': f'post-{post_id}'})
|
||||
if post_content_container:
|
||||
logger_func(f" [SimpCity] ✅ Isolating search to post content container for ID {post_id}.")
|
||||
search_scope = post_content_container
|
||||
else:
|
||||
logger_func(f" [SimpCity] ⚠️ Could not find content container for post ID {post_id}.")
|
||||
|
||||
jobs_on_page = []
|
||||
|
||||
# Find native SimpCity images
|
||||
image_tags = search_scope.find_all('img', class_='bbImage')
|
||||
for img_tag in image_tags:
|
||||
thumbnail_url = img_tag.get('src')
|
||||
if not thumbnail_url or not isinstance(thumbnail_url, str) or 'saint2.su' in thumbnail_url: continue
|
||||
full_url = thumbnail_url.replace('.md.', '.')
|
||||
filename = img_tag.get('alt', '').replace('.md.', '.') or os.path.basename(unquote(urlparse(full_url).path))
|
||||
jobs_on_page.append({'type': 'image', 'filename': filename, 'url': full_url})
|
||||
|
||||
# Find links in <a> tags, now with redirect handling
|
||||
link_tags = search_scope.find_all('a', href=True)
|
||||
for link in link_tags:
|
||||
href = link.get('href', '')
|
||||
|
||||
actual_url = href
|
||||
if '/misc/goto?url=' in href:
|
||||
try:
|
||||
# Extract and decode the real URL from the 'url' parameter
|
||||
parsed_href = urlparse(href)
|
||||
query_params = dict(urllib.parse.parse_qsl(parsed_href.query))
|
||||
if 'url' in query_params:
|
||||
actual_url = unquote(query_params['url'])
|
||||
except Exception:
|
||||
actual_url = href # Fallback if parsing fails
|
||||
|
||||
# Perform all checks on the 'actual_url' which is now the real destination
|
||||
if re.search(r'pixeldrain\.com/[lud]/', actual_url): jobs_on_page.append({'type': 'pixeldrain', 'url': actual_url})
|
||||
elif re.search(r'saint2\.(su|pk|cr|to)/embed/', actual_url): jobs_on_page.append({'type': 'saint2', 'url': actual_url})
|
||||
elif re.search(r'bunkr\.(?:cr|si|la|ws|is|ru|su|red|black|media|site|to|ac|ci|fi|pk|ps|sk|ph)|bunkrr\.ru', actual_url): jobs_on_page.append({'type': 'bunkr', 'url': actual_url})
|
||||
elif re.search(r'mega\.(nz|io)', actual_url): jobs_on_page.append({'type': 'mega', 'url': actual_url})
|
||||
elif re.search(r'gofile\.io', actual_url): jobs_on_page.append({'type': 'gofile', 'url': actual_url})
|
||||
|
||||
# Find direct Saint2 video embeds in <video> tags
|
||||
video_tags = search_scope.find_all('video')
|
||||
for video in video_tags:
|
||||
source_tag = video.find('source')
|
||||
if source_tag and source_tag.get('src'):
|
||||
src_url = source_tag['src']
|
||||
if re.search(r'saint2\.(su|pk|cr|to)', src_url):
|
||||
jobs_on_page.append({'type': 'saint2_direct', 'url': src_url})
|
||||
|
||||
# Find embeds in <iframe> tags (as a fallback)
|
||||
iframe_tags = search_scope.find_all('iframe')
|
||||
for iframe in iframe_tags:
|
||||
src_url = iframe.get('src')
|
||||
if src_url and isinstance(src_url, str):
|
||||
if re.search(r'saint2\.(su|pk|cr|to)/embed/', src_url):
|
||||
jobs_on_page.append({'type': 'saint2', 'url': src_url})
|
||||
|
||||
if jobs_on_page:
|
||||
# We use a set to remove duplicate URLs that might be found in multiple ways
|
||||
unique_jobs = list({job['url']: job for job in jobs_on_page}.values())
|
||||
logger_func(f" [SimpCity] Scraper found jobs: {[job['type'] for job in unique_jobs]}")
|
||||
return album_title, unique_jobs
|
||||
|
||||
return album_title, []
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" [SimpCity] ❌ Error fetching page {url}: {e}")
|
||||
raise e
|
||||
73
src/core/toonily_client.py
Normal file
73
src/core/toonily_client.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
|
||||
def get_chapter_list(series_url, logger_func):
|
||||
logger_func(f" [Toonily] Scraping series page for chapter list: {series_url}")
|
||||
scraper = cloudscraper.create_scraper()
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
||||
'Referer': 'https://toonily.com/'
|
||||
}
|
||||
|
||||
try:
|
||||
response = scraper.get(series_url, timeout=30, headers=headers)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
chapter_links = soup.select('li.wp-manga-chapter > a')
|
||||
|
||||
if not chapter_links:
|
||||
logger_func(" [Toonily] ❌ Could not find any chapter links on the page.")
|
||||
return []
|
||||
|
||||
urls = [link['href'] for link in chapter_links]
|
||||
urls.reverse()
|
||||
logger_func(f" [Toonily] Found {len(urls)} chapters.")
|
||||
return urls
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" [Toonily] ❌ Error getting chapter list: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def fetch_chapter_data(chapter_url, logger_func, scraper_session):
|
||||
"""
|
||||
Scrapes a single Toonily.com chapter page for its title and image URLs.
|
||||
"""
|
||||
main_series_url = chapter_url.rsplit('/', 2)[0] + '/'
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Referer': main_series_url
|
||||
}
|
||||
|
||||
try:
|
||||
response = scraper_session.get(chapter_url, timeout=30, headers=headers)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
title_element = soup.select_one('h1#chapter-heading')
|
||||
image_container = soup.select_one('div.reading-content')
|
||||
|
||||
if not title_element or not image_container:
|
||||
logger_func(" [Toonily] ❌ Page structure invalid. Could not find title or image container.")
|
||||
return None, None, []
|
||||
|
||||
full_chapter_title = title_element.text.strip()
|
||||
|
||||
if " - Chapter" in full_chapter_title:
|
||||
series_title = full_chapter_title.split(" - Chapter")[0].strip()
|
||||
else:
|
||||
series_title = full_chapter_title.strip()
|
||||
|
||||
chapter_title = full_chapter_title # The full string is best for the chapter folder name
|
||||
|
||||
image_elements = image_container.select('img')
|
||||
image_urls = [img.get('data-src', img.get('src')).strip() for img in image_elements if img.get('data-src') or img.get('src')]
|
||||
|
||||
return series_title, chapter_title, image_urls
|
||||
|
||||
except Exception as e:
|
||||
logger_func(f" [Toonily] ❌ An error occurred scraping chapter '{chapter_url}': {e}")
|
||||
return None, None, []
|
||||
1089
src/core/workers.py
1089
src/core/workers.py
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user