This commit is contained in:
Yuvi63771
2025-10-08 17:02:46 +05:30
parent df8a305e81
commit 8239fdb8f3
36 changed files with 5380 additions and 1468 deletions

View File

@@ -2,71 +2,206 @@
import re
import os
import json
import requests
import time
import cloudscraper
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
import queue
def fetch_hentai2read_data(url, logger, session):
def run_hentai2read_download(start_url, output_dir, progress_callback, overall_progress_callback, check_pause_func):
"""
Scrapes a SINGLE Hentai2Read chapter page using a provided session.
Orchestrates the download process using a producer-consumer model.
The main thread scrapes image URLs and puts them in a queue.
A pool of worker threads consumes from the queue to download images concurrently.
"""
logger(f"Attempting to fetch chapter data from: {url}")
scraper = cloudscraper.create_scraper()
try:
response = session.get(url, timeout=30)
response.raise_for_status()
progress_callback(" [Hentai2Read] Scraping series page for all metadata...")
top_level_folder_name, chapters_to_process = _get_series_metadata(start_url, progress_callback, scraper)
if not chapters_to_process:
progress_callback("❌ No chapters found to download. Aborting.")
return 0, 0
page_content_text = response.text
soup = BeautifulSoup(page_content_text, 'html.parser')
album_title = ""
title_tags = soup.select('span[itemprop="name"]')
if title_tags:
album_title = title_tags[-1].text.strip()
total_chapters = len(chapters_to_process)
overall_progress_callback(total_chapters, 0)
if not album_title:
title_tag = soup.select_one('h1.title')
if title_tag:
album_title = title_tag.text.strip()
total_downloaded_count = 0
total_skipped_count = 0
if not album_title:
logger("❌ Could not find album title on page.")
return None, None
for idx, chapter in enumerate(chapters_to_process):
if check_pause_func(): break
progress_callback(f"\n-- Processing and Downloading Chapter {idx + 1}/{total_chapters}: '{chapter['title']}' --")
series_folder = re.sub(r'[\\/*?:"<>|]', "", top_level_folder_name).strip()
chapter_folder = re.sub(r'[\\/*?:"<>|]', "", chapter['title']).strip()
final_save_path = os.path.join(output_dir, series_folder, chapter_folder)
os.makedirs(final_save_path, exist_ok=True)
# This function now scrapes and downloads simultaneously
dl_count, skip_count = _process_and_download_chapter(
chapter_url=chapter['url'],
save_path=final_save_path,
scraper=scraper,
progress_callback=progress_callback,
check_pause_func=check_pause_func
)
total_downloaded_count += dl_count
total_skipped_count += skip_count
overall_progress_callback(total_chapters, idx + 1)
if check_pause_func(): break
image_urls = []
try:
start_index = page_content_text.index("'images' : ") + len("'images' : ")
end_index = page_content_text.index(",\n", start_index)
images_json_str = page_content_text[start_index:end_index]
image_paths = json.loads(images_json_str)
image_urls = ["https://hentaicdn.com/hentai" + part for part in image_paths]
except (ValueError, json.JSONDecodeError):
logger("❌ Could not find or parse image JSON data for this chapter.")
return None, None
return total_downloaded_count, total_skipped_count
if not image_urls:
logger("❌ No image URLs found for this chapter.")
return None, None
logger(f" Found {len(image_urls)} images for album '{album_title}'.")
files_to_download = []
for i, img_url in enumerate(image_urls):
page_num = i + 1
extension = os.path.splitext(img_url)[1].split('?')[0]
if not extension: extension = ".jpg"
filename = f"{page_num:03d}{extension}"
files_to_download.append({'url': img_url, 'filename': filename})
return album_title, files_to_download
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
logger(f" Chapter not found (404 Error). This likely marks the end of the series.")
else:
logger(f"❌ An HTTP error occurred: {e}")
return None, None
except Exception as e:
logger(f"❌ An unexpected error occurred while fetching data: {e}")
return None, None
progress_callback(f"❌ A critical error occurred in the Hentai2Read client: {e}")
return 0, 0
def _get_series_metadata(start_url, progress_callback, scraper):
"""
Scrapes the main series page to get the Artist Name, Series Title, and chapter list.
"""
try:
response = scraper.get(start_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
series_title = "Unknown Series"
artist_name = None
metadata_list = soup.select_one("ul.list.list-simple-mini")
if metadata_list:
first_li = metadata_list.find('li', recursive=False)
if first_li and not first_li.find('a'):
series_title = first_li.get_text(strip=True)
for b_tag in metadata_list.find_all('b'):
label = b_tag.get_text(strip=True)
if label in ("Artist", "Author"):
a_tag = b_tag.find_next_sibling('a')
if a_tag:
artist_name = a_tag.get_text(strip=True)
if label == "Artist":
break
top_level_folder_name = artist_name if artist_name else series_title
chapter_links = soup.select("div.media a.pull-left.font-w600")
if not chapter_links:
chapters_to_process = [{'url': start_url, 'title': series_title}]
else:
chapters_to_process = [
{'url': urljoin(start_url, link['href']), 'title': " ".join(link.stripped_strings)}
for link in chapter_links
]
chapters_to_process.reverse()
progress_callback(f" [Hentai2Read] ✅ Found Artist/Series: '{top_level_folder_name}'")
progress_callback(f" [Hentai2Read] ✅ Found {len(chapters_to_process)} chapters to process.")
return top_level_folder_name, chapters_to_process
except Exception as e:
progress_callback(f" [Hentai2Read] ❌ Error getting series metadata: {e}")
return "Unknown Series", []
### NEW: This function contains the pipeline logic ###
def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func):
"""
Uses a producer-consumer pattern to download a chapter.
The main thread (producer) scrapes URLs one by one.
Worker threads (consumers) download the URLs as they are found.
"""
task_queue = queue.Queue()
num_download_threads = 8
# These will be updated by the worker threads
download_stats = {'downloaded': 0, 'skipped': 0}
def downloader_worker():
"""The function that each download thread will run."""
# Create a unique session for each thread to avoid conflicts
worker_scraper = cloudscraper.create_scraper()
while True:
try:
# Get a task from the queue
task = task_queue.get()
# The sentinel value to signal the end
if task is None:
break
filepath, img_url = task
if os.path.exists(filepath):
progress_callback(f" -> Skip: '{os.path.basename(filepath)}'")
download_stats['skipped'] += 1
else:
progress_callback(f" Downloading: '{os.path.basename(filepath)}'...")
response = worker_scraper.get(img_url, stream=True, timeout=60, headers={'Referer': chapter_url})
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
download_stats['downloaded'] += 1
except Exception as e:
progress_callback(f" ❌ Download failed for task. Error: {e}")
download_stats['skipped'] += 1
finally:
task_queue.task_done()
# --- Start the downloader threads ---
executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader')
for _ in range(num_download_threads):
executor.submit(downloader_worker)
# --- Main thread acts as the scraper (producer) ---
page_number = 1
while True:
if check_pause_func(): break
if page_number > 300: # Safety break
progress_callback(" [Hentai2Read] ⚠️ Safety break: Reached 300 pages.")
break
page_url_to_check = f"{chapter_url}{page_number}/"
try:
response = scraper.get(page_url_to_check, timeout=30)
if response.history or response.status_code != 200:
progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.")
break
soup = BeautifulSoup(response.text, 'html.parser')
img_tag = soup.select_one("img#arf-reader")
img_src = img_tag.get("src") if img_tag else None
if not img_tag or img_src == "https://static.hentai.direct/hentai":
progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).")
break
normalized_img_src = urljoin(response.url, img_src)
ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg"
filename = f"{page_number:03d}{ext}"
filepath = os.path.join(save_path, filename)
# Put the download task into the queue for a worker to pick up
task_queue.put((filepath, normalized_img_src))
page_number += 1
time.sleep(0.1) # Small delay between scraping pages
except Exception as e:
progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}")
break
# --- Shutdown sequence ---
# Tell all worker threads to exit by sending the sentinel value
for _ in range(num_download_threads):
task_queue.put(None)
# Wait for all download tasks to be completed
executor.shutdown(wait=True)
progress_callback(f" Found and processed {page_number - 1} images for this chapter.")
return download_stats['downloaded'], download_stats['skipped']

116
src/core/allcomic_client.py Normal file
View File

@@ -0,0 +1,116 @@
import requests
import re
from bs4 import BeautifulSoup
import cloudscraper
import time
from urllib.parse import urlparse
def get_chapter_list(series_url, logger_func):
"""
Checks if a URL is a series page and returns a list of all chapter URLs if it is.
Includes a retry mechanism for robust connection.
"""
logger_func(f" [AllComic] Checking for chapter list at: {series_url}")
scraper = cloudscraper.create_scraper()
response = None
max_retries = 8
for attempt in range(max_retries):
try:
response = scraper.get(series_url, timeout=30)
response.raise_for_status()
logger_func(f" [AllComic] Successfully connected to series page on attempt {attempt + 1}.")
break # Success, exit the loop
except requests.RequestException as e:
logger_func(f" [AllComic] ⚠️ Series page check attempt {attempt + 1}/{max_retries} failed: {e}")
if attempt < max_retries - 1:
wait_time = 2 * (attempt + 1)
logger_func(f" Retrying in {wait_time} seconds...")
time.sleep(wait_time)
else:
logger_func(f" [AllComic] ❌ All attempts to check series page failed.")
return [] # Return empty on final failure
if not response:
return []
try:
soup = BeautifulSoup(response.text, 'html.parser')
chapter_links = soup.select('li.wp-manga-chapter a')
if not chapter_links:
logger_func(" [AllComic] No chapter list found. Assuming this is a single chapter page.")
return []
chapter_urls = [link['href'] for link in chapter_links]
chapter_urls.reverse() # Reverse for oldest-to-newest reading order
logger_func(f" [AllComic] ✅ Found {len(chapter_urls)} chapters.")
return chapter_urls
except Exception as e:
logger_func(f" [AllComic] ❌ Error parsing chapters after successful connection: {e}")
return []
def fetch_chapter_data(chapter_url, logger_func):
"""
Fetches the comic title, chapter title, and image URLs for a single chapter page.
"""
logger_func(f" [AllComic] Fetching page: {chapter_url}")
scraper = cloudscraper.create_scraper(
browser={'browser': 'firefox', 'platform': 'windows', 'desktop': True}
)
headers = {'Referer': 'https://allporncomic.com/'}
response = None
max_retries = 8
for attempt in range(max_retries):
try:
response = scraper.get(chapter_url, headers=headers, timeout=30)
response.raise_for_status()
break
except requests.RequestException as e:
if attempt < max_retries - 1:
time.sleep(2 * (attempt + 1))
else:
logger_func(f" [AllComic] ❌ All connection attempts failed for chapter: {chapter_url}")
return None, None, None
try:
soup = BeautifulSoup(response.text, 'html.parser')
title_element = soup.find('h1', class_='post-title')
comic_title = None
if title_element:
comic_title = title_element.text.strip()
else:
try:
path_parts = urlparse(chapter_url).path.strip('/').split('/')
if len(path_parts) >= 3 and path_parts[-3] == 'porncomic':
comic_slug = path_parts[-2]
comic_title = comic_slug.replace('-', ' ').title()
except Exception:
comic_title = "Unknown Comic"
chapter_slug = chapter_url.strip('/').split('/')[-1]
chapter_title = chapter_slug.replace('-', ' ').title()
reading_container = soup.find('div', class_='reading-content')
list_of_image_urls = []
if reading_container:
image_elements = reading_container.find_all('img', class_='wp-manga-chapter-img')
for img in image_elements:
img_url = (img.get('data-src') or img.get('src', '')).strip()
if img_url:
list_of_image_urls.append(img_url)
if not comic_title or comic_title == "Unknown Comic" or not list_of_image_urls:
logger_func(f" [AllComic] ❌ Could not find a valid title or images on the page. Title found: '{comic_title}'")
return None, None, None
return comic_title, chapter_title, list_of_image_urls
except Exception as e:
logger_func(f" [AllComic] ❌ An unexpected error occurred while parsing the page: {e}")
return None, None, None

View File

@@ -33,7 +33,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
if cancellation_event and cancellation_event.is_set():
raise RuntimeError("Fetch operation cancelled by user during retry loop.")
log_message = f" Fetching post list: {paginated_url} (Page approx. {offset // 50 + 1})"
log_message = f" Fetching post list: {api_url_base} (Page approx. {offset // 50 + 1})"
if attempt > 0:
log_message += f" (Attempt {attempt + 1}/{max_retries})"
logger(log_message)
@@ -247,7 +247,7 @@ def download_from_api(
break
all_posts_for_manga_mode.extend(posts_batch_manga)
logger(f"MANGA_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}")
logger(f"RENAMING_MODE_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}")
current_offset_manga += page_size
time.sleep(0.6)
@@ -265,7 +265,7 @@ def download_from_api(
if cancellation_event and cancellation_event.is_set(): return
if all_posts_for_manga_mode:
logger(f"MANGA_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}")
logger(f"RENAMING_MODE_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}")
if all_posts_for_manga_mode:
if processed_post_ids:

375
src/core/booru_client.py Normal file
View File

@@ -0,0 +1,375 @@
# src/core/booru_client.py
import os
import re
import time
import datetime
import urllib.parse
import requests
import logging
import cloudscraper
# --- Start of Combined Code from 1.py ---
# Part 1: Essential Utilities & Exceptions
class BooruClientException(Exception):
"""Base class for exceptions in this client."""
pass
class HttpError(BooruClientException):
"""HTTP request during data extraction failed."""
def __init__(self, message="", response=None):
self.response = response
self.status = response.status_code if response else 0
if response and not message:
message = f"'{response.status_code} {response.reason}' for '{response.url}'"
super().__init__(message)
class NotFoundError(BooruClientException):
pass
def unquote(s):
return urllib.parse.unquote(s)
def parse_datetime(date_string, fmt):
try:
# Assumes date_string is in a format that strptime can handle with timezone
return datetime.datetime.strptime(date_string, fmt)
except (ValueError, TypeError):
return None
def nameext_from_url(url, data=None):
if data is None: data = {}
try:
path = urllib.parse.urlparse(url).path
filename = unquote(os.path.basename(path))
if '.' in filename:
name, ext = filename.rsplit('.', 1)
data["filename"], data["extension"] = name, ext.lower()
else:
data["filename"], data["extension"] = filename, ""
except Exception:
data["filename"], data["extension"] = "", ""
return data
USERAGENT_FIREFOX = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"
# Part 2: Core Extractor Logic
class Extractor:
category = ""
subcategory = ""
directory_fmt = ("{category}", "{id}")
filename_fmt = "{filename}.{extension}"
_retries = 3
_timeout = 30
def __init__(self, match, logger_func=print):
self.url = match.string
self.match = match
self.groups = match.groups()
self.session = cloudscraper.create_scraper()
self.session.headers["User-Agent"] = USERAGENT_FIREFOX
self.log = logger_func
self.api_key = None
self.user_id = None
def set_auth(self, api_key, user_id):
self.api_key = api_key
self.user_id = user_id
self._init_auth()
def _init_auth(self):
"""Placeholder for extractor-specific auth setup."""
pass
def request(self, url, method="GET", fatal=True, **kwargs):
for attempt in range(self._retries + 1):
try:
response = self.session.request(method, url, timeout=self._timeout, **kwargs)
if response.status_code < 400:
return response
if response.status_code == 404 and fatal:
raise NotFoundError(f"Resource not found at {url}")
self.log(f"Request for {url} failed with status {response.status_code}. Retrying...")
except requests.exceptions.RequestException as e:
self.log(f"Request for {url} failed: {e}. Retrying...")
if attempt < self._retries:
time.sleep(2 ** attempt)
if fatal:
raise HttpError(f"Failed to retrieve {url} after {self._retries} retries.")
return None
def request_json(self, url, **kwargs):
response = self.request(url, **kwargs)
try:
return response.json()
except (ValueError, TypeError) as exc:
self.log(f"Failed to decode JSON from {url}: {exc}")
raise BooruClientException("Invalid JSON response")
def items(self):
data = self.metadata()
for item in self.posts():
# Check for our special page update message
if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE':
yield item
continue
# Otherwise, process it as a post
post = item
url = post.get("file_url")
if not url: continue
nameext_from_url(url, post)
post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
if url.startswith("/"):
url = self.root + url
post['file_url'] = url # Ensure full URL
post.update(data)
yield post
class BaseExtractor(Extractor):
instances = ()
def __init__(self, match, logger_func=print):
super().__init__(match, logger_func)
self._init_category()
def _init_category(self):
parsed_url = urllib.parse.urlparse(self.url)
self.root = f"{parsed_url.scheme}://{parsed_url.netloc}"
for i, group in enumerate(self.groups):
if group is not None:
try:
self.category = self.instances[i][0]
return
except IndexError:
continue
@classmethod
def update(cls, instances):
pattern_list = []
instance_list = cls.instances = []
for category, info in instances.items():
root = info["root"].rstrip("/") if info["root"] else ""
instance_list.append((category, root, info))
pattern = info.get("pattern", re.escape(root.partition("://")[2]))
pattern_list.append(f"({pattern})")
return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
# Part 3: Danbooru Extractor
class DanbooruExtractor(BaseExtractor):
filename_fmt = "{category}_{id}_{filename}.{extension}"
per_page = 200
def __init__(self, match, logger_func=print):
super().__init__(match, logger_func)
self._auth_logged = False
def _init_auth(self):
if self.user_id and self.api_key:
if not self._auth_logged:
self.log("Danbooru auth set.")
self._auth_logged = True
self.session.auth = (self.user_id, self.api_key)
def items(self):
data = self.metadata()
for item in self.posts():
# Check for our special page update message
if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE':
yield item
continue
# Otherwise, process it as a post
post = item
url = post.get("file_url")
if not url: continue
nameext_from_url(url, post)
post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z")
if url.startswith("/"):
url = self.root + url
post['file_url'] = url # Ensure full URL
post.update(data)
yield post
def metadata(self):
return {}
def posts(self):
return []
def _pagination(self, endpoint, params, prefix="b"):
url = self.root + endpoint
params["limit"] = self.per_page
params["page"] = 1
threshold = self.per_page - 20
while True:
posts = self.request_json(url, params=params)
if not posts: break
yield ('PAGE_UPDATE', len(posts))
yield from posts
if len(posts) < threshold: return
if prefix:
params["page"] = f"{prefix}{posts[-1]['id']}"
else:
params["page"] += 1
BASE_PATTERN = DanbooruExtractor.update({
"danbooru": {"root": None, "pattern": r"(?:danbooru|safebooru)\.donmai\.us"},
})
class DanbooruTagExtractor(DanbooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
pattern = BASE_PATTERN + r"(/posts\?(?:[^&#]*&)*tags=([^&#]*))"
def metadata(self):
self.tags = unquote(self.groups[-1].replace("+", " ")).strip()
sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags)
return {"search_tags": sanitized_tags}
def posts(self):
return self._pagination("/posts.json", {"tags": self.tags})
class DanbooruPostExtractor(DanbooruExtractor):
subcategory = "post"
pattern = BASE_PATTERN + r"(/post(?:s|/show)/(\d+))"
def posts(self):
post_id = self.groups[-1]
url = f"{self.root}/posts/{post_id}.json"
post = self.request_json(url)
return (post,) if post else ()
class GelbooruBase(Extractor):
category = "gelbooru"
root = "https://gelbooru.com"
def __init__(self, match, logger_func=print):
super().__init__(match, logger_func)
self._auth_logged = False
def _api_request(self, params, key="post"):
# Auth is now added dynamically
if self.api_key and self.user_id:
if not self._auth_logged:
self.log("Gelbooru auth set.")
self._auth_logged = True
params.update({"api_key": self.api_key, "user_id": self.user_id})
url = self.root + "/index.php?page=dapi&q=index&json=1"
data = self.request_json(url, params=params)
if not key: return data
posts = data.get(key, [])
return posts if isinstance(posts, list) else [posts] if posts else []
def items(self):
base_data = self.metadata()
base_data['category'] = self.category
for item in self.posts():
# Check for our special page update message
if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE':
yield item
continue
# Otherwise, process it as a post
post = item
url = post.get("file_url")
if not url: continue
data = base_data.copy()
data.update(post)
nameext_from_url(url, data)
yield data
def metadata(self): return {}
def posts(self): return []
GELBOORU_PATTERN = r"(?:https?://)?(?:www\.)?gelbooru\.com"
class GelbooruTagExtractor(GelbooruBase):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
filename_fmt = "{category}_{id}_{md5}.{extension}"
pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=list&tags=([^&#]*))"
def metadata(self):
self.tags = unquote(self.groups[-1].replace("+", " ")).strip()
sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags)
return {"search_tags": sanitized_tags}
def posts(self):
"""Scrapes HTML search pages as API can be restrictive for tags."""
pid = 0
posts_per_page = 42
search_url = self.root + "/index.php"
params = {"page": "post", "s": "list", "tags": self.tags}
while True:
params['pid'] = pid
self.log(f"Scraping search results page (offset: {pid})...")
response = self.request(search_url, params=params)
html_content = response.text
post_ids = re.findall(r'id="p(\d+)"', html_content)
if not post_ids:
self.log("No more posts found on page. Ending scrape.")
break
yield ('PAGE_UPDATE', len(post_ids))
for post_id in post_ids:
post_data = self._api_request({"s": "post", "id": post_id})
yield from post_data
pid += posts_per_page
class GelbooruPostExtractor(GelbooruBase):
subcategory = "post"
filename_fmt = "{category}_{id}_{md5}.{extension}"
pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=view&id=(\d+))"
def posts(self):
post_id = self.groups[-1]
return self._api_request({"s": "post", "id": post_id})
# --- Main Entry Point ---
EXTRACTORS = [
DanbooruTagExtractor,
DanbooruPostExtractor,
GelbooruTagExtractor,
GelbooruPostExtractor,
]
def find_extractor(url, logger_func):
for extractor_cls in EXTRACTORS:
match = re.search(extractor_cls.pattern, url)
if match:
return extractor_cls(match, logger_func)
return None
def fetch_booru_data(url, api_key, user_id, logger_func):
"""
Main function to find an extractor and yield image data.
"""
extractor = find_extractor(url, logger_func)
if not extractor:
logger_func(f"No suitable Booru extractor found for URL: {url}")
return
logger_func(f"Using extractor: {extractor.__class__.__name__}")
extractor.set_auth(api_key, user_id)
# The 'items' method will now yield the data dictionaries directly
yield from extractor.items()

View File

@@ -207,7 +207,7 @@ def get_bunkr_extractor(url, logger):
def fetch_bunkr_data(url, logger):
"""
Main function to be called from the GUI.
It extracts all file information from a Bunkr URL.
It extracts all file information from a Bunkr URL, now handling both albums and direct file links.
Returns:
A tuple of (album_name, list_of_files)
@@ -215,6 +215,30 @@ def fetch_bunkr_data(url, logger):
- list_of_files (list): A list of dicts, each containing 'url', 'name', and '_http_headers'.
Returns (None, None) on failure.
"""
# --- START: New logic to handle direct CDN file URLs ---
try:
parsed_url = urllib.parse.urlparse(url)
# Check if the hostname contains 'cdn' and the path has a common file extension
is_direct_cdn_file = (parsed_url.hostname and 'cdn' in parsed_url.hostname and 'bunkr' in parsed_url.hostname and
any(parsed_url.path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.webm', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar']))
if is_direct_cdn_file:
logger.info("Bunkr direct file URL detected.")
filename = os.path.basename(parsed_url.path)
# Use the filename (without extension) as a sensible album name
album_name = os.path.splitext(filename)[0]
files_to_download = [{
'url': url,
'name': filename,
'_http_headers': {'Referer': 'https://bunkr.ru/'} # Use a generic Referer
}]
return album_name, files_to_download
except Exception as e:
logger.warning(f"Could not parse Bunkr URL for direct file check: {e}")
# --- END: New logic ---
# This is the original logic for album and media pages
extractor = get_bunkr_extractor(url, logger)
if not extractor:
return None, None
@@ -238,4 +262,4 @@ def fetch_bunkr_data(url, logger):
except Exception as e:
logger.error(f"An error occurred while extracting Bunkr info: {e}", exc_info=True)
return None, None
return None, None

View File

@@ -0,0 +1,125 @@
import re
import os
import cloudscraper
from urllib.parse import urlparse, urljoin
from ..utils.file_utils import clean_folder_name
def fetch_fap_nation_data(album_url, logger_func):
"""
Scrapes a fap-nation page by prioritizing HLS streams first, then falling
back to direct download links. Selects the highest quality available.
"""
logger_func(f" [Fap-Nation] Fetching album data from: {album_url}")
scraper = cloudscraper.create_scraper()
try:
response = scraper.get(album_url, timeout=45)
response.raise_for_status()
html_content = response.text
title_match = re.search(r'<h1[^>]*itemprop="name"[^>]*>(.*?)</h1>', html_content, re.IGNORECASE)
album_slug = clean_folder_name(os.path.basename(urlparse(album_url).path.strip('/')))
album_title = clean_folder_name(title_match.group(1).strip()) if title_match else album_slug
files_to_download = []
final_url = None
link_type = None
filename_from_video_tag = None
video_tag_title_match = re.search(r'data-plyr-config=.*?&quot;title&quot;:.*?&quot;([^&]+?\.mp4)&quot;', html_content, re.IGNORECASE)
if video_tag_title_match:
filename_from_video_tag = clean_folder_name(video_tag_title_match.group(1))
logger_func(f" [Fap-Nation] Found high-quality filename in video tag: {filename_from_video_tag}")
# --- REVISED LOGIC: HLS FIRST ---
# 1. Prioritize finding an HLS stream.
logger_func(" [Fap-Nation] Priority 1: Searching for HLS stream...")
iframe_match = re.search(r'<iframe[^>]+src="([^"]+mediadelivery\.net[^"]+)"', html_content, re.IGNORECASE)
if iframe_match:
iframe_url = iframe_match.group(1)
logger_func(f" [Fap-Nation] Found video iframe. Visiting: {iframe_url}")
try:
iframe_response = scraper.get(iframe_url, timeout=30)
iframe_response.raise_for_status()
iframe_html = iframe_response.text
playlist_match = re.search(r'<source[^>]+src="([^"]+\.m3u8)"', iframe_html, re.IGNORECASE)
if playlist_match:
final_url = playlist_match.group(1)
link_type = 'hls'
logger_func(f" [Fap-Nation] Found embedded HLS stream in iframe: {final_url}")
except Exception as e:
logger_func(f" [Fap-Nation] ⚠️ Error fetching or parsing iframe content: {e}")
if not final_url:
logger_func(" [Fap-Nation] No stream found in iframe. Checking main page content as a last resort...")
js_var_match = re.search(r'"(https?://[^"]+\.m3u8)"', html_content, re.IGNORECASE)
if js_var_match:
final_url = js_var_match.group(1)
link_type = 'hls'
logger_func(f" [Fap-Nation] Found HLS stream on main page: {final_url}")
# 2. Fallback: If no HLS stream was found, search for direct links.
if not final_url:
logger_func(" [Fap-Nation] No HLS stream found. Priority 2 (Fallback): Searching for direct download links...")
direct_link_pattern = r'<a\s+[^>]*href="([^"]+\.(?:mp4|webm|mkv|mov))"[^>]*>'
direct_links_found = re.findall(direct_link_pattern, html_content, re.IGNORECASE)
if direct_links_found:
logger_func(f" [Fap-Nation] Found {len(direct_links_found)} direct media link(s). Selecting the best quality...")
best_link = direct_links_found[0]
for link in direct_links_found:
if '1080p' in link.lower():
best_link = link
break
final_url = best_link
link_type = 'direct'
logger_func(f" [Fap-Nation] Identified direct media link: {final_url}")
# If after all checks, we still have no URL, then fail.
if not final_url:
logger_func(" [Fap-Nation] ❌ Stage 1 Failed: Could not find any HLS stream or direct link.")
return None, []
# --- HLS Quality Selection Logic ---
if link_type == 'hls' and final_url:
logger_func(" [Fap-Nation] HLS stream found. Checking for higher quality variants...")
try:
master_playlist_response = scraper.get(final_url, timeout=20)
master_playlist_response.raise_for_status()
playlist_content = master_playlist_response.text
streams = re.findall(r'#EXT-X-STREAM-INF:.*?RESOLUTION=(\d+)x(\d+).*?\n(.*?)\s', playlist_content)
if streams:
best_stream = max(streams, key=lambda s: int(s[0]) * int(s[1]))
height = best_stream[1]
relative_path = best_stream[2]
new_final_url = urljoin(final_url, relative_path)
logger_func(f" [Fap-Nation] ✅ Best quality found: {height}p. Updating URL to: {new_final_url}")
final_url = new_final_url
else:
logger_func(" [Fap-Nation] No alternate quality streams found in playlist. Using original.")
except Exception as e:
logger_func(f" [Fap-Nation] ⚠️ Could not parse HLS master playlist for quality selection: {e}. Using original URL.")
if final_url and link_type:
if filename_from_video_tag:
base_name, _ = os.path.splitext(filename_from_video_tag)
new_filename = f"{base_name}.mp4"
else:
new_filename = f"{album_slug}.mp4"
files_to_download.append({'url': final_url, 'filename': new_filename, 'type': link_type})
logger_func(f" [Fap-Nation] ✅ Ready to download '{new_filename}' ({link_type} method).")
return album_title, files_to_download
logger_func(f" [Fap-Nation] ❌ Could not determine a valid download link.")
return None, []
except Exception as e:
logger_func(f" [Fap-Nation] ❌ Error fetching Fap-Nation data: {e}")
return None, []

189
src/core/mangadex_client.py Normal file
View File

@@ -0,0 +1,189 @@
# src/core/mangadex_client.py
import os
import re
import time
import cloudscraper
from collections import defaultdict
from ..utils.file_utils import clean_folder_name
def fetch_mangadex_data(start_url, output_dir, logger_func, file_progress_callback, overall_progress_callback, pause_event, cancellation_event):
"""
Fetches and downloads all content from a MangaDex series or chapter URL.
Returns a tuple of (downloaded_count, skipped_count).
"""
grand_total_dl = 0
grand_total_skip = 0
api = _MangadexAPI(logger_func)
def _check_pause():
if cancellation_event and cancellation_event.is_set(): return True
if pause_event and pause_event.is_set():
logger_func(" Download paused...")
while pause_event.is_set():
if cancellation_event and cancellation_event.is_set(): return True
time.sleep(0.5)
logger_func(" Download resumed.")
return cancellation_event.is_set()
series_match = re.search(r"mangadex\.org/(?:title|manga)/([0-9a-f-]+)", start_url)
chapter_match = re.search(r"mangadex\.org/chapter/([0-9a-f-]+)", start_url)
chapters_to_process = []
if series_match:
series_id = series_match.group(1)
logger_func(f" Series detected. Fetching chapter list for ID: {series_id}")
chapters_to_process = api.get_manga_chapters(series_id, cancellation_event, pause_event)
elif chapter_match:
chapter_id = chapter_match.group(1)
logger_func(f" Single chapter detected. Fetching info for ID: {chapter_id}")
chapter_info = api.get_chapter_info(chapter_id)
if chapter_info:
chapters_to_process = [chapter_info]
if not chapters_to_process:
logger_func("❌ No chapters found or failed to fetch chapter info.")
return 0, 0
logger_func(f"✅ Found {len(chapters_to_process)} chapter(s) to download.")
if overall_progress_callback:
overall_progress_callback.emit(len(chapters_to_process), 0)
for chap_idx, chapter_json in enumerate(chapters_to_process):
if _check_pause(): break
try:
metadata = api.transform_chapter_data(chapter_json)
logger_func("-" * 40)
logger_func(f"Processing Chapter {chap_idx + 1}/{len(chapters_to_process)}: Vol. {metadata['volume']} Ch. {metadata['chapter']}{metadata['chapter_minor']} - {metadata['title']}")
server_info = api.get_at_home_server(chapter_json["id"])
if not server_info:
logger_func(" ❌ Could not get image server for this chapter. Skipping.")
continue
base_url = f"{server_info['baseUrl']}/data/{server_info['chapter']['hash']}/"
image_files = server_info['chapter']['data']
series_folder = clean_folder_name(metadata['manga'])
chapter_folder_title = metadata['title'] or ''
chapter_folder = clean_folder_name(f"Vol {metadata['volume']:02d} Chap {metadata['chapter']:03d}{metadata['chapter_minor']} - {chapter_folder_title}".strip().strip('-').strip())
final_save_path = os.path.join(output_dir, series_folder, chapter_folder)
os.makedirs(final_save_path, exist_ok=True)
for img_idx, filename in enumerate(image_files):
if _check_pause(): break
full_img_url = base_url + filename
img_path = os.path.join(final_save_path, f"{img_idx + 1:03d}{os.path.splitext(filename)[1]}")
if os.path.exists(img_path):
logger_func(f" -> Skip ({img_idx+1}/{len(image_files)}): '{os.path.basename(img_path)}' already exists.")
grand_total_skip += 1
continue
logger_func(f" Downloading ({img_idx+1}/{len(image_files)}): '{os.path.basename(img_path)}'...")
try:
response = api.session.get(full_img_url, stream=True, timeout=60, headers={'Referer': 'https://mangadex.org/'})
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
if file_progress_callback:
file_progress_callback.emit(os.path.basename(img_path), (0, total_size))
with open(img_path, 'wb') as f:
downloaded_bytes = 0
for chunk in response.iter_content(chunk_size=8192):
if _check_pause(): break
f.write(chunk)
downloaded_bytes += len(chunk)
if file_progress_callback:
file_progress_callback.emit(os.path.basename(img_path), (downloaded_bytes, total_size))
if _check_pause():
if os.path.exists(img_path): os.remove(img_path)
break
grand_total_dl += 1
except Exception as e:
logger_func(f" ❌ Failed to download page {img_idx+1}: {e}")
grand_total_skip += 1
if overall_progress_callback:
overall_progress_callback.emit(len(chapters_to_process), chap_idx + 1)
time.sleep(1)
except Exception as e:
logger_func(f" ❌ An unexpected error occurred while processing chapter {chapter_json.get('id')}: {e}")
return grand_total_dl, grand_total_skip
class _MangadexAPI:
def __init__(self, logger_func):
self.logger_func = logger_func
self.session = cloudscraper.create_scraper()
self.root = "https://api.mangadex.org"
def _call(self, endpoint, params=None, cancellation_event=None):
if cancellation_event and cancellation_event.is_set(): return None
try:
response = self.session.get(f"{self.root}{endpoint}", params=params, timeout=30)
if response.status_code == 429:
retry_after = int(response.headers.get("X-RateLimit-Retry-After", 5))
self.logger_func(f" ⚠️ Rate limited. Waiting for {retry_after} seconds...")
time.sleep(retry_after)
return self._call(endpoint, params, cancellation_event)
response.raise_for_status()
return response.json()
except Exception as e:
self.logger_func(f" ❌ API call to '{endpoint}' failed: {e}")
return None
def get_manga_chapters(self, series_id, cancellation_event, pause_event):
all_chapters = []
offset = 0
limit = 500
base_params = {
"limit": limit, "order[volume]": "asc", "order[chapter]": "asc",
"translatedLanguage[]": ["en"], "includes[]": ["scanlation_group", "user", "manga"]
}
while True:
if cancellation_event.is_set(): break
while pause_event.is_set(): time.sleep(0.5)
params = {**base_params, "offset": offset}
data = self._call(f"/manga/{series_id}/feed", params, cancellation_event)
if not data or data.get("result") != "ok": break
results = data.get("data", [])
all_chapters.extend(results)
if (offset + limit) >= data.get("total", 0): break
offset += limit
return all_chapters
def get_chapter_info(self, chapter_id):
params = {"includes[]": ["scanlation_group", "user", "manga"]}
data = self._call(f"/chapter/{chapter_id}", params)
return data.get("data") if data and data.get("result") == "ok" else None
def get_at_home_server(self, chapter_id):
return self._call(f"/at-home/server/{chapter_id}")
def transform_chapter_data(self, chapter):
relationships = {item["type"]: item for item in chapter.get("relationships", [])}
manga = relationships.get("manga", {})
c_attrs = chapter.get("attributes", {})
m_attrs = manga.get("attributes", {})
chapter_num_str = c_attrs.get("chapter", "0") or "0"
chnum, sep, minor = chapter_num_str.partition(".")
return {
"manga": (m_attrs.get("title", {}).get("en") or next(iter(m_attrs.get("title", {}).values()), "Unknown Series")),
"title": c_attrs.get("title", ""),
"volume": int(float(c_attrs.get("volume", 0) or 0)),
"chapter": int(float(chnum or 0)),
"chapter_minor": sep + minor if minor else ""
}

View File

@@ -0,0 +1,93 @@
import os
import re
import cloudscraper
from ..utils.file_utils import clean_folder_name
# --- ADDED IMPORTS ---
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def fetch_pixeldrain_data(url: str, logger):
"""
Scrapes a given Pixeldrain URL to extract album or file information.
Handles single files (/u/), albums/lists (/l/), and folders (/d/).
"""
logger(f"Fetching data for Pixeldrain URL: {url}")
scraper = cloudscraper.create_scraper()
root = "https://pixeldrain.com"
# --- START OF FIX: Add a robust retry strategy ---
try:
retry_strategy = Retry(
total=5, # Total number of retries
backoff_factor=1, # Wait 1s, 2s, 4s, 8s between retries
status_forcelist=[429, 500, 502, 503, 504], # Retry on these server errors
allowed_methods=["HEAD", "GET"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
scraper.mount("https://", adapter)
scraper.mount("http://", adapter)
logger(" [Pixeldrain] Configured retry strategy for network requests.")
except Exception as e:
logger(f" [Pixeldrain] ⚠️ Could not configure retry strategy: {e}")
# --- END OF FIX ---
file_match = re.search(r"/u/(\w+)", url)
album_match = re.search(r"/l/(\w+)", url)
folder_match = re.search(r"/d/([^?]+)", url)
try:
if file_match:
file_id = file_match.group(1)
logger(f" Detected Pixeldrain File ID: {file_id}")
api_url = f"{root}/api/file/{file_id}/info"
data = scraper.get(api_url).json()
title = data.get("name", file_id)
files = [{
'url': f"{root}/api/file/{file_id}?download",
'filename': data.get("name", f"{file_id}.tmp")
}]
return title, files
elif album_match:
album_id = album_match.group(1)
logger(f" Detected Pixeldrain Album ID: {album_id}")
api_url = f"{root}/api/list/{album_id}"
data = scraper.get(api_url).json()
title = data.get("title", album_id)
files = []
for file_info in data.get("files", []):
files.append({
'url': f"{root}/api/file/{file_info['id']}?download",
'filename': file_info.get("name", f"{file_info['id']}.tmp")
})
return title, files
elif folder_match:
path_id = folder_match.group(1)
logger(f" Detected Pixeldrain Folder Path: {path_id}")
api_url = f"{root}/api/filesystem/{path_id}?stat"
data = scraper.get(api_url).json()
path_info = data["path"][data["base_index"]]
title = path_info.get("name", path_id)
files = []
for child in data.get("children", []):
if child.get("type") == "file":
files.append({
'url': f"{root}/api/filesystem{child['path']}?attach",
'filename': child.get("name")
})
return title, files
else:
logger(" ❌ Could not identify Pixeldrain URL type (file, album, or folder).")
return None, []
except Exception as e:
logger(f"❌ An error occurred while fetching Pixeldrain data: {e}")
return None, []

100
src/core/simpcity_client.py Normal file
View File

@@ -0,0 +1,100 @@
# src/core/simpcity_client.py
import cloudscraper
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote
import os
import re
from ..utils.file_utils import clean_folder_name
import urllib.parse
def fetch_single_simpcity_page(url, logger_func, cookies=None, post_id=None):
"""
Scrapes a single SimpCity page for images, external links, video tags, and iframes.
"""
scraper = cloudscraper.create_scraper()
headers = {'Referer': 'https://simpcity.cr/'}
try:
response = scraper.get(url, timeout=30, headers=headers, cookies=cookies)
if response.status_code == 404:
return None, []
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
album_title = None
title_element = soup.find('h1', class_='p-title-value')
if title_element:
album_title = title_element.text.strip()
search_scope = soup
if post_id:
post_content_container = soup.find('div', attrs={'data-lb-id': f'post-{post_id}'})
if post_content_container:
logger_func(f" [SimpCity] ✅ Isolating search to post content container for ID {post_id}.")
search_scope = post_content_container
else:
logger_func(f" [SimpCity] ⚠️ Could not find content container for post ID {post_id}.")
jobs_on_page = []
# Find native SimpCity images
image_tags = search_scope.find_all('img', class_='bbImage')
for img_tag in image_tags:
thumbnail_url = img_tag.get('src')
if not thumbnail_url or not isinstance(thumbnail_url, str) or 'saint2.su' in thumbnail_url: continue
full_url = thumbnail_url.replace('.md.', '.')
filename = img_tag.get('alt', '').replace('.md.', '.') or os.path.basename(unquote(urlparse(full_url).path))
jobs_on_page.append({'type': 'image', 'filename': filename, 'url': full_url})
# Find links in <a> tags, now with redirect handling
link_tags = search_scope.find_all('a', href=True)
for link in link_tags:
href = link.get('href', '')
actual_url = href
if '/misc/goto?url=' in href:
try:
# Extract and decode the real URL from the 'url' parameter
parsed_href = urlparse(href)
query_params = dict(urllib.parse.parse_qsl(parsed_href.query))
if 'url' in query_params:
actual_url = unquote(query_params['url'])
except Exception:
actual_url = href # Fallback if parsing fails
# Perform all checks on the 'actual_url' which is now the real destination
if re.search(r'pixeldrain\.com/[lud]/', actual_url): jobs_on_page.append({'type': 'pixeldrain', 'url': actual_url})
elif re.search(r'saint2\.(su|pk|cr|to)/embed/', actual_url): jobs_on_page.append({'type': 'saint2', 'url': actual_url})
elif re.search(r'bunkr\.(?:cr|si|la|ws|is|ru|su|red|black|media|site|to|ac|ci|fi|pk|ps|sk|ph)|bunkrr\.ru', actual_url): jobs_on_page.append({'type': 'bunkr', 'url': actual_url})
elif re.search(r'mega\.(nz|io)', actual_url): jobs_on_page.append({'type': 'mega', 'url': actual_url})
elif re.search(r'gofile\.io', actual_url): jobs_on_page.append({'type': 'gofile', 'url': actual_url})
# Find direct Saint2 video embeds in <video> tags
video_tags = search_scope.find_all('video')
for video in video_tags:
source_tag = video.find('source')
if source_tag and source_tag.get('src'):
src_url = source_tag['src']
if re.search(r'saint2\.(su|pk|cr|to)', src_url):
jobs_on_page.append({'type': 'saint2_direct', 'url': src_url})
# Find embeds in <iframe> tags (as a fallback)
iframe_tags = search_scope.find_all('iframe')
for iframe in iframe_tags:
src_url = iframe.get('src')
if src_url and isinstance(src_url, str):
if re.search(r'saint2\.(su|pk|cr|to)/embed/', src_url):
jobs_on_page.append({'type': 'saint2', 'url': src_url})
if jobs_on_page:
# We use a set to remove duplicate URLs that might be found in multiple ways
unique_jobs = list({job['url']: job for job in jobs_on_page}.values())
logger_func(f" [SimpCity] Scraper found jobs: {[job['type'] for job in unique_jobs]}")
return album_title, unique_jobs
return album_title, []
except Exception as e:
logger_func(f" [SimpCity] ❌ Error fetching page {url}: {e}")
raise e

View File

@@ -0,0 +1,73 @@
import cloudscraper
from bs4 import BeautifulSoup
import time
def get_chapter_list(series_url, logger_func):
logger_func(f" [Toonily] Scraping series page for chapter list: {series_url}")
scraper = cloudscraper.create_scraper()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'Referer': 'https://toonily.com/'
}
try:
response = scraper.get(series_url, timeout=30, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
chapter_links = soup.select('li.wp-manga-chapter > a')
if not chapter_links:
logger_func(" [Toonily] ❌ Could not find any chapter links on the page.")
return []
urls = [link['href'] for link in chapter_links]
urls.reverse()
logger_func(f" [Toonily] Found {len(urls)} chapters.")
return urls
except Exception as e:
logger_func(f" [Toonily] ❌ Error getting chapter list: {e}")
return []
def fetch_chapter_data(chapter_url, logger_func, scraper_session):
"""
Scrapes a single Toonily.com chapter page for its title and image URLs.
"""
main_series_url = chapter_url.rsplit('/', 2)[0] + '/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': main_series_url
}
try:
response = scraper_session.get(chapter_url, timeout=30, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title_element = soup.select_one('h1#chapter-heading')
image_container = soup.select_one('div.reading-content')
if not title_element or not image_container:
logger_func(" [Toonily] ❌ Page structure invalid. Could not find title or image container.")
return None, None, []
full_chapter_title = title_element.text.strip()
if " - Chapter" in full_chapter_title:
series_title = full_chapter_title.split(" - Chapter")[0].strip()
else:
series_title = full_chapter_title.strip()
chapter_title = full_chapter_title # The full string is best for the chapter folder name
image_elements = image_container.select('img')
image_urls = [img.get('data-src', img.get('src')).strip() for img in image_elements if img.get('data-src') or img.get('src')]
return series_title, chapter_title, image_urls
except Exception as e:
logger_func(f" [Toonily] ❌ An error occurred scraping chapter '{chapter_url}': {e}")
return None, None, []

File diff suppressed because it is too large Load Diff