diff --git a/LinkMaker/hentai2read.py b/LinkMaker/hentai2read.py new file mode 100644 index 0000000..e69de29 diff --git a/assets/Ko-fi.png b/assets/Ko-fi.png new file mode 100644 index 0000000..d2172ad Binary files /dev/null and b/assets/Ko-fi.png differ diff --git a/assets/buymeacoffee.png b/assets/buymeacoffee.png new file mode 100644 index 0000000..ce9988b Binary files /dev/null and b/assets/buymeacoffee.png differ diff --git a/assets/patreon.png b/assets/patreon.png new file mode 100644 index 0000000..2418803 Binary files /dev/null and b/assets/patreon.png differ diff --git a/src/config/constants.py b/src/config/constants.py index 1d49752..30aebf7 100644 --- a/src/config/constants.py +++ b/src/config/constants.py @@ -47,6 +47,8 @@ MAX_PARTS_FOR_MULTIPART_DOWNLOAD = 15 # --- UI and Settings Keys (for QSettings) --- TOUR_SHOWN_KEY = "neverShowTourAgainV19" MANGA_FILENAME_STYLE_KEY = "mangaFilenameStyleV1" +MANGA_CUSTOM_FORMAT_KEY = "mangaCustomFormatV1" +MANGA_CUSTOM_DATE_FORMAT_KEY = "mangaCustomDateFormatV1" SKIP_WORDS_SCOPE_KEY = "skipWordsScopeV1" ALLOW_MULTIPART_DOWNLOAD_KEY = "allowMultipartDownloadV1" USE_COOKIE_KEY = "useCookieV1" @@ -59,6 +61,8 @@ DOWNLOAD_LOCATION_KEY = "downloadLocationV1" RESOLUTION_KEY = "window_resolution" UI_SCALE_KEY = "ui_scale_factor" SAVE_CREATOR_JSON_KEY = "saveCreatorJsonProfile" +DATE_PREFIX_FORMAT_KEY = "datePrefixFormatV1" +AUTO_RETRY_ON_FINISH_KEY = "auto_retry_on_finish" FETCH_FIRST_KEY = "fetchAllPostsFirst" DISCORD_TOKEN_KEY = "discord/token" @@ -84,7 +88,7 @@ VIDEO_EXTENSIONS = { '.mpg', '.m4v', '.3gp', '.ogv', '.ts', '.vob' } ARCHIVE_EXTENSIONS = { - '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2' + '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.bin' } AUDIO_EXTENSIONS = { '.mp3', '.wav', '.aac', '.flac', '.ogg', '.wma', '.m4a', '.opus', @@ -122,4 +126,5 @@ CREATOR_DOWNLOAD_DEFAULT_FOLDER_IGNORE_WORDS = { # --- Duplicate Handling Modes --- DUPLICATE_HANDLING_HASH = "hash" -DUPLICATE_HANDLING_KEEP_ALL = "keep_all" \ No newline at end of file +DUPLICATE_HANDLING_KEEP_ALL = "keep_all" +STYLE_CUSTOM = "custom" \ No newline at end of file diff --git a/src/core/Hentai2read_client.py b/src/core/Hentai2read_client.py index 38bbac5..a144276 100644 --- a/src/core/Hentai2read_client.py +++ b/src/core/Hentai2read_client.py @@ -2,71 +2,206 @@ import re import os -import json -import requests +import time import cloudscraper from bs4 import BeautifulSoup +from urllib.parse import urljoin +from concurrent.futures import ThreadPoolExecutor +import queue -def fetch_hentai2read_data(url, logger, session): +def run_hentai2read_download(start_url, output_dir, progress_callback, overall_progress_callback, check_pause_func): """ - Scrapes a SINGLE Hentai2Read chapter page using a provided session. + Orchestrates the download process using a producer-consumer model. + The main thread scrapes image URLs and puts them in a queue. + A pool of worker threads consumes from the queue to download images concurrently. """ - logger(f"Attempting to fetch chapter data from: {url}") + scraper = cloudscraper.create_scraper() try: - response = session.get(url, timeout=30) - response.raise_for_status() + progress_callback(" [Hentai2Read] Scraping series page for all metadata...") + top_level_folder_name, chapters_to_process = _get_series_metadata(start_url, progress_callback, scraper) + + if not chapters_to_process: + progress_callback("❌ No chapters found to download. Aborting.") + return 0, 0 - page_content_text = response.text - soup = BeautifulSoup(page_content_text, 'html.parser') - - album_title = "" - title_tags = soup.select('span[itemprop="name"]') - if title_tags: - album_title = title_tags[-1].text.strip() + total_chapters = len(chapters_to_process) + overall_progress_callback(total_chapters, 0) - if not album_title: - title_tag = soup.select_one('h1.title') - if title_tag: - album_title = title_tag.text.strip() + total_downloaded_count = 0 + total_skipped_count = 0 - if not album_title: - logger("❌ Could not find album title on page.") - return None, None + for idx, chapter in enumerate(chapters_to_process): + if check_pause_func(): break + + progress_callback(f"\n-- Processing and Downloading Chapter {idx + 1}/{total_chapters}: '{chapter['title']}' --") + + series_folder = re.sub(r'[\\/*?:"<>|]', "", top_level_folder_name).strip() + chapter_folder = re.sub(r'[\\/*?:"<>|]', "", chapter['title']).strip() + final_save_path = os.path.join(output_dir, series_folder, chapter_folder) + os.makedirs(final_save_path, exist_ok=True) + + # This function now scrapes and downloads simultaneously + dl_count, skip_count = _process_and_download_chapter( + chapter_url=chapter['url'], + save_path=final_save_path, + scraper=scraper, + progress_callback=progress_callback, + check_pause_func=check_pause_func + ) + + total_downloaded_count += dl_count + total_skipped_count += skip_count + + overall_progress_callback(total_chapters, idx + 1) + if check_pause_func(): break - image_urls = [] - try: - start_index = page_content_text.index("'images' : ") + len("'images' : ") - end_index = page_content_text.index(",\n", start_index) - images_json_str = page_content_text[start_index:end_index] - image_paths = json.loads(images_json_str) - image_urls = ["https://hentaicdn.com/hentai" + part for part in image_paths] - except (ValueError, json.JSONDecodeError): - logger("❌ Could not find or parse image JSON data for this chapter.") - return None, None + return total_downloaded_count, total_skipped_count - if not image_urls: - logger("❌ No image URLs found for this chapter.") - return None, None - - logger(f" Found {len(image_urls)} images for album '{album_title}'.") - - files_to_download = [] - for i, img_url in enumerate(image_urls): - page_num = i + 1 - extension = os.path.splitext(img_url)[1].split('?')[0] - if not extension: extension = ".jpg" - filename = f"{page_num:03d}{extension}" - files_to_download.append({'url': img_url, 'filename': filename}) - - return album_title, files_to_download - - except requests.exceptions.HTTPError as e: - if e.response.status_code == 404: - logger(f" Chapter not found (404 Error). This likely marks the end of the series.") - else: - logger(f"❌ An HTTP error occurred: {e}") - return None, None except Exception as e: - logger(f"❌ An unexpected error occurred while fetching data: {e}") - return None, None + progress_callback(f"❌ A critical error occurred in the Hentai2Read client: {e}") + return 0, 0 + +def _get_series_metadata(start_url, progress_callback, scraper): + """ + Scrapes the main series page to get the Artist Name, Series Title, and chapter list. + """ + try: + response = scraper.get(start_url, timeout=30) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + series_title = "Unknown Series" + artist_name = None + metadata_list = soup.select_one("ul.list.list-simple-mini") + + if metadata_list: + first_li = metadata_list.find('li', recursive=False) + if first_li and not first_li.find('a'): + series_title = first_li.get_text(strip=True) + + for b_tag in metadata_list.find_all('b'): + label = b_tag.get_text(strip=True) + if label in ("Artist", "Author"): + a_tag = b_tag.find_next_sibling('a') + if a_tag: + artist_name = a_tag.get_text(strip=True) + if label == "Artist": + break + + top_level_folder_name = artist_name if artist_name else series_title + + chapter_links = soup.select("div.media a.pull-left.font-w600") + if not chapter_links: + chapters_to_process = [{'url': start_url, 'title': series_title}] + else: + chapters_to_process = [ + {'url': urljoin(start_url, link['href']), 'title': " ".join(link.stripped_strings)} + for link in chapter_links + ] + chapters_to_process.reverse() + + progress_callback(f" [Hentai2Read] ✅ Found Artist/Series: '{top_level_folder_name}'") + progress_callback(f" [Hentai2Read] ✅ Found {len(chapters_to_process)} chapters to process.") + + return top_level_folder_name, chapters_to_process + + except Exception as e: + progress_callback(f" [Hentai2Read] ❌ Error getting series metadata: {e}") + return "Unknown Series", [] + +### NEW: This function contains the pipeline logic ### +def _process_and_download_chapter(chapter_url, save_path, scraper, progress_callback, check_pause_func): + """ + Uses a producer-consumer pattern to download a chapter. + The main thread (producer) scrapes URLs one by one. + Worker threads (consumers) download the URLs as they are found. + """ + task_queue = queue.Queue() + num_download_threads = 8 + + # These will be updated by the worker threads + download_stats = {'downloaded': 0, 'skipped': 0} + + def downloader_worker(): + """The function that each download thread will run.""" + # Create a unique session for each thread to avoid conflicts + worker_scraper = cloudscraper.create_scraper() + while True: + try: + # Get a task from the queue + task = task_queue.get() + # The sentinel value to signal the end + if task is None: + break + + filepath, img_url = task + if os.path.exists(filepath): + progress_callback(f" -> Skip: '{os.path.basename(filepath)}'") + download_stats['skipped'] += 1 + else: + progress_callback(f" Downloading: '{os.path.basename(filepath)}'...") + response = worker_scraper.get(img_url, stream=True, timeout=60, headers={'Referer': chapter_url}) + response.raise_for_status() + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + download_stats['downloaded'] += 1 + except Exception as e: + progress_callback(f" ❌ Download failed for task. Error: {e}") + download_stats['skipped'] += 1 + finally: + task_queue.task_done() + + # --- Start the downloader threads --- + executor = ThreadPoolExecutor(max_workers=num_download_threads, thread_name_prefix='H2R_Downloader') + for _ in range(num_download_threads): + executor.submit(downloader_worker) + + # --- Main thread acts as the scraper (producer) --- + page_number = 1 + while True: + if check_pause_func(): break + if page_number > 300: # Safety break + progress_callback(" [Hentai2Read] ⚠️ Safety break: Reached 300 pages.") + break + + page_url_to_check = f"{chapter_url}{page_number}/" + try: + response = scraper.get(page_url_to_check, timeout=30) + if response.history or response.status_code != 200: + progress_callback(f" [Hentai2Read] End of chapter detected on page {page_number}.") + break + + soup = BeautifulSoup(response.text, 'html.parser') + img_tag = soup.select_one("img#arf-reader") + img_src = img_tag.get("src") if img_tag else None + + if not img_tag or img_src == "https://static.hentai.direct/hentai": + progress_callback(f" [Hentai2Read] End of chapter detected (Placeholder image on page {page_number}).") + break + + normalized_img_src = urljoin(response.url, img_src) + ext = os.path.splitext(normalized_img_src.split('/')[-1])[-1] or ".jpg" + filename = f"{page_number:03d}{ext}" + filepath = os.path.join(save_path, filename) + + # Put the download task into the queue for a worker to pick up + task_queue.put((filepath, normalized_img_src)) + + page_number += 1 + time.sleep(0.1) # Small delay between scraping pages + except Exception as e: + progress_callback(f" [Hentai2Read] ❌ Error while scraping page {page_number}: {e}") + break + + # --- Shutdown sequence --- + # Tell all worker threads to exit by sending the sentinel value + for _ in range(num_download_threads): + task_queue.put(None) + + # Wait for all download tasks to be completed + executor.shutdown(wait=True) + + progress_callback(f" Found and processed {page_number - 1} images for this chapter.") + return download_stats['downloaded'], download_stats['skipped'] \ No newline at end of file diff --git a/src/core/allcomic_client.py b/src/core/allcomic_client.py new file mode 100644 index 0000000..91ba1c4 --- /dev/null +++ b/src/core/allcomic_client.py @@ -0,0 +1,116 @@ +import requests +import re +from bs4 import BeautifulSoup +import cloudscraper +import time +from urllib.parse import urlparse + +def get_chapter_list(series_url, logger_func): + """ + Checks if a URL is a series page and returns a list of all chapter URLs if it is. + Includes a retry mechanism for robust connection. + """ + logger_func(f" [AllComic] Checking for chapter list at: {series_url}") + + scraper = cloudscraper.create_scraper() + response = None + max_retries = 8 + + for attempt in range(max_retries): + try: + response = scraper.get(series_url, timeout=30) + response.raise_for_status() + logger_func(f" [AllComic] Successfully connected to series page on attempt {attempt + 1}.") + break # Success, exit the loop + except requests.RequestException as e: + logger_func(f" [AllComic] ⚠️ Series page check attempt {attempt + 1}/{max_retries} failed: {e}") + if attempt < max_retries - 1: + wait_time = 2 * (attempt + 1) + logger_func(f" Retrying in {wait_time} seconds...") + time.sleep(wait_time) + else: + logger_func(f" [AllComic] ❌ All attempts to check series page failed.") + return [] # Return empty on final failure + + if not response: + return [] + + try: + soup = BeautifulSoup(response.text, 'html.parser') + chapter_links = soup.select('li.wp-manga-chapter a') + + if not chapter_links: + logger_func(" [AllComic] ℹ️ No chapter list found. Assuming this is a single chapter page.") + return [] + + chapter_urls = [link['href'] for link in chapter_links] + chapter_urls.reverse() # Reverse for oldest-to-newest reading order + + logger_func(f" [AllComic] ✅ Found {len(chapter_urls)} chapters.") + return chapter_urls + + except Exception as e: + logger_func(f" [AllComic] ❌ Error parsing chapters after successful connection: {e}") + return [] + +def fetch_chapter_data(chapter_url, logger_func): + """ + Fetches the comic title, chapter title, and image URLs for a single chapter page. + """ + logger_func(f" [AllComic] Fetching page: {chapter_url}") + + scraper = cloudscraper.create_scraper( + browser={'browser': 'firefox', 'platform': 'windows', 'desktop': True} + ) + headers = {'Referer': 'https://allporncomic.com/'} + + response = None + max_retries = 8 + for attempt in range(max_retries): + try: + response = scraper.get(chapter_url, headers=headers, timeout=30) + response.raise_for_status() + break + except requests.RequestException as e: + if attempt < max_retries - 1: + time.sleep(2 * (attempt + 1)) + else: + logger_func(f" [AllComic] ❌ All connection attempts failed for chapter: {chapter_url}") + return None, None, None + + try: + soup = BeautifulSoup(response.text, 'html.parser') + title_element = soup.find('h1', class_='post-title') + comic_title = None + if title_element: + comic_title = title_element.text.strip() + else: + try: + path_parts = urlparse(chapter_url).path.strip('/').split('/') + if len(path_parts) >= 3 and path_parts[-3] == 'porncomic': + comic_slug = path_parts[-2] + comic_title = comic_slug.replace('-', ' ').title() + except Exception: + comic_title = "Unknown Comic" + + chapter_slug = chapter_url.strip('/').split('/')[-1] + chapter_title = chapter_slug.replace('-', ' ').title() + + reading_container = soup.find('div', class_='reading-content') + list_of_image_urls = [] + if reading_container: + image_elements = reading_container.find_all('img', class_='wp-manga-chapter-img') + for img in image_elements: + img_url = (img.get('data-src') or img.get('src', '')).strip() + if img_url: + list_of_image_urls.append(img_url) + + if not comic_title or comic_title == "Unknown Comic" or not list_of_image_urls: + logger_func(f" [AllComic] ❌ Could not find a valid title or images on the page. Title found: '{comic_title}'") + return None, None, None + + return comic_title, chapter_title, list_of_image_urls + + except Exception as e: + logger_func(f" [AllComic] ❌ An unexpected error occurred while parsing the page: {e}") + return None, None, None \ No newline at end of file diff --git a/src/core/api_client.py b/src/core/api_client.py index 138ad9d..0493d1f 100644 --- a/src/core/api_client.py +++ b/src/core/api_client.py @@ -33,7 +33,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev if cancellation_event and cancellation_event.is_set(): raise RuntimeError("Fetch operation cancelled by user during retry loop.") - log_message = f" Fetching post list: {paginated_url} (Page approx. {offset // 50 + 1})" + log_message = f" Fetching post list: {api_url_base} (Page approx. {offset // 50 + 1})" if attempt > 0: log_message += f" (Attempt {attempt + 1}/{max_retries})" logger(log_message) @@ -247,7 +247,7 @@ def download_from_api( break all_posts_for_manga_mode.extend(posts_batch_manga) - logger(f"MANGA_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}") + logger(f"RENAMING_MODE_FETCH_PROGRESS:{len(all_posts_for_manga_mode)}:{current_page_num_manga}") current_offset_manga += page_size time.sleep(0.6) @@ -265,7 +265,7 @@ def download_from_api( if cancellation_event and cancellation_event.is_set(): return if all_posts_for_manga_mode: - logger(f"MANGA_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}") + logger(f"RENAMING_MODE_FETCH_COMPLETE:{len(all_posts_for_manga_mode)}") if all_posts_for_manga_mode: if processed_post_ids: diff --git a/src/core/booru_client.py b/src/core/booru_client.py new file mode 100644 index 0000000..6ddcf13 --- /dev/null +++ b/src/core/booru_client.py @@ -0,0 +1,375 @@ +# src/core/booru_client.py + +import os +import re +import time +import datetime +import urllib.parse +import requests +import logging +import cloudscraper +# --- Start of Combined Code from 1.py --- + +# Part 1: Essential Utilities & Exceptions + +class BooruClientException(Exception): + """Base class for exceptions in this client.""" + pass + +class HttpError(BooruClientException): + """HTTP request during data extraction failed.""" + def __init__(self, message="", response=None): + self.response = response + self.status = response.status_code if response else 0 + if response and not message: + message = f"'{response.status_code} {response.reason}' for '{response.url}'" + super().__init__(message) + +class NotFoundError(BooruClientException): + pass + +def unquote(s): + return urllib.parse.unquote(s) + +def parse_datetime(date_string, fmt): + try: + # Assumes date_string is in a format that strptime can handle with timezone + return datetime.datetime.strptime(date_string, fmt) + except (ValueError, TypeError): + return None + +def nameext_from_url(url, data=None): + if data is None: data = {} + try: + path = urllib.parse.urlparse(url).path + filename = unquote(os.path.basename(path)) + if '.' in filename: + name, ext = filename.rsplit('.', 1) + data["filename"], data["extension"] = name, ext.lower() + else: + data["filename"], data["extension"] = filename, "" + except Exception: + data["filename"], data["extension"] = "", "" + return data + +USERAGENT_FIREFOX = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0" + +# Part 2: Core Extractor Logic + +class Extractor: + category = "" + subcategory = "" + directory_fmt = ("{category}", "{id}") + filename_fmt = "{filename}.{extension}" + _retries = 3 + _timeout = 30 + + def __init__(self, match, logger_func=print): + self.url = match.string + self.match = match + self.groups = match.groups() + self.session = cloudscraper.create_scraper() + self.session.headers["User-Agent"] = USERAGENT_FIREFOX + self.log = logger_func + self.api_key = None + self.user_id = None + + def set_auth(self, api_key, user_id): + self.api_key = api_key + self.user_id = user_id + self._init_auth() + + def _init_auth(self): + """Placeholder for extractor-specific auth setup.""" + pass + + def request(self, url, method="GET", fatal=True, **kwargs): + for attempt in range(self._retries + 1): + try: + response = self.session.request(method, url, timeout=self._timeout, **kwargs) + if response.status_code < 400: + return response + if response.status_code == 404 and fatal: + raise NotFoundError(f"Resource not found at {url}") + self.log(f"Request for {url} failed with status {response.status_code}. Retrying...") + except requests.exceptions.RequestException as e: + self.log(f"Request for {url} failed: {e}. Retrying...") + if attempt < self._retries: + time.sleep(2 ** attempt) + if fatal: + raise HttpError(f"Failed to retrieve {url} after {self._retries} retries.") + return None + + def request_json(self, url, **kwargs): + response = self.request(url, **kwargs) + try: + return response.json() + except (ValueError, TypeError) as exc: + self.log(f"Failed to decode JSON from {url}: {exc}") + raise BooruClientException("Invalid JSON response") + + def items(self): + data = self.metadata() + for item in self.posts(): + # Check for our special page update message + if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE': + yield item + continue + + # Otherwise, process it as a post + post = item + url = post.get("file_url") + if not url: continue + + nameext_from_url(url, post) + post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z") + + if url.startswith("/"): + url = self.root + url + post['file_url'] = url # Ensure full URL + + post.update(data) + yield post + +class BaseExtractor(Extractor): + instances = () + + def __init__(self, match, logger_func=print): + super().__init__(match, logger_func) + self._init_category() + + def _init_category(self): + parsed_url = urllib.parse.urlparse(self.url) + self.root = f"{parsed_url.scheme}://{parsed_url.netloc}" + for i, group in enumerate(self.groups): + if group is not None: + try: + self.category = self.instances[i][0] + return + except IndexError: + continue + + @classmethod + def update(cls, instances): + pattern_list = [] + instance_list = cls.instances = [] + for category, info in instances.items(): + root = info["root"].rstrip("/") if info["root"] else "" + instance_list.append((category, root, info)) + pattern = info.get("pattern", re.escape(root.partition("://")[2])) + pattern_list.append(f"({pattern})") + return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")" + +# Part 3: Danbooru Extractor + +class DanbooruExtractor(BaseExtractor): + filename_fmt = "{category}_{id}_{filename}.{extension}" + per_page = 200 + + def __init__(self, match, logger_func=print): + super().__init__(match, logger_func) + self._auth_logged = False + + def _init_auth(self): + if self.user_id and self.api_key: + if not self._auth_logged: + self.log("Danbooru auth set.") + self._auth_logged = True + self.session.auth = (self.user_id, self.api_key) + + + def items(self): + data = self.metadata() + for item in self.posts(): + # Check for our special page update message + if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE': + yield item + continue + + # Otherwise, process it as a post + post = item + url = post.get("file_url") + if not url: continue + + nameext_from_url(url, post) + post["date"] = parse_datetime(post.get("created_at"), "%Y-%m-%dT%H:%M:%S.%f%z") + + if url.startswith("/"): + url = self.root + url + post['file_url'] = url # Ensure full URL + + post.update(data) + yield post + + def metadata(self): + return {} + + def posts(self): + return [] + + def _pagination(self, endpoint, params, prefix="b"): + url = self.root + endpoint + params["limit"] = self.per_page + params["page"] = 1 + threshold = self.per_page - 20 + + while True: + posts = self.request_json(url, params=params) + if not posts: break + yield ('PAGE_UPDATE', len(posts)) + yield from posts + if len(posts) < threshold: return + if prefix: + params["page"] = f"{prefix}{posts[-1]['id']}" + else: + params["page"] += 1 + +BASE_PATTERN = DanbooruExtractor.update({ + "danbooru": {"root": None, "pattern": r"(?:danbooru|safebooru)\.donmai\.us"}, +}) + +class DanbooruTagExtractor(DanbooruExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"(/posts\?(?:[^]*&)*tags=([^]*))" + + def metadata(self): + self.tags = unquote(self.groups[-1].replace("+", " ")).strip() + sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags) + return {"search_tags": sanitized_tags} + + def posts(self): + return self._pagination("/posts.json", {"tags": self.tags}) + +class DanbooruPostExtractor(DanbooruExtractor): + subcategory = "post" + pattern = BASE_PATTERN + r"(/post(?:s|/show)/(\d+))" + + def posts(self): + post_id = self.groups[-1] + url = f"{self.root}/posts/{post_id}.json" + post = self.request_json(url) + return (post,) if post else () + +class GelbooruBase(Extractor): + category = "gelbooru" + root = "https://gelbooru.com" + + def __init__(self, match, logger_func=print): + super().__init__(match, logger_func) + self._auth_logged = False + + def _api_request(self, params, key="post"): + # Auth is now added dynamically + if self.api_key and self.user_id: + if not self._auth_logged: + self.log("Gelbooru auth set.") + self._auth_logged = True + params.update({"api_key": self.api_key, "user_id": self.user_id}) + + url = self.root + "/index.php?page=dapi&q=index&json=1" + data = self.request_json(url, params=params) + + if not key: return data + posts = data.get(key, []) + return posts if isinstance(posts, list) else [posts] if posts else [] + + def items(self): + base_data = self.metadata() + base_data['category'] = self.category + + for item in self.posts(): + # Check for our special page update message + if isinstance(item, tuple) and item[0] == 'PAGE_UPDATE': + yield item + continue + + # Otherwise, process it as a post + post = item + url = post.get("file_url") + if not url: continue + + data = base_data.copy() + data.update(post) + nameext_from_url(url, data) + yield data + + def metadata(self): return {} + def posts(self): return [] + +GELBOORU_PATTERN = r"(?:https?://)?(?:www\.)?gelbooru\.com" + +class GelbooruTagExtractor(GelbooruBase): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + filename_fmt = "{category}_{id}_{md5}.{extension}" + pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=list&tags=([^]*))" + + def metadata(self): + self.tags = unquote(self.groups[-1].replace("+", " ")).strip() + sanitized_tags = re.sub(r'[\\/*?:"<>|]', "_", self.tags) + return {"search_tags": sanitized_tags} + + def posts(self): + """Scrapes HTML search pages as API can be restrictive for tags.""" + pid = 0 + posts_per_page = 42 + search_url = self.root + "/index.php" + params = {"page": "post", "s": "list", "tags": self.tags} + + while True: + params['pid'] = pid + self.log(f"Scraping search results page (offset: {pid})...") + response = self.request(search_url, params=params) + html_content = response.text + post_ids = re.findall(r'id="p(\d+)"', html_content) + + if not post_ids: + self.log("No more posts found on page. Ending scrape.") + break + yield ('PAGE_UPDATE', len(post_ids)) + for post_id in post_ids: + post_data = self._api_request({"s": "post", "id": post_id}) + yield from post_data + + pid += posts_per_page + +class GelbooruPostExtractor(GelbooruBase): + subcategory = "post" + filename_fmt = "{category}_{id}_{md5}.{extension}" + pattern = GELBOORU_PATTERN + r"(/index\.php\?page=post&s=view&id=(\d+))" + + def posts(self): + post_id = self.groups[-1] + return self._api_request({"s": "post", "id": post_id}) + +# --- Main Entry Point --- + +EXTRACTORS = [ + DanbooruTagExtractor, + DanbooruPostExtractor, + GelbooruTagExtractor, + GelbooruPostExtractor, +] + +def find_extractor(url, logger_func): + for extractor_cls in EXTRACTORS: + match = re.search(extractor_cls.pattern, url) + if match: + return extractor_cls(match, logger_func) + return None + +def fetch_booru_data(url, api_key, user_id, logger_func): + """ + Main function to find an extractor and yield image data. + """ + extractor = find_extractor(url, logger_func) + if not extractor: + logger_func(f"No suitable Booru extractor found for URL: {url}") + return + + logger_func(f"Using extractor: {extractor.__class__.__name__}") + extractor.set_auth(api_key, user_id) + + # The 'items' method will now yield the data dictionaries directly + yield from extractor.items() \ No newline at end of file diff --git a/src/core/bunkr_client.py b/src/core/bunkr_client.py index 7821419..9480891 100644 --- a/src/core/bunkr_client.py +++ b/src/core/bunkr_client.py @@ -207,7 +207,7 @@ def get_bunkr_extractor(url, logger): def fetch_bunkr_data(url, logger): """ Main function to be called from the GUI. - It extracts all file information from a Bunkr URL. + It extracts all file information from a Bunkr URL, now handling both albums and direct file links. Returns: A tuple of (album_name, list_of_files) @@ -215,6 +215,30 @@ def fetch_bunkr_data(url, logger): - list_of_files (list): A list of dicts, each containing 'url', 'name', and '_http_headers'. Returns (None, None) on failure. """ + # --- START: New logic to handle direct CDN file URLs --- + try: + parsed_url = urllib.parse.urlparse(url) + # Check if the hostname contains 'cdn' and the path has a common file extension + is_direct_cdn_file = (parsed_url.hostname and 'cdn' in parsed_url.hostname and 'bunkr' in parsed_url.hostname and + any(parsed_url.path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.webm', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar'])) + + if is_direct_cdn_file: + logger.info("Bunkr direct file URL detected.") + filename = os.path.basename(parsed_url.path) + # Use the filename (without extension) as a sensible album name + album_name = os.path.splitext(filename)[0] + + files_to_download = [{ + 'url': url, + 'name': filename, + '_http_headers': {'Referer': 'https://bunkr.ru/'} # Use a generic Referer + }] + return album_name, files_to_download + except Exception as e: + logger.warning(f"Could not parse Bunkr URL for direct file check: {e}") + # --- END: New logic --- + + # This is the original logic for album and media pages extractor = get_bunkr_extractor(url, logger) if not extractor: return None, None @@ -238,4 +262,4 @@ def fetch_bunkr_data(url, logger): except Exception as e: logger.error(f"An error occurred while extracting Bunkr info: {e}", exc_info=True) - return None, None \ No newline at end of file + return None, None diff --git a/src/core/fap_nation_client.py b/src/core/fap_nation_client.py new file mode 100644 index 0000000..71d5aea --- /dev/null +++ b/src/core/fap_nation_client.py @@ -0,0 +1,125 @@ +import re +import os +import cloudscraper +from urllib.parse import urlparse, urljoin +from ..utils.file_utils import clean_folder_name + +def fetch_fap_nation_data(album_url, logger_func): + """ + Scrapes a fap-nation page by prioritizing HLS streams first, then falling + back to direct download links. Selects the highest quality available. + """ + logger_func(f" [Fap-Nation] Fetching album data from: {album_url}") + scraper = cloudscraper.create_scraper() + + try: + response = scraper.get(album_url, timeout=45) + response.raise_for_status() + html_content = response.text + + title_match = re.search(r'