Commit

2025-12-29 16:14:44 +00:00 · 2025-10-18 16:03:34 +05:30
parent 5d4e08f794
commit 9fe0c37127
25 changed files with 2502 additions and 2414 deletions
--- a/src/core/allcomic_client.py
+++ b/src/core/allcomic_client.py
@@ -1,36 +1,36 @@
 import requests
 import re
 from bs4 import BeautifulSoup
-import cloudscraper
 import time 
+import random
 from urllib.parse import urlparse

-def get_chapter_list(series_url, logger_func):
+def get_chapter_list(scraper, series_url, logger_func):
    """
    Checks if a URL is a series page and returns a list of all chapter URLs if it is.
-    Includes a retry mechanism for robust connection.
+    Relies on a passed-in scraper session for connection.
    """
    logger_func(f"   [AllComic] Checking for chapter list at: {series_url}")
    
-    scraper = cloudscraper.create_scraper()
+    headers = {'Referer': 'https://allporncomic.com/'}
    response = None
    max_retries = 8

    for attempt in range(max_retries):
        try:
-            response = scraper.get(series_url, timeout=30)
+            response = scraper.get(series_url, headers=headers, timeout=30)
            response.raise_for_status()
            logger_func(f"   [AllComic] Successfully connected to series page on attempt {attempt + 1}.")
-            break # Success, exit the loop
+            break 
        except requests.RequestException as e:
            logger_func(f"   [AllComic] ⚠️ Series page check attempt {attempt + 1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
-                wait_time = 2 * (attempt + 1)
-                logger_func(f"      Retrying in {wait_time} seconds...")
+                wait_time = (2 ** attempt) + random.uniform(0, 2)
+                logger_func(f"      Retrying in {wait_time:.1f} seconds...")
                time.sleep(wait_time)
            else:
                logger_func(f"   [AllComic] ❌ All attempts to check series page failed.")
-                return [] # Return empty on final failure
+                return [] 
    
    if not response:
        return []
@@ -44,7 +44,7 @@ def get_chapter_list(series_url, logger_func):
            return []

        chapter_urls = [link['href'] for link in chapter_links]
-        chapter_urls.reverse() # Reverse for oldest-to-newest reading order
+        chapter_urls.reverse() 
        
        logger_func(f"   [AllComic] ✅ Found {len(chapter_urls)} chapters.")
        return chapter_urls
@@ -53,15 +53,13 @@ def get_chapter_list(series_url, logger_func):
        logger_func(f"   [AllComic] ❌ Error parsing chapters after successful connection: {e}")
        return []

-def fetch_chapter_data(chapter_url, logger_func):
+def fetch_chapter_data(scraper, chapter_url, logger_func):
    """
    Fetches the comic title, chapter title, and image URLs for a single chapter page.
+    Relies on a passed-in scraper session for connection.
    """
    logger_func(f"   [AllComic] Fetching page: {chapter_url}")

-    scraper = cloudscraper.create_scraper(
-        browser={'browser': 'firefox', 'platform': 'windows', 'desktop': True}
-    )
    headers = {'Referer': 'https://allporncomic.com/'}
    
    response = None
@@ -72,16 +70,23 @@ def fetch_chapter_data(chapter_url, logger_func):
            response.raise_for_status()
            break
        except requests.RequestException as e:
+            logger_func(f"   [AllComic] ⚠️ Chapter page connection attempt {attempt + 1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
-                time.sleep(2 * (attempt + 1))
+                wait_time = (2 ** attempt) + random.uniform(0, 2)
+                logger_func(f"      Retrying in {wait_time:.1f} seconds...")
+                time.sleep(wait_time)
            else:
                logger_func(f"   [AllComic] ❌ All connection attempts failed for chapter: {chapter_url}")
                return None, None, None
    
+    if not response:
+        return None, None, None
+
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        comic_title = "Unknown Comic"
        title_element = soup.find('h1', class_='post-title')
-        comic_title = None
        if title_element:
            comic_title = title_element.text.strip()
        else:
@@ -91,7 +96,7 @@ def fetch_chapter_data(chapter_url, logger_func):
                    comic_slug = path_parts[-2]
                    comic_title = comic_slug.replace('-', ' ').title()
            except Exception:
-                comic_title = "Unknown Comic"
+                pass 

        chapter_slug = chapter_url.strip('/').split('/')[-1]
        chapter_title = chapter_slug.replace('-', ' ').title()
@@ -105,8 +110,8 @@ def fetch_chapter_data(chapter_url, logger_func):
                if img_url:
                    list_of_image_urls.append(img_url)

-        if not comic_title or comic_title == "Unknown Comic" or not list_of_image_urls:
-            logger_func(f"   [AllComic] ❌ Could not find a valid title or images on the page. Title found: '{comic_title}'")
+        if not list_of_image_urls:
+            logger_func(f"   [AllComic] ❌ Could not find any images on the page.")
            return None, None, None

        return comic_title, chapter_title, list_of_image_urls
--- a/src/core/booru_client.py
+++ b/src/core/booru_client.py
@@ -1,4 +1,3 @@
-# src/core/booru_client.py

 import os
 import re
--- a/src/core/bunkr_client.py
+++ b/src/core/bunkr_client.py
@@ -164,17 +164,34 @@ class BunkrAlbumExtractor(Extractor):
    def _extract_file(self, webpage_url):
        page = self.request(webpage_url).text
        data_id = extr(page, 'data-file-id="', '"')
-        referer = self.root_dl + "/file/" + data_id
-        headers = {"Referer": referer, "Origin": self.root_dl}
+        
+        # This referer is for the API call only
+        api_referer = self.root_dl + "/file/" + data_id
+        headers = {"Referer": api_referer, "Origin": self.root_dl}
        data = self.request_json(self.endpoint, method="POST", headers=headers, json={"id": data_id})
        
+        # Get the raw file URL (no domain replacement)
        file_url = decrypt_xor(data["url"], f"SECRET_KEY_{data['timestamp'] // 3600}".encode()) if data.get("encrypted") else data["url"]
+        
        file_name = extr(page, "<h1", "<").rpartition(">")[2]

+        # --- NEW FIX ---
+        # The download thread uses a new `requests` call, so we must
+        # explicitly pass BOTH the User-Agent and the correct Referer.
+        
+        # 1. Get the User-Agent from this extractor's session
+        user_agent = self.session.headers.get("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0")
+        
+        # 2. Use the original album URL as the Referer
+        download_referer = self.url 
+
        return {
            "url": file_url, 
            "name": unescape(file_name), 
-            "_http_headers": {"Referer": referer}
+            "_http_headers": {
+                "Referer": download_referer,
+                "User-Agent": user_agent
+            }
        }

 class BunkrMediaExtractor(BunkrAlbumExtractor):
--- a/src/core/rule34video_client.py
+++ b/src/core/rule34video_client.py
@@ -0,0 +1,107 @@
+import cloudscraper
+from bs4 import BeautifulSoup
+import re
+import html
+
+def fetch_rule34video_data(video_url, logger_func):
+    """
+    Scrapes a rule34video.com page by specifically finding the 'Download' div,
+    then selecting the best available quality link.
+    
+    Args:
+        video_url (str): The full URL to the rule34video.com page.
+        logger_func (callable): Function to use for logging progress.
+
+    Returns:
+        tuple: (video_title, final_video_url) or (None, None) on failure.
+    """
+    logger_func(f"   [Rule34Video] Fetching page: {video_url}")
+    scraper = cloudscraper.create_scraper()
+    
+    try:
+        main_page_response = scraper.get(video_url, timeout=20)
+        main_page_response.raise_for_status()
+        
+        soup = BeautifulSoup(main_page_response.text, 'html.parser')
+        
+        page_title_tag = soup.find('title')
+        video_title = page_title_tag.text.strip() if page_title_tag else "rule34video_file"
+
+        # --- START OF FINAL FIX ---
+        # 1. Find the SPECIFIC "Download" label first. This is the key.
+        download_label = soup.find('div', class_='label', string='Download')
+
+        if not download_label:
+            logger_func("   [Rule34Video] ❌ Could not find the 'Download' label. Unable to locate the correct links div.")
+            return None, None
+
+        # 2. The correct container is the parent of this label.
+        download_div = download_label.parent
+        
+        # 3. Now, find the links ONLY within this correct container.
+        link_tags = download_div.find_all('a', class_='tag_item')
+        if not link_tags:
+            logger_func("   [Rule34Video] ❌ Found the 'Download' div, but no download links were inside it.")
+            return None, None
+        # --- END OF FINAL FIX ---
+
+        links_by_quality = {}
+        quality_pattern = re.compile(r'(\d+p|4k)')
+
+        for tag in link_tags:
+            href = tag.get('href')
+            if not href:
+                continue
+
+            quality = None
+            text_match = quality_pattern.search(tag.text)
+            if text_match:
+                quality = text_match.group(1)
+            else:
+                href_match = quality_pattern.search(href)
+                if href_match:
+                    quality = href_match.group(1)
+
+            if quality:
+                links_by_quality[quality] = href
+        
+        if not links_by_quality:
+            logger_func("   [Rule34Video] ⚠️ Could not parse specific qualities. Using first available link as a fallback.")
+            final_video_url = link_tags[0].get('href')
+            if not final_video_url:
+                logger_func("   [Rule34Video] ❌ Fallback failed: First link tag had no href attribute.")
+                return None, None
+            
+            final_video_url = html.unescape(final_video_url)
+            logger_func(f"   [Rule34Video] ✅ Selected first available link as fallback: {final_video_url}")
+            return video_title, final_video_url
+            
+        logger_func(f"   [Rule34Video] Found available qualities: {list(links_by_quality.keys())}")
+
+        final_video_url = None
+        if '1080p' in links_by_quality:
+            final_video_url = links_by_quality['1080p']
+            logger_func("   [Rule34Video] ✅ Selected preferred 1080p link.")
+        elif '720p' in links_by_quality:
+            final_video_url = links_by_quality['720p']
+            logger_func("   [Rule34Video] ✅ 1080p not found. Selected fallback 720p link.")
+        else:
+            fallback_order = ['480p', '360p']
+            for quality in fallback_order:
+                if quality in links_by_quality:
+                    final_video_url = links_by_quality[quality]
+                    logger_func(f"   [Rule34Video] ⚠️ 1080p/720p not found. Selected best available fallback: {quality}")
+                    break
+        
+        if not final_video_url:
+            logger_func("   [Rule34Video] ❌ Could not find a suitable download link.")
+            return None, None
+            
+        final_video_url = html.unescape(final_video_url)
+        logger_func(f"   [Rule34Video] ✅ Selected direct download URL: {final_video_url}")
+        
+        return video_title, final_video_url
+
+    except Exception as e:
+        logger_func(f"   [Rule34Video] ❌ An error occurred: {e}")
+        return None, None
--- a/src/core/simpcity_client.py
+++ b/src/core/simpcity_client.py
@@ -17,8 +17,10 @@ def fetch_single_simpcity_page(url, logger_func, cookies=None, post_id=None):
    
    try:
        response = scraper.get(url, timeout=30, headers=headers, cookies=cookies)
+        final_url = response.url # Capture the final URL after any redirects
+        
        if response.status_code == 404:
-            return None, []
+            return None, [], final_url
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

@@ -91,9 +93,9 @@ def fetch_single_simpcity_page(url, logger_func, cookies=None, post_id=None):
            # We use a set to remove duplicate URLs that might be found in multiple ways
            unique_jobs = list({job['url']: job for job in jobs_on_page}.values())
            logger_func(f"   [SimpCity] Scraper found jobs: {[job['type'] for job in unique_jobs]}")
-            return album_title, unique_jobs
+            return album_title, unique_jobs, final_url

-        return album_title, []
+        return album_title, [], final_url

    except Exception as e:
        logger_func(f"   [SimpCity] ❌ Error fetching page {url}: {e}")