Kemono-Downloader/src/core/rule34video_client.py

import cloudscraper
from bs4 import BeautifulSoup
import re
import html

def fetch_rule34video_data(video_url, logger_func):
    """
    Scrapes a rule34video.com page by specifically finding the 'Download' div,
    then selecting the best available quality link.
    
    Args:
        video_url (str): The full URL to the rule34video.com page.
        logger_func (callable): Function to use for logging progress.

    Returns:
        tuple: (video_title, final_video_url) or (None, None) on failure.
    """
    logger_func(f"   [Rule34Video] Fetching page: {video_url}")
    scraper = cloudscraper.create_scraper()
    
    try:
        main_page_response = scraper.get(video_url, timeout=20)
        main_page_response.raise_for_status()
        
        soup = BeautifulSoup(main_page_response.text, 'html.parser')
        
        page_title_tag = soup.find('title')
        video_title = page_title_tag.text.strip() if page_title_tag else "rule34video_file"

        # --- START OF FINAL FIX ---
        # 1. Find the SPECIFIC "Download" label first. This is the key.
        download_label = soup.find('div', class_='label', string='Download')

        if not download_label:
            logger_func("   [Rule34Video] ❌ Could not find the 'Download' label. Unable to locate the correct links div.")
            return None, None

        # 2. The correct container is the parent of this label.
        download_div = download_label.parent
        
        # 3. Now, find the links ONLY within this correct container.
        link_tags = download_div.find_all('a', class_='tag_item')
        if not link_tags:
            logger_func("   [Rule34Video] ❌ Found the 'Download' div, but no download links were inside it.")
            return None, None
        # --- END OF FINAL FIX ---

        links_by_quality = {}
        quality_pattern = re.compile(r'(\d+p|4k)')

        for tag in link_tags:
            href = tag.get('href')
            if not href:
                continue

            quality = None
            text_match = quality_pattern.search(tag.text)
            if text_match:
                quality = text_match.group(1)
            else:
                href_match = quality_pattern.search(href)
                if href_match:
                    quality = href_match.group(1)

            if quality:
                links_by_quality[quality] = href
        
        if not links_by_quality:
            logger_func("   [Rule34Video] ⚠️ Could not parse specific qualities. Using first available link as a fallback.")
            final_video_url = link_tags[0].get('href')
            if not final_video_url:
                logger_func("   [Rule34Video] ❌ Fallback failed: First link tag had no href attribute.")
                return None, None
            
            final_video_url = html.unescape(final_video_url)
            logger_func(f"   [Rule34Video] ✅ Selected first available link as fallback: {final_video_url}")
            return video_title, final_video_url
            
        logger_func(f"   [Rule34Video] Found available qualities: {list(links_by_quality.keys())}")

        final_video_url = None
        if '1080p' in links_by_quality:
            final_video_url = links_by_quality['1080p']
            logger_func("   [Rule34Video] ✅ Selected preferred 1080p link.")
        elif '720p' in links_by_quality:
            final_video_url = links_by_quality['720p']
            logger_func("   [Rule34Video] ✅ 1080p not found. Selected fallback 720p link.")
        else:
            fallback_order = ['480p', '360p']
            for quality in fallback_order:
                if quality in links_by_quality:
                    final_video_url = links_by_quality[quality]
                    logger_func(f"   [Rule34Video] ⚠️ 1080p/720p not found. Selected best available fallback: {quality}")
                    break
        
        if not final_video_url:
            logger_func("   [Rule34Video] ❌ Could not find a suitable download link.")
            return None, None
            
        final_video_url = html.unescape(final_video_url)
        logger_func(f"   [Rule34Video] ✅ Selected direct download URL: {final_video_url}")
        
        return video_title, final_video_url

    except Exception as e:
        logger_func(f"   [Rule34Video] ❌ An error occurred: {e}")
        return None, None
Commit 2025-10-18 16:03:34 +05:30			`import cloudscraper`
			`from bs4 import BeautifulSoup`
			`import re`
			`import html`

			`def fetch_rule34video_data(video_url, logger_func):`
			`"""`
			`Scrapes a rule34video.com page by specifically finding the 'Download' div,`
			`then selecting the best available quality link.`

			`Args:`
			`video_url (str): The full URL to the rule34video.com page.`
			`logger_func (callable): Function to use for logging progress.`

			`Returns:`
			`tuple: (video_title, final_video_url) or (None, None) on failure.`
			`"""`
			`logger_func(f" [Rule34Video] Fetching page: {video_url}")`
			`scraper = cloudscraper.create_scraper()`

			`try:`
			`main_page_response = scraper.get(video_url, timeout=20)`
			`main_page_response.raise_for_status()`

			`soup = BeautifulSoup(main_page_response.text, 'html.parser')`

			`page_title_tag = soup.find('title')`
			`video_title = page_title_tag.text.strip() if page_title_tag else "rule34video_file"`

			`# --- START OF FINAL FIX ---`
			`# 1. Find the SPECIFIC "Download" label first. This is the key.`
			`download_label = soup.find('div', class_='label', string='Download')`

			`if not download_label:`
			`logger_func(" [Rule34Video] ❌ Could not find the 'Download' label. Unable to locate the correct links div.")`
			`return None, None`

			`# 2. The correct container is the parent of this label.`
			`download_div = download_label.parent`

			`# 3. Now, find the links ONLY within this correct container.`
			`link_tags = download_div.find_all('a', class_='tag_item')`
			`if not link_tags:`
			`logger_func(" [Rule34Video] ❌ Found the 'Download' div, but no download links were inside it.")`
			`return None, None`
			`# --- END OF FINAL FIX ---`

			`links_by_quality = {}`
			`quality_pattern = re.compile(r'(\d+p\|4k)')`

			`for tag in link_tags:`
			`href = tag.get('href')`
			`if not href:`
			`continue`

			`quality = None`
			`text_match = quality_pattern.search(tag.text)`
			`if text_match:`
			`quality = text_match.group(1)`
			`else:`
			`href_match = quality_pattern.search(href)`
			`if href_match:`
			`quality = href_match.group(1)`

			`if quality:`
			`links_by_quality[quality] = href`

			`if not links_by_quality:`
			`logger_func(" [Rule34Video] ⚠️ Could not parse specific qualities. Using first available link as a fallback.")`
			`final_video_url = link_tags[0].get('href')`
			`if not final_video_url:`
			`logger_func(" [Rule34Video] ❌ Fallback failed: First link tag had no href attribute.")`
			`return None, None`

			`final_video_url = html.unescape(final_video_url)`
			`logger_func(f" [Rule34Video] ✅ Selected first available link as fallback: {final_video_url}")`
			`return video_title, final_video_url`

			`logger_func(f" [Rule34Video] Found available qualities: {list(links_by_quality.keys())}")`

			`final_video_url = None`
			`if '1080p' in links_by_quality:`
			`final_video_url = links_by_quality['1080p']`
			`logger_func(" [Rule34Video] ✅ Selected preferred 1080p link.")`
			`elif '720p' in links_by_quality:`
			`final_video_url = links_by_quality['720p']`
			`logger_func(" [Rule34Video] ✅ 1080p not found. Selected fallback 720p link.")`
			`else:`
			`fallback_order = ['480p', '360p']`
			`for quality in fallback_order:`
			`if quality in links_by_quality:`
			`final_video_url = links_by_quality[quality]`
			`logger_func(f" [Rule34Video] ⚠️ 1080p/720p not found. Selected best available fallback: {quality}")`
			`break`

			`if not final_video_url:`
			`logger_func(" [Rule34Video] ❌ Could not find a suitable download link.")`
			`return None, None`

			`final_video_url = html.unescape(final_video_url)`
			`logger_func(f" [Rule34Video] ✅ Selected direct download URL: {final_video_url}")`

			`return video_title, final_video_url`

			`except Exception as e:`
			`logger_func(f" [Rule34Video] ❌ An error occurred: {e}")`
			`return None, None`