import cloudscraper from bs4 import BeautifulSoup import re import html def fetch_rule34video_data(video_url, logger_func): """ Scrapes a rule34video.com page by specifically finding the 'Download' div, then selecting the best available quality link. Args: video_url (str): The full URL to the rule34video.com page. logger_func (callable): Function to use for logging progress. Returns: tuple: (video_title, final_video_url) or (None, None) on failure. """ logger_func(f" [Rule34Video] Fetching page: {video_url}") scraper = cloudscraper.create_scraper() try: main_page_response = scraper.get(video_url, timeout=20) main_page_response.raise_for_status() soup = BeautifulSoup(main_page_response.text, 'html.parser') page_title_tag = soup.find('title') video_title = page_title_tag.text.strip() if page_title_tag else "rule34video_file" # --- START OF FINAL FIX --- # 1. Find the SPECIFIC "Download" label first. This is the key. download_label = soup.find('div', class_='label', string='Download') if not download_label: logger_func(" [Rule34Video] ❌ Could not find the 'Download' label. Unable to locate the correct links div.") return None, None # 2. The correct container is the parent of this label. download_div = download_label.parent # 3. Now, find the links ONLY within this correct container. link_tags = download_div.find_all('a', class_='tag_item') if not link_tags: logger_func(" [Rule34Video] ❌ Found the 'Download' div, but no download links were inside it.") return None, None # --- END OF FINAL FIX --- links_by_quality = {} quality_pattern = re.compile(r'(\d+p|4k)') for tag in link_tags: href = tag.get('href') if not href: continue quality = None text_match = quality_pattern.search(tag.text) if text_match: quality = text_match.group(1) else: href_match = quality_pattern.search(href) if href_match: quality = href_match.group(1) if quality: links_by_quality[quality] = href if not links_by_quality: logger_func(" [Rule34Video] ⚠️ Could not parse specific qualities. Using first available link as a fallback.") final_video_url = link_tags[0].get('href') if not final_video_url: logger_func(" [Rule34Video] ❌ Fallback failed: First link tag had no href attribute.") return None, None final_video_url = html.unescape(final_video_url) logger_func(f" [Rule34Video] ✅ Selected first available link as fallback: {final_video_url}") return video_title, final_video_url logger_func(f" [Rule34Video] Found available qualities: {list(links_by_quality.keys())}") final_video_url = None if '1080p' in links_by_quality: final_video_url = links_by_quality['1080p'] logger_func(" [Rule34Video] ✅ Selected preferred 1080p link.") elif '720p' in links_by_quality: final_video_url = links_by_quality['720p'] logger_func(" [Rule34Video] ✅ 1080p not found. Selected fallback 720p link.") else: fallback_order = ['480p', '360p'] for quality in fallback_order: if quality in links_by_quality: final_video_url = links_by_quality[quality] logger_func(f" [Rule34Video] ⚠️ 1080p/720p not found. Selected best available fallback: {quality}") break if not final_video_url: logger_func(" [Rule34Video] ❌ Could not find a suitable download link.") return None, None final_video_url = html.unescape(final_video_url) logger_func(f" [Rule34Video] ✅ Selected direct download URL: {final_video_url}") return video_title, final_video_url except Exception as e: logger_func(f" [Rule34Video] ❌ An error occurred: {e}") return None, None