mirror of
https://github.com/Yuvi9587/Kemono-Downloader.git
synced 2025-12-29 16:14:44 +00:00
Commit
This commit is contained in:
@@ -3,6 +3,7 @@ import traceback
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
import requests
|
||||
import cloudscraper
|
||||
from ..utils.network_utils import extract_post_info, prepare_cookies_for_request
|
||||
from ..config.constants import (
|
||||
STYLE_DATE_POST_TITLE
|
||||
@@ -80,26 +81,26 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
|
||||
|
||||
def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logger, cookies_dict=None):
|
||||
"""
|
||||
--- NEW FUNCTION ---
|
||||
Fetches the full data, including the 'content' field, for a single post.
|
||||
--- MODIFIED FUNCTION ---
|
||||
Fetches the full data, including the 'content' field, for a single post using cloudscraper.
|
||||
"""
|
||||
post_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{post_id}"
|
||||
logger(f" Fetching full content for post ID {post_id}...")
|
||||
try:
|
||||
with requests.get(post_api_url, headers=headers, timeout=(15, 300), cookies=cookies_dict, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
response_body = b""
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
response_body += chunk
|
||||
|
||||
full_post_data = json.loads(response_body)
|
||||
|
||||
if isinstance(full_post_data, list) and full_post_data:
|
||||
return full_post_data[0]
|
||||
if isinstance(full_post_data, dict) and 'post' in full_post_data:
|
||||
return full_post_data['post']
|
||||
return full_post_data
|
||||
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
try:
|
||||
response = scraper.get(post_api_url, headers=headers, timeout=(15, 300), cookies=cookies_dict)
|
||||
response.raise_for_status()
|
||||
|
||||
full_post_data = response.json()
|
||||
|
||||
if isinstance(full_post_data, list) and full_post_data:
|
||||
return full_post_data[0]
|
||||
if isinstance(full_post_data, dict) and 'post' in full_post_data:
|
||||
return full_post_data['post']
|
||||
return full_post_data
|
||||
|
||||
except Exception as e:
|
||||
logger(f" ❌ Failed to fetch full content for post {post_id}: {e}")
|
||||
return None
|
||||
@@ -138,8 +139,7 @@ def download_from_api(
|
||||
manga_filename_style_for_sort_check=None,
|
||||
processed_post_ids=None,
|
||||
fetch_all_first=False
|
||||
):
|
||||
# FIX: Define api_domain FIRST, before it is used in the headers
|
||||
):
|
||||
parsed_input_url_for_domain = urlparse(api_url_input)
|
||||
api_domain = parsed_input_url_for_domain.netloc
|
||||
|
||||
|
||||
@@ -1,63 +1,70 @@
|
||||
import time
|
||||
import requests
|
||||
import cloudscraper
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def fetch_server_channels(server_id, logger, cookies=None, cancellation_event=None, pause_event=None):
|
||||
def fetch_server_channels(server_id, logger=print, cookies_dict=None):
|
||||
"""
|
||||
Fetches the list of channels for a given Discord server ID from the Kemono API.
|
||||
UPDATED to be pausable and cancellable.
|
||||
Fetches all channels for a given Discord server ID from the API.
|
||||
Uses cloudscraper to bypass Cloudflare.
|
||||
"""
|
||||
domains_to_try = ["kemono.cr", "kemono.su"]
|
||||
for domain in domains_to_try:
|
||||
if cancellation_event and cancellation_event.is_set():
|
||||
logger(" Channel fetching cancelled by user.")
|
||||
return None
|
||||
while pause_event and pause_event.is_set():
|
||||
if cancellation_event and cancellation_event.is_set(): break
|
||||
time.sleep(0.5)
|
||||
api_url = f"https://kemono.cr/api/v1/discord/server/{server_id}"
|
||||
logger(f" Fetching channels for server: {api_url}")
|
||||
|
||||
lookup_url = f"https://{domain}/api/v1/discord/channel/lookup/{server_id}"
|
||||
logger(f" Attempting to fetch channel list from: {lookup_url}")
|
||||
try:
|
||||
response = requests.get(lookup_url, cookies=cookies, timeout=15)
|
||||
response.raise_for_status()
|
||||
channels = response.json()
|
||||
if isinstance(channels, list):
|
||||
logger(f" ✅ Found {len(channels)} channels for server {server_id}.")
|
||||
return channels
|
||||
except (requests.exceptions.RequestException, json.JSONDecodeError):
|
||||
# This is a silent failure, we'll just try the next domain
|
||||
pass
|
||||
|
||||
logger(f" ❌ Failed to fetch channel list for server {server_id} from all available domains.")
|
||||
return None
|
||||
scraper = cloudscraper.create_scraper()
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': f'https://kemono.cr/discord/server/{server_id}',
|
||||
'Accept': 'text/css'
|
||||
}
|
||||
|
||||
def fetch_channel_messages(channel_id, logger, cancellation_event, pause_event, cookies=None):
|
||||
try:
|
||||
response = scraper.get(api_url, headers=headers, cookies=cookies_dict, timeout=30)
|
||||
response.raise_for_status()
|
||||
channels = response.json()
|
||||
if isinstance(channels, list):
|
||||
logger(f" ✅ Found {len(channels)} channels for server {server_id}.")
|
||||
return channels
|
||||
return None
|
||||
except Exception as e:
|
||||
logger(f" ❌ Error fetching server channels for {server_id}: {e}")
|
||||
return None
|
||||
|
||||
def fetch_channel_messages(channel_id, logger=print, cancellation_event=None, pause_event=None, cookies_dict=None):
|
||||
"""
|
||||
Fetches all messages from a Discord channel by looping through API pages (pagination).
|
||||
Uses a page size of 150 and handles the specific offset logic.
|
||||
A generator that fetches all messages for a specific Discord channel, handling pagination.
|
||||
Uses cloudscraper and proper headers to bypass server protection.
|
||||
"""
|
||||
offset = 0
|
||||
page_size = 150 # Corrected page size based on your findings
|
||||
api_base_url = f"https://kemono.cr/api/v1/discord/channel/{channel_id}"
|
||||
scraper = cloudscraper.create_scraper()
|
||||
base_url = f"https://kemono.cr/api/v1/discord/channel/{channel_id}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': f'https://kemono.cr/discord/channel/{channel_id}',
|
||||
'Accept': 'text/css'
|
||||
}
|
||||
|
||||
while not (cancellation_event and cancellation_event.is_set()):
|
||||
if pause_event and pause_event.is_set():
|
||||
logger(" Message fetching paused...")
|
||||
while pause_event.is_set():
|
||||
if cancellation_event and cancellation_event.is_set(): break
|
||||
time.sleep(0.5)
|
||||
logger(" Message fetching resumed.")
|
||||
offset = 0
|
||||
# --- FIX: Corrected the page size for Discord API pagination ---
|
||||
page_size = 150
|
||||
# --- END FIX ---
|
||||
|
||||
while True:
|
||||
if cancellation_event and cancellation_event.is_set():
|
||||
logger(" Discord message fetching cancelled.")
|
||||
break
|
||||
|
||||
paginated_url = f"{api_base_url}?o={offset}"
|
||||
if pause_event and pause_event.is_set():
|
||||
logger(" Discord message fetching paused...")
|
||||
while pause_event.is_set():
|
||||
if cancellation_event and cancellation_event.is_set():
|
||||
break
|
||||
time.sleep(0.5)
|
||||
if not (cancellation_event and cancellation_event.is_set()):
|
||||
logger(" Discord message fetching resumed.")
|
||||
|
||||
paginated_url = f"{base_url}?o={offset}"
|
||||
logger(f" Fetching messages from API: page starting at offset {offset}")
|
||||
|
||||
try:
|
||||
response = requests.get(paginated_url, cookies=cookies, timeout=20)
|
||||
response = scraper.get(paginated_url, headers=headers, cookies=cookies_dict, timeout=30)
|
||||
response.raise_for_status()
|
||||
messages_batch = response.json()
|
||||
|
||||
@@ -73,8 +80,11 @@ def fetch_channel_messages(channel_id, logger, cancellation_event, pause_event,
|
||||
break
|
||||
|
||||
offset += page_size
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5) # Be respectful to the API
|
||||
|
||||
except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
|
||||
except (cloudscraper.exceptions.CloudflareException, json.JSONDecodeError) as e:
|
||||
logger(f" ❌ Error fetching messages at offset {offset}: {e}")
|
||||
break
|
||||
break
|
||||
except Exception as e:
|
||||
logger(f" ❌ An unexpected error occurred while fetching messages: {e}")
|
||||
break
|
||||
|
||||
45
src/core/nhentai_client.py
Normal file
45
src/core/nhentai_client.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import requests
|
||||
import cloudscraper
|
||||
import json
|
||||
|
||||
def fetch_nhentai_gallery(gallery_id, logger=print):
|
||||
"""
|
||||
Fetches the metadata for a single nhentai gallery using cloudscraper to bypass Cloudflare.
|
||||
|
||||
Args:
|
||||
gallery_id (str or int): The ID of the nhentai gallery.
|
||||
logger (function): A function to log progress and error messages.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the gallery's metadata if successful, otherwise None.
|
||||
"""
|
||||
api_url = f"https://nhentai.net/api/gallery/{gallery_id}"
|
||||
|
||||
# Create a cloudscraper instance
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
logger(f" Fetching nhentai gallery metadata from: {api_url}")
|
||||
|
||||
try:
|
||||
# Use the scraper to make the GET request
|
||||
response = scraper.get(api_url, timeout=20)
|
||||
|
||||
if response.status_code == 404:
|
||||
logger(f" ❌ Gallery not found (404): ID {gallery_id}")
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
gallery_data = response.json()
|
||||
|
||||
if "id" in gallery_data and "media_id" in gallery_data and "images" in gallery_data:
|
||||
logger(f" ✅ Successfully fetched metadata for '{gallery_data['title']['english']}'")
|
||||
gallery_data['pages'] = gallery_data.pop('images')['pages']
|
||||
return gallery_data
|
||||
else:
|
||||
logger(" ❌ API response is missing essential keys (id, media_id, or images).")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger(f" ❌ An error occurred while fetching gallery {gallery_id}: {e}")
|
||||
return None
|
||||
@@ -15,6 +15,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError,
|
||||
from io import BytesIO
|
||||
from urllib .parse import urlparse
|
||||
import requests
|
||||
import cloudscraper
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
@@ -58,18 +60,13 @@ def robust_clean_name(name):
|
||||
"""A more robust function to remove illegal characters for filenames and folders."""
|
||||
if not name:
|
||||
return ""
|
||||
# Removes illegal characters for Windows, macOS, and Linux: < > : " / \ | ? *
|
||||
# Also removes control characters (ASCII 0-31) which are invisible but invalid.
|
||||
illegal_chars_pattern = r'[\x00-\x1f<>:"/\\|?*]'
|
||||
illegal_chars_pattern = r'[\x00-\x1f<>:"/\\|?*\']'
|
||||
cleaned_name = re.sub(illegal_chars_pattern, '', name)
|
||||
|
||||
# Remove leading/trailing spaces or periods, which can cause issues.
|
||||
|
||||
cleaned_name = cleaned_name.strip(' .')
|
||||
|
||||
# If the name is empty after cleaning (e.g., it was only illegal chars),
|
||||
# provide a safe fallback name.
|
||||
|
||||
if not cleaned_name:
|
||||
return "untitled_folder" # Or "untitled_file" depending on context
|
||||
return "untitled_folder"
|
||||
return cleaned_name
|
||||
|
||||
class PostProcessorSignals (QObject ):
|
||||
@@ -271,7 +268,9 @@ class PostProcessorWorker:
|
||||
|
||||
file_download_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'Referer': post_page_url
|
||||
'Referer': post_page_url,
|
||||
'Accept': 'text/css'
|
||||
|
||||
}
|
||||
|
||||
file_url = file_info.get('url')
|
||||
@@ -429,8 +428,26 @@ class PostProcessorWorker:
|
||||
self.logger(f"⚠️ Manga mode: Generated filename was empty. Using generic fallback: '{filename_to_save_in_main_path}'.")
|
||||
was_original_name_kept_flag = False
|
||||
else:
|
||||
filename_to_save_in_main_path = cleaned_original_api_filename
|
||||
was_original_name_kept_flag = True
|
||||
is_url_like = 'http' in api_original_filename.lower()
|
||||
is_too_long = len(cleaned_original_api_filename) > 100
|
||||
|
||||
if is_url_like or is_too_long:
|
||||
self.logger(f" ⚠️ Original filename is a URL or too long. Generating a shorter name.")
|
||||
name_hash = hashlib.md5(api_original_filename.encode()).hexdigest()[:12]
|
||||
_, ext = os.path.splitext(cleaned_original_api_filename)
|
||||
if not ext:
|
||||
try:
|
||||
path = urlparse(api_original_filename).path
|
||||
ext = os.path.splitext(path)[1] or ".file"
|
||||
except Exception:
|
||||
ext = ".file"
|
||||
|
||||
cleaned_post_title = robust_clean_name(post_title.strip() if post_title else "post")[:40]
|
||||
filename_to_save_in_main_path = f"{cleaned_post_title}_{name_hash}{ext}"
|
||||
was_original_name_kept_flag = False
|
||||
else:
|
||||
filename_to_save_in_main_path = cleaned_original_api_filename
|
||||
was_original_name_kept_flag = True
|
||||
|
||||
if self.remove_from_filename_words_list and filename_to_save_in_main_path:
|
||||
base_name_for_removal, ext_for_removal = os.path.splitext(filename_to_save_in_main_path)
|
||||
@@ -854,9 +871,7 @@ class PostProcessorWorker:
|
||||
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_FAILED_RETRYABLE_LATER, details_for_failure
|
||||
|
||||
def process(self):
|
||||
# --- START: REFACTORED PROCESS METHOD ---
|
||||
|
||||
# 1. DATA MAPPING: Map Discord Message or Creator Post fields to a consistent set of variables.
|
||||
if self.service == 'discord':
|
||||
# For Discord, self.post is a MESSAGE object from the API.
|
||||
post_title = self.post.get('content', '') or f"Message {self.post.get('id', 'N/A')}"
|
||||
@@ -885,19 +900,26 @@ class PostProcessorWorker:
|
||||
)
|
||||
|
||||
if content_is_needed and self.post.get('content') is None and self.service != 'discord':
|
||||
|
||||
self.logger(f" Post {post_id} is missing 'content' field, fetching full data...")
|
||||
parsed_url = urlparse(self.api_url_input)
|
||||
api_domain = parsed_url.netloc
|
||||
headers = {'User-Agent': 'Mozilla/5.0'}
|
||||
creator_page_url = f"https://{api_domain}/{self.service}/user/{self.user_id}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'Referer': creator_page_url,
|
||||
'Accept': 'text/css'
|
||||
|
||||
}
|
||||
|
||||
|
||||
cookies = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger, target_domain=api_domain)
|
||||
|
||||
full_post_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
|
||||
|
||||
if full_post_data:
|
||||
self.logger(" ✅ Full post data fetched successfully.")
|
||||
# Update the worker's post object with the complete data
|
||||
self.post = full_post_data
|
||||
# Re-initialize local variables from the new, complete post data
|
||||
post_title = self.post.get('title', '') or 'untitled_post'
|
||||
post_main_file_info = self.post.get('file')
|
||||
post_attachments = self.post.get('attachments', [])
|
||||
@@ -905,9 +927,7 @@ class PostProcessorWorker:
|
||||
post_data = self.post
|
||||
else:
|
||||
self.logger(f" ⚠️ Failed to fetch full content for post {post_id}. Content-dependent features may not work for this post.")
|
||||
# --- END FIX ---
|
||||
|
||||
# 2. SHARED PROCESSING LOGIC: The rest of the function now uses the consistent variables from above.
|
||||
result_tuple = (0, 0, [], [], [], None, None)
|
||||
total_downloaded_this_post = 0
|
||||
total_skipped_this_post = 0
|
||||
@@ -936,7 +956,11 @@ class PostProcessorWorker:
|
||||
else:
|
||||
post_page_url = f"https://{parsed_api_url.netloc}/{self.service}/user/{self.user_id}/post/{post_id}"
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0', 'Referer': post_page_url, 'Accept': '*/*'}
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
||||
'Referer': post_page_url,
|
||||
'Accept': 'text/css'
|
||||
}
|
||||
link_pattern = re.compile(r"""<a\s+.*?href=["'](https?://[^"']+)["'][^>]*>(.*?)</a>""", re.IGNORECASE | re.DOTALL)
|
||||
|
||||
effective_unwanted_keywords_for_folder_naming = self.unwanted_keywords.copy()
|
||||
|
||||
Reference in New Issue
Block a user