mirror of
https://github.com/Yuvi9587/Kemono-Downloader.git
synced 2025-12-29 16:14:44 +00:00
Commit
This commit is contained in:
1
src/utils/__init__.py
Normal file
1
src/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# ...existing code...
|
||||
142
src/utils/file_utils.py
Normal file
142
src/utils/file_utils.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# --- Standard Library Imports ---
|
||||
import os
|
||||
import re
|
||||
|
||||
# --- Module Constants ---
|
||||
|
||||
# This will be populated at runtime by the main application,
|
||||
# but is defined here as it's conceptually related to file/folder naming.
|
||||
KNOWN_NAMES = []
|
||||
|
||||
MAX_FILENAME_COMPONENT_LENGTH = 150
|
||||
|
||||
# Sets of file extensions for quick type checking
|
||||
IMAGE_EXTENSIONS = {
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif', '.webp',
|
||||
'.heic', '.heif', '.svg', '.ico', '.jfif', '.pjpeg', '.pjp', '.avif'
|
||||
}
|
||||
VIDEO_EXTENSIONS = {
|
||||
'.mp4', '.mov', '.mkv', '.webm', '.avi', '.wmv', '.flv', '.mpeg',
|
||||
'.mpg', '.m4v', '.3gp', '.ogv', '.ts', '.vob'
|
||||
}
|
||||
ARCHIVE_EXTENSIONS = {
|
||||
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'
|
||||
}
|
||||
AUDIO_EXTENSIONS = {
|
||||
'.mp3', '.wav', '.aac', '.flac', '.ogg', '.wma', '.m4a', '.opus',
|
||||
'.aiff', '.ape', '.mid', '.midi'
|
||||
}
|
||||
|
||||
# Words to ignore when trying to generate a folder name from a title
|
||||
FOLDER_NAME_STOP_WORDS = {
|
||||
"a", "alone", "am", "an", "and", "at", "be", "blues", "but", "by", "com",
|
||||
"for", "grown", "hard", "he", "her", "his", "hitting", "i", "im", "in", "is", "it", "its",
|
||||
"me", "much", "my", "net", "not", "of", "on", "or", "org", "our", "please",
|
||||
"right", "s", "she", "so", "technically", "tell", "the", "their", "they", "this",
|
||||
"to", "ve", "was", "we", "well", "were", "with", "www", "year", "you", "your",
|
||||
}
|
||||
|
||||
# --- File and Folder Name Utilities ---
|
||||
|
||||
def clean_folder_name(name):
|
||||
"""
|
||||
Sanitizes a string to make it a valid folder name.
|
||||
Removes invalid characters and trims whitespace.
|
||||
|
||||
Args:
|
||||
name (str): The input string.
|
||||
|
||||
Returns:
|
||||
str: A sanitized, valid folder name.
|
||||
"""
|
||||
if not isinstance(name, str):
|
||||
name = str(name)
|
||||
|
||||
# Remove characters that are invalid in folder names on most OS
|
||||
cleaned = re.sub(r'[<>:"/\\|?*]', '', name)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
# Replace multiple spaces with a single space
|
||||
cleaned = re.sub(r'\s+', ' ', cleaned)
|
||||
|
||||
# If after cleaning the name is empty, provide a default
|
||||
if not cleaned:
|
||||
return "untitled_folder"
|
||||
|
||||
# Truncate to a reasonable length
|
||||
if len(cleaned) > MAX_FILENAME_COMPONENT_LENGTH:
|
||||
cleaned = cleaned[:MAX_FILENAME_COMPONENT_LENGTH]
|
||||
|
||||
# Remove trailing dots or spaces, which can be problematic
|
||||
cleaned = cleaned.rstrip('. ')
|
||||
|
||||
return cleaned if cleaned else "untitled_folder"
|
||||
|
||||
|
||||
def clean_filename(name):
|
||||
"""
|
||||
Sanitizes a string to make it a valid file name.
|
||||
|
||||
Args:
|
||||
name (str): The input string.
|
||||
|
||||
Returns:
|
||||
str: A sanitized, valid file name.
|
||||
"""
|
||||
if not isinstance(name, str):
|
||||
name = str(name)
|
||||
|
||||
cleaned = re.sub(r'[<>:"/\\|?*]', '_', name)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
if not cleaned:
|
||||
return "untitled_file"
|
||||
|
||||
base_name, ext = os.path.splitext(cleaned)
|
||||
max_base_len = MAX_FILENAME_COMPONENT_LENGTH - len(ext)
|
||||
|
||||
if len(base_name) > max_base_len:
|
||||
if max_base_len > 0:
|
||||
base_name = base_name[:max_base_len]
|
||||
else:
|
||||
# Handle cases where the extension itself is too long
|
||||
return cleaned[:MAX_FILENAME_COMPONENT_LENGTH]
|
||||
|
||||
return base_name + ext
|
||||
|
||||
|
||||
# --- File Type Identification Functions ---
|
||||
|
||||
def is_image(filename):
|
||||
"""Checks if a filename has a common image extension."""
|
||||
if not filename: return False
|
||||
_, ext = os.path.splitext(filename)
|
||||
return ext.lower() in IMAGE_EXTENSIONS
|
||||
|
||||
def is_video(filename):
|
||||
"""Checks if a filename has a common video extension."""
|
||||
if not filename: return False
|
||||
_, ext = os.path.splitext(filename)
|
||||
return ext.lower() in VIDEO_EXTENSIONS
|
||||
|
||||
def is_zip(filename):
|
||||
"""Checks if a filename is a .zip file."""
|
||||
if not filename: return False
|
||||
return filename.lower().endswith('.zip')
|
||||
|
||||
def is_rar(filename):
|
||||
"""Checks if a filename is a .rar file."""
|
||||
if not filename: return False
|
||||
return filename.lower().endswith('.rar')
|
||||
|
||||
def is_archive(filename):
|
||||
"""Checks if a filename has a common archive extension."""
|
||||
if not filename: return False
|
||||
_, ext = os.path.splitext(filename)
|
||||
return ext.lower() in ARCHIVE_EXTENSIONS
|
||||
|
||||
def is_audio(filename):
|
||||
"""Checks if a filename has a common audio extension."""
|
||||
if not filename: return False
|
||||
_, ext = os.path.splitext(filename)
|
||||
return ext.lower() in AUDIO_EXTENSIONS
|
||||
208
src/utils/network_utils.py
Normal file
208
src/utils/network_utils.py
Normal file
@@ -0,0 +1,208 @@
|
||||
# --- Standard Library Imports ---
|
||||
import os
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# --- Third-Party Library Imports ---
|
||||
# This module might not require third-party libraries directly,
|
||||
# but 'requests' is a common dependency for network operations.
|
||||
# import requests
|
||||
|
||||
|
||||
def parse_cookie_string(cookie_string):
|
||||
"""
|
||||
Parses a 'name=value; name2=value2' cookie string into a dictionary.
|
||||
|
||||
Args:
|
||||
cookie_string (str): The cookie string from browser tools.
|
||||
|
||||
Returns:
|
||||
dict or None: A dictionary of cookie names and values, or None if empty.
|
||||
"""
|
||||
cookies = {}
|
||||
if cookie_string:
|
||||
for item in cookie_string.split(';'):
|
||||
parts = item.split('=', 1)
|
||||
if len(parts) == 2:
|
||||
name = parts[0].strip()
|
||||
value = parts[1].strip()
|
||||
if name:
|
||||
cookies[name] = value
|
||||
return cookies if cookies else None
|
||||
|
||||
|
||||
def load_cookies_from_netscape_file(filepath, logger_func, target_domain_filter=None):
|
||||
"""
|
||||
Loads cookies from a Netscape-formatted cookies.txt file.
|
||||
|
||||
If a target_domain_filter is provided, only cookies for that domain
|
||||
(or its subdomains) are returned.
|
||||
|
||||
Args:
|
||||
filepath (str): The full path to the cookies.txt file.
|
||||
logger_func (callable): Function to use for logging.
|
||||
target_domain_filter (str, optional): The domain to filter cookies for.
|
||||
|
||||
Returns:
|
||||
dict or None: A dictionary of cookie names and values, or None if none are loaded.
|
||||
"""
|
||||
cookies = {}
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
parts = line.split('\t')
|
||||
if len(parts) == 7:
|
||||
cookie_domain = parts[0]
|
||||
name = parts[5]
|
||||
value = parts[6]
|
||||
|
||||
if not name:
|
||||
continue
|
||||
|
||||
if target_domain_filter:
|
||||
# Match domain exactly or as a subdomain
|
||||
host_to_match = target_domain_filter.lower()
|
||||
cookie_domain_norm = cookie_domain.lower()
|
||||
if (cookie_domain_norm.startswith('.') and host_to_match.endswith(cookie_domain_norm)) or \
|
||||
(host_to_match == cookie_domain_norm):
|
||||
cookies[name] = value
|
||||
else:
|
||||
cookies[name] = value
|
||||
|
||||
logger_func(f" 🍪 Loaded {len(cookies)} cookies from '{os.path.basename(filepath)}' for domain '{target_domain_filter or 'any'}'.")
|
||||
return cookies if cookies else None
|
||||
except FileNotFoundError:
|
||||
logger_func(f" 🍪 Cookie file '{os.path.basename(filepath)}' not found.")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger_func(f" 🍪 Error parsing cookie file '{os.path.basename(filepath)}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def prepare_cookies_for_request(use_cookie_flag, cookie_text_input, selected_cookie_file_path, app_base_dir, logger_func, target_domain=None):
|
||||
"""
|
||||
Prepares a cookie dictionary from various sources based on user settings.
|
||||
Priority:
|
||||
1. UI-selected file path.
|
||||
2. Domain-specific file in the app directory.
|
||||
3. Default `cookies.txt` in the app directory.
|
||||
4. Manually entered cookie text.
|
||||
|
||||
Args:
|
||||
use_cookie_flag (bool): Whether cookies are enabled in the UI.
|
||||
cookie_text_input (str): The raw text from the cookie input field.
|
||||
selected_cookie_file_path (str): The path to a user-browsed cookie file.
|
||||
app_base_dir (str): The base directory of the application.
|
||||
logger_func (callable): Function for logging.
|
||||
target_domain (str, optional): The domain for which cookies are needed.
|
||||
|
||||
Returns:
|
||||
dict or None: A dictionary of cookies for the request, or None.
|
||||
"""
|
||||
if not use_cookie_flag:
|
||||
return None
|
||||
|
||||
# Priority 1: Use the specifically browsed file first
|
||||
if selected_cookie_file_path and os.path.exists(selected_cookie_file_path):
|
||||
cookies = load_cookies_from_netscape_file(selected_cookie_file_path, logger_func, target_domain)
|
||||
if cookies:
|
||||
return cookies
|
||||
|
||||
# Priority 2: Look for a domain-specific cookie file
|
||||
if app_base_dir and target_domain:
|
||||
domain_specific_path = os.path.join(app_base_dir, "data", f"{target_domain}_cookies.txt")
|
||||
if os.path.exists(domain_specific_path):
|
||||
cookies = load_cookies_from_netscape_file(domain_specific_path, logger_func, target_domain)
|
||||
if cookies:
|
||||
return cookies
|
||||
|
||||
# Priority 3: Look for a generic cookies.txt
|
||||
if app_base_dir:
|
||||
default_path = os.path.join(app_base_dir, "appdata", "cookies.txt")
|
||||
if os.path.exists(default_path):
|
||||
cookies = load_cookies_from_netscape_file(default_path, logger_func, target_domain)
|
||||
if cookies:
|
||||
return cookies
|
||||
|
||||
# Priority 4: Fall back to manually entered text
|
||||
if cookie_text_input:
|
||||
cookies = parse_cookie_string(cookie_text_input)
|
||||
if cookies:
|
||||
return cookies
|
||||
|
||||
logger_func(f" 🍪 Cookie usage enabled for '{target_domain or 'any'}', but no valid cookies found.")
|
||||
return None
|
||||
|
||||
|
||||
def extract_post_info(url_string):
|
||||
"""
|
||||
Parses a URL string to extract the service, user ID, and post ID.
|
||||
|
||||
Args:
|
||||
url_string (str): The URL to parse.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing (service, user_id, post_id). Any can be None.
|
||||
"""
|
||||
if not isinstance(url_string, str) or not url_string.strip():
|
||||
return None, None, None
|
||||
|
||||
try:
|
||||
parsed_url = urlparse(url_string.strip())
|
||||
path_parts = [part for part in parsed_url.path.strip('/').split('/') if part]
|
||||
|
||||
# Standard format: /<service>/user/<user_id>/post/<post_id>
|
||||
if len(path_parts) >= 3 and path_parts[1].lower() == 'user':
|
||||
service = path_parts[0]
|
||||
user_id = path_parts[2]
|
||||
post_id = path_parts[4] if len(path_parts) >= 5 and path_parts[3].lower() == 'post' else None
|
||||
return service, user_id, post_id
|
||||
|
||||
# API format: /api/v1/<service>/user/<user_id>...
|
||||
if len(path_parts) >= 5 and path_parts[0:2] == ['api', 'v1'] and path_parts[3].lower() == 'user':
|
||||
service = path_parts[2]
|
||||
user_id = path_parts[4]
|
||||
post_id = path_parts[6] if len(path_parts) >= 7 and path_parts[5].lower() == 'post' else None
|
||||
return service, user_id, post_id
|
||||
|
||||
except Exception as e:
|
||||
print(f"Debug: Exception during URL parsing for '{url_string}': {e}")
|
||||
|
||||
return None, None, None
|
||||
|
||||
|
||||
def get_link_platform(url):
|
||||
"""
|
||||
Identifies the platform of a given URL based on its domain.
|
||||
|
||||
Args:
|
||||
url (str): The URL to identify.
|
||||
|
||||
Returns:
|
||||
str: The name of the platform (e.g., 'mega', 'google drive') or 'external'.
|
||||
"""
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
if 'drive.google.com' in domain: return 'google drive'
|
||||
if 'mega.nz' in domain or 'mega.io' in domain: return 'mega'
|
||||
if 'dropbox.com' in domain: return 'dropbox'
|
||||
if 'patreon.com' in domain: return 'patreon'
|
||||
if 'gofile.io' in domain: return 'gofile'
|
||||
if 'instagram.com' in domain: return 'instagram'
|
||||
if 'twitter.com' in domain or 'x.com' in domain: return 'twitter/x'
|
||||
if 'discord.gg' in domain or 'discord.com/invite' in domain: return 'discord invite'
|
||||
if 'pixiv.net' in domain: return 'pixiv'
|
||||
if 'kemono.su' in domain or 'kemono.party' in domain: return 'kemono'
|
||||
if 'coomer.su' in domain or 'coomer.party' in domain: return 'coomer'
|
||||
|
||||
# Fallback to a generic name for other domains
|
||||
parts = domain.split('.')
|
||||
if len(parts) >= 2:
|
||||
return parts[-2]
|
||||
return 'external'
|
||||
except Exception:
|
||||
return 'unknown'
|
||||
207
src/utils/text_utils.py
Normal file
207
src/utils/text_utils.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# --- Standard Library Imports ---
|
||||
import re
|
||||
import html
|
||||
|
||||
# --- Local Application Imports ---
|
||||
# Import from file_utils within the same package
|
||||
from .file_utils import clean_folder_name, FOLDER_NAME_STOP_WORDS
|
||||
|
||||
# --- Module Constants ---
|
||||
|
||||
# Regular expression patterns for cleaning up titles before matching against Known.txt
|
||||
KNOWN_TXT_MATCH_CLEANUP_PATTERNS = [
|
||||
r'\bcum\b',
|
||||
r'\bnsfw\b',
|
||||
r'\bsfw\b',
|
||||
r'\bweb\b',
|
||||
r'\bhd\b',
|
||||
r'\bhi\s*res\b',
|
||||
r'\bhigh\s*res\b',
|
||||
r'\b\d+p\b',
|
||||
r'\b\d+k\b',
|
||||
r'\[OC\]',
|
||||
r'\[Request(?:s)?\]',
|
||||
r'\bCommission\b',
|
||||
r'\bComm\b',
|
||||
r'\bPreview\b',
|
||||
]
|
||||
|
||||
# --- Text Matching and Manipulation Utilities ---
|
||||
|
||||
def is_title_match_for_character(post_title, character_name_filter):
|
||||
"""
|
||||
Checks if a post title contains a specific character name as a whole word.
|
||||
Case-insensitive.
|
||||
|
||||
Args:
|
||||
post_title (str): The title of the post.
|
||||
character_name_filter (str): The character name to search for.
|
||||
|
||||
Returns:
|
||||
bool: True if the name is found as a whole word, False otherwise.
|
||||
"""
|
||||
if not post_title or not character_name_filter:
|
||||
return False
|
||||
|
||||
# Use word boundaries (\b) to match whole words only
|
||||
pattern = r"(?i)\b" + re.escape(str(character_name_filter).strip()) + r"\b"
|
||||
return bool(re.search(pattern, post_title))
|
||||
|
||||
|
||||
def is_filename_match_for_character(filename, character_name_filter):
|
||||
"""
|
||||
Checks if a filename contains a character name. This is a simple substring check.
|
||||
Case-insensitive.
|
||||
|
||||
Args:
|
||||
filename (str): The name of the file.
|
||||
character_name_filter (str): The character name to search for.
|
||||
|
||||
Returns:
|
||||
bool: True if the substring is found, False otherwise.
|
||||
"""
|
||||
if not filename or not character_name_filter:
|
||||
return False
|
||||
|
||||
return str(character_name_filter).strip().lower() in filename.lower()
|
||||
|
||||
|
||||
def strip_html_tags(html_text):
|
||||
"""
|
||||
Removes HTML tags from a string and cleans up resulting whitespace.
|
||||
|
||||
Args:
|
||||
html_text (str): The input string containing HTML.
|
||||
|
||||
Returns:
|
||||
str: The text with HTML tags removed.
|
||||
"""
|
||||
if not html_text:
|
||||
return ""
|
||||
# First, unescape HTML entities like & -> &
|
||||
text = html.unescape(str(html_text))
|
||||
# Remove all tags
|
||||
text_after_tag_removal = re.sub(r'<[^>]+>', ' ', text)
|
||||
# Replace multiple whitespace characters with a single space
|
||||
cleaned_text = re.sub(r'\s+', ' ', text_after_tag_removal).strip()
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def extract_folder_name_from_title(title, unwanted_keywords):
|
||||
"""
|
||||
Extracts a plausible folder name from a post title by finding the first
|
||||
significant word that isn't a stop-word.
|
||||
|
||||
Args:
|
||||
title (str): The post title.
|
||||
unwanted_keywords (set): A set of words to ignore.
|
||||
|
||||
Returns:
|
||||
str: The extracted folder name, or 'Uncategorized'.
|
||||
"""
|
||||
if not title:
|
||||
return 'Uncategorized'
|
||||
|
||||
title_lower = title.lower()
|
||||
# Find all whole words in the title
|
||||
tokens = re.findall(r'\b[\w\-]+\b', title_lower)
|
||||
|
||||
for token in tokens:
|
||||
clean_token = clean_folder_name(token)
|
||||
if clean_token and clean_token.lower() not in unwanted_keywords:
|
||||
return clean_token
|
||||
|
||||
# Fallback to cleaning the full title if no single significant word is found
|
||||
cleaned_full_title = clean_folder_name(title)
|
||||
return cleaned_full_title if cleaned_full_title else 'Uncategorized'
|
||||
|
||||
|
||||
def match_folders_from_title(title, names_to_match, unwanted_keywords):
|
||||
"""
|
||||
Matches folder names from a title based on a list of known name objects.
|
||||
Each name object is a dict: {'name': 'PrimaryName', 'aliases': ['alias1', ...]}
|
||||
|
||||
Args:
|
||||
title (str): The post title to check.
|
||||
names_to_match (list): A list of known name dictionaries.
|
||||
unwanted_keywords (set): A set of folder names to ignore.
|
||||
|
||||
Returns:
|
||||
list: A sorted list of matched primary folder names.
|
||||
"""
|
||||
if not title or not names_to_match:
|
||||
return []
|
||||
|
||||
# Clean the title by removing common tags like [OC], [HD], etc.
|
||||
cleaned_title = title
|
||||
for pat_str in KNOWN_TXT_MATCH_CLEANUP_PATTERNS:
|
||||
cleaned_title = re.sub(pat_str, ' ', cleaned_title, flags=re.IGNORECASE)
|
||||
cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
|
||||
title_lower = cleaned_title.lower()
|
||||
|
||||
matched_cleaned_names = set()
|
||||
|
||||
# Sort by name length descending to match longer names first (e.g., "Cloud Strife" before "Cloud")
|
||||
sorted_name_objects = sorted(names_to_match, key=lambda x: len(x.get("name", "")), reverse=True)
|
||||
|
||||
for name_obj in sorted_name_objects:
|
||||
primary_folder_name = name_obj.get("name")
|
||||
aliases = name_obj.get("aliases", [])
|
||||
if not primary_folder_name or not aliases:
|
||||
continue
|
||||
|
||||
for alias in aliases:
|
||||
alias_lower = alias.lower()
|
||||
if not alias_lower: continue
|
||||
|
||||
# Use word boundaries for accurate matching
|
||||
pattern = r'\b' + re.escape(alias_lower) + r'\b'
|
||||
if re.search(pattern, title_lower):
|
||||
cleaned_primary_name = clean_folder_name(primary_folder_name)
|
||||
if cleaned_primary_name.lower() not in unwanted_keywords:
|
||||
matched_cleaned_names.add(cleaned_primary_name)
|
||||
break # Move to the next name object once a match is found for this one
|
||||
|
||||
return sorted(list(matched_cleaned_names))
|
||||
|
||||
|
||||
def match_folders_from_filename_enhanced(filename, names_to_match, unwanted_keywords):
|
||||
"""
|
||||
Matches folder names from a filename, prioritizing longer and more specific aliases.
|
||||
|
||||
Args:
|
||||
filename (str): The filename to check.
|
||||
names_to_match (list): A list of known name dictionaries.
|
||||
unwanted_keywords (set): A set of folder names to ignore.
|
||||
|
||||
Returns:
|
||||
list: A sorted list of matched primary folder names.
|
||||
"""
|
||||
if not filename or not names_to_match:
|
||||
return []
|
||||
|
||||
filename_lower = filename.lower()
|
||||
matched_primary_names = set()
|
||||
|
||||
# Create a flat list of (alias, primary_name) tuples to sort by alias length
|
||||
alias_map_to_primary = []
|
||||
for name_obj in names_to_match:
|
||||
primary_name = name_obj.get("name")
|
||||
if not primary_name: continue
|
||||
|
||||
cleaned_primary_name = clean_folder_name(primary_name)
|
||||
if not cleaned_primary_name or cleaned_primary_name.lower() in unwanted_keywords:
|
||||
continue
|
||||
|
||||
for alias in name_obj.get("aliases", []):
|
||||
if alias.lower():
|
||||
alias_map_to_primary.append((alias.lower(), cleaned_primary_name))
|
||||
|
||||
# Sort by alias length, descending, to match longer aliases first
|
||||
alias_map_to_primary.sort(key=lambda x: len(x[0]), reverse=True)
|
||||
|
||||
for alias_lower, primary_name_for_alias in alias_map_to_primary:
|
||||
if filename_lower.startswith(alias_lower):
|
||||
matched_primary_names.add(primary_name_for_alias)
|
||||
|
||||
return sorted(list(matched_primary_names))
|
||||
Reference in New Issue
Block a user