3 Commits

Author SHA1 Message Date
Yuvi9587
d9364f4f91 commit 2025-08-14 09:48:55 -07:00
Yuvi9587
9cd48bb63a Update main_window.py 2025-08-13 19:49:10 -07:00
Yuvi9587
d0f11c4a06 Commit 2025-08-13 19:38:33 -07:00
3 changed files with 110 additions and 27 deletions

View File

@@ -41,9 +41,14 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
try: try:
response = requests.get(paginated_url, headers=headers, timeout=(15, 60), cookies=cookies_dict) response = requests.get(paginated_url, headers=headers, timeout=(15, 60), cookies=cookies_dict)
response.raise_for_status() response.raise_for_status()
response.encoding = 'utf-8'
return response.json() return response.json()
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
if e.response is not None and e.response.status_code == 400:
logger(f" ✅ Reached end of posts (API returned 400 Bad Request for offset {offset}).")
return []
logger(f" ⚠️ Retryable network error on page fetch (Attempt {attempt + 1}): {e}") logger(f" ⚠️ Retryable network error on page fetch (Attempt {attempt + 1}): {e}")
if attempt < max_retries - 1: if attempt < max_retries - 1:
delay = retry_delay * (2 ** attempt) delay = retry_delay * (2 ** attempt)
@@ -81,8 +86,11 @@ def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logge
response_body += chunk response_body += chunk
full_post_data = json.loads(response_body) full_post_data = json.loads(response_body)
if isinstance(full_post_data, list) and full_post_data: if isinstance(full_post_data, list) and full_post_data:
return full_post_data[0] return full_post_data[0]
if isinstance(full_post_data, dict) and 'post' in full_post_data:
return full_post_data['post']
return full_post_data return full_post_data
except Exception as e: except Exception as e:
@@ -101,6 +109,7 @@ def fetch_post_comments(api_domain, service, user_id, post_id, headers, logger,
try: try:
response = requests.get(comments_api_url, headers=headers, timeout=(10, 30), cookies=cookies_dict) response = requests.get(comments_api_url, headers=headers, timeout=(10, 30), cookies=cookies_dict)
response.raise_for_status() response.raise_for_status()
response.encoding = 'utf-8'
return response.json() return response.json()
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
raise RuntimeError(f"Error fetching comments for post {post_id}: {e}") raise RuntimeError(f"Error fetching comments for post {post_id}: {e}")
@@ -141,12 +150,9 @@ def download_from_api(
parsed_input_url_for_domain = urlparse(api_url_input) parsed_input_url_for_domain = urlparse(api_url_input)
api_domain = parsed_input_url_for_domain.netloc api_domain = parsed_input_url_for_domain.netloc
# --- START: MODIFIED LOGIC ---
# This list is updated to include the new .cr and .st mirrors for validation.
if not any(d in api_domain.lower() for d in ['kemono.su', 'kemono.party', 'kemono.cr', 'coomer.su', 'coomer.party', 'coomer.st']): if not any(d in api_domain.lower() for d in ['kemono.su', 'kemono.party', 'kemono.cr', 'coomer.su', 'coomer.party', 'coomer.st']):
logger(f"⚠️ Unrecognized domain '{api_domain}' from input URL. Defaulting to kemono.su for API calls.") logger(f"⚠️ Unrecognized domain '{api_domain}' from input URL. Defaulting to kemono.su for API calls.")
api_domain = "kemono.su" api_domain = "kemono.su"
# --- END: MODIFIED LOGIC ---
cookies_for_api = None cookies_for_api = None
if use_cookie and app_base_dir: if use_cookie and app_base_dir:
@@ -160,6 +166,7 @@ def download_from_api(
try: try:
direct_response = requests.get(direct_post_api_url, headers=headers, timeout=(10, 30), cookies=cookies_for_api) direct_response = requests.get(direct_post_api_url, headers=headers, timeout=(10, 30), cookies=cookies_for_api)
direct_response.raise_for_status() direct_response.raise_for_status()
direct_response.encoding = 'utf-8'
direct_post_data = direct_response.json() direct_post_data = direct_response.json()
if isinstance(direct_post_data, list) and direct_post_data: if isinstance(direct_post_data, list) and direct_post_data:
direct_post_data = direct_post_data[0] direct_post_data = direct_post_data[0]
@@ -185,7 +192,7 @@ def download_from_api(
is_manga_mode_fetch_all_and_sort_oldest_first = manga_mode and (manga_filename_style_for_sort_check != STYLE_DATE_POST_TITLE) and not target_post_id is_manga_mode_fetch_all_and_sort_oldest_first = manga_mode and (manga_filename_style_for_sort_check != STYLE_DATE_POST_TITLE) and not target_post_id
should_fetch_all = fetch_all_first or is_manga_mode_fetch_all_and_sort_oldest_first should_fetch_all = fetch_all_first or is_manga_mode_fetch_all_and_sort_oldest_first
api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}" api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/posts"
page_size = 50 page_size = 50
if is_manga_mode_fetch_all_and_sort_oldest_first: if is_manga_mode_fetch_all_and_sort_oldest_first:
logger(f" Manga Mode (Style: {manga_filename_style_for_sort_check if manga_filename_style_for_sort_check else 'Default'} - Oldest First Sort Active): Fetching all posts to sort by date...") logger(f" Manga Mode (Style: {manga_filename_style_for_sort_check if manga_filename_style_for_sort_check else 'Default'} - Oldest First Sort Active): Fetching all posts to sort by date...")

View File

@@ -37,7 +37,7 @@ try:
except ImportError: except ImportError:
Document = None Document = None
from PyQt5 .QtCore import Qt ,QThread ,pyqtSignal ,QMutex ,QMutexLocker ,QObject ,QTimer ,QSettings ,QStandardPaths ,QCoreApplication ,QUrl ,QSize ,QProcess from PyQt5 .QtCore import Qt ,QThread ,pyqtSignal ,QMutex ,QMutexLocker ,QObject ,QTimer ,QSettings ,QStandardPaths ,QCoreApplication ,QUrl ,QSize ,QProcess
from .api_client import download_from_api, fetch_post_comments from .api_client import download_from_api, fetch_post_comments, fetch_single_post_data
from ..services.multipart_downloader import download_file_in_parts, MULTIPART_DOWNLOADER_AVAILABLE from ..services.multipart_downloader import download_file_in_parts, MULTIPART_DOWNLOADER_AVAILABLE
from ..services.drive_downloader import ( from ..services.drive_downloader import (
download_mega_file, download_gdrive_file, download_dropbox_file download_mega_file, download_gdrive_file, download_dropbox_file
@@ -124,7 +124,8 @@ class PostProcessorWorker:
processed_post_ids=None, processed_post_ids=None,
multipart_scope='both', multipart_scope='both',
multipart_parts_count=4, multipart_parts_count=4,
multipart_min_size_mb=100 multipart_min_size_mb=100,
skip_file_size_mb=None
): ):
self.post = post_data self.post = post_data
self.download_root = download_root self.download_root = download_root
@@ -189,6 +190,7 @@ class PostProcessorWorker:
self.multipart_scope = multipart_scope self.multipart_scope = multipart_scope
self.multipart_parts_count = multipart_parts_count self.multipart_parts_count = multipart_parts_count
self.multipart_min_size_mb = multipart_min_size_mb self.multipart_min_size_mb = multipart_min_size_mb
self.skip_file_size_mb = skip_file_size_mb
if self.compress_images and Image is None: if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found.") self.logger("⚠️ Image compression disabled: Pillow library not found.")
self.compress_images = False self.compress_images = False
@@ -277,6 +279,24 @@ class PostProcessorWorker:
if self.use_cookie: if self.use_cookie:
cookies_to_use_for_file = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger) cookies_to_use_for_file = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger)
if self.skip_file_size_mb is not None:
api_original_filename_for_size_check = file_info.get('_original_name_for_log', file_info.get('name'))
try:
# Use a stream=True HEAD request to get headers without downloading the body
with requests.head(file_url, headers=file_download_headers, timeout=15, cookies=cookies_to_use_for_file, allow_redirects=True) as head_response:
head_response.raise_for_status()
content_length = head_response.headers.get('Content-Length')
if content_length:
file_size_bytes = int(content_length)
file_size_mb = file_size_bytes / (1024 * 1024)
if file_size_mb < self.skip_file_size_mb:
self.logger(f" -> Skip File (Size): '{api_original_filename_for_size_check}' is {file_size_mb:.2f} MB, which is smaller than the {self.skip_file_size_mb} MB limit.")
return 0, 1, api_original_filename_for_size_check, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
else:
self.logger(f" ⚠️ Could not determine file size for '{api_original_filename_for_size_check}' to check against size limit. Proceeding with download.")
except requests.RequestException as e:
self.logger(f" ⚠️ Could not fetch file headers to check size for '{api_original_filename_for_size_check}': {e}. Proceeding with download.")
api_original_filename = file_info.get('_original_name_for_log', file_info.get('name')) api_original_filename = file_info.get('_original_name_for_log', file_info.get('name'))
filename_to_save_in_main_path = "" filename_to_save_in_main_path = ""
if forced_filename_override: if forced_filename_override:
@@ -488,19 +508,18 @@ class PostProcessorWorker:
except requests.RequestException as e: except requests.RequestException as e:
self.logger(f" ⚠️ Could not verify size of existing file '{filename_to_save_in_main_path}': {e}. Proceeding with download.") self.logger(f" ⚠️ Could not verify size of existing file '{filename_to_save_in_main_path}': {e}. Proceeding with download.")
max_retries = 3
retry_delay = 5 retry_delay = 5
downloaded_size_bytes = 0 downloaded_size_bytes = 0
calculated_file_hash = None calculated_file_hash = None
downloaded_part_file_path = None downloaded_part_file_path = None
total_size_bytes = 0
download_successful_flag = False download_successful_flag = False
last_exception_for_retry_later = None last_exception_for_retry_later = None
is_permanent_error = False is_permanent_error = False
data_to_write_io = None data_to_write_io = None
response_for_this_attempt = None
for attempt_num_single_stream in range(max_retries + 1): for attempt_num_single_stream in range(max_retries + 1):
response_for_this_attempt = None response = None
if self._check_pause(f"File download attempt for '{api_original_filename}'"): break if self._check_pause(f"File download attempt for '{api_original_filename}'"): break
if self.check_cancel() or (skip_event and skip_event.is_set()): break if self.check_cancel() or (skip_event and skip_event.is_set()): break
try: try:
@@ -519,12 +538,24 @@ class PostProcessorWorker:
new_url = self._find_valid_subdomain(current_url_to_try) new_url = self._find_valid_subdomain(current_url_to_try)
if new_url != current_url_to_try: if new_url != current_url_to_try:
self.logger(f" Retrying with new URL: {new_url}") self.logger(f" Retrying with new URL: {new_url}")
file_url = new_url # Update the main file_url for subsequent retries file_url = new_url
response.close() # Close the old response
response = requests.get(new_url, headers=file_download_headers, timeout=(30, 300), stream=True, cookies=cookies_to_use_for_file) response = requests.get(new_url, headers=file_download_headers, timeout=(30, 300), stream=True, cookies=cookies_to_use_for_file)
response.raise_for_status() response.raise_for_status()
# --- REVISED AND MOVED SIZE CHECK LOGIC ---
total_size_bytes = int(response.headers.get('Content-Length', 0)) total_size_bytes = int(response.headers.get('Content-Length', 0))
if self.skip_file_size_mb is not None:
if total_size_bytes > 0:
file_size_mb = total_size_bytes / (1024 * 1024)
if file_size_mb < self.skip_file_size_mb:
self.logger(f" -> Skip File (Size): '{api_original_filename}' is {file_size_mb:.2f} MB, which is smaller than the {self.skip_file_size_mb} MB limit.")
return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
# If Content-Length is missing, we can't check, so we no longer log a warning here and just proceed.
# --- END OF REVISED LOGIC ---
num_parts_for_file = min(self.multipart_parts_count, MAX_PARTS_FOR_MULTIPART_DOWNLOAD) num_parts_for_file = min(self.multipart_parts_count, MAX_PARTS_FOR_MULTIPART_DOWNLOAD)
file_is_eligible_by_scope = False file_is_eligible_by_scope = False
@@ -548,9 +579,7 @@ class PostProcessorWorker:
if self._check_pause(f"Multipart decision for '{api_original_filename}'"): break if self._check_pause(f"Multipart decision for '{api_original_filename}'"): break
if attempt_multipart: if attempt_multipart:
if response_for_this_attempt: response.close() # Close the initial connection before starting multipart
response_for_this_attempt.close()
response_for_this_attempt = None
mp_save_path_for_unique_part_stem_arg = os.path.join(target_folder_path, f"{unique_part_file_stem_on_disk}{temp_file_ext_for_unique_part}") mp_save_path_for_unique_part_stem_arg = os.path.join(target_folder_path, f"{unique_part_file_stem_on_disk}{temp_file_ext_for_unique_part}")
mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts( mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts(
file_url, mp_save_path_for_unique_part_stem_arg, total_size_bytes, num_parts_for_file, file_download_headers, api_original_filename, file_url, mp_save_path_for_unique_part_stem_arg, total_size_bytes, num_parts_for_file, file_download_headers, api_original_filename,
@@ -576,7 +605,6 @@ class PostProcessorWorker:
current_attempt_downloaded_bytes = 0 current_attempt_downloaded_bytes = 0
md5_hasher = hashlib.md5() md5_hasher = hashlib.md5()
last_progress_time = time.time() last_progress_time = time.time()
single_stream_exception = None
try: try:
with open(current_single_stream_part_path, 'wb') as f_part: with open(current_single_stream_part_path, 'wb') as f_part:
for chunk in response.iter_content(chunk_size=1 * 1024 * 1024): for chunk in response.iter_content(chunk_size=1 * 1024 * 1024):
@@ -643,8 +671,8 @@ class PostProcessorWorker:
is_permanent_error = True is_permanent_error = True
break break
finally: finally:
if response_for_this_attempt: if response:
response_for_this_attempt.close() response.close()
self._emit_signal('file_download_status', False) self._emit_signal('file_download_status', False)
final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes
@@ -848,6 +876,37 @@ class PostProcessorWorker:
post_data = self.post # Reference to the post object post_data = self.post # Reference to the post object
log_prefix = "Post" log_prefix = "Post"
# --- FIX: FETCH FULL POST DATA IF CONTENT IS MISSING BUT NEEDED ---
content_is_needed = (
self.show_external_links or
self.extract_links_only or
self.scan_content_for_images or
(self.filter_mode == 'text_only' and self.text_only_scope == 'content')
)
if content_is_needed and self.post.get('content') is None and self.service != 'discord':
self.logger(f" Post {post_id} is missing 'content' field, fetching full data...")
parsed_url = urlparse(self.api_url_input)
api_domain = parsed_url.netloc
headers = {'User-Agent': 'Mozilla/5.0'}
cookies = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger, target_domain=api_domain)
full_post_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
if full_post_data:
self.logger(" ✅ Full post data fetched successfully.")
# Update the worker's post object with the complete data
self.post = full_post_data
# Re-initialize local variables from the new, complete post data
post_title = self.post.get('title', '') or 'untitled_post'
post_main_file_info = self.post.get('file')
post_attachments = self.post.get('attachments', [])
post_content_html = self.post.get('content', '')
post_data = self.post
else:
self.logger(f" ⚠️ Failed to fetch full content for post {post_id}. Content-dependent features may not work for this post.")
# --- END FIX ---
# 2. SHARED PROCESSING LOGIC: The rest of the function now uses the consistent variables from above. # 2. SHARED PROCESSING LOGIC: The rest of the function now uses the consistent variables from above.
result_tuple = (0, 0, [], [], [], None, None) result_tuple = (0, 0, [], [], [], None, None)
total_downloaded_this_post = 0 total_downloaded_this_post = 0
@@ -1258,7 +1317,6 @@ class PostProcessorWorker:
parsed_url = urlparse(self.api_url_input) parsed_url = urlparse(self.api_url_input)
api_domain = parsed_url.netloc api_domain = parsed_url.netloc
cookies = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger, target_domain=api_domain) cookies = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger, target_domain=api_domain)
from .api_client import fetch_single_post_data
full_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies) full_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
if full_data: if full_data:
final_post_data = full_data final_post_data = full_data
@@ -1935,7 +1993,9 @@ class DownloadThread(QThread):
project_root_dir=None, project_root_dir=None,
processed_post_ids=None, processed_post_ids=None,
start_offset=0, start_offset=0,
fetch_first=False): fetch_first=False,
skip_file_size_mb=None
):
super().__init__() super().__init__()
self.api_url_input = api_url_input self.api_url_input = api_url_input
self.output_dir = output_dir self.output_dir = output_dir
@@ -2002,6 +2062,7 @@ class DownloadThread(QThread):
self.processed_post_ids_set = set(processed_post_ids) if processed_post_ids is not None else set() self.processed_post_ids_set = set(processed_post_ids) if processed_post_ids is not None else set()
self.start_offset = start_offset self.start_offset = start_offset
self.fetch_first = fetch_first self.fetch_first = fetch_first
self.skip_file_size_mb = skip_file_size_mb
if self.compress_images and Image is None: if self.compress_images and Image is None:
self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).") self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
@@ -2122,6 +2183,7 @@ class DownloadThread(QThread):
'single_pdf_mode': self.single_pdf_mode, 'single_pdf_mode': self.single_pdf_mode,
'multipart_parts_count': self.multipart_parts_count, 'multipart_parts_count': self.multipart_parts_count,
'multipart_min_size_mb': self.multipart_min_size_mb, 'multipart_min_size_mb': self.multipart_min_size_mb,
'skip_file_size_mb': self.skip_file_size_mb,
'project_root_dir': self.project_root_dir, 'project_root_dir': self.project_root_dir,
} }

View File

@@ -281,7 +281,7 @@ class DownloaderApp (QWidget ):
self.download_location_label_widget = None self.download_location_label_widget = None
self.remove_from_filename_label_widget = None self.remove_from_filename_label_widget = None
self.skip_words_label_widget = None self.skip_words_label_widget = None
self.setWindowTitle("Kemono Downloader v6.3.1") self.setWindowTitle("Kemono Downloader v6.4.2")
setup_ui(self) setup_ui(self)
self._connect_signals() self._connect_signals()
self.log_signal.emit(" Local API server functionality has been removed.") self.log_signal.emit(" Local API server functionality has been removed.")
@@ -3354,7 +3354,8 @@ class DownloaderApp (QWidget ):
'pause_event': self.pause_event, 'cancellation_event': self.cancellation_event, 'pause_event': self.pause_event, 'cancellation_event': self.cancellation_event,
'downloaded_files': self.downloaded_files, 'downloaded_file_hashes': self.downloaded_file_hashes, 'downloaded_files': self.downloaded_files, 'downloaded_file_hashes': self.downloaded_file_hashes,
'downloaded_files_lock': self.downloaded_files_lock, 'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock, 'downloaded_files_lock': self.downloaded_files_lock, 'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock,
'skip_words_list': [word.strip().lower() for word in self.skip_words_input.text().strip().split(',') if word.strip()], 'skip_words_list': [part.strip().lower() for part in self.skip_words_input.text().strip().split(',') if part.strip() and not part.strip().startswith('[')],
'skip_file_size_mb': next((int(re.search(r'\[(\d+)\]', part).group(1)) for part in self.skip_words_input.text().strip().split(',') if re.fullmatch(r'\[\d+\]', part.strip())), None),
'skip_words_scope': self.get_skip_words_scope(), 'char_filter_scope': self.get_char_filter_scope(), 'skip_words_scope': self.get_skip_words_scope(), 'char_filter_scope': self.get_char_filter_scope(),
'remove_from_filename_words_list': [word.strip() for word in self.remove_from_filename_input.text().strip().split(',') if word.strip()], 'remove_from_filename_words_list': [word.strip() for word in self.remove_from_filename_input.text().strip().split(',') if word.strip()],
'scan_content_for_images': self.scan_content_images_checkbox.isChecked(), 'scan_content_for_images': self.scan_content_images_checkbox.isChecked(),
@@ -3523,8 +3524,19 @@ class DownloaderApp (QWidget ):
self.thread_count_input.selectAll() self.thread_count_input.selectAll()
return False return False
raw_skip_words = self.skip_words_input.text().strip() raw_skip_words_text = self.skip_words_input.text().strip()
skip_words_list = [word.strip().lower() for word in raw_skip_words.split(',') if word.strip()] skip_words_parts = [part.strip() for part in raw_skip_words_text.split(',') if part.strip()]
skip_words_list = []
skip_file_size_mb = None
size_pattern = re.compile(r'\[(\d+)\]')
for part in skip_words_parts:
match = size_pattern.fullmatch(part)
if match:
skip_file_size_mb = int(match.group(1))
self.log_signal.emit(f" File size skip rule found: Will skip files smaller than {skip_file_size_mb} MB.")
else:
skip_words_list.append(part.lower())
raw_remove_filename_words = self.remove_from_filename_input.text().strip() if hasattr(self, 'remove_from_filename_input') else "" raw_remove_filename_words = self.remove_from_filename_input.text().strip() if hasattr(self, 'remove_from_filename_input') else ""
allow_multipart = self.allow_multipart_download_setting allow_multipart = self.allow_multipart_download_setting
@@ -3891,6 +3903,7 @@ class DownloaderApp (QWidget ):
'downloaded_file_hashes': self.downloaded_file_hashes, 'downloaded_file_hashes': self.downloaded_file_hashes,
'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock, 'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock,
'skip_words_list': skip_words_list, 'skip_words_list': skip_words_list,
'skip_file_size_mb': skip_file_size_mb,
'skip_words_scope': current_skip_words_scope, 'skip_words_scope': current_skip_words_scope,
'remove_from_filename_words_list': remove_from_filename_words_list, 'remove_from_filename_words_list': remove_from_filename_words_list,
'char_filter_scope': current_char_filter_scope, 'char_filter_scope': current_char_filter_scope,
@@ -5484,7 +5497,8 @@ class DownloaderApp (QWidget ):
'downloaded_files_lock': self.downloaded_files_lock, 'downloaded_files_lock': self.downloaded_files_lock,
'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock, 'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock,
'dynamic_character_filter_holder': self.dynamic_character_filter_holder, 'dynamic_character_filter_holder': self.dynamic_character_filter_holder,
'skip_words_list': [word.strip().lower() for word in self.skip_words_input.text().strip().split(',') if word.strip()], 'skip_words_list': [part.strip().lower() for part in self.skip_words_input.text().strip().split(',') if part.strip() and not part.strip().startswith('[')],
'skip_file_size_mb': next((int(re.search(r'\[(\d+)\]', part).group(1)) for part in self.skip_words_input.text().strip().split(',') if re.fullmatch(r'\[\d+\]', part.strip())), None),
'skip_words_scope': self.get_skip_words_scope(), 'skip_words_scope': self.get_skip_words_scope(),
'show_external_links': self.external_links_checkbox.isChecked(), 'show_external_links': self.external_links_checkbox.isChecked(),
'extract_links_only': self.radio_only_links.isChecked(), 'extract_links_only': self.radio_only_links.isChecked(),