Update readme.md

2025-12-29 16:14:44 +00:00 · 2025-08-11 09:31:53 -07:00
4 changed files with 39 additions and 143 deletions
--- a/readme.md
+++ b/readme.md
@@ -99,7 +99,7 @@ Built with PyQt5, this tool is designed for users who want deep filtering capabi
 ### Install Dependencies
 ```bash
-pip install PyQt5 requests Pillow mega.py fpdf2 python-docx
+pip install PyQt5 requests Pillow mega.py fpdf python-docx
 ```
 ### Running the Application
--- a/src/core/api_client.py
+++ b/src/core/api_client.py
@@ -12,6 +12,7 @@ from ..config.constants import (
 def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_event=None, pause_event=None, cookies_dict=None):
    """
    Fetches a single page of posts from the API with robust retry logic.
    NEW: Requests only essential fields to keep the response size small and reliable.
    """
    if cancellation_event and cancellation_event.is_set():
        raise RuntimeError("Fetch operation cancelled by user.")
@@ -32,7 +33,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
        if cancellation_event and cancellation_event.is_set():
            raise RuntimeError("Fetch operation cancelled by user during retry loop.")
-        log_message = f"   Fetching post list: {paginated_url} (Page approx. {offset // 50 + 1})"
+        log_message = f"   Fetching post list: {api_url_base}?o={offset} (Page approx. {offset // 50 + 1})"
        if attempt > 0:
            log_message += f" (Attempt {attempt + 1}/{max_retries})"
        logger(log_message)
@@ -40,23 +41,9 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
        try:
            response = requests.get(paginated_url, headers=headers, timeout=(15, 60), cookies=cookies_dict)
            response.raise_for_status()
            response.encoding = 'utf-8'  
            return response.json()
        except requests.exceptions.RequestException as e:
            # Handle 403 error on the FIRST page as a rate limit/block
            if e.response is not None and e.response.status_code == 403 and offset == 0:
                logger("   ❌ Access Denied (403 Forbidden) on the first page.")
                logger("      This is likely a rate limit or a Cloudflare block.")
                logger("      💡 SOLUTION: Wait a while, use a VPN, or provide a valid session cookie.")
                return [] # Stop the process gracefully
            # Handle 400 error as the end of pages
            if e.response is not None and e.response.status_code == 400:
                logger(f"   ✅ Reached end of posts (API returned 400 Bad Request for offset {offset}).")
                return []
            # Handle all other network errors with a retry
            logger(f"   ⚠️ Retryable network error on page fetch (Attempt {attempt + 1}): {e}")
            if attempt < max_retries - 1:
                delay = retry_delay * (2 ** attempt)
@@ -78,6 +65,7 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
    raise RuntimeError(f"Failed to fetch page {paginated_url} after all attempts.")
 def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logger, cookies_dict=None):
    """
    --- NEW FUNCTION ---
@@ -93,11 +81,8 @@ def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logge
                response_body += chunk
            full_post_data = json.loads(response_body)
            if isinstance(full_post_data, list) and full_post_data:
                return full_post_data[0]
            if isinstance(full_post_data, dict) and 'post' in full_post_data:
                return full_post_data['post'] 
            return full_post_data
    except Exception as e:
@@ -116,7 +101,6 @@ def fetch_post_comments(api_domain, service, user_id, post_id, headers, logger,
    try:
        response = requests.get(comments_api_url, headers=headers, timeout=(10, 30), cookies=cookies_dict)
        response.raise_for_status()
        response.encoding = 'utf-8'          
        return response.json()
    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"Error fetching comments for post {post_id}: {e}")
@@ -139,16 +123,10 @@ def download_from_api(
    processed_post_ids=None,
    fetch_all_first=False  
 ):
    # FIX: Define api_domain FIRST, before it is used in the headers
    parsed_input_url_for_domain = urlparse(api_url_input)
    api_domain = parsed_input_url_for_domain.netloc
    headers = {
-        'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
+        'User-Agent': 'Mozilla/5.0',
-        'Referer': f'https://{api_domain}/',
+        'Accept': 'application/json'
        'Accept': 'text/css'
    }
    if processed_post_ids is None:
        processed_post_ids = set()
    else:
@@ -160,11 +138,15 @@ def download_from_api(
        logger("   Download_from_api cancelled at start.")
        return
-    # The code that defined api_domain was moved from here to the top of the function
+    parsed_input_url_for_domain = urlparse(api_url_input)
    api_domain = parsed_input_url_for_domain.netloc
    # --- START: MODIFIED LOGIC ---
    # This list is updated to include the new .cr and .st mirrors for validation.
    if not any(d in api_domain.lower() for d in ['kemono.su', 'kemono.party', 'kemono.cr', 'coomer.su', 'coomer.party', 'coomer.st']):
        logger(f"⚠️ Unrecognized domain '{api_domain}' from input URL. Defaulting to kemono.su for API calls.")
        api_domain = "kemono.su"
    # --- END: MODIFIED LOGIC ---
    cookies_for_api = None
    if use_cookie and app_base_dir:
@@ -178,7 +160,6 @@ def download_from_api(
        try:
            direct_response = requests.get(direct_post_api_url, headers=headers, timeout=(10, 30), cookies=cookies_for_api)
            direct_response.raise_for_status()
            direct_response.encoding = 'utf-8' 
            direct_post_data = direct_response.json()
            if isinstance(direct_post_data, list) and direct_post_data:
                direct_post_data = direct_post_data[0]
@@ -204,7 +185,7 @@ def download_from_api(
    is_manga_mode_fetch_all_and_sort_oldest_first = manga_mode and (manga_filename_style_for_sort_check != STYLE_DATE_POST_TITLE) and not target_post_id
    should_fetch_all = fetch_all_first or is_manga_mode_fetch_all_and_sort_oldest_first  
-    api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/posts"
+    api_base_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}"
    page_size = 50
    if is_manga_mode_fetch_all_and_sort_oldest_first:
        logger(f"   Manga Mode (Style: {manga_filename_style_for_sort_check if manga_filename_style_for_sort_check else 'Default'} - Oldest First Sort Active): Fetching all posts to sort by date...")
@@ -375,4 +356,3 @@ def download_from_api(
        time.sleep(0.6)
    if target_post_id and not processed_target_post_flag and not (cancellation_event and cancellation_event.is_set()):
        logger(f"❌ Target post {target_post_id} could not be found after checking all relevant pages (final check after loop).")
--- a/src/core/workers.py
+++ b/src/core/workers.py
@@ -37,7 +37,7 @@ try:
 except ImportError:
    Document = None
 from PyQt5 .QtCore import Qt ,QThread ,pyqtSignal ,QMutex ,QMutexLocker ,QObject ,QTimer ,QSettings ,QStandardPaths ,QCoreApplication ,QUrl ,QSize ,QProcess 
-from .api_client import download_from_api, fetch_post_comments, fetch_single_post_data
+from .api_client import download_from_api, fetch_post_comments
 from ..services.multipart_downloader import download_file_in_parts, MULTIPART_DOWNLOADER_AVAILABLE
 from ..services.drive_downloader import (
    download_mega_file, download_gdrive_file, download_dropbox_file
@@ -124,8 +124,7 @@ class PostProcessorWorker:
                 processed_post_ids=None,
                 multipart_scope='both', 
                 multipart_parts_count=4, 
-                 multipart_min_size_mb=100,
+                 multipart_min_size_mb=100 
                 skip_file_size_mb=None 
                 ):
        self.post = post_data
        self.download_root = download_root
@@ -190,7 +189,6 @@ class PostProcessorWorker:
        self.multipart_scope = multipart_scope 
        self.multipart_parts_count = multipart_parts_count 
        self.multipart_min_size_mb = multipart_min_size_mb 
        self.skip_file_size_mb = skip_file_size_mb
        if self.compress_images and Image is None:
            self.logger("⚠️ Image compression disabled: Pillow library not found.")
            self.compress_images = False
@@ -270,7 +268,7 @@ class PostProcessorWorker:
            return 0, 1, "", False, FILE_DOWNLOAD_STATUS_SKIPPED, None
        file_download_headers = {
-            'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
            'Referer': post_page_url
        }
@@ -279,24 +277,6 @@ class PostProcessorWorker:
        if self.use_cookie:
            cookies_to_use_for_file = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger)
        if self.skip_file_size_mb is not None:
                api_original_filename_for_size_check = file_info.get('_original_name_for_log', file_info.get('name'))
                try:
                        # Use a stream=True HEAD request to get headers without downloading the body
                        with requests.head(file_url, headers=file_download_headers, timeout=15, cookies=cookies_to_use_for_file, allow_redirects=True) as head_response:
                                head_response.raise_for_status()
                                content_length = head_response.headers.get('Content-Length')
                                if content_length:
                                        file_size_bytes = int(content_length)
                                        file_size_mb = file_size_bytes / (1024 * 1024)
                                        if file_size_mb < self.skip_file_size_mb:
                                                self.logger(f"   -> Skip File (Size): '{api_original_filename_for_size_check}' is {file_size_mb:.2f} MB, which is smaller than the {self.skip_file_size_mb} MB limit.")
                                                return 0, 1, api_original_filename_for_size_check, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
                                else:
                                        self.logger(f"   ⚠️ Could not determine file size for '{api_original_filename_for_size_check}' to check against size limit. Proceeding with download.")
                except requests.RequestException as e:
                        self.logger(f"   ⚠️ Could not fetch file headers to check size for '{api_original_filename_for_size_check}': {e}. Proceeding with download.")
        api_original_filename = file_info.get('_original_name_for_log', file_info.get('name'))
        filename_to_save_in_main_path = ""
        if forced_filename_override:
@@ -508,18 +488,19 @@ class PostProcessorWorker:
                except requests.RequestException as e:
                    self.logger(f"   ⚠️ Could not verify size of existing file '{filename_to_save_in_main_path}': {e}. Proceeding with download.")
        max_retries = 3
        retry_delay = 5
        downloaded_size_bytes = 0
        calculated_file_hash = None
        downloaded_part_file_path = None
        total_size_bytes = 0
        download_successful_flag = False
        last_exception_for_retry_later = None
        is_permanent_error = False
        data_to_write_io = None
        response_for_this_attempt = None
        for attempt_num_single_stream in range(max_retries + 1):
-            response = None
+            response_for_this_attempt = None
            if self._check_pause(f"File download attempt for '{api_original_filename}'"): break
            if self.check_cancel() or (skip_event and skip_event.is_set()): break
            try:
@@ -538,24 +519,12 @@ class PostProcessorWorker:
                    new_url = self._find_valid_subdomain(current_url_to_try)
                    if new_url != current_url_to_try:
                        self.logger(f"   Retrying with new URL: {new_url}")
-                        file_url = new_url
+                        file_url = new_url # Update the main file_url for subsequent retries
                        response.close() # Close the old response
                        response = requests.get(new_url, headers=file_download_headers, timeout=(30, 300), stream=True, cookies=cookies_to_use_for_file)
                response.raise_for_status()
                # --- REVISED AND MOVED SIZE CHECK LOGIC ---
                total_size_bytes = int(response.headers.get('Content-Length', 0))
                if self.skip_file_size_mb is not None:
                    if total_size_bytes > 0:
                        file_size_mb = total_size_bytes / (1024 * 1024)
                        if file_size_mb < self.skip_file_size_mb:
                            self.logger(f"   -> Skip File (Size): '{api_original_filename}' is {file_size_mb:.2f} MB, which is smaller than the {self.skip_file_size_mb} MB limit.")
                            return 0, 1, api_original_filename, False, FILE_DOWNLOAD_STATUS_SKIPPED, None
                    # If Content-Length is missing, we can't check, so we no longer log a warning here and just proceed.
                # --- END OF REVISED LOGIC ---
                num_parts_for_file = min(self.multipart_parts_count, MAX_PARTS_FOR_MULTIPART_DOWNLOAD)
                file_is_eligible_by_scope = False
@@ -579,7 +548,9 @@ class PostProcessorWorker:
                if self._check_pause(f"Multipart decision for '{api_original_filename}'"): break
                if attempt_multipart:
-                    response.close() # Close the initial connection before starting multipart
+                    if response_for_this_attempt:
                        response_for_this_attempt.close()
                        response_for_this_attempt = None
                    mp_save_path_for_unique_part_stem_arg = os.path.join(target_folder_path, f"{unique_part_file_stem_on_disk}{temp_file_ext_for_unique_part}")
                    mp_success, mp_bytes, mp_hash, mp_file_handle = download_file_in_parts(
                        file_url, mp_save_path_for_unique_part_stem_arg, total_size_bytes, num_parts_for_file, file_download_headers, api_original_filename,
@@ -605,6 +576,7 @@ class PostProcessorWorker:
                    current_attempt_downloaded_bytes = 0
                    md5_hasher = hashlib.md5()
                    last_progress_time = time.time()
                    single_stream_exception = None
                    try:
                        with open(current_single_stream_part_path, 'wb') as f_part:
                            for chunk in response.iter_content(chunk_size=1 * 1024 * 1024):
@@ -671,8 +643,8 @@ class PostProcessorWorker:
                is_permanent_error = True                
                break
            finally:
-                if response:
+                if response_for_this_attempt:
-                    response.close()
+                    response_for_this_attempt.close()
                self._emit_signal('file_download_status', False)
        final_total_for_progress = total_size_bytes if download_successful_flag and total_size_bytes > 0 else downloaded_size_bytes
@@ -876,37 +848,6 @@ class PostProcessorWorker:
            post_data = self.post  # Reference to the post object
            log_prefix = "Post"
        # --- FIX: FETCH FULL POST DATA IF CONTENT IS MISSING BUT NEEDED ---
        content_is_needed = (
            self.show_external_links or
            self.extract_links_only or
            self.scan_content_for_images or
            (self.filter_mode == 'text_only' and self.text_only_scope == 'content')
        )
        if content_is_needed and self.post.get('content') is None and self.service != 'discord':
            self.logger(f"   Post {post_id} is missing 'content' field, fetching full data...")
            parsed_url = urlparse(self.api_url_input)
            api_domain = parsed_url.netloc
            headers = {'User-Agent': 'Mozilla/5.0'}
            cookies = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger, target_domain=api_domain)
            full_post_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
            if full_post_data:
                self.logger("   ✅ Full post data fetched successfully.")
                # Update the worker's post object with the complete data
                self.post = full_post_data
                # Re-initialize local variables from the new, complete post data
                post_title = self.post.get('title', '') or 'untitled_post'
                post_main_file_info = self.post.get('file')
                post_attachments = self.post.get('attachments', [])
                post_content_html = self.post.get('content', '')
                post_data = self.post
            else:
                self.logger(f"   ⚠️ Failed to fetch full content for post {post_id}. Content-dependent features may not work for this post.")
        # --- END FIX ---
        # 2. SHARED PROCESSING LOGIC: The rest of the function now uses the consistent variables from above.
        result_tuple = (0, 0, [], [], [], None, None)
        total_downloaded_this_post = 0
@@ -1317,6 +1258,7 @@ class PostProcessorWorker:
                    parsed_url = urlparse(self.api_url_input)
                    api_domain = parsed_url.netloc
                    cookies = prepare_cookies_for_request(self.use_cookie, self.cookie_text, self.selected_cookie_file, self.app_base_dir, self.logger, target_domain=api_domain)
                    from .api_client import fetch_single_post_data
                    full_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
                    if full_data:
                        final_post_data = full_data
@@ -1993,9 +1935,7 @@ class DownloadThread(QThread):
                 project_root_dir=None,
                 processed_post_ids=None,
                 start_offset=0,
-                 fetch_first=False,
+                 fetch_first=False): 
                 skip_file_size_mb=None
                 ): 
        super().__init__()
        self.api_url_input = api_url_input
        self.output_dir = output_dir
@@ -2062,7 +2002,6 @@ class DownloadThread(QThread):
        self.processed_post_ids_set = set(processed_post_ids) if processed_post_ids is not None else set() 
        self.start_offset = start_offset 
        self.fetch_first = fetch_first
        self.skip_file_size_mb = skip_file_size_mb
        if self.compress_images and Image is None:
            self.logger("⚠️ Image compression disabled: Pillow library not found (DownloadThread).")
@@ -2183,7 +2122,6 @@ class DownloadThread(QThread):
                        'single_pdf_mode': self.single_pdf_mode,
                        'multipart_parts_count': self.multipart_parts_count, 
                        'multipart_min_size_mb': self.multipart_min_size_mb, 
                        'skip_file_size_mb': self.skip_file_size_mb, 
                        'project_root_dir': self.project_root_dir,
                    }
--- a/src/ui/main_window.py
+++ b/src/ui/main_window.py
@@ -281,7 +281,7 @@ class DownloaderApp (QWidget ):
        self.download_location_label_widget = None
        self.remove_from_filename_label_widget = None
        self.skip_words_label_widget = None
-        self.setWindowTitle("Kemono Downloader v6.4.3")
+        self.setWindowTitle("Kemono Downloader v6.3.1")
        setup_ui(self)
        self._connect_signals()
        self.log_signal.emit("ℹ️ Local API server functionality has been removed.")
@@ -688,12 +688,8 @@ class DownloaderApp (QWidget ):
            return
        self.fetched_posts_for_download = fetched_posts
-        self.is_ready_to_download_fetched = True
+        self.is_ready_to_download_fetched = True  # <-- ADD THIS LINE
        self.log_signal.emit(f"✅ Fetch complete. Found {len(self.fetched_posts_for_download)} posts.")
        self.log_signal.emit("=" * 40)
        self.log_signal.emit("✅ Stage 1 complete. All post data has been fetched.")
        self.log_signal.emit("   💡 You can now disconnect your VPN (if used) before starting the download.")
        self.log_signal.emit("   Press the 'Start Download' button to begin Stage 2: Downloading files.")
        self.progress_label.setText(f"Found {len(self.fetched_posts_for_download)} posts. Ready to download.")
        self._update_button_states_and_connections()
@@ -704,9 +700,7 @@ class DownloaderApp (QWidget ):
        Initiates the download of the posts that were previously fetched.
        """
        self.is_ready_to_download_fetched = False  # Reset the state flag
-        self.log_signal.emit("=" * 40)
+        self.log_signal.emit(f"🚀 Starting download of {len(self.fetched_posts_for_download)} fetched posts...")
        self.log_signal.emit(f"🚀 Starting Stage 2: Downloading files for {len(self.fetched_posts_for_download)} fetched posts.")
        self.log_signal.emit("   💡 If you disconnected your VPN, downloads will now use your regular connection.")
        # Manually set the UI to a "downloading" state for reliability
        self.set_ui_enabled(False)
@@ -3360,8 +3354,7 @@ class DownloaderApp (QWidget ):
                            'pause_event': self.pause_event, 'cancellation_event': self.cancellation_event,
                            'downloaded_files': self.downloaded_files, 'downloaded_file_hashes': self.downloaded_file_hashes,
                            'downloaded_files_lock': self.downloaded_files_lock, 'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock,
-                            'skip_words_list': [part.strip().lower() for part in self.skip_words_input.text().strip().split(',') if part.strip() and not part.strip().startswith('[')],
+                            'skip_words_list': [word.strip().lower() for word in self.skip_words_input.text().strip().split(',') if word.strip()],
                            'skip_file_size_mb': next((int(re.search(r'\[(\d+)\]', part).group(1)) for part in self.skip_words_input.text().strip().split(',') if re.fullmatch(r'\[\d+\]', part.strip())), None),
                            'skip_words_scope': self.get_skip_words_scope(), 'char_filter_scope': self.get_char_filter_scope(),
                            'remove_from_filename_words_list': [word.strip() for word in self.remove_from_filename_input.text().strip().split(',') if word.strip()],
                            'scan_content_for_images': self.scan_content_images_checkbox.isChecked(),
@@ -3530,19 +3523,8 @@ class DownloaderApp (QWidget ):
                    self.thread_count_input.selectAll()
                    return False
-        raw_skip_words_text = self.skip_words_input.text().strip()
+        raw_skip_words = self.skip_words_input.text().strip()
-        skip_words_parts = [part.strip() for part in raw_skip_words_text.split(',') if part.strip()]
+        skip_words_list = [word.strip().lower() for word in raw_skip_words.split(',') if word.strip()]
        skip_words_list = []
        skip_file_size_mb = None
        size_pattern = re.compile(r'\[(\d+)\]')
        for part in skip_words_parts:
            match = size_pattern.fullmatch(part)
            if match:
                skip_file_size_mb = int(match.group(1))
                self.log_signal.emit(f"ℹ️ File size skip rule found: Will skip files smaller than {skip_file_size_mb} MB.")
            else:
                skip_words_list.append(part.lower())
        raw_remove_filename_words = self.remove_from_filename_input.text().strip() if hasattr(self, 'remove_from_filename_input') else ""
        allow_multipart = self.allow_multipart_download_setting
@@ -3909,7 +3891,6 @@ class DownloaderApp (QWidget ):
            'downloaded_file_hashes': self.downloaded_file_hashes,
            'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock,
            'skip_words_list': skip_words_list,
            'skip_file_size_mb': skip_file_size_mb,
            'skip_words_scope': current_skip_words_scope,
            'remove_from_filename_words_list': remove_from_filename_words_list,
            'char_filter_scope': current_char_filter_scope,
@@ -3960,9 +3941,7 @@ class DownloaderApp (QWidget ):
        self.last_start_download_args = args_template.copy()
        if fetch_first_enabled and not post_id_from_url:
-            self.log_signal.emit("=" * 40)
+            self.log_signal.emit("🚀 Starting Stage 1: Fetching all pages...")
            self.log_signal.emit("🚀 'Fetch First' mode is active. Starting Stage 1: Fetching all post data.")
            self.log_signal.emit("   💡 If you are using a VPN for this stage, ensure it is connected now.")
            self.is_fetching_only = True
            self.set_ui_enabled(False)
            self._update_button_states_and_connections()
@@ -5505,8 +5484,7 @@ class DownloaderApp (QWidget ):
            'downloaded_files_lock': self.downloaded_files_lock,
            'downloaded_file_hashes_lock': self.downloaded_file_hashes_lock,
            'dynamic_character_filter_holder': self.dynamic_character_filter_holder,
-            'skip_words_list': [part.strip().lower() for part in self.skip_words_input.text().strip().split(',') if part.strip() and not part.strip().startswith('[')],
+            'skip_words_list': [word.strip().lower() for word in self.skip_words_input.text().strip().split(',') if word.strip()],
            'skip_file_size_mb': next((int(re.search(r'\[(\d+)\]', part).group(1)) for part in self.skip_words_input.text().strip().split(',') if re.fullmatch(r'\[\d+\]', part.strip())), None),
            'skip_words_scope': self.get_skip_words_scope(), 
            'show_external_links': self.external_links_checkbox.isChecked(),
            'extract_links_only': self.radio_only_links.isChecked(),