mirror of
https://github.com/Yuvi9587/Kemono-Downloader.git
synced 2025-12-29 16:14:44 +00:00
Commit
This commit is contained in:
@@ -3,8 +3,6 @@ import traceback
|
||||
from urllib.parse import urlparse
|
||||
import json # Ensure json is imported
|
||||
import requests
|
||||
|
||||
# (Keep the rest of your imports)
|
||||
from ..utils.network_utils import extract_post_info, prepare_cookies_for_request
|
||||
from ..config.constants import (
|
||||
STYLE_DATE_POST_TITLE
|
||||
@@ -25,9 +23,6 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
|
||||
raise RuntimeError("Fetch operation cancelled by user while paused.")
|
||||
time.sleep(0.5)
|
||||
logger(" Post fetching resumed.")
|
||||
|
||||
# --- MODIFICATION: Added `fields` to the URL to request only metadata ---
|
||||
# This prevents the large 'content' field from being included in the list, avoiding timeouts.
|
||||
fields_to_request = "id,user,service,title,shared_file,added,published,edited,file,attachments,tags"
|
||||
paginated_url = f'{api_url_base}?o={offset}&fields={fields_to_request}'
|
||||
|
||||
@@ -44,7 +39,6 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
|
||||
logger(log_message)
|
||||
|
||||
try:
|
||||
# We can now remove the streaming logic as the response will be small and fast.
|
||||
response = requests.get(paginated_url, headers=headers, timeout=(15, 60), cookies=cookies_dict)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
@@ -80,7 +74,6 @@ def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logge
|
||||
post_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{post_id}"
|
||||
logger(f" Fetching full content for post ID {post_id}...")
|
||||
try:
|
||||
# Use streaming here as a precaution for single posts that are still very large.
|
||||
with requests.get(post_api_url, headers=headers, timeout=(15, 300), cookies=cookies_dict, stream=True) as response:
|
||||
response.raise_for_status()
|
||||
response_body = b""
|
||||
@@ -88,7 +81,6 @@ def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logge
|
||||
response_body += chunk
|
||||
|
||||
full_post_data = json.loads(response_body)
|
||||
# The API sometimes wraps the post in a list, handle that.
|
||||
if isinstance(full_post_data, list) and full_post_data:
|
||||
return full_post_data[0]
|
||||
return full_post_data
|
||||
@@ -134,14 +126,10 @@ def download_from_api(
|
||||
'User-Agent': 'Mozilla/5.0',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
# --- ADD THIS BLOCK ---
|
||||
# Ensure processed_post_ids is a set for fast lookups
|
||||
if processed_post_ids is None:
|
||||
processed_post_ids = set()
|
||||
else:
|
||||
processed_post_ids = set(processed_post_ids)
|
||||
# --- END OF ADDITION ---
|
||||
|
||||
service, user_id, target_post_id = extract_post_info(api_url_input)
|
||||
|
||||
@@ -158,11 +146,9 @@ def download_from_api(
|
||||
if use_cookie and app_base_dir:
|
||||
cookies_for_api = prepare_cookies_for_request(use_cookie, cookie_text, selected_cookie_file, app_base_dir, logger, target_domain=api_domain)
|
||||
if target_post_id:
|
||||
# --- ADD THIS CHECK FOR RESTORE ---
|
||||
if target_post_id in processed_post_ids:
|
||||
logger(f" Skipping already processed target post ID: {target_post_id}")
|
||||
return
|
||||
# --- END OF ADDITION ---
|
||||
direct_post_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{target_post_id}"
|
||||
logger(f" Attempting direct fetch for target post: {direct_post_api_url}")
|
||||
try:
|
||||
@@ -248,14 +234,12 @@ def download_from_api(
|
||||
break
|
||||
if cancellation_event and cancellation_event.is_set(): return
|
||||
if all_posts_for_manga_mode:
|
||||
# --- ADD THIS BLOCK TO FILTER POSTS IN MANGA MODE ---
|
||||
if processed_post_ids:
|
||||
original_count = len(all_posts_for_manga_mode)
|
||||
all_posts_for_manga_mode = [post for post in all_posts_for_manga_mode if post.get('id') not in processed_post_ids]
|
||||
skipped_count = original_count - len(all_posts_for_manga_mode)
|
||||
if skipped_count > 0:
|
||||
logger(f" Manga Mode: Skipped {skipped_count} already processed post(s) before sorting.")
|
||||
# --- END OF ADDITION ---
|
||||
|
||||
logger(f" Manga Mode: Fetched {len(all_posts_for_manga_mode)} total posts. Sorting by publication date (oldest first)...")
|
||||
def sort_key_tuple(post):
|
||||
@@ -326,15 +310,12 @@ def download_from_api(
|
||||
logger(f"❌ Unexpected error fetching page {current_page_num} (offset {current_offset}): {e}")
|
||||
traceback.print_exc()
|
||||
break
|
||||
|
||||
# --- ADD THIS BLOCK TO FILTER POSTS IN STANDARD MODE ---
|
||||
if processed_post_ids:
|
||||
original_count = len(posts_batch)
|
||||
posts_batch = [post for post in posts_batch if post.get('id') not in processed_post_ids]
|
||||
skipped_count = original_count - len(posts_batch)
|
||||
if skipped_count > 0:
|
||||
logger(f" Skipped {skipped_count} already processed post(s) from page {current_page_num}.")
|
||||
# --- END OF ADDITION ---
|
||||
|
||||
if not posts_batch:
|
||||
if target_post_id and not processed_target_post_flag:
|
||||
|
||||
@@ -1,13 +1,9 @@
|
||||
# --- Standard Library Imports ---
|
||||
import threading
|
||||
import time
|
||||
import os
|
||||
import json
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed, Future
|
||||
|
||||
# --- Local Application Imports ---
|
||||
# These imports reflect the new, organized project structure.
|
||||
from .api_client import download_from_api
|
||||
from .workers import PostProcessorWorker, DownloadThread
|
||||
from ..config.constants import (
|
||||
@@ -36,8 +32,6 @@ class DownloadManager:
|
||||
self.progress_queue = progress_queue
|
||||
self.thread_pool = None
|
||||
self.active_futures = []
|
||||
|
||||
# --- Session State ---
|
||||
self.cancellation_event = threading.Event()
|
||||
self.pause_event = threading.Event()
|
||||
self.is_running = False
|
||||
@@ -64,8 +58,6 @@ class DownloadManager:
|
||||
if self.is_running:
|
||||
self._log("❌ Cannot start a new session: A session is already in progress.")
|
||||
return
|
||||
|
||||
# --- Reset state for the new session ---
|
||||
self.is_running = True
|
||||
self.cancellation_event.clear()
|
||||
self.pause_event.clear()
|
||||
@@ -75,8 +67,6 @@ class DownloadManager:
|
||||
self.total_downloads = 0
|
||||
self.total_skips = 0
|
||||
self.all_kept_original_filenames = []
|
||||
|
||||
# --- Decide execution strategy (multi-threaded vs. single-threaded) ---
|
||||
is_single_post = bool(config.get('target_post_id_from_initial_url'))
|
||||
use_multithreading = config.get('use_multithreading', True)
|
||||
is_manga_sequential = config.get('manga_mode_active') and config.get('manga_filename_style') in [STYLE_DATE_BASED, STYLE_POST_TITLE_GLOBAL_NUMBERING]
|
||||
@@ -84,7 +74,6 @@ class DownloadManager:
|
||||
should_use_multithreading_for_posts = use_multithreading and not is_single_post and not is_manga_sequential
|
||||
|
||||
if should_use_multithreading_for_posts:
|
||||
# Start a separate thread to manage fetching and queuing to the thread pool
|
||||
fetcher_thread = threading.Thread(
|
||||
target=self._fetch_and_queue_posts_for_pool,
|
||||
args=(config, restore_data),
|
||||
@@ -92,16 +81,11 @@ class DownloadManager:
|
||||
)
|
||||
fetcher_thread.start()
|
||||
else:
|
||||
# For single posts or sequential manga mode, use a single worker thread
|
||||
# which is simpler and ensures order.
|
||||
self._start_single_threaded_session(config)
|
||||
|
||||
def _start_single_threaded_session(self, config):
|
||||
"""Handles downloads that are best processed by a single worker thread."""
|
||||
self._log("ℹ️ Initializing single-threaded download process...")
|
||||
|
||||
# The original DownloadThread is now a pure Python thread, not a QThread.
|
||||
# We run its `run` method in a standard Python thread.
|
||||
self.worker_thread = threading.Thread(
|
||||
target=self._run_single_worker,
|
||||
args=(config,),
|
||||
@@ -112,7 +96,6 @@ class DownloadManager:
|
||||
def _run_single_worker(self, config):
|
||||
"""Target function for the single-worker thread."""
|
||||
try:
|
||||
# Pass the queue directly to the worker for it to send updates
|
||||
worker = DownloadThread(config, self.progress_queue)
|
||||
worker.run() # This is the main blocking call for this thread
|
||||
except Exception as e:
|
||||
@@ -129,9 +112,6 @@ class DownloadManager:
|
||||
try:
|
||||
num_workers = min(config.get('num_threads', 4), MAX_THREADS)
|
||||
self.thread_pool = ThreadPoolExecutor(max_workers=num_workers, thread_name_prefix='PostWorker_')
|
||||
|
||||
# Fetch posts
|
||||
# In a real implementation, this would call `api_client.download_from_api`
|
||||
if restore_data:
|
||||
all_posts = restore_data['all_posts_data']
|
||||
processed_ids = set(restore_data['processed_post_ids'])
|
||||
@@ -149,12 +129,9 @@ class DownloadManager:
|
||||
if not posts_to_process:
|
||||
self._log("✅ No new posts to process.")
|
||||
return
|
||||
|
||||
# Submit tasks to the pool
|
||||
for post_data in posts_to_process:
|
||||
if self.cancellation_event.is_set():
|
||||
break
|
||||
# Each PostProcessorWorker gets the queue to send its own updates
|
||||
worker = PostProcessorWorker(post_data, config, self.progress_queue)
|
||||
future = self.thread_pool.submit(worker.process)
|
||||
future.add_done_callback(self._handle_future_result)
|
||||
@@ -164,27 +141,32 @@ class DownloadManager:
|
||||
self._log(f"❌ CRITICAL ERROR in post fetcher thread: {e}")
|
||||
self._log(traceback.format_exc())
|
||||
finally:
|
||||
# Wait for all submitted tasks to complete before shutting down
|
||||
if self.thread_pool:
|
||||
self.thread_pool.shutdown(wait=True)
|
||||
self.is_running = False
|
||||
self._log("🏁 All processing tasks have completed.")
|
||||
# Emit final signal
|
||||
self.progress_queue.put({
|
||||
'type': 'finished',
|
||||
'payload': (self.total_downloads, self.total_skips, self.cancellation_event.is_set(), self.all_kept_original_filenames)
|
||||
})
|
||||
|
||||
|
||||
def _get_all_posts(self, config):
|
||||
"""Helper to fetch all posts using the API client."""
|
||||
all_posts = []
|
||||
# This generator yields batches of posts
|
||||
post_generator = download_from_api(
|
||||
api_url_input=config['api_url'],
|
||||
logger=self._log,
|
||||
# ... pass other relevant config keys ...
|
||||
start_page=config.get('start_page'),
|
||||
end_page=config.get('end_page'),
|
||||
manga_mode=config.get('manga_mode_active', False),
|
||||
cancellation_event=self.cancellation_event,
|
||||
pause_event=self.pause_event
|
||||
pause_event=self.pause_event,
|
||||
use_cookie=config.get('use_cookie', False),
|
||||
cookie_text=config.get('cookie_text', ''),
|
||||
selected_cookie_file=config.get('selected_cookie_file'),
|
||||
app_base_dir=config.get('app_base_dir'),
|
||||
manga_filename_style_for_sort_check=config.get('manga_filename_style'),
|
||||
processed_post_ids=config.get('processed_post_ids', [])
|
||||
)
|
||||
for batch in post_generator:
|
||||
all_posts.extend(batch)
|
||||
@@ -203,14 +185,11 @@ class DownloadManager:
|
||||
self.total_skips += 1
|
||||
else:
|
||||
result = future.result()
|
||||
# Unpack result tuple from the worker
|
||||
(dl_count, skip_count, kept_originals,
|
||||
retryable, permanent, history) = result
|
||||
self.total_downloads += dl_count
|
||||
self.total_skips += skip_count
|
||||
self.all_kept_original_filenames.extend(kept_originals)
|
||||
|
||||
# Queue up results for UI to handle
|
||||
if retryable:
|
||||
self.progress_queue.put({'type': 'retryable_failure', 'payload': (retryable,)})
|
||||
if permanent:
|
||||
@@ -221,8 +200,6 @@ class DownloadManager:
|
||||
except Exception as e:
|
||||
self._log(f"❌ Worker task resulted in an exception: {e}")
|
||||
self.total_skips += 1 # Count errored posts as skipped
|
||||
|
||||
# Update overall progress
|
||||
self.progress_queue.put({'type': 'overall_progress', 'payload': (self.total_posts, self.processed_posts)})
|
||||
|
||||
def cancel_session(self):
|
||||
@@ -231,11 +208,7 @@ class DownloadManager:
|
||||
return
|
||||
self._log("⚠️ Cancellation requested by user...")
|
||||
self.cancellation_event.set()
|
||||
|
||||
# For single thread mode, the worker checks the event
|
||||
# For multi-thread mode, shut down the pool
|
||||
if self.thread_pool:
|
||||
# Don't wait, just cancel pending futures and let the fetcher thread exit
|
||||
self.thread_pool.shutdown(wait=False, cancel_futures=True)
|
||||
|
||||
self.is_running = False
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# --- Standard Library Imports ---
|
||||
import os
|
||||
import queue
|
||||
import re
|
||||
@@ -15,15 +14,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError,
|
||||
from io import BytesIO
|
||||
from urllib .parse import urlparse
|
||||
import requests
|
||||
# --- Third-Party Library Imports ---
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None
|
||||
#
|
||||
try:
|
||||
from fpdf import FPDF
|
||||
# Add a simple class to handle the header/footer for stories
|
||||
class PDF(FPDF):
|
||||
def header(self):
|
||||
pass # No header
|
||||
@@ -39,16 +35,12 @@ try:
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
Document = None
|
||||
|
||||
# --- PyQt5 Imports ---
|
||||
from PyQt5 .QtCore import Qt ,QThread ,pyqtSignal ,QMutex ,QMutexLocker ,QObject ,QTimer ,QSettings ,QStandardPaths ,QCoreApplication ,QUrl ,QSize ,QProcess
|
||||
# --- Local Application Imports ---
|
||||
from .api_client import download_from_api, fetch_post_comments
|
||||
from ..services.multipart_downloader import download_file_in_parts, MULTIPART_DOWNLOADER_AVAILABLE
|
||||
from ..services.drive_downloader import (
|
||||
download_mega_file, download_gdrive_file, download_dropbox_file
|
||||
)
|
||||
# Corrected Imports:
|
||||
from ..utils.file_utils import (
|
||||
is_image, is_video, is_zip, is_rar, is_archive, is_audio, KNOWN_NAMES,
|
||||
clean_filename, clean_folder_name
|
||||
@@ -567,10 +559,8 @@ class PostProcessorWorker:
|
||||
with self.downloaded_hash_counts_lock:
|
||||
current_count = self.downloaded_hash_counts.get(calculated_file_hash, 0)
|
||||
|
||||
# Default to not skipping
|
||||
decision_to_skip = False
|
||||
|
||||
# Apply logic based on mode
|
||||
if self.keep_duplicates_mode == DUPLICATE_HANDLING_HASH:
|
||||
if current_count >= 1:
|
||||
decision_to_skip = True
|
||||
@@ -581,12 +571,10 @@ class PostProcessorWorker:
|
||||
decision_to_skip = True
|
||||
self.logger(f" -> Skip (Duplicate Limit Reached): Limit of {self.keep_duplicates_limit} for this file content has been met. Discarding.")
|
||||
|
||||
# If we are NOT skipping this file, we MUST increment the count.
|
||||
if not decision_to_skip:
|
||||
self.downloaded_hash_counts[calculated_file_hash] = current_count + 1
|
||||
|
||||
should_skip = decision_to_skip
|
||||
# --- End of Final Corrected Logic ---
|
||||
|
||||
if should_skip:
|
||||
if downloaded_part_file_path and os.path.exists(downloaded_part_file_path):
|
||||
@@ -678,9 +666,14 @@ class PostProcessorWorker:
|
||||
else:
|
||||
self.logger(f"->>Download Fail for '{api_original_filename}' (Post ID: {original_post_id_for_log}). No successful download after retries.")
|
||||
details_for_failure = {
|
||||
'file_info': file_info, 'target_folder_path': target_folder_path, 'headers': headers,
|
||||
'original_post_id_for_log': original_post_id_for_log, 'post_title': post_title,
|
||||
'file_index_in_post': file_index_in_post, 'num_files_in_this_post': num_files_in_this_post
|
||||
'file_info': file_info,
|
||||
'target_folder_path': target_folder_path,
|
||||
'headers': headers,
|
||||
'original_post_id_for_log': original_post_id_for_log,
|
||||
'post_title': post_title,
|
||||
'file_index_in_post': file_index_in_post,
|
||||
'num_files_in_this_post': num_files_in_this_post,
|
||||
'forced_filename_override': filename_to_save_in_main_path
|
||||
}
|
||||
if is_permanent_error:
|
||||
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_FAILED_PERMANENTLY_THIS_SESSION, details_for_failure
|
||||
@@ -1040,7 +1033,9 @@ class PostProcessorWorker:
|
||||
return result_tuple
|
||||
|
||||
raw_text_content = ""
|
||||
comments_data = []
|
||||
final_post_data = post_data
|
||||
|
||||
if self.text_only_scope == 'content' and 'content' not in final_post_data:
|
||||
self.logger(f" Post {post_id} is missing 'content' field, fetching full data...")
|
||||
parsed_url = urlparse(self.api_url_input)
|
||||
@@ -1050,6 +1045,7 @@ class PostProcessorWorker:
|
||||
full_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
|
||||
if full_data:
|
||||
final_post_data = full_data
|
||||
|
||||
if self.text_only_scope == 'content':
|
||||
raw_text_content = final_post_data.get('content', '')
|
||||
elif self.text_only_scope == 'comments':
|
||||
@@ -1060,46 +1056,46 @@ class PostProcessorWorker:
|
||||
if comments_data:
|
||||
comment_texts = []
|
||||
for comment in comments_data:
|
||||
user = comment.get('user', {}).get('name', 'Unknown User')
|
||||
timestamp = comment.get('updated', 'No Date')
|
||||
user = comment.get('commenter_name', 'Unknown User')
|
||||
timestamp = comment.get('published', 'No Date')
|
||||
body = strip_html_tags(comment.get('content', ''))
|
||||
comment_texts.append(f"--- Comment by {user} on {timestamp} ---\n{body}\n")
|
||||
raw_text_content = "\n".join(comment_texts)
|
||||
else:
|
||||
raw_text_content = ""
|
||||
except Exception as e:
|
||||
self.logger(f" ❌ Error fetching comments for text-only mode: {e}")
|
||||
|
||||
if not raw_text_content or not raw_text_content.strip():
|
||||
cleaned_text = ""
|
||||
if self.text_only_scope == 'content':
|
||||
if not raw_text_content:
|
||||
cleaned_text = ""
|
||||
else:
|
||||
text_with_newlines = re.sub(r'(?i)</p>|<br\s*/?>', '\n', raw_text_content)
|
||||
just_text = re.sub(r'<.*?>', '', text_with_newlines)
|
||||
cleaned_text = html.unescape(just_text).strip()
|
||||
else:
|
||||
cleaned_text = raw_text_content
|
||||
|
||||
cleaned_text = cleaned_text.replace('…', '...')
|
||||
|
||||
if not cleaned_text.strip():
|
||||
self.logger(" -> Skip Saving Text: No content/comments found or fetched.")
|
||||
result_tuple = (0, num_potential_files_in_post, [], [], [], None, None)
|
||||
return result_tuple
|
||||
|
||||
paragraph_pattern = re.compile(r'<p.*?>(.*?)</p>', re.IGNORECASE | re.DOTALL)
|
||||
html_paragraphs = paragraph_pattern.findall(raw_text_content)
|
||||
cleaned_text = ""
|
||||
if not html_paragraphs:
|
||||
self.logger(" ⚠️ No <p> tags found. Falling back to basic HTML cleaning for the whole block.")
|
||||
text_with_br = re.sub(r'<br\s*/?>', '\n', raw_text_content, flags=re.IGNORECASE)
|
||||
cleaned_text = re.sub(r'<.*?>', '', text_with_br)
|
||||
else:
|
||||
cleaned_paragraphs_list = []
|
||||
for p_content in html_paragraphs:
|
||||
p_with_br = re.sub(r'<br\s*/?>', '\n', p_content, flags=re.IGNORECASE)
|
||||
p_cleaned = re.sub(r'<.*?>', '', p_with_br)
|
||||
p_final = html.unescape(p_cleaned).strip()
|
||||
if p_final:
|
||||
cleaned_paragraphs_list.append(p_final)
|
||||
cleaned_text = '\n\n'.join(cleaned_paragraphs_list)
|
||||
cleaned_text = cleaned_text.replace('…', '...')
|
||||
|
||||
if self.single_pdf_mode:
|
||||
if not cleaned_text:
|
||||
result_tuple = (0, 0, [], [], [], None, None)
|
||||
return result_tuple
|
||||
content_data = {
|
||||
'title': post_title,
|
||||
'content': cleaned_text,
|
||||
'published': self.post.get('published') or self.post.get('added')
|
||||
}
|
||||
if self.text_only_scope == 'comments':
|
||||
if not comments_data: return (0, 0, [], [], [], None, None)
|
||||
content_data['comments'] = comments_data
|
||||
else:
|
||||
if not cleaned_text.strip(): return (0, 0, [], [], [], None, None)
|
||||
content_data['content'] = cleaned_text
|
||||
|
||||
temp_dir = os.path.join(self.app_base_dir, "appdata")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
temp_filename = f"tmp_{post_id}_{uuid.uuid4().hex[:8]}.json"
|
||||
@@ -1107,13 +1103,11 @@ class PostProcessorWorker:
|
||||
try:
|
||||
with open(temp_filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(content_data, f, indent=2)
|
||||
self.logger(f" Saved temporary text for '{post_title}' for single PDF compilation.")
|
||||
result_tuple = (0, 0, [], [], [], None, temp_filepath)
|
||||
return result_tuple
|
||||
self.logger(f" Saved temporary data for '{post_title}' for single PDF compilation.")
|
||||
return (0, 0, [], [], [], None, temp_filepath)
|
||||
except Exception as e:
|
||||
self.logger(f" ❌ Failed to write temporary file for single PDF: {e}")
|
||||
result_tuple = (0, 0, [], [], [], None, None)
|
||||
return result_tuple
|
||||
return (0, 0, [], [], [], None, None)
|
||||
else:
|
||||
file_extension = self.text_export_format
|
||||
txt_filename = clean_filename(post_title) + f".{file_extension}"
|
||||
@@ -1125,27 +1119,63 @@ class PostProcessorWorker:
|
||||
while os.path.exists(final_save_path):
|
||||
final_save_path = f"{base}_{counter}{ext}"
|
||||
counter += 1
|
||||
|
||||
if file_extension == 'pdf':
|
||||
if FPDF:
|
||||
self.logger(f" Converting to PDF...")
|
||||
self.logger(f" Creating formatted PDF for {'comments' if self.text_only_scope == 'comments' else 'content'}...")
|
||||
pdf = PDF()
|
||||
font_path = ""
|
||||
bold_font_path = ""
|
||||
if self.project_root_dir:
|
||||
font_path = os.path.join(self.project_root_dir, 'data', 'dejavu-sans', 'DejaVuSans.ttf')
|
||||
bold_font_path = os.path.join(self.project_root_dir, 'data', 'dejavu-sans', 'DejaVuSans-Bold.ttf')
|
||||
|
||||
try:
|
||||
if not os.path.exists(font_path): raise RuntimeError(f"Font file not found: {font_path}")
|
||||
if not os.path.exists(bold_font_path): raise RuntimeError(f"Bold font file not found: {bold_font_path}")
|
||||
pdf.add_font('DejaVu', '', font_path, uni=True)
|
||||
pdf.set_font('DejaVu', '', 12)
|
||||
pdf.add_font('DejaVu', 'B', bold_font_path, uni=True)
|
||||
default_font_family = 'DejaVu'
|
||||
except Exception as font_error:
|
||||
self.logger(f" ⚠️ Could not load DejaVu font: {font_error}. Falling back to Arial.")
|
||||
pdf.set_font('Arial', '', 12)
|
||||
default_font_family = 'Arial'
|
||||
|
||||
pdf.add_page()
|
||||
pdf.multi_cell(0, 5, cleaned_text)
|
||||
pdf.set_font(default_font_family, 'B', 16)
|
||||
pdf.multi_cell(0, 10, post_title)
|
||||
pdf.ln(10)
|
||||
|
||||
if self.text_only_scope == 'comments':
|
||||
if not comments_data:
|
||||
self.logger(" -> Skip PDF Creation: No comments to process.")
|
||||
return (0, num_potential_files_in_post, [], [], [], None, None)
|
||||
for i, comment in enumerate(comments_data):
|
||||
user = comment.get('commenter_name', 'Unknown User')
|
||||
timestamp = comment.get('published', 'No Date')
|
||||
body = strip_html_tags(comment.get('content', ''))
|
||||
pdf.set_font(default_font_family, '', 10)
|
||||
pdf.write(8, "Comment by: ")
|
||||
pdf.set_font(default_font_family, 'B', 10)
|
||||
pdf.write(8, user)
|
||||
pdf.set_font(default_font_family, '', 10)
|
||||
pdf.write(8, f" on {timestamp}")
|
||||
pdf.ln(10)
|
||||
pdf.set_font(default_font_family, '', 11)
|
||||
pdf.multi_cell(0, 7, body)
|
||||
if i < len(comments_data) - 1:
|
||||
pdf.ln(5)
|
||||
pdf.cell(0, 0, '', border='T')
|
||||
pdf.ln(5)
|
||||
else:
|
||||
pdf.set_font(default_font_family, '', 12)
|
||||
pdf.multi_cell(0, 7, cleaned_text)
|
||||
|
||||
pdf.output(final_save_path)
|
||||
else:
|
||||
self.logger(f" ⚠️ Cannot create PDF: 'fpdf2' library not installed. Saving as .txt.")
|
||||
final_save_path = os.path.splitext(final_save_path)[0] + ".txt"
|
||||
with open(final_save_path, 'w', encoding='utf-8') as f: f.write(cleaned_text)
|
||||
|
||||
elif file_extension == 'docx':
|
||||
if Document:
|
||||
self.logger(f" Converting to DOCX...")
|
||||
@@ -1156,12 +1186,15 @@ class PostProcessorWorker:
|
||||
self.logger(f" ⚠️ Cannot create DOCX: 'python-docx' library not installed. Saving as .txt.")
|
||||
final_save_path = os.path.splitext(final_save_path)[0] + ".txt"
|
||||
with open(final_save_path, 'w', encoding='utf-8') as f: f.write(cleaned_text)
|
||||
else:
|
||||
|
||||
else: # TXT file
|
||||
with open(final_save_path, 'w', encoding='utf-8') as f:
|
||||
f.write(cleaned_text)
|
||||
|
||||
self.logger(f"✅ Saved Text: '{os.path.basename(final_save_path)}' in '{os.path.basename(determined_post_save_path_for_history)}'")
|
||||
result_tuple = (1, num_potential_files_in_post, [], [], [], history_data_for_this_post, None)
|
||||
return result_tuple
|
||||
|
||||
except Exception as e:
|
||||
self.logger(f" ❌ Critical error saving text file '{txt_filename}': {e}")
|
||||
result_tuple = (0, num_potential_files_in_post, [], [], [], None, None)
|
||||
@@ -1263,7 +1296,6 @@ class PostProcessorWorker:
|
||||
if self.keep_duplicates_mode == DUPLICATE_HANDLING_HASH:
|
||||
unique_files_by_url = {}
|
||||
for file_info in all_files_from_post_api:
|
||||
# Use the file URL as a unique key to avoid processing the same file multiple times
|
||||
file_url = file_info.get('url')
|
||||
if file_url and file_url not in unique_files_by_url:
|
||||
unique_files_by_url[file_url] = file_info
|
||||
@@ -1734,7 +1766,6 @@ class DownloadThread(QThread):
|
||||
|
||||
worker_signals_obj = PostProcessorSignals()
|
||||
try:
|
||||
# Connect signals
|
||||
worker_signals_obj.progress_signal.connect(self.progress_signal)
|
||||
worker_signals_obj.file_download_status_signal.connect(self.file_download_status_signal)
|
||||
worker_signals_obj.file_progress_signal.connect(self.file_progress_signal)
|
||||
@@ -1771,8 +1802,6 @@ class DownloadThread(QThread):
|
||||
was_process_cancelled = True
|
||||
break
|
||||
|
||||
# --- START OF FIX: Explicitly build the arguments dictionary ---
|
||||
# This robustly maps all thread attributes to the correct worker parameters.
|
||||
worker_args = {
|
||||
'post_data': individual_post_data,
|
||||
'emitter': worker_signals_obj,
|
||||
@@ -1833,7 +1862,6 @@ class DownloadThread(QThread):
|
||||
'single_pdf_mode': self.single_pdf_mode,
|
||||
'project_root_dir': self.project_root_dir,
|
||||
}
|
||||
# --- END OF FIX ---
|
||||
|
||||
post_processing_worker = PostProcessorWorker(**worker_args)
|
||||
|
||||
@@ -1860,6 +1888,7 @@ class DownloadThread(QThread):
|
||||
if not was_process_cancelled and not self.isInterruptionRequested():
|
||||
self.logger("✅ All posts processed or end of content reached by DownloadThread.")
|
||||
|
||||
|
||||
except Exception as main_thread_err:
|
||||
self.logger(f"\n❌ Critical error within DownloadThread run loop: {main_thread_err}")
|
||||
traceback.print_exc()
|
||||
|
||||
Reference in New Issue
Block a user