This commit is contained in:
Yuvi9587
2025-07-19 03:28:32 -07:00
parent 33133eb275
commit fbdae61b80
15 changed files with 194 additions and 376 deletions

View File

@@ -3,8 +3,6 @@ import traceback
from urllib.parse import urlparse
import json # Ensure json is imported
import requests
# (Keep the rest of your imports)
from ..utils.network_utils import extract_post_info, prepare_cookies_for_request
from ..config.constants import (
STYLE_DATE_POST_TITLE
@@ -25,9 +23,6 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
raise RuntimeError("Fetch operation cancelled by user while paused.")
time.sleep(0.5)
logger(" Post fetching resumed.")
# --- MODIFICATION: Added `fields` to the URL to request only metadata ---
# This prevents the large 'content' field from being included in the list, avoiding timeouts.
fields_to_request = "id,user,service,title,shared_file,added,published,edited,file,attachments,tags"
paginated_url = f'{api_url_base}?o={offset}&fields={fields_to_request}'
@@ -44,7 +39,6 @@ def fetch_posts_paginated(api_url_base, headers, offset, logger, cancellation_ev
logger(log_message)
try:
# We can now remove the streaming logic as the response will be small and fast.
response = requests.get(paginated_url, headers=headers, timeout=(15, 60), cookies=cookies_dict)
response.raise_for_status()
return response.json()
@@ -80,7 +74,6 @@ def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logge
post_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{post_id}"
logger(f" Fetching full content for post ID {post_id}...")
try:
# Use streaming here as a precaution for single posts that are still very large.
with requests.get(post_api_url, headers=headers, timeout=(15, 300), cookies=cookies_dict, stream=True) as response:
response.raise_for_status()
response_body = b""
@@ -88,7 +81,6 @@ def fetch_single_post_data(api_domain, service, user_id, post_id, headers, logge
response_body += chunk
full_post_data = json.loads(response_body)
# The API sometimes wraps the post in a list, handle that.
if isinstance(full_post_data, list) and full_post_data:
return full_post_data[0]
return full_post_data
@@ -134,14 +126,10 @@ def download_from_api(
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json'
}
# --- ADD THIS BLOCK ---
# Ensure processed_post_ids is a set for fast lookups
if processed_post_ids is None:
processed_post_ids = set()
else:
processed_post_ids = set(processed_post_ids)
# --- END OF ADDITION ---
service, user_id, target_post_id = extract_post_info(api_url_input)
@@ -158,11 +146,9 @@ def download_from_api(
if use_cookie and app_base_dir:
cookies_for_api = prepare_cookies_for_request(use_cookie, cookie_text, selected_cookie_file, app_base_dir, logger, target_domain=api_domain)
if target_post_id:
# --- ADD THIS CHECK FOR RESTORE ---
if target_post_id in processed_post_ids:
logger(f" Skipping already processed target post ID: {target_post_id}")
return
# --- END OF ADDITION ---
direct_post_api_url = f"https://{api_domain}/api/v1/{service}/user/{user_id}/post/{target_post_id}"
logger(f" Attempting direct fetch for target post: {direct_post_api_url}")
try:
@@ -248,14 +234,12 @@ def download_from_api(
break
if cancellation_event and cancellation_event.is_set(): return
if all_posts_for_manga_mode:
# --- ADD THIS BLOCK TO FILTER POSTS IN MANGA MODE ---
if processed_post_ids:
original_count = len(all_posts_for_manga_mode)
all_posts_for_manga_mode = [post for post in all_posts_for_manga_mode if post.get('id') not in processed_post_ids]
skipped_count = original_count - len(all_posts_for_manga_mode)
if skipped_count > 0:
logger(f" Manga Mode: Skipped {skipped_count} already processed post(s) before sorting.")
# --- END OF ADDITION ---
logger(f" Manga Mode: Fetched {len(all_posts_for_manga_mode)} total posts. Sorting by publication date (oldest first)...")
def sort_key_tuple(post):
@@ -326,15 +310,12 @@ def download_from_api(
logger(f"❌ Unexpected error fetching page {current_page_num} (offset {current_offset}): {e}")
traceback.print_exc()
break
# --- ADD THIS BLOCK TO FILTER POSTS IN STANDARD MODE ---
if processed_post_ids:
original_count = len(posts_batch)
posts_batch = [post for post in posts_batch if post.get('id') not in processed_post_ids]
skipped_count = original_count - len(posts_batch)
if skipped_count > 0:
logger(f" Skipped {skipped_count} already processed post(s) from page {current_page_num}.")
# --- END OF ADDITION ---
if not posts_batch:
if target_post_id and not processed_target_post_flag:

View File

@@ -1,13 +1,9 @@
# --- Standard Library Imports ---
import threading
import time
import os
import json
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed, Future
# --- Local Application Imports ---
# These imports reflect the new, organized project structure.
from .api_client import download_from_api
from .workers import PostProcessorWorker, DownloadThread
from ..config.constants import (
@@ -36,8 +32,6 @@ class DownloadManager:
self.progress_queue = progress_queue
self.thread_pool = None
self.active_futures = []
# --- Session State ---
self.cancellation_event = threading.Event()
self.pause_event = threading.Event()
self.is_running = False
@@ -64,8 +58,6 @@ class DownloadManager:
if self.is_running:
self._log("❌ Cannot start a new session: A session is already in progress.")
return
# --- Reset state for the new session ---
self.is_running = True
self.cancellation_event.clear()
self.pause_event.clear()
@@ -75,8 +67,6 @@ class DownloadManager:
self.total_downloads = 0
self.total_skips = 0
self.all_kept_original_filenames = []
# --- Decide execution strategy (multi-threaded vs. single-threaded) ---
is_single_post = bool(config.get('target_post_id_from_initial_url'))
use_multithreading = config.get('use_multithreading', True)
is_manga_sequential = config.get('manga_mode_active') and config.get('manga_filename_style') in [STYLE_DATE_BASED, STYLE_POST_TITLE_GLOBAL_NUMBERING]
@@ -84,7 +74,6 @@ class DownloadManager:
should_use_multithreading_for_posts = use_multithreading and not is_single_post and not is_manga_sequential
if should_use_multithreading_for_posts:
# Start a separate thread to manage fetching and queuing to the thread pool
fetcher_thread = threading.Thread(
target=self._fetch_and_queue_posts_for_pool,
args=(config, restore_data),
@@ -92,16 +81,11 @@ class DownloadManager:
)
fetcher_thread.start()
else:
# For single posts or sequential manga mode, use a single worker thread
# which is simpler and ensures order.
self._start_single_threaded_session(config)
def _start_single_threaded_session(self, config):
"""Handles downloads that are best processed by a single worker thread."""
self._log(" Initializing single-threaded download process...")
# The original DownloadThread is now a pure Python thread, not a QThread.
# We run its `run` method in a standard Python thread.
self.worker_thread = threading.Thread(
target=self._run_single_worker,
args=(config,),
@@ -112,7 +96,6 @@ class DownloadManager:
def _run_single_worker(self, config):
"""Target function for the single-worker thread."""
try:
# Pass the queue directly to the worker for it to send updates
worker = DownloadThread(config, self.progress_queue)
worker.run() # This is the main blocking call for this thread
except Exception as e:
@@ -129,9 +112,6 @@ class DownloadManager:
try:
num_workers = min(config.get('num_threads', 4), MAX_THREADS)
self.thread_pool = ThreadPoolExecutor(max_workers=num_workers, thread_name_prefix='PostWorker_')
# Fetch posts
# In a real implementation, this would call `api_client.download_from_api`
if restore_data:
all_posts = restore_data['all_posts_data']
processed_ids = set(restore_data['processed_post_ids'])
@@ -149,12 +129,9 @@ class DownloadManager:
if not posts_to_process:
self._log("✅ No new posts to process.")
return
# Submit tasks to the pool
for post_data in posts_to_process:
if self.cancellation_event.is_set():
break
# Each PostProcessorWorker gets the queue to send its own updates
worker = PostProcessorWorker(post_data, config, self.progress_queue)
future = self.thread_pool.submit(worker.process)
future.add_done_callback(self._handle_future_result)
@@ -164,27 +141,32 @@ class DownloadManager:
self._log(f"❌ CRITICAL ERROR in post fetcher thread: {e}")
self._log(traceback.format_exc())
finally:
# Wait for all submitted tasks to complete before shutting down
if self.thread_pool:
self.thread_pool.shutdown(wait=True)
self.is_running = False
self._log("🏁 All processing tasks have completed.")
# Emit final signal
self.progress_queue.put({
'type': 'finished',
'payload': (self.total_downloads, self.total_skips, self.cancellation_event.is_set(), self.all_kept_original_filenames)
})
def _get_all_posts(self, config):
"""Helper to fetch all posts using the API client."""
all_posts = []
# This generator yields batches of posts
post_generator = download_from_api(
api_url_input=config['api_url'],
logger=self._log,
# ... pass other relevant config keys ...
start_page=config.get('start_page'),
end_page=config.get('end_page'),
manga_mode=config.get('manga_mode_active', False),
cancellation_event=self.cancellation_event,
pause_event=self.pause_event
pause_event=self.pause_event,
use_cookie=config.get('use_cookie', False),
cookie_text=config.get('cookie_text', ''),
selected_cookie_file=config.get('selected_cookie_file'),
app_base_dir=config.get('app_base_dir'),
manga_filename_style_for_sort_check=config.get('manga_filename_style'),
processed_post_ids=config.get('processed_post_ids', [])
)
for batch in post_generator:
all_posts.extend(batch)
@@ -203,14 +185,11 @@ class DownloadManager:
self.total_skips += 1
else:
result = future.result()
# Unpack result tuple from the worker
(dl_count, skip_count, kept_originals,
retryable, permanent, history) = result
self.total_downloads += dl_count
self.total_skips += skip_count
self.all_kept_original_filenames.extend(kept_originals)
# Queue up results for UI to handle
if retryable:
self.progress_queue.put({'type': 'retryable_failure', 'payload': (retryable,)})
if permanent:
@@ -221,8 +200,6 @@ class DownloadManager:
except Exception as e:
self._log(f"❌ Worker task resulted in an exception: {e}")
self.total_skips += 1 # Count errored posts as skipped
# Update overall progress
self.progress_queue.put({'type': 'overall_progress', 'payload': (self.total_posts, self.processed_posts)})
def cancel_session(self):
@@ -231,11 +208,7 @@ class DownloadManager:
return
self._log("⚠️ Cancellation requested by user...")
self.cancellation_event.set()
# For single thread mode, the worker checks the event
# For multi-thread mode, shut down the pool
if self.thread_pool:
# Don't wait, just cancel pending futures and let the fetcher thread exit
self.thread_pool.shutdown(wait=False, cancel_futures=True)
self.is_running = False

View File

@@ -1,4 +1,3 @@
# --- Standard Library Imports ---
import os
import queue
import re
@@ -15,15 +14,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError,
from io import BytesIO
from urllib .parse import urlparse
import requests
# --- Third-Party Library Imports ---
try:
from PIL import Image
except ImportError:
Image = None
#
try:
from fpdf import FPDF
# Add a simple class to handle the header/footer for stories
class PDF(FPDF):
def header(self):
pass # No header
@@ -39,16 +35,12 @@ try:
from docx import Document
except ImportError:
Document = None
# --- PyQt5 Imports ---
from PyQt5 .QtCore import Qt ,QThread ,pyqtSignal ,QMutex ,QMutexLocker ,QObject ,QTimer ,QSettings ,QStandardPaths ,QCoreApplication ,QUrl ,QSize ,QProcess
# --- Local Application Imports ---
from .api_client import download_from_api, fetch_post_comments
from ..services.multipart_downloader import download_file_in_parts, MULTIPART_DOWNLOADER_AVAILABLE
from ..services.drive_downloader import (
download_mega_file, download_gdrive_file, download_dropbox_file
)
# Corrected Imports:
from ..utils.file_utils import (
is_image, is_video, is_zip, is_rar, is_archive, is_audio, KNOWN_NAMES,
clean_filename, clean_folder_name
@@ -567,10 +559,8 @@ class PostProcessorWorker:
with self.downloaded_hash_counts_lock:
current_count = self.downloaded_hash_counts.get(calculated_file_hash, 0)
# Default to not skipping
decision_to_skip = False
# Apply logic based on mode
if self.keep_duplicates_mode == DUPLICATE_HANDLING_HASH:
if current_count >= 1:
decision_to_skip = True
@@ -581,12 +571,10 @@ class PostProcessorWorker:
decision_to_skip = True
self.logger(f" -> Skip (Duplicate Limit Reached): Limit of {self.keep_duplicates_limit} for this file content has been met. Discarding.")
# If we are NOT skipping this file, we MUST increment the count.
if not decision_to_skip:
self.downloaded_hash_counts[calculated_file_hash] = current_count + 1
should_skip = decision_to_skip
# --- End of Final Corrected Logic ---
if should_skip:
if downloaded_part_file_path and os.path.exists(downloaded_part_file_path):
@@ -678,9 +666,14 @@ class PostProcessorWorker:
else:
self.logger(f"->>Download Fail for '{api_original_filename}' (Post ID: {original_post_id_for_log}). No successful download after retries.")
details_for_failure = {
'file_info': file_info, 'target_folder_path': target_folder_path, 'headers': headers,
'original_post_id_for_log': original_post_id_for_log, 'post_title': post_title,
'file_index_in_post': file_index_in_post, 'num_files_in_this_post': num_files_in_this_post
'file_info': file_info,
'target_folder_path': target_folder_path,
'headers': headers,
'original_post_id_for_log': original_post_id_for_log,
'post_title': post_title,
'file_index_in_post': file_index_in_post,
'num_files_in_this_post': num_files_in_this_post,
'forced_filename_override': filename_to_save_in_main_path
}
if is_permanent_error:
return 0, 1, filename_to_save_in_main_path, was_original_name_kept_flag, FILE_DOWNLOAD_STATUS_FAILED_PERMANENTLY_THIS_SESSION, details_for_failure
@@ -1040,7 +1033,9 @@ class PostProcessorWorker:
return result_tuple
raw_text_content = ""
comments_data = []
final_post_data = post_data
if self.text_only_scope == 'content' and 'content' not in final_post_data:
self.logger(f" Post {post_id} is missing 'content' field, fetching full data...")
parsed_url = urlparse(self.api_url_input)
@@ -1050,6 +1045,7 @@ class PostProcessorWorker:
full_data = fetch_single_post_data(api_domain, self.service, self.user_id, post_id, headers, self.logger, cookies_dict=cookies)
if full_data:
final_post_data = full_data
if self.text_only_scope == 'content':
raw_text_content = final_post_data.get('content', '')
elif self.text_only_scope == 'comments':
@@ -1060,46 +1056,46 @@ class PostProcessorWorker:
if comments_data:
comment_texts = []
for comment in comments_data:
user = comment.get('user', {}).get('name', 'Unknown User')
timestamp = comment.get('updated', 'No Date')
user = comment.get('commenter_name', 'Unknown User')
timestamp = comment.get('published', 'No Date')
body = strip_html_tags(comment.get('content', ''))
comment_texts.append(f"--- Comment by {user} on {timestamp} ---\n{body}\n")
raw_text_content = "\n".join(comment_texts)
else:
raw_text_content = ""
except Exception as e:
self.logger(f" ❌ Error fetching comments for text-only mode: {e}")
if not raw_text_content or not raw_text_content.strip():
cleaned_text = ""
if self.text_only_scope == 'content':
if not raw_text_content:
cleaned_text = ""
else:
text_with_newlines = re.sub(r'(?i)</p>|<br\s*/?>', '\n', raw_text_content)
just_text = re.sub(r'<.*?>', '', text_with_newlines)
cleaned_text = html.unescape(just_text).strip()
else:
cleaned_text = raw_text_content
cleaned_text = cleaned_text.replace('', '...')
if not cleaned_text.strip():
self.logger(" -> Skip Saving Text: No content/comments found or fetched.")
result_tuple = (0, num_potential_files_in_post, [], [], [], None, None)
return result_tuple
paragraph_pattern = re.compile(r'<p.*?>(.*?)</p>', re.IGNORECASE | re.DOTALL)
html_paragraphs = paragraph_pattern.findall(raw_text_content)
cleaned_text = ""
if not html_paragraphs:
self.logger(" ⚠️ No <p> tags found. Falling back to basic HTML cleaning for the whole block.")
text_with_br = re.sub(r'<br\s*/?>', '\n', raw_text_content, flags=re.IGNORECASE)
cleaned_text = re.sub(r'<.*?>', '', text_with_br)
else:
cleaned_paragraphs_list = []
for p_content in html_paragraphs:
p_with_br = re.sub(r'<br\s*/?>', '\n', p_content, flags=re.IGNORECASE)
p_cleaned = re.sub(r'<.*?>', '', p_with_br)
p_final = html.unescape(p_cleaned).strip()
if p_final:
cleaned_paragraphs_list.append(p_final)
cleaned_text = '\n\n'.join(cleaned_paragraphs_list)
cleaned_text = cleaned_text.replace('', '...')
if self.single_pdf_mode:
if not cleaned_text:
result_tuple = (0, 0, [], [], [], None, None)
return result_tuple
content_data = {
'title': post_title,
'content': cleaned_text,
'published': self.post.get('published') or self.post.get('added')
}
if self.text_only_scope == 'comments':
if not comments_data: return (0, 0, [], [], [], None, None)
content_data['comments'] = comments_data
else:
if not cleaned_text.strip(): return (0, 0, [], [], [], None, None)
content_data['content'] = cleaned_text
temp_dir = os.path.join(self.app_base_dir, "appdata")
os.makedirs(temp_dir, exist_ok=True)
temp_filename = f"tmp_{post_id}_{uuid.uuid4().hex[:8]}.json"
@@ -1107,13 +1103,11 @@ class PostProcessorWorker:
try:
with open(temp_filepath, 'w', encoding='utf-8') as f:
json.dump(content_data, f, indent=2)
self.logger(f" Saved temporary text for '{post_title}' for single PDF compilation.")
result_tuple = (0, 0, [], [], [], None, temp_filepath)
return result_tuple
self.logger(f" Saved temporary data for '{post_title}' for single PDF compilation.")
return (0, 0, [], [], [], None, temp_filepath)
except Exception as e:
self.logger(f" ❌ Failed to write temporary file for single PDF: {e}")
result_tuple = (0, 0, [], [], [], None, None)
return result_tuple
return (0, 0, [], [], [], None, None)
else:
file_extension = self.text_export_format
txt_filename = clean_filename(post_title) + f".{file_extension}"
@@ -1125,27 +1119,63 @@ class PostProcessorWorker:
while os.path.exists(final_save_path):
final_save_path = f"{base}_{counter}{ext}"
counter += 1
if file_extension == 'pdf':
if FPDF:
self.logger(f" Converting to PDF...")
self.logger(f" Creating formatted PDF for {'comments' if self.text_only_scope == 'comments' else 'content'}...")
pdf = PDF()
font_path = ""
bold_font_path = ""
if self.project_root_dir:
font_path = os.path.join(self.project_root_dir, 'data', 'dejavu-sans', 'DejaVuSans.ttf')
bold_font_path = os.path.join(self.project_root_dir, 'data', 'dejavu-sans', 'DejaVuSans-Bold.ttf')
try:
if not os.path.exists(font_path): raise RuntimeError(f"Font file not found: {font_path}")
if not os.path.exists(bold_font_path): raise RuntimeError(f"Bold font file not found: {bold_font_path}")
pdf.add_font('DejaVu', '', font_path, uni=True)
pdf.set_font('DejaVu', '', 12)
pdf.add_font('DejaVu', 'B', bold_font_path, uni=True)
default_font_family = 'DejaVu'
except Exception as font_error:
self.logger(f" ⚠️ Could not load DejaVu font: {font_error}. Falling back to Arial.")
pdf.set_font('Arial', '', 12)
default_font_family = 'Arial'
pdf.add_page()
pdf.multi_cell(0, 5, cleaned_text)
pdf.set_font(default_font_family, 'B', 16)
pdf.multi_cell(0, 10, post_title)
pdf.ln(10)
if self.text_only_scope == 'comments':
if not comments_data:
self.logger(" -> Skip PDF Creation: No comments to process.")
return (0, num_potential_files_in_post, [], [], [], None, None)
for i, comment in enumerate(comments_data):
user = comment.get('commenter_name', 'Unknown User')
timestamp = comment.get('published', 'No Date')
body = strip_html_tags(comment.get('content', ''))
pdf.set_font(default_font_family, '', 10)
pdf.write(8, "Comment by: ")
pdf.set_font(default_font_family, 'B', 10)
pdf.write(8, user)
pdf.set_font(default_font_family, '', 10)
pdf.write(8, f" on {timestamp}")
pdf.ln(10)
pdf.set_font(default_font_family, '', 11)
pdf.multi_cell(0, 7, body)
if i < len(comments_data) - 1:
pdf.ln(5)
pdf.cell(0, 0, '', border='T')
pdf.ln(5)
else:
pdf.set_font(default_font_family, '', 12)
pdf.multi_cell(0, 7, cleaned_text)
pdf.output(final_save_path)
else:
self.logger(f" ⚠️ Cannot create PDF: 'fpdf2' library not installed. Saving as .txt.")
final_save_path = os.path.splitext(final_save_path)[0] + ".txt"
with open(final_save_path, 'w', encoding='utf-8') as f: f.write(cleaned_text)
elif file_extension == 'docx':
if Document:
self.logger(f" Converting to DOCX...")
@@ -1156,12 +1186,15 @@ class PostProcessorWorker:
self.logger(f" ⚠️ Cannot create DOCX: 'python-docx' library not installed. Saving as .txt.")
final_save_path = os.path.splitext(final_save_path)[0] + ".txt"
with open(final_save_path, 'w', encoding='utf-8') as f: f.write(cleaned_text)
else:
else: # TXT file
with open(final_save_path, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
self.logger(f"✅ Saved Text: '{os.path.basename(final_save_path)}' in '{os.path.basename(determined_post_save_path_for_history)}'")
result_tuple = (1, num_potential_files_in_post, [], [], [], history_data_for_this_post, None)
return result_tuple
except Exception as e:
self.logger(f" ❌ Critical error saving text file '{txt_filename}': {e}")
result_tuple = (0, num_potential_files_in_post, [], [], [], None, None)
@@ -1263,7 +1296,6 @@ class PostProcessorWorker:
if self.keep_duplicates_mode == DUPLICATE_HANDLING_HASH:
unique_files_by_url = {}
for file_info in all_files_from_post_api:
# Use the file URL as a unique key to avoid processing the same file multiple times
file_url = file_info.get('url')
if file_url and file_url not in unique_files_by_url:
unique_files_by_url[file_url] = file_info
@@ -1734,7 +1766,6 @@ class DownloadThread(QThread):
worker_signals_obj = PostProcessorSignals()
try:
# Connect signals
worker_signals_obj.progress_signal.connect(self.progress_signal)
worker_signals_obj.file_download_status_signal.connect(self.file_download_status_signal)
worker_signals_obj.file_progress_signal.connect(self.file_progress_signal)
@@ -1771,8 +1802,6 @@ class DownloadThread(QThread):
was_process_cancelled = True
break
# --- START OF FIX: Explicitly build the arguments dictionary ---
# This robustly maps all thread attributes to the correct worker parameters.
worker_args = {
'post_data': individual_post_data,
'emitter': worker_signals_obj,
@@ -1833,7 +1862,6 @@ class DownloadThread(QThread):
'single_pdf_mode': self.single_pdf_mode,
'project_root_dir': self.project_root_dir,
}
# --- END OF FIX ---
post_processing_worker = PostProcessorWorker(**worker_args)
@@ -1860,6 +1888,7 @@ class DownloadThread(QThread):
if not was_process_cancelled and not self.isInterruptionRequested():
self.logger("✅ All posts processed or end of content reached by DownloadThread.")
except Exception as main_thread_err:
self.logger(f"\n❌ Critical error within DownloadThread run loop: {main_thread_err}")
traceback.print_exc()