From f93795e37031968c5547f8aa25226d0f59ac3f0f Mon Sep 17 00:00:00 2001 From: Yuvi9587 <114073886+Yuvi9587@users.noreply.github.com> Date: Sun, 1 Jun 2025 08:22:29 +0100 Subject: [PATCH] Commit --- downloader_utils.py | 7 ---- main.py | 4 ++- multipart_downloader.py | 78 ++++++++++++++++++----------------------- 3 files changed, 38 insertions(+), 51 deletions(-) diff --git a/downloader_utils.py b/downloader_utils.py index 703b947..e767d70 100644 --- a/downloader_utils.py +++ b/downloader_utils.py @@ -154,10 +154,7 @@ def clean_filename(name): def strip_html_tags(html_text): if not html_text: return "" text = html.unescape(str(html_text)) # Ensure input is a string - # Replace HTML tags with a single space text_after_tag_removal = re.sub(r'<[^>]+>', ' ', text) - # Replace multiple whitespace characters (including newlines, tabs, etc. that are now spaces) - # with a single space. Also, strip leading/trailing whitespace from the final result. cleaned_text = re.sub(r'\s+', ' ', text_after_tag_removal).strip() return cleaned_text def extract_folder_name_from_title(title, unwanted_keywords): @@ -1214,7 +1211,6 @@ class PostProcessorWorker: if (self.show_external_links or self.extract_links_only) and post_content_html: # type: ignore if self._check_pause(f"External link extraction for post {post_id}"): return 0, num_potential_files_in_post, [], [] try: - # Regex for typical Mega decryption keys (43 or 22 chars, alphanumeric + hyphen/underscore) mega_key_pattern = re.compile(r'\b([a-zA-Z0-9_-]{43}|[a-zA-Z0-9_-]{22})\b') unique_links_data = {} for match in link_pattern.finditer(post_content_html): @@ -1234,19 +1230,16 @@ class PostProcessorWorker: platform = get_link_platform(link_url) decryption_key_found = "" if platform == 'mega': - # 1. Check if key is in the URL fragment parsed_mega_url = urlparse(link_url) if parsed_mega_url.fragment: potential_key_from_fragment = parsed_mega_url.fragment.split('!')[-1] # Handle cases like #!key or #key if mega_key_pattern.fullmatch(potential_key_from_fragment): decryption_key_found = potential_key_from_fragment - # 2. If not in fragment, search in link text if not decryption_key_found and link_text: key_match_in_text = mega_key_pattern.search(link_text) if key_match_in_text: decryption_key_found = key_match_in_text.group(1) - # 3. If still not found, search the whole post content (if extracting links only, as it's more critical) if not decryption_key_found and self.extract_links_only and post_content_html: key_match_in_content = mega_key_pattern.search(strip_html_tags(post_content_html)) # Search cleaned content if key_match_in_content: diff --git a/main.py b/main.py index 95a1268..c77dca5 100644 --- a/main.py +++ b/main.py @@ -3357,7 +3357,9 @@ class DownloaderApp(QWidget): self.file_progress_label.setText(progress_text) elif isinstance(progress_info, tuple) and len(progress_info) == 2: - if not filename and total_bytes == 0 and downloaded_bytes == 0: + downloaded_bytes, total_bytes = progress_info # Unpack the tuple + + if not filename and total_bytes == 0 and downloaded_bytes == 0: self.file_progress_label.setText("") return diff --git a/multipart_downloader.py b/multipart_downloader.py index d5161da..7e207c5 100644 --- a/multipart_downloader.py +++ b/multipart_downloader.py @@ -5,21 +5,20 @@ import hashlib import http.client import traceback import threading -import queue # Import the missing 'queue' module +import queue from concurrent.futures import ThreadPoolExecutor, as_completed -CHUNK_DOWNLOAD_RETRY_DELAY = 2 # Slightly reduced for faster retries if needed -MAX_CHUNK_DOWNLOAD_RETRIES = 1 # Further reduced for quicker fallback if a chunk is problematic -DOWNLOAD_CHUNK_SIZE_ITER = 1024 * 256 # 256KB for iter_content within a chunk download +CHUNK_DOWNLOAD_RETRY_DELAY = 2 +MAX_CHUNK_DOWNLOAD_RETRIES = 1 +DOWNLOAD_CHUNK_SIZE_ITER = 1024 * 256 def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, headers, - part_num, total_parts, progress_data, cancellation_event, skip_event, pause_event, global_emit_time_ref, cookies_for_chunk, # Added cookies_for_chunk - logger_func, emitter=None, api_original_filename=None): # Renamed logger, signals to emitter - """Downloads a single chunk of a file and writes it to the temp file.""" + part_num, total_parts, progress_data, cancellation_event, skip_event, pause_event, global_emit_time_ref, cookies_for_chunk, + logger_func, emitter=None, api_original_filename=None): if cancellation_event and cancellation_event.is_set(): logger_func(f" [Chunk {part_num + 1}/{total_parts}] Download cancelled before start.") - return 0, False # bytes_downloaded, success + return 0, False if skip_event and skip_event.is_set(): logger_func(f" [Chunk {part_num + 1}/{total_parts}] Skip event triggered before start.") return 0, False @@ -30,16 +29,15 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, if cancellation_event and cancellation_event.is_set(): logger_func(f" [Chunk {part_num + 1}/{total_parts}] Download cancelled while paused.") return 0, False - time.sleep(0.2) # Shorter sleep for responsive resume + time.sleep(0.2) logger_func(f" [Chunk {part_num + 1}/{total_parts}] Download resumed.") chunk_headers = headers.copy() - if end_byte != -1 : # For 0-byte files, end_byte might be -1, Range header should not be set or be 0-0 + if end_byte != -1 : chunk_headers['Range'] = f"bytes={start_byte}-{end_byte}" - elif start_byte == 0 and end_byte == -1: # Specifically for 0-byte files + elif start_byte == 0 and end_byte == -1: pass - bytes_this_chunk = 0 last_speed_calc_time = time.time() bytes_at_last_speed_calc = 0 @@ -65,7 +63,7 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, logger_func(f" [Chunk {part_num + 1}/{total_parts}] Retrying download (Attempt {attempt}/{MAX_CHUNK_DOWNLOAD_RETRIES})...") time.sleep(CHUNK_DOWNLOAD_RETRY_DELAY * (2 ** (attempt - 1))) last_speed_calc_time = time.time() - bytes_at_last_speed_calc = bytes_this_chunk # Current progress of this chunk + bytes_at_last_speed_calc = bytes_this_chunk log_msg = f" 🚀 [Chunk {part_num + 1}/{total_parts}] Starting download: bytes {start_byte}-{end_byte if end_byte != -1 else 'EOF'}" logger_func(log_msg) response = requests.get(chunk_url, headers=chunk_headers, timeout=(10, 120), stream=True, cookies=cookies_for_chunk) @@ -77,7 +75,7 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, progress_data['chunks_status'][part_num]['speed_bps'] = 0 return 0, True - with open(temp_file_path, 'r+b') as f: # Open in read-write binary + with open(temp_file_path, 'r+b') as f: f.seek(start_byte) for data_segment in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE_ITER): if cancellation_event and cancellation_event.is_set(): @@ -105,18 +103,18 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, current_time = time.time() time_delta_speed = current_time - last_speed_calc_time - if time_delta_speed > 0.5: # Calculate speed every 0.5 seconds + if time_delta_speed > 0.5: bytes_delta = bytes_this_chunk - bytes_at_last_speed_calc current_speed_bps = (bytes_delta * 8) / time_delta_speed if time_delta_speed > 0 else 0 progress_data['chunks_status'][part_num]['speed_bps'] = current_speed_bps last_speed_calc_time = current_time bytes_at_last_speed_calc = bytes_this_chunk - if emitter and (current_time - global_emit_time_ref[0] > 0.25): # Max ~4Hz for the whole file - global_emit_time_ref[0] = current_time # Update shared last emit time - status_list_copy = [dict(s) for s in progress_data['chunks_status']] # Make a deep enough copy + if emitter and (current_time - global_emit_time_ref[0] > 0.25): + global_emit_time_ref[0] = current_time + status_list_copy = [dict(s) for s in progress_data['chunks_status']] if isinstance(emitter, queue.Queue): emitter.put({'type': 'file_progress', 'payload': (api_original_filename, status_list_copy)}) - elif hasattr(emitter, 'file_progress_signal'): # PostProcessorSignals-like + elif hasattr(emitter, 'file_progress_signal'): emitter.file_progress_signal.emit(api_original_filename, status_list_copy) return bytes_this_chunk, True @@ -128,9 +126,9 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, if attempt == MAX_CHUNK_DOWNLOAD_RETRIES: logger_func(f" ❌ [Chunk {part_num + 1}/{total_parts}] Failed after {MAX_CHUNK_DOWNLOAD_RETRIES} retries.") return bytes_this_chunk, False - except requests.exceptions.RequestException as e: # Includes 4xx/5xx errors after raise_for_status + except requests.exceptions.RequestException as e: logger_func(f" ❌ [Chunk {part_num + 1}/{total_parts}] Non-retryable error: {e}") - if ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): # More general check + if ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): logger_func(" 💡 This looks like a DNS resolution problem. Please check your internet connection, DNS settings, or VPN.") return bytes_this_chunk, False except Exception as e: @@ -140,25 +138,19 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, with progress_data['lock']: progress_data['chunks_status'][part_num]['active'] = False progress_data['chunks_status'][part_num]['speed_bps'] = 0 - return bytes_this_chunk, False # Should be unreachable + return bytes_this_chunk, False def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, api_original_filename, - emitter_for_multipart, cookies_for_chunk_session, # Added cookies_for_chunk_session + emitter_for_multipart, cookies_for_chunk_session, cancellation_event, skip_event, logger_func, pause_event): - """ - Downloads a file in multiple parts concurrently. - Returns: (download_successful_flag, downloaded_bytes, calculated_file_hash, temp_file_handle_or_None) - The temp_file_handle will be an open read-binary file handle to the .part file if successful, otherwise None. - It is the responsibility of the caller to close this handle and rename/delete the .part file. - """ logger_func(f"⬇️ Initializing Multi-part Download ({num_parts} parts) for: '{api_original_filename}' (Size: {total_size / (1024*1024):.2f} MB)") temp_file_path = save_path + ".part" try: with open(temp_file_path, 'wb') as f_temp: if total_size > 0: - f_temp.truncate(total_size) # Pre-allocate space + f_temp.truncate(total_size) except IOError as e: logger_func(f" ❌ Error creating/truncating temp file '{temp_file_path}': {e}") return False, 0, None, None @@ -168,14 +160,14 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, for i in range(num_parts): start = i * chunk_size_calc end = start + chunk_size_calc - 1 if i < num_parts - 1 else total_size - 1 - if start <= end: # Valid range + if start <= end: chunks_ranges.append((start, end)) - elif total_size == 0 and i == 0: # Special case for 0-byte file - chunks_ranges.append((0, -1)) # Indicates 0-byte file, download 0 bytes from offset 0 + elif total_size == 0 and i == 0: + chunks_ranges.append((0, -1)) - chunk_actual_sizes = [] + chunk_actual_sizes = [] for start, end in chunks_ranges: - if end == -1 and start == 0: # 0-byte file + if end == -1 and start == 0: chunk_actual_sizes.append(0) else: chunk_actual_sizes.append(end - start + 1) @@ -186,19 +178,19 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, return False, 0, None, None progress_data = { - 'total_file_size': total_size, # Overall file size for reference - 'total_downloaded_so_far': 0, # New key for overall progress - 'chunks_status': [ # Status for each chunk + 'total_file_size': total_size, + 'total_downloaded_so_far': 0, + 'chunks_status': [ {'id': i, 'downloaded': 0, 'total': chunk_actual_sizes[i] if i < len(chunk_actual_sizes) else 0, 'active': False, 'speed_bps': 0.0} for i in range(num_parts) ], 'lock': threading.Lock(), - 'last_global_emit_time': [time.time()] # Shared mutable for global throttling timestamp + 'last_global_emit_time': [time.time()] } chunk_futures = [] all_chunks_successful = True - total_bytes_from_chunks = 0 # Still useful to verify total downloaded against file size + total_bytes_from_chunks = 0 with ThreadPoolExecutor(max_workers=num_parts, thread_name_prefix=f"MPChunk_{api_original_filename[:10]}_") as chunk_pool: for i, (start, end) in enumerate(chunks_ranges): @@ -226,14 +218,14 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, status_list_copy = [dict(s) for s in progress_data['chunks_status']] if isinstance(emitter_for_multipart, queue.Queue): emitter_for_multipart.put({'type': 'file_progress', 'payload': (api_original_filename, status_list_copy)}) - elif hasattr(emitter_for_multipart, 'file_progress_signal'): # PostProcessorSignals-like + elif hasattr(emitter_for_multipart, 'file_progress_signal'): emitter_for_multipart.file_progress_signal.emit(api_original_filename, status_list_copy) if all_chunks_successful and (total_bytes_from_chunks == total_size or total_size == 0): logger_func(f" ✅ Multi-part download successful for '{api_original_filename}'. Total bytes: {total_bytes_from_chunks}") md5_hasher = hashlib.md5() with open(temp_file_path, 'rb') as f_hash: - for buf in iter(lambda: f_hash.read(4096*10), b''): # Read in larger buffers for hashing + for buf in iter(lambda: f_hash.read(4096*10), b''): md5_hasher.update(buf) calculated_hash = md5_hasher.hexdigest() return True, total_bytes_from_chunks, calculated_hash, open(temp_file_path, 'rb') @@ -242,4 +234,4 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, if os.path.exists(temp_file_path): try: os.remove(temp_file_path) except OSError as e: logger_func(f" Failed to remove temp part file '{temp_file_path}': {e}") - return False, total_bytes_from_chunks, None, None + return False, total_bytes_from_chunks, None, None \ No newline at end of file