Commit

2025-12-29 16:14:44 +00:00 · 2025-06-01 08:22:29 +01:00
parent 7d4e785ca1
commit f93795e370
3 changed files with 38 additions and 51 deletions
--- a/downloader_utils.py
+++ b/downloader_utils.py
@@ -154,10 +154,7 @@ def clean_filename(name):
 def strip_html_tags(html_text):
    if not html_text: return ""
    text = html.unescape(str(html_text)) # Ensure input is a string
-    # Replace HTML tags with a single space
    text_after_tag_removal = re.sub(r'<[^>]+>', ' ', text)
-    # Replace multiple whitespace characters (including newlines, tabs, etc. that are now spaces)
-    # with a single space. Also, strip leading/trailing whitespace from the final result.
    cleaned_text = re.sub(r'\s+', ' ', text_after_tag_removal).strip()
    return cleaned_text
 def extract_folder_name_from_title(title, unwanted_keywords):
@@ -1214,7 +1211,6 @@ class PostProcessorWorker:
        if (self.show_external_links or self.extract_links_only) and post_content_html: # type: ignore
            if self._check_pause(f"External link extraction for post {post_id}"): return 0, num_potential_files_in_post, [], []
            try:
-                # Regex for typical Mega decryption keys (43 or 22 chars, alphanumeric + hyphen/underscore)
                mega_key_pattern = re.compile(r'\b([a-zA-Z0-9_-]{43}|[a-zA-Z0-9_-]{22})\b')
                unique_links_data = {} 
                for match in link_pattern.finditer(post_content_html):
@@ -1234,19 +1230,16 @@ class PostProcessorWorker:
                    platform = get_link_platform(link_url)
                    decryption_key_found = ""
                    if platform == 'mega':
-                        # 1. Check if key is in the URL fragment
                        parsed_mega_url = urlparse(link_url)
                        if parsed_mega_url.fragment:
                            potential_key_from_fragment = parsed_mega_url.fragment.split('!')[-1] # Handle cases like #!key or #key
                            if mega_key_pattern.fullmatch(potential_key_from_fragment):
                                decryption_key_found = potential_key_from_fragment

-                        # 2. If not in fragment, search in link text
                        if not decryption_key_found and link_text:
                            key_match_in_text = mega_key_pattern.search(link_text)
                            if key_match_in_text:
                                decryption_key_found = key_match_in_text.group(1)
-                        # 3. If still not found, search the whole post content (if extracting links only, as it's more critical)
                        if not decryption_key_found and self.extract_links_only and post_content_html:
                            key_match_in_content = mega_key_pattern.search(strip_html_tags(post_content_html)) # Search cleaned content
                            if key_match_in_content:
--- a/main.py
+++ b/main.py
@@ -3357,6 +3357,8 @@ class DownloaderApp(QWidget):
            self.file_progress_label.setText(progress_text)

        elif isinstance(progress_info, tuple) and len(progress_info) == 2: 
+            downloaded_bytes, total_bytes = progress_info # Unpack the tuple
+            
            if not filename and total_bytes == 0 and downloaded_bytes == 0:
                self.file_progress_label.setText("")
                return
--- a/multipart_downloader.py
+++ b/multipart_downloader.py
@@ -5,21 +5,20 @@ import hashlib
 import http.client
 import traceback
 import threading
-import queue # Import the missing 'queue' module
+import queue
 from concurrent.futures import ThreadPoolExecutor, as_completed

-CHUNK_DOWNLOAD_RETRY_DELAY = 2 # Slightly reduced for faster retries if needed
-MAX_CHUNK_DOWNLOAD_RETRIES = 1 # Further reduced for quicker fallback if a chunk is problematic
-DOWNLOAD_CHUNK_SIZE_ITER = 1024 * 256  # 256KB for iter_content within a chunk download
+CHUNK_DOWNLOAD_RETRY_DELAY = 2
+MAX_CHUNK_DOWNLOAD_RETRIES = 1
+DOWNLOAD_CHUNK_SIZE_ITER = 1024 * 256  


 def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte, headers,
-                               part_num, total_parts, progress_data, cancellation_event, skip_event, pause_event, global_emit_time_ref, cookies_for_chunk, # Added cookies_for_chunk
-                               logger_func, emitter=None, api_original_filename=None): # Renamed logger, signals to emitter
-    """Downloads a single chunk of a file and writes it to the temp file."""
+                               part_num, total_parts, progress_data, cancellation_event, skip_event, pause_event, global_emit_time_ref, cookies_for_chunk,
+                               logger_func, emitter=None, api_original_filename=None):
    if cancellation_event and cancellation_event.is_set():
        logger_func(f"   [Chunk {part_num + 1}/{total_parts}] Download cancelled before start.")
-        return 0, False  # bytes_downloaded, success
+        return 0, False
    if skip_event and skip_event.is_set():
        logger_func(f"   [Chunk {part_num + 1}/{total_parts}] Skip event triggered before start.")
        return 0, False
@@ -30,16 +29,15 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
            if cancellation_event and cancellation_event.is_set():
                logger_func(f"   [Chunk {part_num + 1}/{total_parts}] Download cancelled while paused.")
                return 0, False
-            time.sleep(0.2) # Shorter sleep for responsive resume
+            time.sleep(0.2)
        logger_func(f"   [Chunk {part_num + 1}/{total_parts}] Download resumed.")

    chunk_headers = headers.copy()
-    if end_byte != -1 : # For 0-byte files, end_byte might be -1, Range header should not be set or be 0-0
+    if end_byte != -1 :
        chunk_headers['Range'] = f"bytes={start_byte}-{end_byte}"
-    elif start_byte == 0 and end_byte == -1: # Specifically for 0-byte files
+    elif start_byte == 0 and end_byte == -1:
        pass

-
    bytes_this_chunk = 0
    last_speed_calc_time = time.time()
    bytes_at_last_speed_calc = 0
@@ -65,7 +63,7 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
                logger_func(f"   [Chunk {part_num + 1}/{total_parts}] Retrying download (Attempt {attempt}/{MAX_CHUNK_DOWNLOAD_RETRIES})...")
                time.sleep(CHUNK_DOWNLOAD_RETRY_DELAY * (2 ** (attempt - 1)))
                last_speed_calc_time = time.time()
-                bytes_at_last_speed_calc = bytes_this_chunk # Current progress of this chunk
+                bytes_at_last_speed_calc = bytes_this_chunk
            log_msg = f"   🚀 [Chunk {part_num + 1}/{total_parts}] Starting download: bytes {start_byte}-{end_byte if end_byte != -1 else 'EOF'}"
            logger_func(log_msg)
            response = requests.get(chunk_url, headers=chunk_headers, timeout=(10, 120), stream=True, cookies=cookies_for_chunk)
@@ -77,7 +75,7 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
                    progress_data['chunks_status'][part_num]['speed_bps'] = 0
                return 0, True

-            with open(temp_file_path, 'r+b') as f:  # Open in read-write binary
+            with open(temp_file_path, 'r+b') as f:
                f.seek(start_byte)
                for data_segment in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE_ITER):
                    if cancellation_event and cancellation_event.is_set():
@@ -105,18 +103,18 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,

                            current_time = time.time()
                            time_delta_speed = current_time - last_speed_calc_time
-                            if time_delta_speed > 0.5: # Calculate speed every 0.5 seconds
+                            if time_delta_speed > 0.5:
                                bytes_delta = bytes_this_chunk - bytes_at_last_speed_calc
                                current_speed_bps = (bytes_delta * 8) / time_delta_speed if time_delta_speed > 0 else 0
                                progress_data['chunks_status'][part_num]['speed_bps'] = current_speed_bps
                                last_speed_calc_time = current_time
                                bytes_at_last_speed_calc = bytes_this_chunk                            
-                            if emitter and (current_time - global_emit_time_ref[0] > 0.25): # Max ~4Hz for the whole file
-                                global_emit_time_ref[0] = current_time # Update shared last emit time
-                                status_list_copy = [dict(s) for s in progress_data['chunks_status']] # Make a deep enough copy
+                            if emitter and (current_time - global_emit_time_ref[0] > 0.25):
+                                global_emit_time_ref[0] = current_time
+                                status_list_copy = [dict(s) for s in progress_data['chunks_status']]
                                if isinstance(emitter, queue.Queue):
                                    emitter.put({'type': 'file_progress', 'payload': (api_original_filename, status_list_copy)})
-                                elif hasattr(emitter, 'file_progress_signal'): # PostProcessorSignals-like
+                                elif hasattr(emitter, 'file_progress_signal'):
                                    emitter.file_progress_signal.emit(api_original_filename, status_list_copy)
            return bytes_this_chunk, True

@@ -128,9 +126,9 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
            if attempt == MAX_CHUNK_DOWNLOAD_RETRIES:
                logger_func(f"   ❌ [Chunk {part_num + 1}/{total_parts}] Failed after {MAX_CHUNK_DOWNLOAD_RETRIES} retries.")
                return bytes_this_chunk, False
-        except requests.exceptions.RequestException as e: # Includes 4xx/5xx errors after raise_for_status
+        except requests.exceptions.RequestException as e:
            logger_func(f"   ❌ [Chunk {part_num + 1}/{total_parts}] Non-retryable error: {e}")
-            if ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)): # More general check
+            if ("Failed to resolve" in str(e) or "NameResolutionError" in str(e)):
                logger_func("   💡 This looks like a DNS resolution problem. Please check your internet connection, DNS settings, or VPN.")
            return bytes_this_chunk, False
        except Exception as e:
@@ -140,25 +138,19 @@ def _download_individual_chunk(chunk_url, temp_file_path, start_byte, end_byte,
    with progress_data['lock']:
        progress_data['chunks_status'][part_num]['active'] = False
        progress_data['chunks_status'][part_num]['speed_bps'] = 0
-    return bytes_this_chunk, False # Should be unreachable
+    return bytes_this_chunk, False


 def download_file_in_parts(file_url, save_path, total_size, num_parts, headers, api_original_filename,
-                           emitter_for_multipart, cookies_for_chunk_session, # Added cookies_for_chunk_session
+                           emitter_for_multipart, cookies_for_chunk_session,
                           cancellation_event, skip_event, logger_func, pause_event):
-    """
-    Downloads a file in multiple parts concurrently.
-    Returns: (download_successful_flag, downloaded_bytes, calculated_file_hash, temp_file_handle_or_None)
-    The temp_file_handle will be an open read-binary file handle to the .part file if successful, otherwise None.
-    It is the responsibility of the caller to close this handle and rename/delete the .part file.
-    """
    logger_func(f"⬇️ Initializing Multi-part Download ({num_parts} parts) for: '{api_original_filename}' (Size: {total_size / (1024*1024):.2f} MB)")
    temp_file_path = save_path + ".part"

    try:
        with open(temp_file_path, 'wb') as f_temp:
            if total_size > 0:
-                f_temp.truncate(total_size) # Pre-allocate space
+                f_temp.truncate(total_size)
    except IOError as e:
        logger_func(f"   ❌ Error creating/truncating temp file '{temp_file_path}': {e}")
        return False, 0, None, None
@@ -168,14 +160,14 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers,
    for i in range(num_parts):
        start = i * chunk_size_calc
        end = start + chunk_size_calc - 1 if i < num_parts - 1 else total_size - 1
-        if start <= end: # Valid range
+        if start <= end:
            chunks_ranges.append((start, end))
-        elif total_size == 0 and i == 0: # Special case for 0-byte file
-            chunks_ranges.append((0, -1)) # Indicates 0-byte file, download 0 bytes from offset 0
+        elif total_size == 0 and i == 0:
+            chunks_ranges.append((0, -1))

    chunk_actual_sizes = []
    for start, end in chunks_ranges:
-        if end == -1 and start == 0: # 0-byte file
+        if end == -1 and start == 0:
            chunk_actual_sizes.append(0)
        else:
            chunk_actual_sizes.append(end - start + 1)
@@ -186,19 +178,19 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers,
        return False, 0, None, None

    progress_data = {
-        'total_file_size': total_size, # Overall file size for reference
-        'total_downloaded_so_far': 0,  # New key for overall progress
-        'chunks_status': [ # Status for each chunk
+        'total_file_size': total_size,
+        'total_downloaded_so_far': 0,
+        'chunks_status': [
            {'id': i, 'downloaded': 0, 'total': chunk_actual_sizes[i] if i < len(chunk_actual_sizes) else 0, 'active': False, 'speed_bps': 0.0}
            for i in range(num_parts)
        ],
        'lock': threading.Lock(),
-        'last_global_emit_time': [time.time()] # Shared mutable for global throttling timestamp
+        'last_global_emit_time': [time.time()]
    }

    chunk_futures = []
    all_chunks_successful = True
-    total_bytes_from_chunks = 0 # Still useful to verify total downloaded against file size
+    total_bytes_from_chunks = 0

    with ThreadPoolExecutor(max_workers=num_parts, thread_name_prefix=f"MPChunk_{api_original_filename[:10]}_") as chunk_pool:
        for i, (start, end) in enumerate(chunks_ranges):
@@ -226,14 +218,14 @@ def download_file_in_parts(file_url, save_path, total_size, num_parts, headers,
            status_list_copy = [dict(s) for s in progress_data['chunks_status']]
            if isinstance(emitter_for_multipart, queue.Queue):
                emitter_for_multipart.put({'type': 'file_progress', 'payload': (api_original_filename, status_list_copy)})
-            elif hasattr(emitter_for_multipart, 'file_progress_signal'): # PostProcessorSignals-like
+            elif hasattr(emitter_for_multipart, 'file_progress_signal'):
                emitter_for_multipart.file_progress_signal.emit(api_original_filename, status_list_copy)

    if all_chunks_successful and (total_bytes_from_chunks == total_size or total_size == 0):
        logger_func(f"   ✅ Multi-part download successful for '{api_original_filename}'. Total bytes: {total_bytes_from_chunks}")
        md5_hasher = hashlib.md5()
        with open(temp_file_path, 'rb') as f_hash:
-            for buf in iter(lambda: f_hash.read(4096*10), b''): # Read in larger buffers for hashing
+            for buf in iter(lambda: f_hash.read(4096*10), b''):
                md5_hasher.update(buf)
        calculated_hash = md5_hasher.hexdigest()
        return True, total_bytes_from_chunks, calculated_hash, open(temp_file_path, 'rb')