This commit is contained in:
Yuvi9587
2025-06-01 08:22:29 +01:00
parent 7d4e785ca1
commit f93795e370
3 changed files with 38 additions and 51 deletions

View File

@@ -154,10 +154,7 @@ def clean_filename(name):
def strip_html_tags(html_text):
if not html_text: return ""
text = html.unescape(str(html_text)) # Ensure input is a string
# Replace HTML tags with a single space
text_after_tag_removal = re.sub(r'<[^>]+>', ' ', text)
# Replace multiple whitespace characters (including newlines, tabs, etc. that are now spaces)
# with a single space. Also, strip leading/trailing whitespace from the final result.
cleaned_text = re.sub(r'\s+', ' ', text_after_tag_removal).strip()
return cleaned_text
def extract_folder_name_from_title(title, unwanted_keywords):
@@ -1214,7 +1211,6 @@ class PostProcessorWorker:
if (self.show_external_links or self.extract_links_only) and post_content_html: # type: ignore
if self._check_pause(f"External link extraction for post {post_id}"): return 0, num_potential_files_in_post, [], []
try:
# Regex for typical Mega decryption keys (43 or 22 chars, alphanumeric + hyphen/underscore)
mega_key_pattern = re.compile(r'\b([a-zA-Z0-9_-]{43}|[a-zA-Z0-9_-]{22})\b')
unique_links_data = {}
for match in link_pattern.finditer(post_content_html):
@@ -1234,19 +1230,16 @@ class PostProcessorWorker:
platform = get_link_platform(link_url)
decryption_key_found = ""
if platform == 'mega':
# 1. Check if key is in the URL fragment
parsed_mega_url = urlparse(link_url)
if parsed_mega_url.fragment:
potential_key_from_fragment = parsed_mega_url.fragment.split('!')[-1] # Handle cases like #!key or #key
if mega_key_pattern.fullmatch(potential_key_from_fragment):
decryption_key_found = potential_key_from_fragment
# 2. If not in fragment, search in link text
if not decryption_key_found and link_text:
key_match_in_text = mega_key_pattern.search(link_text)
if key_match_in_text:
decryption_key_found = key_match_in_text.group(1)
# 3. If still not found, search the whole post content (if extracting links only, as it's more critical)
if not decryption_key_found and self.extract_links_only and post_content_html:
key_match_in_content = mega_key_pattern.search(strip_html_tags(post_content_html)) # Search cleaned content
if key_match_in_content: