From f7c0f1a96421bbbebf67292cd1daab41b2ae6028 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Tue, 4 Nov 2025 23:18:04 +0000 Subject: [PATCH] Better support for .php, .asp, and other files when using --local see #37 --- lib/wayback_machine_downloader.rb | 8 +- lib/wayback_machine_downloader/url_rewrite.rb | 143 ++++++++++-------- 2 files changed, 82 insertions(+), 69 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index fe5721c..12b6999 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -127,6 +127,7 @@ class WaybackMachineDownloader include ArchiveAPI include SubdomainProcessor + include URLRewrite VERSION = "2.4.4" DEFAULT_TIMEOUT = 30 @@ -648,7 +649,8 @@ class WaybackMachineDownloader begin content = File.binread(file_path) - if file_ext == '.html' || file_ext == '.htm' + # detect encoding for HTML files + if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp' encoding = content.match(/]+)/i)&.captures&.first || 'UTF-8' content.force_encoding(encoding) rescue content.force_encoding('UTF-8') else @@ -664,13 +666,13 @@ class WaybackMachineDownloader # URLs in JavaScript content = rewrite_js_urls(content) - # for URLs in HTML attributes that start with a single slash + # for URLs that start with a single slash, make them relative content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do prefix, path, suffix = $1, $2, $3 "#{prefix}./#{path}#{suffix}" end - # for URLs in CSS that start with a single slash + # for URLs in CSS that start with a single slash, make them relative content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do path = $1 "url(\"./#{path}\")" diff --git a/lib/wayback_machine_downloader/url_rewrite.rb b/lib/wayback_machine_downloader/url_rewrite.rb index 5d56626..f6bd476 100644 --- a/lib/wayback_machine_downloader/url_rewrite.rb +++ b/lib/wayback_machine_downloader/url_rewrite.rb @@ -1,74 +1,85 @@ # frozen_string_literal: true -# URLs in HTML attributes -def rewrite_html_attr_urls(content) - - content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do - prefix, url, suffix = $1, $2, $3 - - if url.start_with?('http') - begin - uri = URI.parse(url) - path = uri.path - path = path[1..-1] if path.start_with?('/') - "#{prefix}#{path}#{suffix}" - rescue - "#{prefix}#{url}#{suffix}" - end - elsif url.start_with?('/') - "#{prefix}./#{url[1..-1]}#{suffix}" - else - "#{prefix}#{url}#{suffix}" - end - end - content -end +module URLRewrite + # server-side extensions that should work locally + SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze -# URLs in CSS -def rewrite_css_urls(content) - - content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do - url = $1 - - if url.start_with?('http') - begin - uri = URI.parse(url) - path = uri.path - path = path[1..-1] if path.start_with?('/') - "url(\"#{path}\")" - rescue - "url(\"#{url}\")" - end - elsif url.start_with?('/') - "url(\"./#{url[1..-1]}\")" - else - "url(\"#{url}\")" + def rewrite_html_attr_urls(content) + # rewrite URLs to relative paths + content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do + prefix, path, suffix = $1, $2, $3 + path = normalize_path_for_local(path) + "#{prefix}#{path}#{suffix}" end - end - content -end -# URLs in JavaScript -def rewrite_js_urls(content) - - content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do - quote_start, url, quote_end = $1, $2, $3 - - if url.start_with?('http') - begin - uri = URI.parse(url) - path = uri.path - path = path[1..-1] if path.start_with?('/') - "#{quote_start}#{path}#{quote_end}" - rescue - "#{quote_start}#{url}#{quote_end}" - end - elsif url.start_with?('/') - "#{quote_start}./#{url[1..-1]}#{quote_end}" - else - "#{quote_start}#{url}#{quote_end}" + # rewrite absolute URLs to same domain as relative + content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do + prefix, path, suffix = $1, $2, $3 + path = normalize_path_for_local(path) + "#{prefix}#{path}#{suffix}" end + + content + end + + def rewrite_css_urls(content) + # rewrite URLs in CSS + content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do + path = normalize_path_for_local($1) + "url(\"#{path}\")" + end + + # rewrite absolute URLs in CSS + content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do + path = normalize_path_for_local($1) + "url(\"#{path}\")" + end + + content + end + + def rewrite_js_urls(content) + # rewrite archive.org URLs in JavaScript strings + content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do + quote_start, path, quote_end = $1, $2, $3 + path = normalize_path_for_local(path) + "#{quote_start}#{path}#{quote_end}" + end + + # rewrite absolute URLs in JavaScript + content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do + quote_start, path, quote_end = $1, $2, $3 + next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://') + path = normalize_path_for_local(path) + "#{quote_start}#{path}#{quote_end}" + end + + content + end + + private + + def normalize_path_for_local(path) + return "./index.html" if path.empty? || path == "/" + + # handle query strings - they're already part of the filename + path = path.split('?').first if path.include?('?') + + # check if this is a server-side script + ext = File.extname(path).downcase + if SERVER_SIDE_EXTS.include?(ext) + # keep the path as-is but ensure it starts with ./ + path = "./#{path}" unless path.start_with?('./', '/') + else + # regular file handling + path = "./#{path}" unless path.start_with?('./', '/') + + # if it looks like a directory, add index.html + if path.end_with?('/') || !path.include?('.') + path = "#{path.chomp('/')}/index.html" + end + end + + path end - - content end \ No newline at end of file