Better support for .php, .asp, and other files when using --local

see #37
2025-12-29 16:16:06 +00:00 · 2025-11-04 23:18:04 +00:00 · 2025-11-04 23:18:04 +00:00 · f7c0f1a964
commit f7c0f1a964
parent 99da3ca48e
2 changed files with 82 additions and 69 deletions
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@ -127,6 +127,7 @@ class WaybackMachineDownloader

  include ArchiveAPI
  include SubdomainProcessor
+  include URLRewrite

  VERSION = "2.4.4"
  DEFAULT_TIMEOUT = 30
@ -648,7 +649,8 @@ class WaybackMachineDownloader
    begin
      content = File.binread(file_path)

-      if file_ext == '.html' || file_ext == '.htm'
+      # detect encoding for HTML files
+      if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
        encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
        content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
      else
@ -664,13 +666,13 @@ class WaybackMachineDownloader
      # URLs in JavaScript
      content = rewrite_js_urls(content)
      
-      # for URLs in HTML attributes that start with a single slash
+      # for URLs that start with a single slash, make them relative
      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
        prefix, path, suffix = $1, $2, $3
        "#{prefix}./#{path}#{suffix}"
      end
      
-      # for URLs in CSS that start with a single slash
+      # for URLs in CSS that start with a single slash, make them relative
      content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
        path = $1
        "url(\"./#{path}\")"
--- a/lib/wayback_machine_downloader/url_rewrite.rb
+++ b/lib/wayback_machine_downloader/url_rewrite.rb
@ -1,74 +1,85 @@
 # frozen_string_literal: true

-# URLs in HTML attributes
-def rewrite_html_attr_urls(content)
+module URLRewrite
+  # server-side extensions that should work locally
+  SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze

-  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-    prefix, url, suffix = $1, $2, $3
-    
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
+  def rewrite_html_attr_urls(content)
+    # rewrite URLs to relative paths
+    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      prefix, path, suffix = $1, $2, $3
+      path = normalize_path_for_local(path)
      "#{prefix}#{path}#{suffix}"
-      rescue
-        "#{prefix}#{url}#{suffix}"
-      end
-    elsif url.start_with?('/')
-      "#{prefix}./#{url[1..-1]}#{suffix}"
-    else
-      "#{prefix}#{url}#{suffix}"
    end
+
+    # rewrite absolute URLs to same domain as relative
+    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      prefix, path, suffix = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{prefix}#{path}#{suffix}"
    end
+
    content
-end
+  end

-# URLs in CSS
-def rewrite_css_urls(content)
-
-  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
-    url = $1
-    
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
+  def rewrite_css_urls(content)
+    # rewrite URLs in CSS
+    content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
+      path = normalize_path_for_local($1)
      "url(\"#{path}\")"
-      rescue
-        "url(\"#{url}\")"
-      end
-    elsif url.start_with?('/')
-      "url(\"./#{url[1..-1]}\")"
-    else
-      "url(\"#{url}\")"
    end
+
+    # rewrite absolute URLs in CSS
+    content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
+      path = normalize_path_for_local($1)
+      "url(\"#{path}\")"
    end
+
    content
-end
+  end

-# URLs in JavaScript
-def rewrite_js_urls(content)
-  
-  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-    quote_start, url, quote_end = $1, $2, $3
-    
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
+  def rewrite_js_urls(content)
+    # rewrite archive.org URLs in JavaScript strings
+    content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      quote_start, path, quote_end = $1, $2, $3
+      path = normalize_path_for_local(path)
      "#{quote_start}#{path}#{quote_end}"
-      rescue
-        "#{quote_start}#{url}#{quote_end}"
-      end
-    elsif url.start_with?('/')
-      "#{quote_start}./#{url[1..-1]}#{quote_end}"
-    else
-      "#{quote_start}#{url}#{quote_end}"
    end
+
+    # rewrite absolute URLs in JavaScript
+    content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      quote_start, path, quote_end = $1, $2, $3
+      next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
+      path = normalize_path_for_local(path)
+      "#{quote_start}#{path}#{quote_end}"
    end

    content
+  end
+
+  private
+
+  def normalize_path_for_local(path)
+    return "./index.html" if path.empty? || path == "/"
+    
+    # handle query strings - they're already part of the filename
+    path = path.split('?').first if path.include?('?')
+    
+    # check if this is a server-side script
+    ext = File.extname(path).downcase
+    if SERVER_SIDE_EXTS.include?(ext)
+      # keep the path as-is but ensure it starts with ./
+      path = "./#{path}" unless path.start_with?('./', '/')
+    else
+      # regular file handling
+      path = "./#{path}" unless path.start_with?('./', '/')
+      
+      # if it looks like a directory, add index.html
+      if path.end_with?('/') || !path.include?('.')
+        path = "#{path.chomp('/')}/index.html"
+      end
+    end
+    
+    path
+  end
 end