mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
Better support for .php, .asp, and other files when using --local
see #37
This commit is contained in:
parent
99da3ca48e
commit
f7c0f1a964
@ -127,6 +127,7 @@ class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
include SubdomainProcessor
|
||||
include URLRewrite
|
||||
|
||||
VERSION = "2.4.4"
|
||||
DEFAULT_TIMEOUT = 30
|
||||
@ -648,7 +649,8 @@ class WaybackMachineDownloader
|
||||
begin
|
||||
content = File.binread(file_path)
|
||||
|
||||
if file_ext == '.html' || file_ext == '.htm'
|
||||
# detect encoding for HTML files
|
||||
if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
|
||||
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
||||
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
||||
else
|
||||
@ -664,13 +666,13 @@ class WaybackMachineDownloader
|
||||
# URLs in JavaScript
|
||||
content = rewrite_js_urls(content)
|
||||
|
||||
# for URLs in HTML attributes that start with a single slash
|
||||
# for URLs that start with a single slash, make them relative
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||||
prefix, path, suffix = $1, $2, $3
|
||||
"#{prefix}./#{path}#{suffix}"
|
||||
end
|
||||
|
||||
# for URLs in CSS that start with a single slash
|
||||
# for URLs in CSS that start with a single slash, make them relative
|
||||
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
||||
path = $1
|
||||
"url(\"./#{path}\")"
|
||||
|
||||
@ -1,74 +1,85 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# URLs in HTML attributes
|
||||
def rewrite_html_attr_urls(content)
|
||||
module URLRewrite
|
||||
# server-side extensions that should work locally
|
||||
SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
|
||||
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||
prefix, url, suffix = $1, $2, $3
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
def rewrite_html_attr_urls(content)
|
||||
# rewrite URLs to relative paths
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||
prefix, path, suffix = $1, $2, $3
|
||||
path = normalize_path_for_local(path)
|
||||
"#{prefix}#{path}#{suffix}"
|
||||
rescue
|
||||
"#{prefix}#{url}#{suffix}"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"#{prefix}./#{url[1..-1]}#{suffix}"
|
||||
else
|
||||
"#{prefix}#{url}#{suffix}"
|
||||
end
|
||||
|
||||
# rewrite absolute URLs to same domain as relative
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||
prefix, path, suffix = $1, $2, $3
|
||||
path = normalize_path_for_local(path)
|
||||
"#{prefix}#{path}#{suffix}"
|
||||
end
|
||||
|
||||
content
|
||||
end
|
||||
end
|
||||
|
||||
# URLs in CSS
|
||||
def rewrite_css_urls(content)
|
||||
|
||||
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
||||
url = $1
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
def rewrite_css_urls(content)
|
||||
# rewrite URLs in CSS
|
||||
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
||||
path = normalize_path_for_local($1)
|
||||
"url(\"#{path}\")"
|
||||
rescue
|
||||
"url(\"#{url}\")"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"url(\"./#{url[1..-1]}\")"
|
||||
else
|
||||
"url(\"#{url}\")"
|
||||
end
|
||||
|
||||
# rewrite absolute URLs in CSS
|
||||
content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
||||
path = normalize_path_for_local($1)
|
||||
"url(\"#{path}\")"
|
||||
end
|
||||
|
||||
content
|
||||
end
|
||||
end
|
||||
|
||||
# URLs in JavaScript
|
||||
def rewrite_js_urls(content)
|
||||
|
||||
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||
quote_start, url, quote_end = $1, $2, $3
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
def rewrite_js_urls(content)
|
||||
# rewrite archive.org URLs in JavaScript strings
|
||||
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||
quote_start, path, quote_end = $1, $2, $3
|
||||
path = normalize_path_for_local(path)
|
||||
"#{quote_start}#{path}#{quote_end}"
|
||||
rescue
|
||||
"#{quote_start}#{url}#{quote_end}"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
||||
else
|
||||
"#{quote_start}#{url}#{quote_end}"
|
||||
end
|
||||
|
||||
# rewrite absolute URLs in JavaScript
|
||||
content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||
quote_start, path, quote_end = $1, $2, $3
|
||||
next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
|
||||
path = normalize_path_for_local(path)
|
||||
"#{quote_start}#{path}#{quote_end}"
|
||||
end
|
||||
|
||||
content
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def normalize_path_for_local(path)
|
||||
return "./index.html" if path.empty? || path == "/"
|
||||
|
||||
# handle query strings - they're already part of the filename
|
||||
path = path.split('?').first if path.include?('?')
|
||||
|
||||
# check if this is a server-side script
|
||||
ext = File.extname(path).downcase
|
||||
if SERVER_SIDE_EXTS.include?(ext)
|
||||
# keep the path as-is but ensure it starts with ./
|
||||
path = "./#{path}" unless path.start_with?('./', '/')
|
||||
else
|
||||
# regular file handling
|
||||
path = "./#{path}" unless path.start_with?('./', '/')
|
||||
|
||||
# if it looks like a directory, add index.html
|
||||
if path.end_with?('/') || !path.include?('.')
|
||||
path = "#{path.chomp('/')}/index.html"
|
||||
end
|
||||
end
|
||||
|
||||
path
|
||||
end
|
||||
end
|
||||
Loading…
x
Reference in New Issue
Block a user