mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d572207122 | ||
|
|
b2fc748c2c | ||
|
|
8632050c45 | ||
|
|
2aa694eed0 | ||
|
|
4d2513eca8 | ||
|
|
67685b781e | ||
|
|
f7c0f1a964 | ||
|
|
99da3ca48e |
@@ -67,7 +67,7 @@ docker run -it --rm wayback_machine_downloader [options] URL
|
|||||||
As an example of how this works without cloning this repo, this command fetches smallrockets.com until the year 2013:
|
As an example of how this works without cloning this repo, this command fetches smallrockets.com until the year 2013:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
|
docker run -v .:/build/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
|
||||||
```
|
```
|
||||||
|
|
||||||
### 🐳 Using Docker Compose
|
### 🐳 Using Docker Compose
|
||||||
|
|||||||
@@ -86,6 +86,10 @@ option_parser = OptionParser.new do |opts|
|
|||||||
options[:subdomain_depth] = t
|
options[:subdomain_depth] = t
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
|
||||||
|
options[:page_requisites] = true
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("-v", "--version", "Display version") do |t|
|
opts.on("-v", "--version", "Display version") do |t|
|
||||||
options[:version] = t
|
options[:version] = t
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ require 'digest'
|
|||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
require_relative 'wayback_machine_downloader/to_regex'
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
require_relative 'wayback_machine_downloader/archive_api'
|
require_relative 'wayback_machine_downloader/archive_api'
|
||||||
|
require_relative 'wayback_machine_downloader/page_requisites'
|
||||||
require_relative 'wayback_machine_downloader/subdom_processor'
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
||||||
require_relative 'wayback_machine_downloader/url_rewrite'
|
require_relative 'wayback_machine_downloader/url_rewrite'
|
||||||
|
|
||||||
@@ -127,8 +128,9 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
include SubdomainProcessor
|
include SubdomainProcessor
|
||||||
|
include URLRewrite
|
||||||
|
|
||||||
VERSION = "2.4.4"
|
VERSION = "2.4.5"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -142,7 +144,7 @@ class WaybackMachineDownloader
|
|||||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
||||||
:snapshot_at
|
:snapshot_at, :page_requisites
|
||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
validate_params(params)
|
validate_params(params)
|
||||||
@@ -175,6 +177,8 @@ class WaybackMachineDownloader
|
|||||||
@subdomain_depth = params[:subdomain_depth] || 1
|
@subdomain_depth = params[:subdomain_depth] || 1
|
||||||
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
||||||
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
||||||
|
@page_requisites = params[:page_requisites] || false
|
||||||
|
@pending_jobs = Concurrent::AtomicFixnum.new(0)
|
||||||
|
|
||||||
# URL for rejecting invalid/unencoded wayback urls
|
# URL for rejecting invalid/unencoded wayback urls
|
||||||
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
||||||
@@ -183,18 +187,29 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
url_to_process = @base_url
|
||||||
|
url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
|
||||||
|
|
||||||
raw = if url_to_process.include?('//')
|
raw = if url_to_process.include?('//')
|
||||||
url_to_process.split('/')[2]
|
url_to_process.split('/')[2]
|
||||||
else
|
else
|
||||||
url_to_process
|
url_to_process
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# if it looks like a wildcard pattern, normalize to a safe host-ish name
|
||||||
|
if raw&.start_with?('*.')
|
||||||
|
raw = raw.sub(/\A\*\./, 'all-')
|
||||||
|
end
|
||||||
|
|
||||||
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
||||||
if Gem.win_platform?
|
if Gem.win_platform?
|
||||||
raw = raw.gsub(/[:*?"<>|]/, '_')
|
raw = raw.gsub(/[:*?"<>|]/, '_')
|
||||||
raw = raw.gsub(/[ .]+\z/, '')
|
raw = raw.gsub(/[ .]+\z/, '')
|
||||||
|
else
|
||||||
|
# still good practice to strip path separators (and maybe '*' for POSIX too)
|
||||||
|
raw = raw.gsub(/[\/:*?"<>|]/, '_')
|
||||||
end
|
end
|
||||||
|
|
||||||
raw = 'site' if raw.nil? || raw.empty?
|
raw = 'site' if raw.nil? || raw.empty?
|
||||||
raw
|
raw
|
||||||
end
|
end
|
||||||
@@ -570,6 +585,12 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
# Load IDs of already downloaded files
|
# Load IDs of already downloaded files
|
||||||
downloaded_ids = load_downloaded_ids
|
downloaded_ids = load_downloaded_ids
|
||||||
|
|
||||||
|
# We use a thread-safe Set to track what we have queued/downloaded in this session
|
||||||
|
# to avoid infinite loops with page requisites
|
||||||
|
@session_downloaded_ids = Concurrent::Set.new
|
||||||
|
downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
|
||||||
|
|
||||||
files_to_process = files_to_download.reject do |file_info|
|
files_to_process = files_to_download.reject do |file_info|
|
||||||
downloaded_ids.include?(file_info[:file_id])
|
downloaded_ids.include?(file_info[:file_id])
|
||||||
end
|
end
|
||||||
@@ -581,7 +602,7 @@ class WaybackMachineDownloader
|
|||||||
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
||||||
end
|
end
|
||||||
|
|
||||||
if remaining_count == 0
|
if remaining_count == 0 && !@page_requisites
|
||||||
puts "All matching files have already been downloaded."
|
puts "All matching files have already been downloaded."
|
||||||
cleanup
|
cleanup
|
||||||
return
|
return
|
||||||
@@ -594,12 +615,22 @@ class WaybackMachineDownloader
|
|||||||
@download_mutex = Mutex.new
|
@download_mutex = Mutex.new
|
||||||
|
|
||||||
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
||||||
pool = Concurrent::FixedThreadPool.new(thread_count)
|
@worker_pool = Concurrent::FixedThreadPool.new(thread_count)
|
||||||
|
|
||||||
processing_files(pool, files_to_process)
|
# initial batch
|
||||||
|
files_to_process.each do |file_remote_info|
|
||||||
|
@session_downloaded_ids.add(file_remote_info[:file_id])
|
||||||
|
submit_download_job(file_remote_info)
|
||||||
|
end
|
||||||
|
|
||||||
pool.shutdown
|
# wait for all jobs to finish
|
||||||
pool.wait_for_termination
|
loop do
|
||||||
|
sleep 0.5
|
||||||
|
break if @pending_jobs.value == 0
|
||||||
|
end
|
||||||
|
|
||||||
|
@worker_pool.shutdown
|
||||||
|
@worker_pool.wait_for_termination
|
||||||
|
|
||||||
end_time = Time.now
|
end_time = Time.now
|
||||||
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
||||||
@@ -617,6 +648,138 @@ class WaybackMachineDownloader
|
|||||||
cleanup
|
cleanup
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# helper to submit jobs and increment the counter
|
||||||
|
def submit_download_job(file_remote_info)
|
||||||
|
@pending_jobs.increment
|
||||||
|
@worker_pool.post do
|
||||||
|
begin
|
||||||
|
process_single_file(file_remote_info)
|
||||||
|
ensure
|
||||||
|
@pending_jobs.decrement
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_single_file(file_remote_info)
|
||||||
|
download_success = false
|
||||||
|
downloaded_path = nil
|
||||||
|
|
||||||
|
@connection_pool.with_connection do |connection|
|
||||||
|
result_message, path = download_file(file_remote_info, connection)
|
||||||
|
downloaded_path = path
|
||||||
|
|
||||||
|
if result_message && result_message.include?(' -> ')
|
||||||
|
download_success = true
|
||||||
|
end
|
||||||
|
|
||||||
|
@download_mutex.synchronize do
|
||||||
|
@processed_file_count += 1 if @processed_file_count < @total_to_download
|
||||||
|
# only print if it's a "User" file or a requisite we found
|
||||||
|
puts result_message if result_message
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if download_success
|
||||||
|
append_to_db(file_remote_info[:file_id])
|
||||||
|
|
||||||
|
if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
|
||||||
|
process_page_requisites(downloaded_path, file_remote_info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
rescue => e
|
||||||
|
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_page_requisites(file_path, parent_remote_info)
|
||||||
|
return unless File.exist?(file_path)
|
||||||
|
|
||||||
|
content = File.read(file_path)
|
||||||
|
content = content.force_encoding('UTF-8').scrub
|
||||||
|
|
||||||
|
assets = PageRequisites.extract(content)
|
||||||
|
|
||||||
|
# prepare base URI for resolving relative paths
|
||||||
|
parent_raw = parent_remote_info[:file_url]
|
||||||
|
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
||||||
|
|
||||||
|
begin
|
||||||
|
base_uri = URI(parent_raw)
|
||||||
|
# calculate the "root" host of the site we are downloading to compare later
|
||||||
|
current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
|
||||||
|
rescue URI::InvalidURIError
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
parent_timestamp = parent_remote_info[:timestamp]
|
||||||
|
|
||||||
|
assets.each do |asset_rel_url|
|
||||||
|
begin
|
||||||
|
# resolve full URL (handles relative paths like "../img/logo.png")
|
||||||
|
resolved_uri = base_uri + asset_rel_url
|
||||||
|
|
||||||
|
# detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
|
||||||
|
asset_timestamp = parent_timestamp
|
||||||
|
if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
|
||||||
|
embedded_ts = $1
|
||||||
|
begin
|
||||||
|
orig_uri = URI($2)
|
||||||
|
resolved_uri = orig_uri
|
||||||
|
asset_timestamp = embedded_ts.to_i
|
||||||
|
rescue URI::InvalidURIError
|
||||||
|
# fall back to original resolved_uri and parent timestamp
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# filter out navigation links (pages) vs assets
|
||||||
|
# skip if extension is empty or looks like an HTML page
|
||||||
|
path = resolved_uri.path
|
||||||
|
ext = File.extname(path).downcase
|
||||||
|
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
# construct the URL for the Wayback API
|
||||||
|
asset_wbm_url = resolved_uri.host + resolved_uri.path
|
||||||
|
asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
|
||||||
|
|
||||||
|
# construct the local file ID
|
||||||
|
# if the asset is on the SAME domain, strip the domain from the folder path
|
||||||
|
# if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
|
||||||
|
if resolved_uri.host == current_project_host
|
||||||
|
# e.g. /static/css/style.css
|
||||||
|
asset_file_id = resolved_uri.path
|
||||||
|
asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
|
||||||
|
else
|
||||||
|
# e.g. cdn.google.com/jquery.js
|
||||||
|
asset_file_id = asset_wbm_url
|
||||||
|
end
|
||||||
|
|
||||||
|
rescue URI::InvalidURIError, StandardError
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
# sanitize and queue
|
||||||
|
asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
|
||||||
|
|
||||||
|
unless @session_downloaded_ids.include?(asset_id)
|
||||||
|
@session_downloaded_ids.add(asset_id)
|
||||||
|
|
||||||
|
new_file_info = {
|
||||||
|
file_url: asset_wbm_url,
|
||||||
|
timestamp: asset_timestamp,
|
||||||
|
file_id: asset_id
|
||||||
|
}
|
||||||
|
|
||||||
|
@download_mutex.synchronize do
|
||||||
|
@total_to_download += 1
|
||||||
|
puts "Queued requisite: #{asset_file_id}"
|
||||||
|
end
|
||||||
|
|
||||||
|
submit_download_job(new_file_info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def structure_dir_path dir_path
|
def structure_dir_path dir_path
|
||||||
begin
|
begin
|
||||||
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
||||||
@@ -648,7 +811,8 @@ class WaybackMachineDownloader
|
|||||||
begin
|
begin
|
||||||
content = File.binread(file_path)
|
content = File.binread(file_path)
|
||||||
|
|
||||||
if file_ext == '.html' || file_ext == '.htm'
|
# detect encoding for HTML files
|
||||||
|
if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
|
||||||
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
||||||
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
||||||
else
|
else
|
||||||
@@ -664,13 +828,13 @@ class WaybackMachineDownloader
|
|||||||
# URLs in JavaScript
|
# URLs in JavaScript
|
||||||
content = rewrite_js_urls(content)
|
content = rewrite_js_urls(content)
|
||||||
|
|
||||||
# for URLs in HTML attributes that start with a single slash
|
# for URLs that start with a single slash, make them relative
|
||||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||||||
prefix, path, suffix = $1, $2, $3
|
prefix, path, suffix = $1, $2, $3
|
||||||
"#{prefix}./#{path}#{suffix}"
|
"#{prefix}./#{path}#{suffix}"
|
||||||
end
|
end
|
||||||
|
|
||||||
# for URLs in CSS that start with a single slash
|
# for URLs in CSS that start with a single slash, make them relative
|
||||||
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
||||||
path = $1
|
path = $1
|
||||||
"url(\"./#{path}\")"
|
"url(\"./#{path}\")"
|
||||||
@@ -723,7 +887,7 @@ class WaybackMachineDownloader
|
|||||||
# check existence *before* download attempt
|
# check existence *before* download attempt
|
||||||
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
||||||
if File.exist? file_path
|
if File.exist? file_path
|
||||||
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
||||||
end
|
end
|
||||||
|
|
||||||
begin
|
begin
|
||||||
@@ -735,13 +899,13 @@ class WaybackMachineDownloader
|
|||||||
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
||||||
rewrite_urls_to_relative(file_path)
|
rewrite_urls_to_relative(file_path)
|
||||||
end
|
end
|
||||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
||||||
when :skipped_not_found
|
when :skipped_not_found
|
||||||
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
||||||
else
|
else
|
||||||
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
||||||
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
||||||
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
||||||
end
|
end
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
@@ -749,7 +913,7 @@ class WaybackMachineDownloader
|
|||||||
File.delete(file_path)
|
File.delete(file_path)
|
||||||
msg += "\n#{file_path} was empty and was removed."
|
msg += "\n#{file_path} was empty and was removed."
|
||||||
end
|
end
|
||||||
msg
|
return [msg, nil]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,10 @@ module ArchiveAPI
|
|||||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||||
request_url.query = URI.encode_www_form(params)
|
request_url.query = URI.encode_www_form(params)
|
||||||
|
|
||||||
|
retries = 0
|
||||||
|
max_retries = (@max_retries || 3)
|
||||||
|
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
|
||||||
|
|
||||||
begin
|
begin
|
||||||
response = http.get(request_url)
|
response = http.get(request_url)
|
||||||
body = response.body.to_s.strip
|
body = response.body.to_s.strip
|
||||||
@@ -26,7 +30,21 @@ module ArchiveAPI
|
|||||||
json.shift if json.first == ["timestamp", "original"]
|
json.shift if json.first == ["timestamp", "original"]
|
||||||
json
|
json
|
||||||
rescue JSON::ParserError => e
|
rescue JSON::ParserError => e
|
||||||
warn "Failed to fetch data from API: #{e.message}"
|
warn "Failed to parse JSON from API for #{url}: #{e.message}"
|
||||||
|
[]
|
||||||
|
rescue Net::ReadTimeout, Net::OpenTimeout => e
|
||||||
|
if retries < max_retries
|
||||||
|
retries += 1
|
||||||
|
warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
|
||||||
|
sleep(delay * retries)
|
||||||
|
retry
|
||||||
|
else
|
||||||
|
warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
|
||||||
|
[]
|
||||||
|
end
|
||||||
|
rescue StandardError => e
|
||||||
|
# treat any other transient-ish error similarly, though without retries for now
|
||||||
|
warn "Error fetching CDX data for #{url}: #{e.message}"
|
||||||
[]
|
[]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
33
lib/wayback_machine_downloader/page_requisites.rb
Normal file
33
lib/wayback_machine_downloader/page_requisites.rb
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
module PageRequisites
|
||||||
|
# regex to find links in href, src, url(), and srcset
|
||||||
|
# this ignores data: URIs, mailto:, and anchors
|
||||||
|
ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
|
||||||
|
|
||||||
|
def self.extract(html_content)
|
||||||
|
assets = []
|
||||||
|
|
||||||
|
html_content.scan(ASSET_REGEX) do |match|
|
||||||
|
# match is an array of capture groups; find the one that matched
|
||||||
|
url = match.compact.first
|
||||||
|
next unless url
|
||||||
|
|
||||||
|
# handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
|
||||||
|
if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
|
||||||
|
url.split(',').each do |src_def|
|
||||||
|
src_url = src_def.strip.split(' ').first
|
||||||
|
assets << src_url if valid_asset?(src_url)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
assets << url if valid_asset?(url)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assets.uniq
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.valid_asset?(url)
|
||||||
|
return false if url.strip.empty?
|
||||||
|
return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -1,74 +1,85 @@
|
|||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
|
|
||||||
# URLs in HTML attributes
|
module URLRewrite
|
||||||
def rewrite_html_attr_urls(content)
|
# server-side extensions that should work locally
|
||||||
|
SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
|
||||||
|
|
||||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
def rewrite_html_attr_urls(content)
|
||||||
prefix, url, suffix = $1, $2, $3
|
# rewrite URLs to relative paths
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||||
if url.start_with?('http')
|
prefix, path, suffix = $1, $2, $3
|
||||||
begin
|
path = normalize_path_for_local(path)
|
||||||
uri = URI.parse(url)
|
"#{prefix}#{path}#{suffix}"
|
||||||
path = uri.path
|
|
||||||
path = path[1..-1] if path.start_with?('/')
|
|
||||||
"#{prefix}#{path}#{suffix}"
|
|
||||||
rescue
|
|
||||||
"#{prefix}#{url}#{suffix}"
|
|
||||||
end
|
|
||||||
elsif url.start_with?('/')
|
|
||||||
"#{prefix}./#{url[1..-1]}#{suffix}"
|
|
||||||
else
|
|
||||||
"#{prefix}#{url}#{suffix}"
|
|
||||||
end
|
end
|
||||||
end
|
|
||||||
content
|
|
||||||
end
|
|
||||||
|
|
||||||
# URLs in CSS
|
# rewrite absolute URLs to same domain as relative
|
||||||
def rewrite_css_urls(content)
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||||
|
prefix, path, suffix = $1, $2, $3
|
||||||
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
path = normalize_path_for_local(path)
|
||||||
url = $1
|
"#{prefix}#{path}#{suffix}"
|
||||||
|
|
||||||
if url.start_with?('http')
|
|
||||||
begin
|
|
||||||
uri = URI.parse(url)
|
|
||||||
path = uri.path
|
|
||||||
path = path[1..-1] if path.start_with?('/')
|
|
||||||
"url(\"#{path}\")"
|
|
||||||
rescue
|
|
||||||
"url(\"#{url}\")"
|
|
||||||
end
|
|
||||||
elsif url.start_with?('/')
|
|
||||||
"url(\"./#{url[1..-1]}\")"
|
|
||||||
else
|
|
||||||
"url(\"#{url}\")"
|
|
||||||
end
|
end
|
||||||
end
|
|
||||||
content
|
|
||||||
end
|
|
||||||
|
|
||||||
# URLs in JavaScript
|
content
|
||||||
def rewrite_js_urls(content)
|
|
||||||
|
|
||||||
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
|
||||||
quote_start, url, quote_end = $1, $2, $3
|
|
||||||
|
|
||||||
if url.start_with?('http')
|
|
||||||
begin
|
|
||||||
uri = URI.parse(url)
|
|
||||||
path = uri.path
|
|
||||||
path = path[1..-1] if path.start_with?('/')
|
|
||||||
"#{quote_start}#{path}#{quote_end}"
|
|
||||||
rescue
|
|
||||||
"#{quote_start}#{url}#{quote_end}"
|
|
||||||
end
|
|
||||||
elsif url.start_with?('/')
|
|
||||||
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
||||||
else
|
|
||||||
"#{quote_start}#{url}#{quote_end}"
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
content
|
def rewrite_css_urls(content)
|
||||||
|
# rewrite URLs in CSS
|
||||||
|
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
||||||
|
path = normalize_path_for_local($1)
|
||||||
|
"url(\"#{path}\")"
|
||||||
|
end
|
||||||
|
|
||||||
|
# rewrite absolute URLs in CSS
|
||||||
|
content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
|
||||||
|
path = normalize_path_for_local($1)
|
||||||
|
"url(\"#{path}\")"
|
||||||
|
end
|
||||||
|
|
||||||
|
content
|
||||||
|
end
|
||||||
|
|
||||||
|
def rewrite_js_urls(content)
|
||||||
|
# rewrite archive.org URLs in JavaScript strings
|
||||||
|
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||||
|
quote_start, path, quote_end = $1, $2, $3
|
||||||
|
path = normalize_path_for_local(path)
|
||||||
|
"#{quote_start}#{path}#{quote_end}"
|
||||||
|
end
|
||||||
|
|
||||||
|
# rewrite absolute URLs in JavaScript
|
||||||
|
content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
|
||||||
|
quote_start, path, quote_end = $1, $2, $3
|
||||||
|
next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
|
||||||
|
path = normalize_path_for_local(path)
|
||||||
|
"#{quote_start}#{path}#{quote_end}"
|
||||||
|
end
|
||||||
|
|
||||||
|
content
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def normalize_path_for_local(path)
|
||||||
|
return "./index.html" if path.empty? || path == "/"
|
||||||
|
|
||||||
|
# handle query strings - they're already part of the filename
|
||||||
|
path = path.split('?').first if path.include?('?')
|
||||||
|
|
||||||
|
# check if this is a server-side script
|
||||||
|
ext = File.extname(path).downcase
|
||||||
|
if SERVER_SIDE_EXTS.include?(ext)
|
||||||
|
# keep the path as-is but ensure it starts with ./
|
||||||
|
path = "./#{path}" unless path.start_with?('./', '/')
|
||||||
|
else
|
||||||
|
# regular file handling
|
||||||
|
path = "./#{path}" unless path.start_with?('./', '/')
|
||||||
|
|
||||||
|
# if it looks like a directory, add index.html
|
||||||
|
if path.end_with?('/') || !path.include?('.')
|
||||||
|
path = "#{path.chomp('/')}/index.html"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
path
|
||||||
|
end
|
||||||
end
|
end
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = "2.4.4"
|
s.version = "2.4.5"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
s.authors = ["strawberrymaster"]
|
s.authors = ["strawberrymaster"]
|
||||||
s.email = "strawberrymaster@vivaldi.net"
|
s.email = "strawberrymaster@vivaldi.net"
|
||||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
|
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/page_requisites.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
|
||||||
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
||||||
s.license = "MIT"
|
s.license = "MIT"
|
||||||
s.required_ruby_version = ">= 3.4.3"
|
s.required_ruby_version = ">= 3.4.3"
|
||||||
|
|||||||
Reference in New Issue
Block a user