mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
1142 lines
37 KiB
Ruby
1142 lines
37 KiB
Ruby
# encoding: UTF-8
|
||
|
||
require 'thread'
|
||
require 'net/http'
|
||
require 'open-uri'
|
||
require 'fileutils'
|
||
require 'cgi'
|
||
require 'json'
|
||
require 'time'
|
||
require 'concurrent-ruby'
|
||
require 'logger'
|
||
require 'zlib'
|
||
require 'stringio'
|
||
require 'digest'
|
||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||
require_relative 'wayback_machine_downloader/to_regex'
|
||
require_relative 'wayback_machine_downloader/archive_api'
|
||
require_relative 'wayback_machine_downloader/page_requisites'
|
||
require_relative 'wayback_machine_downloader/subdom_processor'
|
||
require_relative 'wayback_machine_downloader/url_rewrite'
|
||
|
||
class ConnectionPool
|
||
MAX_AGE = 300
|
||
CLEANUP_INTERVAL = 60
|
||
DEFAULT_TIMEOUT = 30
|
||
MAX_RETRIES = 3
|
||
|
||
def initialize(size)
|
||
@pool = SizedQueue.new(size)
|
||
size.times { @pool << build_connection_entry }
|
||
@cleanup_thread = schedule_cleanup
|
||
end
|
||
|
||
def with_connection
|
||
entry = acquire_connection
|
||
begin
|
||
yield entry[:http]
|
||
ensure
|
||
release_connection(entry)
|
||
end
|
||
end
|
||
|
||
def shutdown
|
||
@cleanup_thread&.exit
|
||
drain_pool { |entry| safe_finish(entry[:http]) }
|
||
end
|
||
|
||
private
|
||
|
||
def acquire_connection
|
||
entry = @pool.pop
|
||
if stale?(entry)
|
||
safe_finish(entry[:http])
|
||
entry = build_connection_entry
|
||
end
|
||
entry
|
||
end
|
||
|
||
def release_connection(entry)
|
||
if stale?(entry)
|
||
safe_finish(entry[:http])
|
||
entry = build_connection_entry
|
||
end
|
||
@pool << entry
|
||
end
|
||
|
||
def stale?(entry)
|
||
http = entry[:http]
|
||
!http.started? || (Time.now - entry[:created_at] > MAX_AGE)
|
||
end
|
||
|
||
def build_connection_entry
|
||
{ http: create_connection, created_at: Time.now }
|
||
end
|
||
|
||
def safe_finish(http)
|
||
http.finish if http&.started?
|
||
rescue StandardError
|
||
nil
|
||
end
|
||
|
||
def drain_pool
|
||
loop do
|
||
entry = begin
|
||
@pool.pop(true)
|
||
rescue ThreadError
|
||
break
|
||
end
|
||
yield(entry)
|
||
end
|
||
end
|
||
|
||
def cleanup_old_connections
|
||
entry = begin
|
||
@pool.pop(true)
|
||
rescue ThreadError
|
||
return
|
||
end
|
||
if stale?(entry)
|
||
safe_finish(entry[:http])
|
||
entry = build_connection_entry
|
||
end
|
||
@pool << entry
|
||
end
|
||
|
||
def schedule_cleanup
|
||
Thread.new do
|
||
loop do
|
||
cleanup_old_connections
|
||
sleep CLEANUP_INTERVAL
|
||
end
|
||
end
|
||
end
|
||
|
||
def create_connection
|
||
http = Net::HTTP.new("web.archive.org", 443)
|
||
http.use_ssl = true
|
||
http.read_timeout = DEFAULT_TIMEOUT
|
||
http.open_timeout = DEFAULT_TIMEOUT
|
||
http.keep_alive_timeout = 30
|
||
http.max_retries = MAX_RETRIES
|
||
http.start
|
||
http
|
||
end
|
||
end
|
||
|
||
class WaybackMachineDownloader
|
||
|
||
include ArchiveAPI
|
||
include SubdomainProcessor
|
||
include URLRewrite
|
||
|
||
VERSION = "2.4.5"
|
||
DEFAULT_TIMEOUT = 30
|
||
MAX_RETRIES = 3
|
||
RETRY_DELAY = 2
|
||
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
||
CONNECTION_POOL_SIZE = 10
|
||
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
||
STATE_CDX_FILENAME = ".cdx.json"
|
||
STATE_DB_FILENAME = ".downloaded.txt"
|
||
|
||
|
||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
||
:snapshot_at, :page_requisites
|
||
|
||
def initialize params
|
||
validate_params(params)
|
||
@base_url = params[:base_url]&.tidy_bytes
|
||
@exact_url = params[:exact_url]
|
||
if params[:directory]
|
||
sanitized_dir = params[:directory].tidy_bytes
|
||
@directory = File.expand_path(sanitized_dir)
|
||
else
|
||
@directory = nil
|
||
end
|
||
@all_timestamps = params[:all_timestamps]
|
||
@from_timestamp = params[:from_timestamp].to_i
|
||
@to_timestamp = params[:to_timestamp].to_i
|
||
@only_filter = params[:only_filter]
|
||
@exclude_filter = params[:exclude_filter]
|
||
@all = params[:all]
|
||
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
||
@threads_count = [params[:threads_count].to_i, 1].max
|
||
@rewritten = params[:rewritten]
|
||
@reset = params[:reset]
|
||
@keep = params[:keep]
|
||
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
||
@logger = setup_logger
|
||
@failed_downloads = Concurrent::Array.new
|
||
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
||
@db_mutex = Mutex.new
|
||
@rewrite = params[:rewrite] || false
|
||
@recursive_subdomains = params[:recursive_subdomains] || false
|
||
@subdomain_depth = params[:subdomain_depth] || 1
|
||
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
||
@max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
|
||
@page_requisites = params[:page_requisites] || false
|
||
@pending_jobs = Concurrent::AtomicFixnum.new(0)
|
||
|
||
# URL for rejecting invalid/unencoded wayback urls
|
||
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
||
|
||
handle_reset
|
||
end
|
||
|
||
def backup_name
|
||
url_to_process = @base_url
|
||
url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
|
||
|
||
raw = if url_to_process.include?('//')
|
||
url_to_process.split('/')[2]
|
||
else
|
||
url_to_process
|
||
end
|
||
|
||
# if it looks like a wildcard pattern, normalize to a safe host-ish name
|
||
if raw&.start_with?('*.')
|
||
raw = raw.sub(/\A\*\./, 'all-')
|
||
end
|
||
|
||
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
||
if Gem.win_platform?
|
||
raw = raw.gsub(/[:*?"<>|]/, '_')
|
||
raw = raw.gsub(/[ .]+\z/, '')
|
||
else
|
||
# still good practice to strip path separators (and maybe '*' for POSIX too)
|
||
raw = raw.gsub(/[\/:*?"<>|]/, '_')
|
||
end
|
||
|
||
raw = 'site' if raw.nil? || raw.empty?
|
||
raw
|
||
end
|
||
|
||
def backup_path
|
||
if @directory
|
||
# because @directory is already an absolute path, we just ensure it exists
|
||
@directory
|
||
else
|
||
# ensure the default path is absolute and normalized
|
||
cwd = Dir.pwd
|
||
File.expand_path(File.join(cwd, 'websites', backup_name))
|
||
end
|
||
end
|
||
|
||
def cdx_path
|
||
File.join(backup_path, STATE_CDX_FILENAME)
|
||
end
|
||
|
||
def db_path
|
||
File.join(backup_path, STATE_DB_FILENAME)
|
||
end
|
||
|
||
def handle_reset
|
||
if @reset
|
||
puts "Resetting download state..."
|
||
FileUtils.rm_f(cdx_path)
|
||
FileUtils.rm_f(db_path)
|
||
puts "Removed state files: #{cdx_path}, #{db_path}"
|
||
end
|
||
end
|
||
|
||
def match_only_filter file_url
|
||
if @only_filter
|
||
only_filter_regex = @only_filter.to_regex(detect: true)
|
||
if only_filter_regex
|
||
only_filter_regex =~ file_url
|
||
else
|
||
file_url.downcase.include? @only_filter.downcase
|
||
end
|
||
else
|
||
true
|
||
end
|
||
end
|
||
|
||
def match_exclude_filter file_url
|
||
if @exclude_filter
|
||
exclude_filter_regex = @exclude_filter.to_regex(detect: true)
|
||
if exclude_filter_regex
|
||
exclude_filter_regex =~ file_url
|
||
else
|
||
file_url.downcase.include? @exclude_filter.downcase
|
||
end
|
||
else
|
||
false
|
||
end
|
||
end
|
||
|
||
def get_all_snapshots_to_consider
|
||
if File.exist?(cdx_path) && !@reset
|
||
puts "Loading snapshot list from #{cdx_path}"
|
||
begin
|
||
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
|
||
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
|
||
puts
|
||
return Concurrent::Array.new(snapshot_list_to_consider)
|
||
rescue JSON::ParserError => e
|
||
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
|
||
FileUtils.rm_f(cdx_path)
|
||
rescue => e
|
||
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
|
||
FileUtils.rm_f(cdx_path)
|
||
end
|
||
end
|
||
|
||
snapshot_list_to_consider = Concurrent::Array.new
|
||
mutex = Mutex.new
|
||
|
||
puts "Getting snapshot pages from Wayback Machine API..."
|
||
|
||
# Fetch the initial set of snapshots, sequentially
|
||
@connection_pool.with_connection do |connection|
|
||
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
||
initial_list ||= []
|
||
mutex.synchronize do
|
||
snapshot_list_to_consider.concat(initial_list)
|
||
print "."
|
||
end
|
||
end
|
||
|
||
# Fetch additional pages if the exact URL flag is not set
|
||
unless @exact_url
|
||
page_index = 0
|
||
batch_size = [@threads_count, 5].min
|
||
continue_fetching = true
|
||
fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
|
||
begin
|
||
while continue_fetching && page_index < @maximum_pages
|
||
# Determine the range of pages to fetch in this batch
|
||
end_index = [page_index + batch_size, @maximum_pages].min
|
||
current_batch = (page_index...end_index).to_a
|
||
|
||
# Create futures for concurrent API calls
|
||
futures = current_batch.map do |page|
|
||
Concurrent::Future.execute(executor: fetch_pool) do
|
||
result = nil
|
||
@connection_pool.with_connection do |connection|
|
||
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
||
end
|
||
result ||= []
|
||
[page, result]
|
||
end
|
||
end
|
||
|
||
results = []
|
||
|
||
futures.each do |future|
|
||
begin
|
||
results << future.value
|
||
rescue => e
|
||
puts "\nError fetching page #{future}: #{e.message}"
|
||
end
|
||
end
|
||
|
||
# Sort results by page number to maintain order
|
||
results.sort_by! { |page, _| page }
|
||
|
||
# Process results and check for empty pages
|
||
results.each do |page, result|
|
||
if result.nil? || result.empty?
|
||
continue_fetching = false
|
||
break
|
||
else
|
||
mutex.synchronize do
|
||
snapshot_list_to_consider.concat(result)
|
||
print "."
|
||
end
|
||
end
|
||
end
|
||
|
||
page_index = end_index
|
||
|
||
sleep(RATE_LIMIT) if continue_fetching
|
||
end
|
||
ensure
|
||
fetch_pool.shutdown
|
||
fetch_pool.wait_for_termination
|
||
end
|
||
end
|
||
|
||
puts " found #{snapshot_list_to_consider.length} snapshots."
|
||
|
||
# Save the fetched list to the cache file
|
||
begin
|
||
FileUtils.mkdir_p(File.dirname(cdx_path))
|
||
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
|
||
puts "Saved snapshot list to #{cdx_path}"
|
||
rescue => e
|
||
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
|
||
end
|
||
puts
|
||
|
||
snapshot_list_to_consider
|
||
end
|
||
|
||
# Get a composite snapshot file list for a specific timestamp
|
||
def get_composite_snapshot_file_list(target_timestamp)
|
||
file_versions = {}
|
||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||
next unless file_url.include?('/')
|
||
next if file_timestamp.to_i > target_timestamp
|
||
|
||
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||
next if file_id.nil?
|
||
next if match_exclude_filter(file_url)
|
||
next unless match_only_filter(file_url)
|
||
|
||
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
||
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
||
end
|
||
end
|
||
file_versions.values
|
||
end
|
||
|
||
# Returns a list of files for the composite snapshot
|
||
def get_file_list_composite_snapshot(target_timestamp)
|
||
file_list = get_composite_snapshot_file_list(target_timestamp)
|
||
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||
file_list.map do |file_remote_info|
|
||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||
file_remote_info[1]
|
||
end
|
||
end
|
||
|
||
def get_file_list_curated
|
||
file_list_curated = Hash.new
|
||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||
next unless file_url.include?('/')
|
||
|
||
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||
if file_id.nil?
|
||
puts "Malformed file url, ignoring: #{file_url}"
|
||
next
|
||
end
|
||
|
||
if file_id.include?('<') || file_id.include?('>')
|
||
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||
else
|
||
if match_exclude_filter(file_url)
|
||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||
elsif !match_only_filter(file_url)
|
||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||
elsif file_list_curated[file_id]
|
||
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
||
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||
end
|
||
else
|
||
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||
end
|
||
end
|
||
end
|
||
file_list_curated
|
||
end
|
||
|
||
def get_file_list_all_timestamps
|
||
file_list_curated = Hash.new
|
||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||
next unless file_url.include?('/')
|
||
|
||
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||
if file_id.nil?
|
||
puts "Malformed file url, ignoring: #{file_url}"
|
||
next
|
||
end
|
||
|
||
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
||
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
||
if file_id_and_timestamp.nil?
|
||
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
||
next
|
||
end
|
||
|
||
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
||
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||
else
|
||
if match_exclude_filter(file_url)
|
||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||
elsif !match_only_filter(file_url)
|
||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||
elsif file_list_curated[file_id_and_timestamp]
|
||
# duplicate combo, ignore silently (verbose flag not shown here)
|
||
else
|
||
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
||
end
|
||
end
|
||
end
|
||
puts "file_list_curated: " + file_list_curated.count.to_s
|
||
file_list_curated
|
||
end
|
||
|
||
|
||
def get_file_list_by_timestamp
|
||
if @snapshot_at
|
||
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
||
elsif @all_timestamps
|
||
file_list_curated = get_file_list_all_timestamps
|
||
file_list_curated.map do |file_remote_info|
|
||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||
file_remote_info[1]
|
||
end
|
||
else
|
||
file_list_curated = get_file_list_curated
|
||
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||
file_list_curated.map do |file_remote_info|
|
||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||
file_remote_info[1]
|
||
end
|
||
end
|
||
end
|
||
|
||
def list_files
|
||
# retrieval produces its own output
|
||
@orig_stdout = $stdout
|
||
$stdout = $stderr
|
||
files = get_file_list_by_timestamp
|
||
$stdout = @orig_stdout
|
||
puts "["
|
||
files[0...-1].each do |file|
|
||
puts file.to_json + ","
|
||
end
|
||
puts files[-1].to_json
|
||
puts "]"
|
||
end
|
||
|
||
def load_downloaded_ids
|
||
downloaded_ids = Set.new
|
||
if File.exist?(db_path) && !@reset
|
||
puts "Loading list of already downloaded files from #{db_path}"
|
||
begin
|
||
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
|
||
rescue => e
|
||
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
|
||
downloaded_ids.clear
|
||
end
|
||
end
|
||
downloaded_ids
|
||
end
|
||
|
||
def append_to_db(file_id)
|
||
@db_mutex.synchronize do
|
||
begin
|
||
FileUtils.mkdir_p(File.dirname(db_path))
|
||
File.open(db_path, 'a') { |f| f.puts(file_id) }
|
||
rescue => e
|
||
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
|
||
end
|
||
end
|
||
end
|
||
|
||
def processing_files(pool, files_to_process)
|
||
files_to_process.each do |file_remote_info|
|
||
pool.post do
|
||
download_success = false
|
||
begin
|
||
@connection_pool.with_connection do |connection|
|
||
result_message = download_file(file_remote_info, connection)
|
||
# assume download success if the result message contains ' -> '
|
||
if result_message && result_message.include?(' -> ')
|
||
download_success = true
|
||
end
|
||
@download_mutex.synchronize do
|
||
@processed_file_count += 1
|
||
# adjust progress message to reflect remaining files
|
||
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
||
puts progress_message if progress_message
|
||
end
|
||
end
|
||
# sppend to DB only after successful download outside the connection block
|
||
if download_success
|
||
append_to_db(file_remote_info[:file_id])
|
||
end
|
||
rescue => e
|
||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||
@download_mutex.synchronize do
|
||
@processed_file_count += 1
|
||
end
|
||
end
|
||
sleep(RATE_LIMIT)
|
||
end
|
||
end
|
||
end
|
||
|
||
def download_files
|
||
start_time = Time.now
|
||
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
||
|
||
FileUtils.mkdir_p(backup_path)
|
||
|
||
# Load the list of files to potentially download
|
||
files_to_download = file_list_by_timestamp
|
||
|
||
if files_to_download.empty?
|
||
puts "No files found matching criteria."
|
||
cleanup
|
||
return
|
||
end
|
||
|
||
total_files = files_to_download.count
|
||
puts "#{total_files} files found matching criteria."
|
||
|
||
# Load IDs of already downloaded files
|
||
downloaded_ids = load_downloaded_ids
|
||
|
||
# We use a thread-safe Set to track what we have queued/downloaded in this session
|
||
# to avoid infinite loops with page requisites
|
||
@session_downloaded_ids = Concurrent::Set.new
|
||
downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
|
||
|
||
files_to_process = files_to_download.reject do |file_info|
|
||
downloaded_ids.include?(file_info[:file_id])
|
||
end
|
||
|
||
remaining_count = files_to_process.count
|
||
skipped_count = total_files - remaining_count
|
||
|
||
if skipped_count > 0
|
||
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
||
end
|
||
|
||
if remaining_count == 0 && !@page_requisites
|
||
puts "All matching files have already been downloaded."
|
||
cleanup
|
||
return
|
||
end
|
||
|
||
puts "#{remaining_count} files to download:"
|
||
|
||
@processed_file_count = 0
|
||
@total_to_download = remaining_count
|
||
@download_mutex = Mutex.new
|
||
|
||
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
||
@worker_pool = Concurrent::FixedThreadPool.new(thread_count)
|
||
|
||
# initial batch
|
||
files_to_process.each do |file_remote_info|
|
||
@session_downloaded_ids.add(file_remote_info[:file_id])
|
||
submit_download_job(file_remote_info)
|
||
end
|
||
|
||
# wait for all jobs to finish
|
||
loop do
|
||
sleep 0.5
|
||
break if @pending_jobs.value == 0
|
||
end
|
||
|
||
@worker_pool.shutdown
|
||
@worker_pool.wait_for_termination
|
||
|
||
end_time = Time.now
|
||
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
||
|
||
# process subdomains if enabled
|
||
if @recursive_subdomains
|
||
subdomain_start_time = Time.now
|
||
process_subdomains
|
||
subdomain_end_time = Time.now
|
||
subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
|
||
puts "Subdomain processing finished in #{subdomain_time}s."
|
||
end
|
||
|
||
puts "Results saved in #{backup_path}"
|
||
cleanup
|
||
end
|
||
|
||
# helper to submit jobs and increment the counter
|
||
def submit_download_job(file_remote_info)
|
||
@pending_jobs.increment
|
||
@worker_pool.post do
|
||
begin
|
||
process_single_file(file_remote_info)
|
||
ensure
|
||
@pending_jobs.decrement
|
||
end
|
||
end
|
||
end
|
||
|
||
def process_single_file(file_remote_info)
|
||
download_success = false
|
||
downloaded_path = nil
|
||
|
||
@connection_pool.with_connection do |connection|
|
||
result_message, path = download_file(file_remote_info, connection)
|
||
downloaded_path = path
|
||
|
||
if result_message && result_message.include?(' -> ')
|
||
download_success = true
|
||
end
|
||
|
||
@download_mutex.synchronize do
|
||
@processed_file_count += 1 if @processed_file_count < @total_to_download
|
||
# only print if it's a "User" file or a requisite we found
|
||
puts result_message if result_message
|
||
end
|
||
end
|
||
|
||
if download_success
|
||
append_to_db(file_remote_info[:file_id])
|
||
|
||
if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
|
||
process_page_requisites(downloaded_path, file_remote_info)
|
||
end
|
||
end
|
||
rescue => e
|
||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||
end
|
||
|
||
def process_page_requisites(file_path, parent_remote_info)
|
||
return unless File.exist?(file_path)
|
||
|
||
content = File.read(file_path)
|
||
content = content.force_encoding('UTF-8').scrub
|
||
|
||
assets = PageRequisites.extract(content)
|
||
|
||
# prepare base URI for resolving relative paths
|
||
parent_raw = parent_remote_info[:file_url]
|
||
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
||
|
||
begin
|
||
base_uri = URI(parent_raw)
|
||
# calculate the "root" host of the site we are downloading to compare later
|
||
current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
|
||
rescue URI::InvalidURIError
|
||
return
|
||
end
|
||
|
||
parent_timestamp = parent_remote_info[:timestamp]
|
||
|
||
assets.each do |asset_rel_url|
|
||
begin
|
||
# resolve full URL (handles relative paths like "../img/logo.png")
|
||
resolved_uri = base_uri + asset_rel_url
|
||
|
||
# filter out navigation links (pages) vs assets
|
||
# skip if extension is empty or looks like an HTML page
|
||
path = resolved_uri.path
|
||
ext = File.extname(path).downcase
|
||
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
||
next
|
||
end
|
||
|
||
# construct the URL for the Wayback API
|
||
asset_wbm_url = resolved_uri.host + resolved_uri.path
|
||
asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
|
||
|
||
# construct the local file ID
|
||
# if the asset is on the SAME domain, strip the domain from the folder path
|
||
# if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
|
||
if resolved_uri.host == current_project_host
|
||
# e.g. /static/css/style.css
|
||
asset_file_id = resolved_uri.path
|
||
asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
|
||
else
|
||
# e.g. cdn.google.com/jquery.js
|
||
asset_file_id = asset_wbm_url
|
||
end
|
||
|
||
rescue URI::InvalidURIError, StandardError
|
||
next
|
||
end
|
||
|
||
# sanitize and queue
|
||
asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
|
||
|
||
unless @session_downloaded_ids.include?(asset_id)
|
||
@session_downloaded_ids.add(asset_id)
|
||
|
||
new_file_info = {
|
||
file_url: asset_wbm_url,
|
||
timestamp: parent_timestamp,
|
||
file_id: asset_id
|
||
}
|
||
|
||
@download_mutex.synchronize do
|
||
@total_to_download += 1
|
||
puts "Queued requisite: #{asset_file_id}"
|
||
end
|
||
|
||
submit_download_job(new_file_info)
|
||
end
|
||
end
|
||
end
|
||
|
||
def structure_dir_path dir_path
|
||
begin
|
||
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
||
rescue Errno::EEXIST => e
|
||
error_to_string = e.to_s
|
||
puts "# #{error_to_string}"
|
||
if error_to_string.include? "File exists @ dir_s_mkdir - "
|
||
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
|
||
elsif error_to_string.include? "File exists - "
|
||
file_already_existing = error_to_string.split("File exists - ")[-1]
|
||
else
|
||
raise "Unhandled directory restructure error # #{error_to_string}"
|
||
end
|
||
file_already_existing_temporary = file_already_existing + '.temp'
|
||
file_already_existing_permanent = file_already_existing + '/index.html'
|
||
FileUtils::mv file_already_existing, file_already_existing_temporary
|
||
FileUtils::mkdir_p file_already_existing
|
||
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
||
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
||
structure_dir_path dir_path
|
||
end
|
||
end
|
||
|
||
def rewrite_urls_to_relative(file_path)
|
||
return unless File.exist?(file_path)
|
||
|
||
file_ext = File.extname(file_path).downcase
|
||
|
||
begin
|
||
content = File.binread(file_path)
|
||
|
||
# detect encoding for HTML files
|
||
if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
|
||
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
||
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
||
else
|
||
content.force_encoding('UTF-8')
|
||
end
|
||
|
||
# URLs in HTML attributes
|
||
content = rewrite_html_attr_urls(content)
|
||
|
||
# URLs in CSS
|
||
content = rewrite_css_urls(content)
|
||
|
||
# URLs in JavaScript
|
||
content = rewrite_js_urls(content)
|
||
|
||
# for URLs that start with a single slash, make them relative
|
||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||
prefix, path, suffix = $1, $2, $3
|
||
"#{prefix}./#{path}#{suffix}"
|
||
end
|
||
|
||
# for URLs in CSS that start with a single slash, make them relative
|
||
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
||
path = $1
|
||
"url(\"./#{path}\")"
|
||
end
|
||
|
||
# save the modified content back to the file
|
||
File.binwrite(file_path, content)
|
||
puts "Rewrote URLs in #{file_path} to be relative."
|
||
rescue Errno::ENOENT => e
|
||
@logger.warn("Error reading file #{file_path}: #{e.message}")
|
||
end
|
||
end
|
||
|
||
def download_file (file_remote_info, http)
|
||
current_encoding = "".encoding
|
||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||
file_id = file_remote_info[:file_id]
|
||
file_timestamp = file_remote_info[:timestamp]
|
||
|
||
# sanitize file_id to ensure it is a valid path component
|
||
raw_path_elements = file_id.split('/')
|
||
|
||
sanitized_path_elements = raw_path_elements.map do |element|
|
||
if Gem.win_platform?
|
||
# for Windows, we need to sanitize path components to avoid invalid characters
|
||
# this prevents issues with file names that contain characters not allowed in
|
||
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
||
element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
|
||
else
|
||
element
|
||
end
|
||
end
|
||
|
||
current_backup_path = backup_path
|
||
|
||
if file_id == ""
|
||
dir_path = current_backup_path
|
||
file_path = File.join(dir_path, 'index.html')
|
||
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
||
# if file_id is a directory, we treat it as such
|
||
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||
file_path = File.join(dir_path, 'index.html')
|
||
else
|
||
# if file_id is a file, we treat it as such
|
||
filename = sanitized_path_elements.pop
|
||
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||
file_path = File.join(dir_path, filename)
|
||
end
|
||
|
||
# check existence *before* download attempt
|
||
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
||
if File.exist? file_path
|
||
return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
||
end
|
||
|
||
begin
|
||
structure_dir_path dir_path
|
||
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
||
|
||
case status
|
||
when :saved
|
||
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
||
rewrite_urls_to_relative(file_path)
|
||
end
|
||
return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
|
||
when :skipped_not_found
|
||
return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
||
else
|
||
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
||
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
||
return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
|
||
end
|
||
rescue StandardError => e
|
||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||
if File.exist?(file_path) and File.size(file_path) == 0
|
||
File.delete(file_path)
|
||
msg += "\n#{file_path} was empty and was removed."
|
||
end
|
||
return [msg, nil]
|
||
end
|
||
end
|
||
|
||
def file_queue
|
||
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
||
end
|
||
|
||
def file_list_by_timestamp
|
||
if @snapshot_at
|
||
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
||
elsif @all_timestamps
|
||
file_list_curated = get_file_list_all_timestamps
|
||
file_list_curated.map do |file_remote_info|
|
||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||
file_remote_info[1]
|
||
end
|
||
else
|
||
file_list_curated = get_file_list_curated
|
||
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||
file_list_curated.map do |file_remote_info|
|
||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||
file_remote_info[1]
|
||
end
|
||
end
|
||
end
|
||
|
||
private
|
||
|
||
def validate_params(params)
|
||
raise ArgumentError, "Base URL is required" unless params[:base_url]
|
||
raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
|
||
end
|
||
|
||
def setup_logger
|
||
logger = Logger.new(STDOUT)
|
||
logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
|
||
logger.formatter = proc do |severity, datetime, progname, msg|
|
||
"#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
|
||
end
|
||
logger
|
||
end
|
||
|
||
# safely sanitize a file id (or id+timestamp)
|
||
def sanitize_and_prepare_id(raw, file_url)
|
||
return nil if raw.nil?
|
||
return "" if raw.empty?
|
||
original = raw.dup
|
||
begin
|
||
# work on a binary copy to avoid premature encoding errors
|
||
raw = raw.dup.force_encoding(Encoding::BINARY)
|
||
|
||
# percent-decode (repeat until stable in case of double-encoding)
|
||
loop do
|
||
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
||
break if decoded == raw
|
||
raw = decoded
|
||
end
|
||
|
||
# try tidy_bytes
|
||
begin
|
||
raw = raw.tidy_bytes
|
||
rescue StandardError
|
||
# fallback: scrub to UTF-8
|
||
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||
end
|
||
|
||
# ensure UTF-8 and scrub again
|
||
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
||
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||
end
|
||
|
||
# strip HTML/comment artifacts & control chars
|
||
raw.gsub!(/<!--+/, '')
|
||
raw.gsub!(/[\x00-\x1F]/, '')
|
||
|
||
# split query; hash it for stable short name
|
||
path_part, query_part = raw.split('?', 2)
|
||
if query_part && !query_part.empty?
|
||
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
||
if path_part.include?('.')
|
||
pre, _sep, post = path_part.rpartition('.')
|
||
path_part = "#{pre}__q#{q_digest}.#{post}"
|
||
else
|
||
path_part = "#{path_part}__q#{q_digest}"
|
||
end
|
||
end
|
||
raw = path_part
|
||
|
||
# collapse slashes & trim leading slash
|
||
raw.gsub!(%r{/+}, '/')
|
||
raw.sub!(%r{\A/}, '')
|
||
|
||
# segment-wise sanitation
|
||
raw = raw.split('/').map do |segment|
|
||
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
||
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
||
seg.empty? ? '_' : seg
|
||
end.join('/')
|
||
|
||
# remove any remaining angle brackets
|
||
raw.tr!('<>', '')
|
||
|
||
# final fallback if empty
|
||
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
||
|
||
raw
|
||
rescue => e
|
||
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
||
# deterministic fallback – never return nil so caller won’t mark malformed
|
||
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
||
end
|
||
end
|
||
|
||
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
||
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
||
# when invoking manually.
|
||
def safe_display_url(url)
|
||
return url unless url && url.match?(/[&]/)
|
||
"(#{url})"
|
||
end
|
||
|
||
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
||
retries = 0
|
||
begin
|
||
wayback_url = if @rewritten
|
||
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
|
||
else
|
||
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
||
end
|
||
|
||
# Escape square brackets because they are not valid in URI()
|
||
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
||
|
||
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
|
||
if not @url_regexp.match?(wayback_url)
|
||
@logger.warn("Skipped #{file_url}: invalid URL")
|
||
return :skipped_not_found
|
||
end
|
||
|
||
request = Net::HTTP::Get.new(URI(wayback_url))
|
||
request["Connection"] = "keep-alive"
|
||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||
request["Accept-Encoding"] = "gzip, deflate"
|
||
|
||
response = connection.request(request)
|
||
|
||
save_response_body = lambda do
|
||
File.open(file_path, "wb") do |file|
|
||
body = response.body
|
||
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
||
begin
|
||
gz = Zlib::GzipReader.new(StringIO.new(body))
|
||
decompressed_body = gz.read
|
||
gz.close
|
||
file.write(decompressed_body)
|
||
rescue Zlib::GzipFile::Error => e
|
||
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
||
file.write(body)
|
||
end
|
||
else
|
||
file.write(body) if body
|
||
end
|
||
end
|
||
end
|
||
|
||
if @all
|
||
case response
|
||
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
||
save_response_body.call
|
||
if response.is_a?(Net::HTTPRedirection)
|
||
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
||
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
||
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
||
end
|
||
return :saved
|
||
else
|
||
# for any other response type when --all is true, treat as an error to be retried or failed
|
||
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
||
end
|
||
else # not @all (our default behavior)
|
||
case response
|
||
when Net::HTTPSuccess
|
||
save_response_body.call
|
||
return :saved
|
||
when Net::HTTPRedirection
|
||
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
||
location = response['location']
|
||
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
||
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
||
when Net::HTTPTooManyRequests
|
||
sleep(RATE_LIMIT * 2)
|
||
raise "Rate limited, retrying..."
|
||
when Net::HTTPNotFound
|
||
@logger.warn("File not found, skipping: #{file_url}")
|
||
return :skipped_not_found
|
||
else
|
||
raise "HTTP Error: #{response.code} #{response.message}"
|
||
end
|
||
end
|
||
|
||
rescue StandardError => e
|
||
if retries < @max_retries
|
||
retries += 1
|
||
@logger.warn("Retry #{retries}/#{@max_retries} for #{file_url}: #{e.message}")
|
||
sleep(RETRY_DELAY * retries)
|
||
retry
|
||
else
|
||
@failed_downloads << {url: file_url, error: e.message}
|
||
raise e
|
||
end
|
||
end
|
||
end
|
||
|
||
def cleanup
|
||
@connection_pool.shutdown
|
||
|
||
if @failed_downloads.any?
|
||
@logger.error("Download completed with errors.")
|
||
@logger.error("Failed downloads summary:")
|
||
@failed_downloads.each do |failure|
|
||
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
||
end
|
||
unless @reset
|
||
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
|
||
return
|
||
end
|
||
end
|
||
|
||
if !@keep || @reset
|
||
puts "Cleaning up state files..." unless @keep && !@reset
|
||
FileUtils.rm_f(cdx_path)
|
||
FileUtils.rm_f(db_path)
|
||
elsif @keep
|
||
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
|
||
end
|
||
end
|
||
end
|