wayback-machine-downloader/lib/wayback_machine_downloader.rb
Felipe fc8d8a9441 Added retry command
fixes [Feature request} Retry flag
Fixes StrawberryMaster/wayback-machine-downloader#31
2025-08-20 01:21:29 +00:00

869 lines
29 KiB
Ruby

# encoding: UTF-8
require 'thread'
require 'net/http'
require 'open-uri'
require 'fileutils'
require 'cgi'
require 'json'
require 'time'
require 'concurrent-ruby'
require 'logger'
require 'zlib'
require 'stringio'
require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api'
require_relative 'wayback_machine_downloader/subdom_processor'
require_relative 'wayback_machine_downloader/url_rewrite'
class ConnectionPool
MAX_AGE = 300
CLEANUP_INTERVAL = 60
DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3
def initialize(size)
@size = size
@pool = Concurrent::Map.new
@creation_times = Concurrent::Map.new
@cleanup_thread = schedule_cleanup
end
def with_connection(&block)
conn = acquire_connection
begin
yield conn
ensure
release_connection(conn)
end
end
def shutdown
@cleanup_thread&.exit
@pool.each_value { |conn| conn.finish if conn&.started? }
@pool.clear
@creation_times.clear
end
private
def acquire_connection
thread_id = Thread.current.object_id
conn = @pool[thread_id]
if should_create_new?(conn)
conn&.finish if conn&.started?
conn = create_connection
@pool[thread_id] = conn
@creation_times[thread_id] = Time.now
end
conn
end
def release_connection(conn)
return unless conn
if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
conn.finish
@pool.delete(Thread.current.object_id)
@creation_times.delete(Thread.current.object_id)
end
end
def should_create_new?(conn)
return true if conn.nil?
return true unless conn.started?
return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
false
end
def create_connection
http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true
http.read_timeout = DEFAULT_TIMEOUT
http.open_timeout = DEFAULT_TIMEOUT
http.keep_alive_timeout = 30
http.max_retries = MAX_RETRIES
http.start
http
end
def schedule_cleanup
Thread.new do
loop do
cleanup_old_connections
sleep CLEANUP_INTERVAL
end
end
end
def cleanup_old_connections
current_time = Time.now
@creation_times.each do |thread_id, creation_time|
if current_time - creation_time > MAX_AGE
conn = @pool[thread_id]
conn&.finish if conn&.started?
@pool.delete(thread_id)
@creation_times.delete(thread_id)
end
end
end
end
class WaybackMachineDownloader
include ArchiveAPI
include SubdomainProcessor
VERSION = "2.4.0"
DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3
RETRY_DELAY = 2
RATE_LIMIT = 0.25 # Delay between requests in seconds
CONNECTION_POOL_SIZE = 10
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
STATE_CDX_FILENAME = ".cdx.json"
STATE_DB_FILENAME = ".downloaded.txt"
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
:snapshot_at
def initialize params
validate_params(params)
@base_url = params[:base_url]&.tidy_bytes
@exact_url = params[:exact_url]
if params[:directory]
sanitized_dir = params[:directory].tidy_bytes
@directory = File.expand_path(sanitized_dir)
else
@directory = nil
end
@all_timestamps = params[:all_timestamps]
@from_timestamp = params[:from_timestamp].to_i
@to_timestamp = params[:to_timestamp].to_i
@only_filter = params[:only_filter]
@exclude_filter = params[:exclude_filter]
@all = params[:all]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = [params[:threads_count].to_i, 1].max
@rewritten = params[:rewritten]
@reset = params[:reset]
@keep = params[:keep]
@timeout = params[:timeout] || DEFAULT_TIMEOUT
@logger = setup_logger
@failed_downloads = Concurrent::Array.new
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
@db_mutex = Mutex.new
@rewrite = params[:rewrite] || false
@recursive_subdomains = params[:recursive_subdomains] || false
@subdomain_depth = params[:subdomain_depth] || 1
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
# URL for rejecting invalid/unencoded wayback urls
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
handle_reset
end
def backup_name
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
if url_to_process.include? '//'
url_to_process.split('/')[2]
else
url_to_process
end
end
def backup_path
if @directory
# because @directory is already an absolute path, we just ensure it exists
@directory
else
# ensure the default path is absolute and normalized
File.expand_path(File.join('websites', backup_name))
end
end
def cdx_path
File.join(backup_path, STATE_CDX_FILENAME)
end
def db_path
File.join(backup_path, STATE_DB_FILENAME)
end
def handle_reset
if @reset
puts "Resetting download state..."
FileUtils.rm_f(cdx_path)
FileUtils.rm_f(db_path)
puts "Removed state files: #{cdx_path}, #{db_path}"
end
end
def match_only_filter file_url
if @only_filter
only_filter_regex = @only_filter.to_regex(detect: true)
if only_filter_regex
only_filter_regex =~ file_url
else
file_url.downcase.include? @only_filter.downcase
end
else
true
end
end
def match_exclude_filter file_url
if @exclude_filter
exclude_filter_regex = @exclude_filter.to_regex(detect: true)
if exclude_filter_regex
exclude_filter_regex =~ file_url
else
file_url.downcase.include? @exclude_filter.downcase
end
else
false
end
end
def get_all_snapshots_to_consider
if File.exist?(cdx_path) && !@reset
puts "Loading snapshot list from #{cdx_path}"
begin
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
puts
return Concurrent::Array.new(snapshot_list_to_consider)
rescue JSON::ParserError => e
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
FileUtils.rm_f(cdx_path)
rescue => e
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
FileUtils.rm_f(cdx_path)
end
end
snapshot_list_to_consider = Concurrent::Array.new
mutex = Mutex.new
puts "Getting snapshot pages from Wayback Machine API..."
# Fetch the initial set of snapshots, sequentially
@connection_pool.with_connection do |connection|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
initial_list ||= []
mutex.synchronize do
snapshot_list_to_consider.concat(initial_list)
print "."
end
end
# Fetch additional pages if the exact URL flag is not set
unless @exact_url
page_index = 0
batch_size = [@threads_count, 5].min
continue_fetching = true
while continue_fetching && page_index < @maximum_pages
# Determine the range of pages to fetch in this batch
end_index = [page_index + batch_size, @maximum_pages].min
current_batch = (page_index...end_index).to_a
# Create futures for concurrent API calls
futures = current_batch.map do |page|
Concurrent::Future.execute do
result = nil
@connection_pool.with_connection do |connection|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
end
result ||= []
[page, result]
end
end
results = []
futures.each do |future|
begin
results << future.value
rescue => e
puts "\nError fetching page #{future}: #{e.message}"
end
end
# Sort results by page number to maintain order
results.sort_by! { |page, _| page }
# Process results and check for empty pages
results.each do |page, result|
if result.nil? || result.empty?
continue_fetching = false
break
else
mutex.synchronize do
snapshot_list_to_consider.concat(result)
print "."
end
end
end
page_index = end_index
sleep(RATE_LIMIT) if continue_fetching
end
end
puts " found #{snapshot_list_to_consider.length} snapshots."
# Save the fetched list to the cache file
begin
FileUtils.mkdir_p(File.dirname(cdx_path))
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
puts "Saved snapshot list to #{cdx_path}"
rescue => e
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
end
puts
snapshot_list_to_consider
end
# Get a composite snapshot file list for a specific timestamp
def get_composite_snapshot_file_list(target_timestamp)
file_versions = {}
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless file_url.include?('/')
next if file_timestamp.to_i > target_timestamp
file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == ""
next if file_id.nil?
next if match_exclude_filter(file_url)
next unless match_only_filter(file_url)
# Select the most recent version <= target_timestamp
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
end
end
file_versions.values
end
# Returns a list of files for the composite snapshot
def get_file_list_composite_snapshot(target_timestamp)
file_list = get_composite_snapshot_file_list(target_timestamp)
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
file_list.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
def get_file_list_curated
file_list_curated = Hash.new
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless file_url.include?('/')
file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
else
if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id]
unless file_list_curated[file_id][:timestamp] > file_timestamp
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
end
end
file_list_curated
end
def get_file_list_all_timestamps
file_list_curated = Hash.new
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless file_url.include?('/')
file_id = file_url.split('/')[3..-1].join('/')
file_id_and_timestamp = [file_timestamp, file_id].join('/')
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
else
if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id_and_timestamp]
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
else
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
end
end
end
puts "file_list_curated: " + file_list_curated.count.to_s
file_list_curated
end
def get_file_list_by_timestamp
if @snapshot_at
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
elsif @all_timestamps
file_list_curated = get_file_list_all_timestamps
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
else
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
end
def list_files
# retrieval produces its own output
@orig_stdout = $stdout
$stdout = $stderr
files = get_file_list_by_timestamp
$stdout = @orig_stdout
puts "["
files[0...-1].each do |file|
puts file.to_json + ","
end
puts files[-1].to_json
puts "]"
end
def load_downloaded_ids
downloaded_ids = Set.new
if File.exist?(db_path) && !@reset
puts "Loading list of already downloaded files from #{db_path}"
begin
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
rescue => e
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
downloaded_ids.clear
end
end
downloaded_ids
end
def append_to_db(file_id)
@db_mutex.synchronize do
begin
FileUtils.mkdir_p(File.dirname(db_path))
File.open(db_path, 'a') { |f| f.puts(file_id) }
rescue => e
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
end
end
end
def processing_files(pool, files_to_process)
files_to_process.each do |file_remote_info|
pool.post do
download_success = false
begin
@connection_pool.with_connection do |connection|
result_message = download_file(file_remote_info, connection)
# assume download success if the result message contains ' -> '
if result_message && result_message.include?(' -> ')
download_success = true
end
@download_mutex.synchronize do
@processed_file_count += 1
# adjust progress message to reflect remaining files
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
puts progress_message if progress_message
end
end
# sppend to DB only after successful download outside the connection block
if download_success
append_to_db(file_remote_info[:file_id])
end
rescue => e
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
@download_mutex.synchronize do
@processed_file_count += 1
end
end
sleep(RATE_LIMIT)
end
end
end
def download_files
start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
FileUtils.mkdir_p(backup_path)
# Load the list of files to potentially download
files_to_download = file_list_by_timestamp
if files_to_download.empty?
puts "No files found matching criteria."
cleanup
return
end
total_files = files_to_download.count
puts "#{total_files} files found matching criteria."
# Load IDs of already downloaded files
downloaded_ids = load_downloaded_ids
files_to_process = files_to_download.reject do |file_info|
downloaded_ids.include?(file_info[:file_id])
end
remaining_count = files_to_process.count
skipped_count = total_files - remaining_count
if skipped_count > 0
puts "Found #{skipped_count} previously downloaded files, skipping them."
end
if remaining_count == 0
puts "All matching files have already been downloaded."
cleanup
return
end
puts "#{remaining_count} files to download:"
@processed_file_count = 0
@total_to_download = remaining_count
@download_mutex = Mutex.new
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
pool = Concurrent::FixedThreadPool.new(thread_count)
processing_files(pool, files_to_process)
pool.shutdown
pool.wait_for_termination
end_time = Time.now
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
# process subdomains if enabled
if @recursive_subdomains
subdomain_start_time = Time.now
process_subdomains
subdomain_end_time = Time.now
subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
puts "Subdomain processing finished in #{subdomain_time}s."
end
puts "Results saved in #{backup_path}"
cleanup
end
def structure_dir_path dir_path
begin
FileUtils::mkdir_p dir_path unless File.exist? dir_path
rescue Errno::EEXIST => e
error_to_string = e.to_s
puts "# #{error_to_string}"
if error_to_string.include? "File exists @ dir_s_mkdir - "
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
elsif error_to_string.include? "File exists - "
file_already_existing = error_to_string.split("File exists - ")[-1]
else
raise "Unhandled directory restructure error # #{error_to_string}"
end
file_already_existing_temporary = file_already_existing + '.temp'
file_already_existing_permanent = file_already_existing + '/index.html'
FileUtils::mv file_already_existing, file_already_existing_temporary
FileUtils::mkdir_p file_already_existing
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
structure_dir_path dir_path
end
end
def rewrite_urls_to_relative(file_path)
return unless File.exist?(file_path)
file_ext = File.extname(file_path).downcase
begin
content = File.binread(file_path)
if file_ext == '.html' || file_ext == '.htm'
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
else
content.force_encoding('UTF-8')
end
# URLs in HTML attributes
rewrite_html_attr_urls(content)
# URLs in CSS
rewrite_css_urls(content)
# URLs in JavaScript
rewrite_js_urls(content)
# for URLs in HTML attributes that start with a single slash
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
prefix, path, suffix = $1, $2, $3
"#{prefix}./#{path}#{suffix}"
end
# for URLs in CSS that start with a single slash
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
path = $1
"url(\"./#{path}\")"
end
# save the modified content back to the file
File.binwrite(file_path, content)
puts "Rewrote URLs in #{file_path} to be relative."
rescue Errno::ENOENT => e
@logger.warn("Error reading file #{file_path}: #{e.message}")
end
end
def download_file (file_remote_info, http)
current_encoding = "".encoding
file_url = file_remote_info[:file_url].encode(current_encoding)
file_id = file_remote_info[:file_id]
file_timestamp = file_remote_info[:timestamp]
# sanitize file_id to ensure it is a valid path component
raw_path_elements = file_id.split('/')
sanitized_path_elements = raw_path_elements.map do |element|
if Gem.win_platform?
# for Windows, we need to sanitize path components to avoid invalid characters
# this prevents issues with file names that contain characters not allowed in
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
else
element
end
end
current_backup_path = backup_path
if file_id == ""
dir_path = current_backup_path
file_path = File.join(dir_path, 'index.html')
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
# if file_id is a directory, we treat it as such
dir_path = File.join(current_backup_path, *sanitized_path_elements)
file_path = File.join(dir_path, 'index.html')
else
# if file_id is a file, we treat it as such
filename = sanitized_path_elements.pop
dir_path = File.join(current_backup_path, *sanitized_path_elements)
file_path = File.join(dir_path, filename)
end
# check existence *before* download attempt
# this handles cases where a file was created manually or by a previous partial run without a .db entry
if File.exist? file_path
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
end
begin
structure_dir_path dir_path
status = download_with_retry(file_path, file_url, file_timestamp, http)
case status
when :saved
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
rewrite_urls_to_relative(file_path)
end
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
when :skipped_not_found
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
else
# ideally, this case should not be reached if download_with_retry behaves as expected.
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
end
rescue StandardError => e
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
if File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
msg += "\n#{file_path} was empty and was removed."
end
msg
end
end
def file_queue
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
end
def file_list_by_timestamp
if @snapshot_at
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
elsif @all_timestamps
file_list_curated = get_file_list_all_timestamps
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
else
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
end
private
def validate_params(params)
raise ArgumentError, "Base URL is required" unless params[:base_url]
raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
end
def setup_logger
logger = Logger.new(STDOUT)
logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
logger.formatter = proc do |severity, datetime, progname, msg|
"#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
end
logger
end
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
retries = 0
begin
wayback_url = if @rewritten
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
else
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
end
# Escape square brackets because they are not valid in URI()
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
if not @url_regexp.match?(wayback_url)
@logger.warn("Skipped #{file_url}: invalid URL")
return :skipped_not_found
end
request = Net::HTTP::Get.new(URI(wayback_url))
request["Connection"] = "keep-alive"
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
request["Accept-Encoding"] = "gzip, deflate"
response = connection.request(request)
save_response_body = lambda do
File.open(file_path, "wb") do |file|
body = response.body
if response['content-encoding'] == 'gzip' && body && !body.empty?
begin
gz = Zlib::GzipReader.new(StringIO.new(body))
decompressed_body = gz.read
gz.close
file.write(decompressed_body)
rescue Zlib::GzipFile::Error => e
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
file.write(body)
end
else
file.write(body) if body
end
end
end
if @all
case response
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
save_response_body.call
if response.is_a?(Net::HTTPRedirection)
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
end
return :saved
else
# for any other response type when --all is true, treat as an error to be retried or failed
raise "Unhandled HTTP response: #{response.code} #{response.message}"
end
else # not @all (our default behavior)
case response
when Net::HTTPSuccess
save_response_body.call
return :saved
when Net::HTTPRedirection
raise "Too many redirects for #{file_url}" if redirect_count >= 2
location = response['location']
@logger.warn("Redirect found for #{file_url} -> #{location}")
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
when Net::HTTPTooManyRequests
sleep(RATE_LIMIT * 2)
raise "Rate limited, retrying..."
when Net::HTTPNotFound
@logger.warn("File not found, skipping: #{file_url}")
return :skipped_not_found
else
raise "HTTP Error: #{response.code} #{response.message}"
end
end
rescue StandardError => e
if retries < MAX_RETRIES
retries += 1
@logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
sleep(RETRY_DELAY * retries)
retry
else
@failed_downloads << {url: file_url, error: e.message}
raise e
end
end
end
def cleanup
@connection_pool.shutdown
if @failed_downloads.any?
@logger.error("Download completed with errors.")
@logger.error("Failed downloads summary:")
@failed_downloads.each do |failure|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
end
unless @reset
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
return
end
end
if !@keep || @reset
puts "Cleaning up state files..." unless @keep && !@reset
FileUtils.rm_f(cdx_path)
FileUtils.rm_f(db_path)
elsif @keep
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
end
end
end