Added support for resuming incomplete downloads

This commit is contained in:
Felipe
2025-04-19 13:40:14 +00:00
committed by GitHub
parent 27dd619aa4
commit febffe5de4
4 changed files with 216 additions and 47 deletions

View File

@@ -120,10 +120,12 @@ class WaybackMachineDownloader
RATE_LIMIT = 0.25 # Delay between requests in seconds
CONNECTION_POOL_SIZE = 10
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
STATE_CDX_FILENAME = ".cdx.json"
STATE_DB_FILENAME = ".downloaded.txt"
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
:all, :maximum_pages, :threads_count, :logger
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
def initialize params
validate_params(params)
@@ -139,10 +141,15 @@ class WaybackMachineDownloader
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = [params[:threads_count].to_i, 1].max
@rewritten = params[:rewritten]
@reset = params[:reset]
@keep = params[:keep]
@timeout = params[:timeout] || DEFAULT_TIMEOUT
@logger = setup_logger
@failed_downloads = Concurrent::Array.new
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
@db_mutex = Mutex.new
handle_reset
end
def backup_name
@@ -165,6 +172,23 @@ class WaybackMachineDownloader
end
end
def cdx_path
File.join(backup_path, STATE_CDX_FILENAME)
end
def db_path
File.join(backup_path, STATE_DB_FILENAME)
end
def handle_reset
if @reset
puts "Resetting download state..."
FileUtils.rm_f(cdx_path)
FileUtils.rm_f(db_path)
puts "Removed state files: #{cdx_path}, #{db_path}"
end
end
def match_only_filter file_url
if @only_filter
only_filter_regex = @only_filter.to_regex
@@ -192,10 +216,26 @@ class WaybackMachineDownloader
end
def get_all_snapshots_to_consider
if File.exist?(cdx_path) && !@reset
puts "Loading snapshot list from #{cdx_path}"
begin
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
puts
return Concurrent::Array.new(snapshot_list_to_consider)
rescue JSON::ParserError => e
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
FileUtils.rm_f(cdx_path)
rescue => e
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
FileUtils.rm_f(cdx_path)
end
end
snapshot_list_to_consider = Concurrent::Array.new
mutex = Mutex.new
puts "Getting snapshot pages"
puts "Getting snapshot pages from Wayback Machine API..."
# Fetch the initial set of snapshots, sequentially
@connection_pool.with_connection do |connection|
@@ -211,12 +251,12 @@ class WaybackMachineDownloader
page_index = 0
batch_size = [@threads_count, 5].min
continue_fetching = true
while continue_fetching && page_index < @maximum_pages
# Determine the range of pages to fetch in this batch
end_index = [page_index + batch_size, @maximum_pages].min
current_batch = (page_index...end_index).to_a
# Create futures for concurrent API calls
futures = current_batch.map do |page|
Concurrent::Future.execute do
@@ -227,9 +267,9 @@ class WaybackMachineDownloader
[page, result]
end
end
results = []
futures.each do |future|
begin
results << future.value
@@ -237,10 +277,10 @@ class WaybackMachineDownloader
puts "\nError fetching page #{future}: #{e.message}"
end
end
# Sort results by page number to maintain order
results.sort_by! { |page, _| page }
# Process results and check for empty pages
results.each do |page, result|
if result.empty?
@@ -253,14 +293,23 @@ class WaybackMachineDownloader
end
end
end
page_index = end_index
sleep(RATE_LIMIT) if continue_fetching
end
end
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
puts " found #{snapshot_list_to_consider.length} snapshots."
# Save the fetched list to the cache file
begin
FileUtils.mkdir_p(File.dirname(cdx_path))
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
puts "Saved snapshot list to #{cdx_path}"
rescue => e
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
end
puts
snapshot_list_to_consider
@@ -350,32 +399,103 @@ class WaybackMachineDownloader
puts "]"
end
def load_downloaded_ids
downloaded_ids = Set.new
if File.exist?(db_path) && !@reset
puts "Loading list of already downloaded files from #{db_path}"
begin
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
rescue => e
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
downloaded_ids.clear
end
end
downloaded_ids
end
def append_to_db(file_id)
@db_mutex.synchronize do
begin
FileUtils.mkdir_p(File.dirname(db_path))
File.open(db_path, 'a') { |f| f.puts(file_id) }
rescue => e
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
end
end
end
def download_files
start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
if file_list_by_timestamp.empty?
puts "No files to download."
FileUtils.mkdir_p(backup_path)
# Load the list of files to potentially download
files_to_download = file_list_by_timestamp
if files_to_download.empty?
puts "No files found matching criteria."
cleanup
return
end
total_files = file_list_by_timestamp.count
puts "#{total_files} files to download:"
total_files = files_to_download.count
puts "#{total_files} files found matching criteria."
# Load IDs of already downloaded files
downloaded_ids = load_downloaded_ids
files_to_process = files_to_download.reject do |file_info|
downloaded_ids.include?(file_info[:file_id])
end
remaining_count = files_to_process.count
skipped_count = total_files - remaining_count
if skipped_count > 0
puts "Found #{skipped_count} previously downloaded files, skipping them."
end
if remaining_count == 0
puts "All matching files have already been downloaded."
cleanup
return
end
puts "#{remaining_count} files to download:"
@processed_file_count = 0
@total_to_download = remaining_count
@download_mutex = Mutex.new
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
pool = Concurrent::FixedThreadPool.new(thread_count)
file_list_by_timestamp.each do |file_remote_info|
files_to_process.each do |file_remote_info|
pool.post do
@connection_pool.with_connection do |connection|
result = download_file(file_remote_info, connection)
@download_mutex.synchronize do
@processed_file_count += 1
puts result if result
download_success = false
begin
@connection_pool.with_connection do |connection|
result_message = download_file(file_remote_info, connection)
# for now, assume success if no exception and message doesn't indicate error/skip
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
download_success = true
end
@download_mutex.synchronize do
@processed_file_count += 1
# adjust progress message to reflect remaining files
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
puts progress_message if progress_message
end
end
# sppend to DB only after successful download outside the connection block
if download_success
append_to_db(file_remote_info[:file_id])
end
rescue => e
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
@download_mutex.synchronize do
@processed_file_count += 1
end
end
sleep(RATE_LIMIT)
end
@@ -385,7 +505,8 @@ class WaybackMachineDownloader
pool.wait_for_termination
end_time = Time.now
puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
puts "Results saved in #{backup_path}"
cleanup
end
@@ -417,9 +538,10 @@ class WaybackMachineDownloader
file_url = file_remote_info[:file_url].encode(current_encoding)
file_id = file_remote_info[:file_id]
file_timestamp = file_remote_info[:timestamp]
file_path_elements = file_id.split('/')
original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
file_path_elements = original_file_id.split('/')
if file_id == ""
if original_file_id == ""
dir_path = backup_path
file_path = backup_path + 'index.html'
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -433,21 +555,24 @@ class WaybackMachineDownloader
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
end
unless File.exist? file_path
begin
structure_dir_path dir_path
download_with_retry(file_path, file_url, file_timestamp, http)
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
rescue StandardError => e
msg = "#{file_url} # #{e}"
if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
msg += "\n#{file_path} was empty and was removed."
end
msg
# check existence *before* download attempt
# this handles cases where a file was created manually or by a previous partial run without a .db entry
if File.exist? file_path
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
end
begin
structure_dir_path dir_path
download_with_retry(file_path, file_url, file_timestamp, http)
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
rescue StandardError => e
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
msg += "\n#{file_path} was empty and was removed."
end
else
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
msg
end
end
@@ -523,7 +648,7 @@ class WaybackMachineDownloader
else
raise "HTTP Error: #{response.code} #{response.message}"
end
rescue StandardError => e
if retries < MAX_RETRIES
retries += 1
@@ -539,12 +664,25 @@ class WaybackMachineDownloader
def cleanup
@connection_pool.shutdown
if @failed_downloads.any?
@logger.error("Download completed with errors.")
@logger.error("Failed downloads summary:")
@failed_downloads.each do |failure|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
end
unless @reset
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
return
end
end
if !@keep || @reset
puts "Cleaning up state files..." unless @keep && !@reset
FileUtils.rm_f(cdx_path)
FileUtils.rm_f(db_path)
elsif @keep
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
end
end
end

View File

@@ -4,7 +4,7 @@ require 'uri'
module ArchiveAPI
def get_raw_list_from_api(url, page_index, http)
request_url = URI("https://web.archive.org/cdx/search/xd")
request_url = URI("https://web.archive.org/cdx/search/cdx")
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
request_url.query = URI.encode_www_form(params)