mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Added support for resuming incomplete downloads
This commit is contained in:
@@ -120,10 +120,12 @@ class WaybackMachineDownloader
|
||||
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
||||
CONNECTION_POOL_SIZE = 10
|
||||
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
||||
STATE_CDX_FILENAME = ".cdx.json"
|
||||
STATE_DB_FILENAME = ".downloaded.txt"
|
||||
|
||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||
:all, :maximum_pages, :threads_count, :logger
|
||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
||||
|
||||
def initialize params
|
||||
validate_params(params)
|
||||
@@ -139,10 +141,15 @@ class WaybackMachineDownloader
|
||||
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
||||
@threads_count = [params[:threads_count].to_i, 1].max
|
||||
@rewritten = params[:rewritten]
|
||||
@reset = params[:reset]
|
||||
@keep = params[:keep]
|
||||
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
||||
@logger = setup_logger
|
||||
@failed_downloads = Concurrent::Array.new
|
||||
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
||||
@db_mutex = Mutex.new
|
||||
|
||||
handle_reset
|
||||
end
|
||||
|
||||
def backup_name
|
||||
@@ -165,6 +172,23 @@ class WaybackMachineDownloader
|
||||
end
|
||||
end
|
||||
|
||||
def cdx_path
|
||||
File.join(backup_path, STATE_CDX_FILENAME)
|
||||
end
|
||||
|
||||
def db_path
|
||||
File.join(backup_path, STATE_DB_FILENAME)
|
||||
end
|
||||
|
||||
def handle_reset
|
||||
if @reset
|
||||
puts "Resetting download state..."
|
||||
FileUtils.rm_f(cdx_path)
|
||||
FileUtils.rm_f(db_path)
|
||||
puts "Removed state files: #{cdx_path}, #{db_path}"
|
||||
end
|
||||
end
|
||||
|
||||
def match_only_filter file_url
|
||||
if @only_filter
|
||||
only_filter_regex = @only_filter.to_regex
|
||||
@@ -192,10 +216,26 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def get_all_snapshots_to_consider
|
||||
if File.exist?(cdx_path) && !@reset
|
||||
puts "Loading snapshot list from #{cdx_path}"
|
||||
begin
|
||||
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
|
||||
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
|
||||
puts
|
||||
return Concurrent::Array.new(snapshot_list_to_consider)
|
||||
rescue JSON::ParserError => e
|
||||
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
|
||||
FileUtils.rm_f(cdx_path)
|
||||
rescue => e
|
||||
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
|
||||
FileUtils.rm_f(cdx_path)
|
||||
end
|
||||
end
|
||||
|
||||
snapshot_list_to_consider = Concurrent::Array.new
|
||||
mutex = Mutex.new
|
||||
|
||||
puts "Getting snapshot pages"
|
||||
|
||||
puts "Getting snapshot pages from Wayback Machine API..."
|
||||
|
||||
# Fetch the initial set of snapshots, sequentially
|
||||
@connection_pool.with_connection do |connection|
|
||||
@@ -211,12 +251,12 @@ class WaybackMachineDownloader
|
||||
page_index = 0
|
||||
batch_size = [@threads_count, 5].min
|
||||
continue_fetching = true
|
||||
|
||||
|
||||
while continue_fetching && page_index < @maximum_pages
|
||||
# Determine the range of pages to fetch in this batch
|
||||
end_index = [page_index + batch_size, @maximum_pages].min
|
||||
current_batch = (page_index...end_index).to_a
|
||||
|
||||
|
||||
# Create futures for concurrent API calls
|
||||
futures = current_batch.map do |page|
|
||||
Concurrent::Future.execute do
|
||||
@@ -227,9 +267,9 @@ class WaybackMachineDownloader
|
||||
[page, result]
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
results = []
|
||||
|
||||
|
||||
futures.each do |future|
|
||||
begin
|
||||
results << future.value
|
||||
@@ -237,10 +277,10 @@ class WaybackMachineDownloader
|
||||
puts "\nError fetching page #{future}: #{e.message}"
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
# Sort results by page number to maintain order
|
||||
results.sort_by! { |page, _| page }
|
||||
|
||||
|
||||
# Process results and check for empty pages
|
||||
results.each do |page, result|
|
||||
if result.empty?
|
||||
@@ -253,14 +293,23 @@ class WaybackMachineDownloader
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
page_index = end_index
|
||||
|
||||
|
||||
sleep(RATE_LIMIT) if continue_fetching
|
||||
end
|
||||
end
|
||||
|
||||
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
||||
puts " found #{snapshot_list_to_consider.length} snapshots."
|
||||
|
||||
# Save the fetched list to the cache file
|
||||
begin
|
||||
FileUtils.mkdir_p(File.dirname(cdx_path))
|
||||
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
|
||||
puts "Saved snapshot list to #{cdx_path}"
|
||||
rescue => e
|
||||
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
|
||||
end
|
||||
puts
|
||||
|
||||
snapshot_list_to_consider
|
||||
@@ -350,32 +399,103 @@ class WaybackMachineDownloader
|
||||
puts "]"
|
||||
end
|
||||
|
||||
def load_downloaded_ids
|
||||
downloaded_ids = Set.new
|
||||
if File.exist?(db_path) && !@reset
|
||||
puts "Loading list of already downloaded files from #{db_path}"
|
||||
begin
|
||||
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
|
||||
rescue => e
|
||||
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
|
||||
downloaded_ids.clear
|
||||
end
|
||||
end
|
||||
downloaded_ids
|
||||
end
|
||||
|
||||
def append_to_db(file_id)
|
||||
@db_mutex.synchronize do
|
||||
begin
|
||||
FileUtils.mkdir_p(File.dirname(db_path))
|
||||
File.open(db_path, 'a') { |f| f.puts(file_id) }
|
||||
rescue => e
|
||||
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def download_files
|
||||
start_time = Time.now
|
||||
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
||||
|
||||
if file_list_by_timestamp.empty?
|
||||
puts "No files to download."
|
||||
|
||||
FileUtils.mkdir_p(backup_path)
|
||||
|
||||
# Load the list of files to potentially download
|
||||
files_to_download = file_list_by_timestamp
|
||||
|
||||
if files_to_download.empty?
|
||||
puts "No files found matching criteria."
|
||||
cleanup
|
||||
return
|
||||
end
|
||||
|
||||
total_files = file_list_by_timestamp.count
|
||||
puts "#{total_files} files to download:"
|
||||
|
||||
total_files = files_to_download.count
|
||||
puts "#{total_files} files found matching criteria."
|
||||
|
||||
# Load IDs of already downloaded files
|
||||
downloaded_ids = load_downloaded_ids
|
||||
files_to_process = files_to_download.reject do |file_info|
|
||||
downloaded_ids.include?(file_info[:file_id])
|
||||
end
|
||||
|
||||
remaining_count = files_to_process.count
|
||||
skipped_count = total_files - remaining_count
|
||||
|
||||
if skipped_count > 0
|
||||
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
||||
end
|
||||
|
||||
if remaining_count == 0
|
||||
puts "All matching files have already been downloaded."
|
||||
cleanup
|
||||
return
|
||||
end
|
||||
|
||||
puts "#{remaining_count} files to download:"
|
||||
|
||||
@processed_file_count = 0
|
||||
@total_to_download = remaining_count
|
||||
@download_mutex = Mutex.new
|
||||
|
||||
|
||||
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
||||
pool = Concurrent::FixedThreadPool.new(thread_count)
|
||||
|
||||
file_list_by_timestamp.each do |file_remote_info|
|
||||
|
||||
files_to_process.each do |file_remote_info|
|
||||
pool.post do
|
||||
@connection_pool.with_connection do |connection|
|
||||
result = download_file(file_remote_info, connection)
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
puts result if result
|
||||
download_success = false
|
||||
begin
|
||||
@connection_pool.with_connection do |connection|
|
||||
result_message = download_file(file_remote_info, connection)
|
||||
# for now, assume success if no exception and message doesn't indicate error/skip
|
||||
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
||||
download_success = true
|
||||
end
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
# adjust progress message to reflect remaining files
|
||||
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
||||
puts progress_message if progress_message
|
||||
end
|
||||
end
|
||||
# sppend to DB only after successful download outside the connection block
|
||||
if download_success
|
||||
append_to_db(file_remote_info[:file_id])
|
||||
end
|
||||
rescue => e
|
||||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
end
|
||||
end
|
||||
sleep(RATE_LIMIT)
|
||||
end
|
||||
@@ -385,7 +505,8 @@ class WaybackMachineDownloader
|
||||
pool.wait_for_termination
|
||||
|
||||
end_time = Time.now
|
||||
puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
|
||||
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
||||
puts "Results saved in #{backup_path}"
|
||||
cleanup
|
||||
end
|
||||
|
||||
@@ -417,9 +538,10 @@ class WaybackMachineDownloader
|
||||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||||
file_id = file_remote_info[:file_id]
|
||||
file_timestamp = file_remote_info[:timestamp]
|
||||
file_path_elements = file_id.split('/')
|
||||
original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
|
||||
file_path_elements = original_file_id.split('/')
|
||||
|
||||
if file_id == ""
|
||||
if original_file_id == ""
|
||||
dir_path = backup_path
|
||||
file_path = backup_path + 'index.html'
|
||||
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
||||
@@ -433,21 +555,24 @@ class WaybackMachineDownloader
|
||||
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
end
|
||||
unless File.exist? file_path
|
||||
begin
|
||||
structure_dir_path dir_path
|
||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
||||
rescue StandardError => e
|
||||
msg = "#{file_url} # #{e}"
|
||||
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
||||
File.delete(file_path)
|
||||
msg += "\n#{file_path} was empty and was removed."
|
||||
end
|
||||
msg
|
||||
|
||||
# check existence *before* download attempt
|
||||
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
||||
if File.exist? file_path
|
||||
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
end
|
||||
|
||||
begin
|
||||
structure_dir_path dir_path
|
||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
rescue StandardError => e
|
||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
||||
File.delete(file_path)
|
||||
msg += "\n#{file_path} was empty and was removed."
|
||||
end
|
||||
else
|
||||
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
||||
msg
|
||||
end
|
||||
end
|
||||
|
||||
@@ -523,7 +648,7 @@ class WaybackMachineDownloader
|
||||
else
|
||||
raise "HTTP Error: #{response.code} #{response.message}"
|
||||
end
|
||||
|
||||
|
||||
rescue StandardError => e
|
||||
if retries < MAX_RETRIES
|
||||
retries += 1
|
||||
@@ -539,12 +664,25 @@ class WaybackMachineDownloader
|
||||
|
||||
def cleanup
|
||||
@connection_pool.shutdown
|
||||
|
||||
|
||||
if @failed_downloads.any?
|
||||
@logger.error("Download completed with errors.")
|
||||
@logger.error("Failed downloads summary:")
|
||||
@failed_downloads.each do |failure|
|
||||
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
||||
end
|
||||
unless @reset
|
||||
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
|
||||
return
|
||||
end
|
||||
end
|
||||
|
||||
if !@keep || @reset
|
||||
puts "Cleaning up state files..." unless @keep && !@reset
|
||||
FileUtils.rm_f(cdx_path)
|
||||
FileUtils.rm_f(db_path)
|
||||
elsif @keep
|
||||
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -4,7 +4,7 @@ require 'uri'
|
||||
module ArchiveAPI
|
||||
|
||||
def get_raw_list_from_api(url, page_index, http)
|
||||
request_url = URI("https://web.archive.org/cdx/search/xd")
|
||||
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||
request_url.query = URI.encode_www_form(params)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user