|
|
|
|
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|
|
|
|
|
|
|
|
|
include ArchiveAPI
|
|
|
|
|
|
|
|
|
|
VERSION = "2.3.6"
|
|
|
|
|
VERSION = "2.3.10"
|
|
|
|
|
DEFAULT_TIMEOUT = 30
|
|
|
|
|
MAX_RETRIES = 3
|
|
|
|
|
RETRY_DELAY = 2
|
|
|
|
|
@@ -131,7 +131,11 @@ class WaybackMachineDownloader
|
|
|
|
|
validate_params(params)
|
|
|
|
|
@base_url = params[:base_url]
|
|
|
|
|
@exact_url = params[:exact_url]
|
|
|
|
|
@directory = params[:directory]
|
|
|
|
|
if params[:directory]
|
|
|
|
|
@directory = File.expand_path(params[:directory])
|
|
|
|
|
else
|
|
|
|
|
@directory = nil
|
|
|
|
|
end
|
|
|
|
|
@all_timestamps = params[:all_timestamps]
|
|
|
|
|
@from_timestamp = params[:from_timestamp].to_i
|
|
|
|
|
@to_timestamp = params[:to_timestamp].to_i
|
|
|
|
|
@@ -154,22 +158,22 @@ class WaybackMachineDownloader
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def backup_name
|
|
|
|
|
if @base_url.include? '//'
|
|
|
|
|
@base_url.split('/')[2]
|
|
|
|
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
|
|
|
|
|
|
|
|
|
if url_to_process.include? '//'
|
|
|
|
|
url_to_process.split('/')[2]
|
|
|
|
|
else
|
|
|
|
|
@base_url
|
|
|
|
|
url_to_process
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def backup_path
|
|
|
|
|
if @directory
|
|
|
|
|
if @directory[-1] == '/'
|
|
|
|
|
@directory
|
|
|
|
|
else
|
|
|
|
|
@directory + '/'
|
|
|
|
|
end
|
|
|
|
|
# because @directory is already an absolute path, we just ensure it exists
|
|
|
|
|
@directory
|
|
|
|
|
else
|
|
|
|
|
'websites/' + backup_name + '/'
|
|
|
|
|
# ensure the default path is absolute and normalized
|
|
|
|
|
File.expand_path(File.join('websites', backup_name))
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
@@ -241,6 +245,7 @@ class WaybackMachineDownloader
|
|
|
|
|
# Fetch the initial set of snapshots, sequentially
|
|
|
|
|
@connection_pool.with_connection do |connection|
|
|
|
|
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
|
|
|
|
initial_list ||= []
|
|
|
|
|
mutex.synchronize do
|
|
|
|
|
snapshot_list_to_consider.concat(initial_list)
|
|
|
|
|
print "."
|
|
|
|
|
@@ -265,6 +270,7 @@ class WaybackMachineDownloader
|
|
|
|
|
@connection_pool.with_connection do |connection|
|
|
|
|
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
|
|
|
|
end
|
|
|
|
|
result ||= []
|
|
|
|
|
[page, result]
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
@@ -284,7 +290,7 @@ class WaybackMachineDownloader
|
|
|
|
|
|
|
|
|
|
# Process results and check for empty pages
|
|
|
|
|
results.each do |page, result|
|
|
|
|
|
if result.empty?
|
|
|
|
|
if result.nil? || result.empty?
|
|
|
|
|
continue_fetching = false
|
|
|
|
|
break
|
|
|
|
|
else
|
|
|
|
|
@@ -378,7 +384,7 @@ class WaybackMachineDownloader
|
|
|
|
|
end
|
|
|
|
|
else
|
|
|
|
|
file_list_curated = get_file_list_curated
|
|
|
|
|
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
|
|
|
|
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
|
|
|
|
file_list_curated.map do |file_remote_info|
|
|
|
|
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
|
|
|
file_remote_info[1]
|
|
|
|
|
@@ -477,8 +483,8 @@ class WaybackMachineDownloader
|
|
|
|
|
begin
|
|
|
|
|
@connection_pool.with_connection do |connection|
|
|
|
|
|
result_message = download_file(file_remote_info, connection)
|
|
|
|
|
# for now, assume success if no exception and message doesn't indicate error/skip
|
|
|
|
|
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
|
|
|
|
# assume download success if the result message contains ' -> '
|
|
|
|
|
if result_message && result_message.include?(' -> ')
|
|
|
|
|
download_success = true
|
|
|
|
|
end
|
|
|
|
|
@download_mutex.synchronize do
|
|
|
|
|
@@ -634,21 +640,35 @@ class WaybackMachineDownloader
|
|
|
|
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
|
|
|
|
file_id = file_remote_info[:file_id]
|
|
|
|
|
file_timestamp = file_remote_info[:timestamp]
|
|
|
|
|
file_path_elements = file_id.split('/')
|
|
|
|
|
|
|
|
|
|
# sanitize file_id to ensure it is a valid path component
|
|
|
|
|
raw_path_elements = file_id.split('/')
|
|
|
|
|
|
|
|
|
|
sanitized_path_elements = raw_path_elements.map do |element|
|
|
|
|
|
if Gem.win_platform?
|
|
|
|
|
# for Windows, we need to sanitize path components to avoid invalid characters
|
|
|
|
|
# this prevents issues with file names that contain characters not allowed in
|
|
|
|
|
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
|
|
|
|
element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
|
|
|
|
|
else
|
|
|
|
|
element
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
current_backup_path = backup_path
|
|
|
|
|
|
|
|
|
|
if file_id == ""
|
|
|
|
|
dir_path = backup_path
|
|
|
|
|
file_path = backup_path + 'index.html'
|
|
|
|
|
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
|
|
|
|
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
|
|
|
|
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
|
|
|
|
dir_path = current_backup_path
|
|
|
|
|
file_path = File.join(dir_path, 'index.html')
|
|
|
|
|
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
|
|
|
|
# if file_id is a directory, we treat it as such
|
|
|
|
|
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
|
|
|
|
file_path = File.join(dir_path, 'index.html')
|
|
|
|
|
else
|
|
|
|
|
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
|
|
|
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
|
|
|
|
end
|
|
|
|
|
if Gem.win_platform?
|
|
|
|
|
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
|
|
|
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
|
|
|
|
# if file_id is a file, we treat it as such
|
|
|
|
|
filename = sanitized_path_elements.pop
|
|
|
|
|
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
|
|
|
|
file_path = File.join(dir_path, filename)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# check existence *before* download attempt
|
|
|
|
|
@@ -659,11 +679,21 @@ class WaybackMachineDownloader
|
|
|
|
|
|
|
|
|
|
begin
|
|
|
|
|
structure_dir_path dir_path
|
|
|
|
|
download_with_retry(file_path, file_url, file_timestamp, http)
|
|
|
|
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
|
|
|
|
rewrite_urls_to_relative(file_path)
|
|
|
|
|
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
|
|
|
|
|
|
|
|
|
case status
|
|
|
|
|
when :saved
|
|
|
|
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
|
|
|
|
rewrite_urls_to_relative(file_path)
|
|
|
|
|
end
|
|
|
|
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
|
|
|
when :skipped_not_found
|
|
|
|
|
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
|
|
|
else
|
|
|
|
|
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
|
|
|
|
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
|
|
|
|
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
|
|
|
end
|
|
|
|
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
|
|
|
rescue StandardError => e
|
|
|
|
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
|
|
|
if File.exist?(file_path) and File.size(file_path) == 0
|
|
|
|
|
@@ -707,6 +737,9 @@ class WaybackMachineDownloader
|
|
|
|
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# Escape square brackets because they are not valid in URI()
|
|
|
|
|
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
|
|
|
|
|
|
|
|
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
|
|
|
|
request["Connection"] = "keep-alive"
|
|
|
|
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
|
|
|
|
@@ -714,8 +747,7 @@ class WaybackMachineDownloader
|
|
|
|
|
|
|
|
|
|
response = connection.request(request)
|
|
|
|
|
|
|
|
|
|
case response
|
|
|
|
|
when Net::HTTPSuccess
|
|
|
|
|
save_response_body = lambda do
|
|
|
|
|
File.open(file_path, "wb") do |file|
|
|
|
|
|
body = response.body
|
|
|
|
|
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
|
|
|
|
@@ -725,26 +757,48 @@ class WaybackMachineDownloader
|
|
|
|
|
gz.close
|
|
|
|
|
file.write(decompressed_body)
|
|
|
|
|
rescue Zlib::GzipFile::Error => e
|
|
|
|
|
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
|
|
|
|
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
|
|
|
|
file.write(body)
|
|
|
|
|
end
|
|
|
|
|
else
|
|
|
|
|
file.write(body) if body
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
when Net::HTTPRedirection
|
|
|
|
|
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
|
|
|
|
location = response['location']
|
|
|
|
|
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
|
|
|
|
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
|
|
|
|
when Net::HTTPTooManyRequests
|
|
|
|
|
sleep(RATE_LIMIT * 2)
|
|
|
|
|
raise "Rate limited, retrying..."
|
|
|
|
|
when Net::HTTPNotFound
|
|
|
|
|
@logger.warn("File not found, skipping: #{file_url}")
|
|
|
|
|
return
|
|
|
|
|
else
|
|
|
|
|
raise "HTTP Error: #{response.code} #{response.message}"
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
if @all
|
|
|
|
|
case response
|
|
|
|
|
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
|
|
|
|
save_response_body.call
|
|
|
|
|
if response.is_a?(Net::HTTPRedirection)
|
|
|
|
|
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
|
|
|
|
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
|
|
|
|
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
|
|
|
|
end
|
|
|
|
|
return :saved
|
|
|
|
|
else
|
|
|
|
|
# for any other response type when --all is true, treat as an error to be retried or failed
|
|
|
|
|
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
|
|
|
|
end
|
|
|
|
|
else # not @all (our default behavior)
|
|
|
|
|
case response
|
|
|
|
|
when Net::HTTPSuccess
|
|
|
|
|
save_response_body.call
|
|
|
|
|
return :saved
|
|
|
|
|
when Net::HTTPRedirection
|
|
|
|
|
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
|
|
|
|
location = response['location']
|
|
|
|
|
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
|
|
|
|
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
|
|
|
|
when Net::HTTPTooManyRequests
|
|
|
|
|
sleep(RATE_LIMIT * 2)
|
|
|
|
|
raise "Rate limited, retrying..."
|
|
|
|
|
when Net::HTTPNotFound
|
|
|
|
|
@logger.warn("File not found, skipping: #{file_url}")
|
|
|
|
|
return :skipped_not_found
|
|
|
|
|
else
|
|
|
|
|
raise "HTTP Error: #{response.code} #{response.message}"
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
rescue StandardError => e
|
|
|
|
|
|