Added ability to download rewritten Wayback Archive files

This commit is contained in:
Felipe 2025-01-02 12:17:20 +00:00
parent 3fff7daf35
commit 9283f04a57
2 changed files with 12 additions and 1 deletions

View File

@ -58,6 +58,10 @@ option_parser = OptionParser.new do |opts|
options[:list] = true options[:list] = true
end end
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
options[:rewritten] = t
end
opts.on("-v", "--version", "Display version") do |t| opts.on("-v", "--version", "Display version") do |t|
options[:version] = t options[:version] = t
end end

View File

@ -136,6 +136,7 @@ class WaybackMachineDownloader
@all = params[:all] @all = params[:all]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = [params[:threads_count].to_i, 1].max @threads_count = [params[:threads_count].to_i, 1].max
@rewritten = params[:rewritten]
@timeout = params[:timeout] || DEFAULT_TIMEOUT @timeout = params[:timeout] || DEFAULT_TIMEOUT
@logger = setup_logger @logger = setup_logger
@failed_downloads = Concurrent::Array.new @failed_downloads = Concurrent::Array.new
@ -428,7 +429,13 @@ class WaybackMachineDownloader
def download_with_retry(file_path, file_url, file_timestamp, connection) def download_with_retry(file_path, file_url, file_timestamp, connection)
retries = 0 retries = 0
begin begin
request = Net::HTTP::Get.new(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) wayback_url = if @rewritten
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
else
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
end
request = Net::HTTP::Get.new(URI(wayback_url))
request["Connection"] = "keep-alive" request["Connection"] = "keep-alive"
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}" request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"