From 9283f04a57f62addbdc0974096cbba9a2f931cce Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Thu, 2 Jan 2025 12:17:20 +0000 Subject: [PATCH] Added ability to download rewritten Wayback Archive files --- bin/wayback_machine_downloader | 4 ++++ lib/wayback_machine_downloader.rb | 9 ++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 4fb6d3d..00b2fd9 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,6 +58,10 @@ option_parser = OptionParser.new do |opts| options[:list] = true end + opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t| + options[:rewritten] = t + end + opts.on("-v", "--version", "Display version") do |t| options[:version] = t end diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index c13cf38..91a1e04 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -136,6 +136,7 @@ class WaybackMachineDownloader @all = params[:all] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = [params[:threads_count].to_i, 1].max + @rewritten = params[:rewritten] @timeout = params[:timeout] || DEFAULT_TIMEOUT @logger = setup_logger @failed_downloads = Concurrent::Array.new @@ -428,7 +429,13 @@ class WaybackMachineDownloader def download_with_retry(file_path, file_url, file_timestamp, connection) retries = 0 begin - request = Net::HTTP::Get.new(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) + wayback_url = if @rewritten + "https://web.archive.org/web/#{file_timestamp}/#{file_url}" + else + "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}" + end + + request = Net::HTTP::Get.new(URI(wayback_url)) request["Connection"] = "keep-alive" request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"