Using net:HTTP and decompressing gzip content

see https://github.com/ShiftaDeband/wayback-machine-downloader and bf6e33c2fe
2025-12-29 16:16:06 +00:00 · 2024-06-26 16:54:55 +00:00 · 2024-06-26 16:54:55 +00:00 · cff30f529e
commit cff30f529e
parent 466228fee4
1 changed files with 49 additions and 21 deletions
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@ -6,6 +6,8 @@ require 'open-uri'
 require 'fileutils'
 require 'cgi'
 require 'json'
 require 'zlib'
 require 'stringio'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@ -14,7 +16,7 @@ class WaybackMachineDownloader
  include ArchiveAPI
-  VERSION = "2.3.1"
+  VERSION = "2.3.2"
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
@ -81,22 +83,33 @@ class WaybackMachineDownloader
  end
  def get_all_snapshots_to_consider
-    # Note: Passing a page index parameter allow us to get more snapshots,
+    http = Net::HTTP.new("web.archive.org", 443)
-    # but from a less fresh index
+    http.use_ssl = true
-    print "Getting snapshot pages"
+
    snapshot_list_to_consider = []
-    snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
+
    http.start do
      puts "Getting snapshot pages"
      # Fetch the initial set of snapshots
      snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
      print "."
      # Fetch additional pages if the exact URL flag is not set
      unless @exact_url
        @maximum_pages.times do |page_index|
-        snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
+          snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, http)
          break if snapshot_list.empty?
          snapshot_list_to_consider += snapshot_list
          print "."
        end
      end
-    puts " found #{snapshot_list_to_consider.length} snaphots to consider."
+    end
    puts " found #{snapshot_list_to_consider.length} snapshots to consider."
    puts
    snapshot_list_to_consider
  end
@ -206,11 +219,15 @@ class WaybackMachineDownloader
    @processed_file_count = 0
    @threads_count = 1 unless @threads_count != 0
    @threads_count.times do
      http = Net::HTTP.new("web.archive.org", 443)
      http.use_ssl = true
      http.start()
      threads << Thread.new do
        until file_queue.empty?
          file_remote_info = file_queue.pop(true) rescue nil
-          download_file(file_remote_info) if file_remote_info
+          download_file(file_remote_info, http) if file_remote_info
        end
        http.finish()
      end
    end
@ -243,7 +260,7 @@ class WaybackMachineDownloader
    end
  end
-  def download_file file_remote_info
+  def download_file (file_remote_info, http)
    current_encoding = "".encoding
    file_url = file_remote_info[:file_url].encode(current_encoding)
    file_id = file_remote_info[:file_id]
@ -268,8 +285,19 @@ class WaybackMachineDownloader
        structure_dir_path dir_path
        open(file_path, "wb") do |file|
          begin
-            URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
+            http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
-              file.write(uri.read)
+              file.write(body)
              if file_path.include? '.gz'
                file_path_temp = file_path + '.temp'
                File.rename(file_path, file_path_temp)
                Zlib::GzipReader.open(file_path_temp) do |gz|
                  File.open(file_path, 'wb') do |f|
                    f.write gz.read
                  end
                end
                File.delete(file_path_temp)
              end
            end
          rescue OpenURI::HTTPError => e
            puts "#{file_url} # #{e}"