Significant refactoring

including extra config settings, a proper rate limit, and a logger. Fixes: #307 #291 #281 #269 and probably others too
2025-12-29 16:16:06 +00:00 · 2024-12-03 00:23:47 +00:00 · 2024-12-03 00:23:47 +00:00 · 45fa2be573
commit 45fa2be573
parent eaff48fbdb
2 changed files with 153 additions and 69 deletions
--- a/1
+++ b/1
@ -1,3 +1,4 @@
 source "https://rubygems.org"
 gemspec
 gem 'concurrent-ruby'
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@ -7,6 +7,8 @@ require 'fileutils'
 require 'cgi'
 require 'json'
 require 'time'
 require 'concurrent'
 require 'logger'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@ -16,12 +18,20 @@ class WaybackMachineDownloader
  include ArchiveAPI
  VERSION = "2.3.2"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
  RATE_LIMIT = 0.25  # Delay between requests in seconds
  CONNECTION_POOL_SIZE = 10
  HTTP_CACHE_SIZE = 1000
  MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count
+    :all, :maximum_pages, :threads_count, :logger
  def initialize params
    validate_params(params)
    @base_url = params[:base_url]
    @exact_url = params[:exact_url]
    @directory = params[:directory]
@ -32,7 +42,11 @@ class WaybackMachineDownloader
    @exclude_filter = params[:exclude_filter]
    @all = params[:all]
    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
-    @threads_count = params[:threads_count].to_i
+    @threads_count = [params[:threads_count].to_i, 1].max # Garante mínimo de 1 thread
    @timeout = params[:timeout] || DEFAULT_TIMEOUT
    @logger = setup_logger
    @http_cache = Concurrent::Map.new
    @failed_downloads = Concurrent::Array.new
  end
  def backup_name
@ -82,28 +96,30 @@ class WaybackMachineDownloader
  end
  def get_all_snapshots_to_consider
-    http = Net::HTTP.new("web.archive.org", 443)
+    http = setup_http_client
    http.use_ssl = true
    snapshot_list_to_consider = []
-    http.start do
+    begin
-      puts "Getting snapshot pages"
+      http.start do |connection|
        puts "Getting snapshot pages"
-      # Fetch the initial set of snapshots
+        # Fetch the initial set of snapshots
-      snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
+        snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
-      print "."
+        print "."
-      # Fetch additional pages if the exact URL flag is not set
+        # Fetch additional pages if the exact URL flag is not set
-      unless @exact_url
+        unless @exact_url
-        @maximum_pages.times do |page_index|
+          @maximum_pages.times do |page_index|
-          snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, http)
+            snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
-          break if snapshot_list.empty?
+            break if snapshot_list.empty?
-          snapshot_list_to_consider += snapshot_list
+            snapshot_list_to_consider += snapshot_list
-          print "."
+            print "."
          end
        end
      end
    ensure
      http.finish if http.started?
    end
    puts " found #{snapshot_list_to_consider.length} snapshots to consider."
@ -199,46 +215,49 @@ class WaybackMachineDownloader
  def download_files
    start_time = Time.now
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
-    puts
+    
    if file_list_by_timestamp.empty?
      puts "No files to download."
      puts "Possible reasons:"
      puts "\t* Site is not in Wayback Machine Archive."
      puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
      puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
      puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
      puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
      return
    end
-    puts "#{file_list_by_timestamp.count} files to download:"
+    total_files = file_list_by_timestamp.count
-
+    puts "#{total_files} files to download:"
-    threads = []
+    
    mutex = Mutex.new
    @processed_file_count = 0
-    @threads_count = 1 unless @threads_count != 0
+    @download_mutex = Mutex.new
-    @threads_count.times do
+    
-      threads << Thread.new do
+    thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
-        http = Net::HTTP.new("web.archive.org", 443)
+    pool = Concurrent::FixedThreadPool.new(thread_count)
-        http.use_ssl = true
+    semaphore = Concurrent::Semaphore.new(CONNECTION_POOL_SIZE)
-
+    
    file_list_by_timestamp.each do |file_remote_info|
      pool.post do
        semaphore.acquire
        http = nil
        begin
-          until file_queue.empty?
+          http = setup_http_client
-            file_remote_info = nil
+          http.start do |connection|
-            mutex.synchronize { file_remote_info = file_queue.pop(true) rescue nil }
+            result = download_file(file_remote_info, connection)
-            download_file(file_remote_info, http) if file_remote_info
+            @download_mutex.synchronize do
              @processed_file_count += 1
              puts result if result
            end
          end
        ensure
-          http.finish if http.started?
+          semaphore.release
          http&.finish if http&.started?
          sleep(RATE_LIMIT)
        end
      end
    end
-    threads.each(&:join)
+    pool.shutdown
    pool.wait_for_termination
    end_time = Time.now
-    puts
+    puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
-    puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
+    cleanup
  end
  def structure_dir_path dir_path
@ -288,38 +307,18 @@ class WaybackMachineDownloader
    unless File.exist? file_path
      begin
        structure_dir_path dir_path
-        open(file_path, "wb") do |file|
+        download_with_retry(file_path, file_url, file_timestamp, http)
-          begin
+        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
            http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
              file.write(body)
            end
          rescue OpenURI::HTTPError => e
            puts "#{file_url} # #{e}"
            if @all
              file.write(e.io.read)
              puts "#{file_path} saved anyway."
            end
          rescue StandardError => e
            puts "#{file_url} # #{e}"
          end
        end
      rescue StandardError => e
-        puts "#{file_url} # #{e}"
+        msg = "#{file_url} # #{e}"
      ensure
        if not @all and File.exist?(file_path) and File.size(file_path) == 0
          File.delete(file_path)
-          puts "#{file_path} was empty and was removed."
+          msg += "\n#{file_path} was empty and was removed."
        end
-      end
+        msg
      semaphore.synchronize do
        @processed_file_count += 1
        puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
      end
    else
-      semaphore.synchronize do
+      "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
        @processed_file_count += 1
        puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
      end
    end
  end
@ -334,4 +333,88 @@ class WaybackMachineDownloader
  def semaphore
    @semaphore ||= Mutex.new
  end
  private
  def validate_params(params)
    raise ArgumentError, "Base URL is required" unless params[:base_url]
    raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
    # Removida validação de threads_count pois agora é forçado a ser positivo
  end
  def setup_logger
    logger = Logger.new(STDOUT)
    logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
    logger.formatter = proc do |severity, datetime, progname, msg|
      "#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
    end
    logger
  end
  def setup_http_client
    cached_client = @http_cache[Thread.current.object_id]
    return cached_client if cached_client&.active?
    http = Net::HTTP.new("web.archive.org", 443)
    http.use_ssl = true
    http.read_timeout = @timeout
    http.open_timeout = @timeout
    http.keep_alive_timeout = 30
    http.max_retries = MAX_RETRIES
    @http_cache[Thread.current.object_id] = http
    http
  end
  def download_with_retry(file_path, file_url, file_timestamp, connection)
    retries = 0
    begin
      request = Net::HTTP::Get.new(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"))
      request["Connection"] = "keep-alive"
      request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
      response = connection.request(request)
      case response
      when Net::HTTPSuccess
        File.open(file_path, "wb") do |file|
          if block_given?
            yield(response, file)
          else
            file.write(response.body)
          end
        end
      when Net::HTTPTooManyRequests
        sleep(RATE_LIMIT * 2)
        raise "Rate limited, retrying..."
      else
        raise "HTTP Error: #{response.code} #{response.message}"
      end
    rescue StandardError => e
      if retries < MAX_RETRIES
        retries += 1
        @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
        sleep(RETRY_DELAY * retries)
        retry
      else
        @failed_downloads << {url: file_url, error: e.message}
        raise e
      end
    end
  end
  def cleanup
    @http_cache.each_value do |client|
      client.finish if client&.started?
    end
    @http_cache.clear
    if @failed_downloads.any?
      @logger.error("Failed downloads summary:")
      @failed_downloads.each do |failure|
        @logger.error("  #{failure[:url]} - #{failure[:error]}")
      end
    end
  end
 end