wayback-machine-downloader/lib/wayback_machine_downloader.rb

# encoding: UTF-8

require 'thread'
require 'net/http'
require 'open-uri'
require 'fileutils'
require 'cgi'
require 'json'
require 'time'
require 'concurrent-ruby'
require 'logger'
require 'zlib'
require 'stringio'
require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api'
require_relative 'wayback_machine_downloader/subdom_processor'
require_relative 'wayback_machine_downloader/url_rewrite'

class ConnectionPool
  MAX_AGE = 300
  CLEANUP_INTERVAL = 60
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3

  def initialize(size)
    @size = size
    @pool = Concurrent::Map.new
    @creation_times = Concurrent::Map.new
    @cleanup_thread = schedule_cleanup
  end

  def with_connection(&block)
    conn = acquire_connection
    begin
      yield conn
    ensure
      release_connection(conn)
    end
  end

  def shutdown
    @cleanup_thread&.exit
    @pool.each_value { |conn| conn.finish if conn&.started? }
    @pool.clear
    @creation_times.clear
  end

  private

  def acquire_connection
    thread_id = Thread.current.object_id
    conn = @pool[thread_id]

    if should_create_new?(conn)
      conn&.finish if conn&.started?
      conn = create_connection
      @pool[thread_id] = conn
      @creation_times[thread_id] = Time.now
    end

    conn
  end

  def release_connection(conn)
    return unless conn
    if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
      conn.finish
      @pool.delete(Thread.current.object_id)
      @creation_times.delete(Thread.current.object_id)
    end
  end

  def should_create_new?(conn)
    return true if conn.nil?
    return true unless conn.started?
    return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
    false
  end

  def create_connection
    http = Net::HTTP.new("web.archive.org", 443)
    http.use_ssl = true
    http.read_timeout = DEFAULT_TIMEOUT
    http.open_timeout = DEFAULT_TIMEOUT
    http.keep_alive_timeout = 30
    http.max_retries = MAX_RETRIES
    http.start
    http
  end

  def schedule_cleanup
    Thread.new do
      loop do
        cleanup_old_connections
        sleep CLEANUP_INTERVAL
      end
    end
  end

  def cleanup_old_connections
    current_time = Time.now
    @creation_times.each do |thread_id, creation_time|
      if current_time - creation_time > MAX_AGE
        conn = @pool[thread_id]
        conn&.finish if conn&.started?
        @pool.delete(thread_id)
        @creation_times.delete(thread_id)
      end
    end
  end
end

class WaybackMachineDownloader

  include ArchiveAPI
  include SubdomainProcessor

  VERSION = "2.4.0"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
  RATE_LIMIT = 0.25  # Delay between requests in seconds
  CONNECTION_POOL_SIZE = 10
  MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
  STATE_CDX_FILENAME = ".cdx.json"
  STATE_DB_FILENAME = ".downloaded.txt"


  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
    :snapshot_at

  def initialize params
    validate_params(params)
    @base_url = params[:base_url]&.tidy_bytes
    @exact_url = params[:exact_url]
    if params[:directory]
      sanitized_dir = params[:directory].tidy_bytes
      @directory = File.expand_path(sanitized_dir)
    else
      @directory = nil
    end
    @all_timestamps = params[:all_timestamps]
    @from_timestamp = params[:from_timestamp].to_i
    @to_timestamp = params[:to_timestamp].to_i
    @only_filter = params[:only_filter]
    @exclude_filter = params[:exclude_filter]
    @all = params[:all]
    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
    @threads_count = [params[:threads_count].to_i, 1].max
    @rewritten = params[:rewritten]
    @reset = params[:reset]
    @keep = params[:keep]
    @timeout = params[:timeout] || DEFAULT_TIMEOUT
    @logger = setup_logger
    @failed_downloads = Concurrent::Array.new
    @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
    @db_mutex = Mutex.new
    @rewrite = params[:rewrite] || false
    @recursive_subdomains = params[:recursive_subdomains] || false
    @subdomain_depth = params[:subdomain_depth] || 1
    @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil

    # URL for rejecting invalid/unencoded wayback urls
    @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/

    handle_reset
  end

  def backup_name
    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url

    if url_to_process.include? '//'
      url_to_process.split('/')[2]
    else
      url_to_process
    end
  end

  def backup_path
    if @directory
      # because @directory is already an absolute path, we just ensure it exists
      @directory
    else
      # ensure the default path is absolute and normalized
      File.expand_path(File.join('websites', backup_name))
    end
  end

  def cdx_path
    File.join(backup_path, STATE_CDX_FILENAME)
  end

  def db_path
    File.join(backup_path, STATE_DB_FILENAME)
  end

  def handle_reset
    if @reset
      puts "Resetting download state..."
      FileUtils.rm_f(cdx_path)
      FileUtils.rm_f(db_path)
      puts "Removed state files: #{cdx_path}, #{db_path}"
    end
  end

  def match_only_filter file_url
    if @only_filter
      only_filter_regex = @only_filter.to_regex(detect: true)
      if only_filter_regex
        only_filter_regex =~ file_url
      else
        file_url.downcase.include? @only_filter.downcase
      end
    else
      true
    end
  end

  def match_exclude_filter file_url
    if @exclude_filter
      exclude_filter_regex = @exclude_filter.to_regex(detect: true)
      if exclude_filter_regex
        exclude_filter_regex =~ file_url
      else
        file_url.downcase.include? @exclude_filter.downcase
      end
    else
      false
    end
  end

  def get_all_snapshots_to_consider
    if File.exist?(cdx_path) && !@reset
      puts "Loading snapshot list from #{cdx_path}"
      begin
        snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
        puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
        puts
        return Concurrent::Array.new(snapshot_list_to_consider)
      rescue JSON::ParserError => e
        puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
        FileUtils.rm_f(cdx_path)
      rescue => e
        puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
        FileUtils.rm_f(cdx_path)
      end
    end

    snapshot_list_to_consider = Concurrent::Array.new
    mutex = Mutex.new

    puts "Getting snapshot pages from Wayback Machine API..."

    # Fetch the initial set of snapshots, sequentially
    @connection_pool.with_connection do |connection|
      initial_list = get_raw_list_from_api(@base_url, nil, connection)
      initial_list ||= []
      mutex.synchronize do
        snapshot_list_to_consider.concat(initial_list)
        print "."
      end
    end

    # Fetch additional pages if the exact URL flag is not set
    unless @exact_url
      page_index = 0
      batch_size = [@threads_count, 5].min
      continue_fetching = true

      while continue_fetching && page_index < @maximum_pages
        # Determine the range of pages to fetch in this batch
        end_index = [page_index + batch_size, @maximum_pages].min
        current_batch = (page_index...end_index).to_a

        # Create futures for concurrent API calls
        futures = current_batch.map do |page|
          Concurrent::Future.execute do
            result = nil
            @connection_pool.with_connection do |connection|
              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
            end
            result ||= []
            [page, result]
          end
        end

        results = []

        futures.each do |future|
          begin
            results << future.value
          rescue => e
            puts "\nError fetching page #{future}: #{e.message}"
          end
        end

        # Sort results by page number to maintain order
        results.sort_by! { |page, _| page }

        # Process results and check for empty pages
        results.each do |page, result|
          if result.nil? || result.empty?
            continue_fetching = false
            break
          else
            mutex.synchronize do
              snapshot_list_to_consider.concat(result)
              print "."
            end
          end
        end

        page_index = end_index

        sleep(RATE_LIMIT) if continue_fetching
      end
    end

    puts " found #{snapshot_list_to_consider.length} snapshots."

    # Save the fetched list to the cache file
    begin
      FileUtils.mkdir_p(File.dirname(cdx_path))
      File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
      puts "Saved snapshot list to #{cdx_path}"
    rescue => e
      puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
    end
    puts

    snapshot_list_to_consider
  end

  # Get a composite snapshot file list for a specific timestamp
  def get_composite_snapshot_file_list(target_timestamp)
    file_versions = {}
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
      next if file_timestamp.to_i > target_timestamp
      file_id = file_url.split('/')[3..-1].join('/')
      file_id = CGI::unescape file_id
      file_id = file_id.tidy_bytes unless file_id == ""
      next if file_id.nil?
      next if match_exclude_filter(file_url)
      next unless match_only_filter(file_url)
      # Select the most recent version <= target_timestamp
      if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
        file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
      end
    end
    file_versions.values
  end

  # Returns a list of files for the composite snapshot
  def get_file_list_composite_snapshot(target_timestamp)
    file_list = get_composite_snapshot_file_list(target_timestamp)
    file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
    file_list.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  end

  def get_file_list_curated
    file_list_curated = Hash.new
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
      file_id = file_url.split('/')[3..-1].join('/')
      file_id = CGI::unescape file_id
      file_id = file_id.tidy_bytes unless file_id == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
        elsif not match_only_filter(file_url)
          puts "File url doesn't match only filter, ignoring: #{file_url}"
        elsif file_list_curated[file_id]
          unless file_list_curated[file_id][:timestamp] > file_timestamp
            file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
          end
        else
          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
        end
      end
    end
    file_list_curated
  end

  def get_file_list_all_timestamps
    file_list_curated = Hash.new
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
      file_id = file_url.split('/')[3..-1].join('/')
      file_id_and_timestamp = [file_timestamp, file_id].join('/')
      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
        elsif not match_only_filter(file_url)
          puts "File url doesn't match only filter, ignoring: #{file_url}"
        elsif file_list_curated[file_id_and_timestamp]
          puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
        else
          file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
        end
      end
    end
    puts "file_list_curated: " + file_list_curated.count.to_s
    file_list_curated
  end


  def get_file_list_by_timestamp
    if @snapshot_at
      @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
    elsif @all_timestamps
      file_list_curated = get_file_list_all_timestamps
      file_list_curated.map do |file_remote_info|
        file_remote_info[1][:file_id] = file_remote_info[0]
        file_remote_info[1]
      end
    else
      file_list_curated = get_file_list_curated
      file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
      file_list_curated.map do |file_remote_info|
        file_remote_info[1][:file_id] = file_remote_info[0]
        file_remote_info[1]
      end
    end
  end

  def list_files
    # retrieval produces its own output
    @orig_stdout = $stdout
    $stdout = $stderr
    files = get_file_list_by_timestamp
    $stdout = @orig_stdout
    puts "["
    files[0...-1].each do |file|
      puts file.to_json + ","
    end
    puts files[-1].to_json
    puts "]"
  end

  def load_downloaded_ids
    downloaded_ids = Set.new
    if File.exist?(db_path) && !@reset
      puts "Loading list of already downloaded files from #{db_path}"
      begin
        File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
      rescue => e
        puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
        downloaded_ids.clear
      end
    end
    downloaded_ids
  end

  def append_to_db(file_id)
    @db_mutex.synchronize do
      begin
        FileUtils.mkdir_p(File.dirname(db_path))
        File.open(db_path, 'a') { |f| f.puts(file_id) }
      rescue => e
        @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
      end
    end
  end

  def processing_files(pool, files_to_process)
    files_to_process.each do |file_remote_info|
      pool.post do
        download_success = false
        begin
          @connection_pool.with_connection do |connection|
            result_message = download_file(file_remote_info, connection)
            # assume download success if the result message contains ' -> '
            if result_message && result_message.include?(' -> ')
               download_success = true
            end
            @download_mutex.synchronize do
              @processed_file_count += 1
              # adjust progress message to reflect remaining files
              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
              puts progress_message if progress_message
            end
          end
          # sppend to DB only after successful download outside the connection block
          if download_success
            append_to_db(file_remote_info[:file_id])
          end
        rescue => e
          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
           @download_mutex.synchronize do
              @processed_file_count += 1
           end
        end
        sleep(RATE_LIMIT)
      end
    end
  end

  def download_files
    start_time = Time.now
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."

    FileUtils.mkdir_p(backup_path)

    # Load the list of files to potentially download
    files_to_download = file_list_by_timestamp

    if files_to_download.empty?
      puts "No files found matching criteria."
      cleanup
      return
    end

    total_files = files_to_download.count
    puts "#{total_files} files found matching criteria."

    # Load IDs of already downloaded files
    downloaded_ids = load_downloaded_ids
    files_to_process = files_to_download.reject do |file_info|
      downloaded_ids.include?(file_info[:file_id])
    end

    remaining_count = files_to_process.count
    skipped_count = total_files - remaining_count

    if skipped_count > 0
      puts "Found #{skipped_count} previously downloaded files, skipping them."
    end

    if remaining_count == 0
      puts "All matching files have already been downloaded."
      cleanup
      return
    end

    puts "#{remaining_count} files to download:"

    @processed_file_count = 0
    @total_to_download = remaining_count
    @download_mutex = Mutex.new

    thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
    pool = Concurrent::FixedThreadPool.new(thread_count)

    processing_files(pool, files_to_process)

    pool.shutdown
    pool.wait_for_termination

    end_time = Time.now
    puts "\nDownload finished in #{(end_time - start_time).round(2)}s."

    # process subdomains if enabled
    if @recursive_subdomains
      subdomain_start_time = Time.now
      process_subdomains
      subdomain_end_time = Time.now
      subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
      puts "Subdomain processing finished in #{subdomain_time}s."
    end

    puts "Results saved in #{backup_path}"
    cleanup
  end

  def structure_dir_path dir_path
    begin
      FileUtils::mkdir_p dir_path unless File.exist? dir_path
    rescue Errno::EEXIST => e
      error_to_string = e.to_s
      puts "# #{error_to_string}"
      if error_to_string.include? "File exists @ dir_s_mkdir - "
        file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
      elsif error_to_string.include? "File exists - "
        file_already_existing = error_to_string.split("File exists - ")[-1]
      else
        raise "Unhandled directory restructure error # #{error_to_string}"
      end
      file_already_existing_temporary = file_already_existing + '.temp'
      file_already_existing_permanent = file_already_existing + '/index.html'
      FileUtils::mv file_already_existing, file_already_existing_temporary
      FileUtils::mkdir_p file_already_existing
      FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
      puts "#{file_already_existing} -> #{file_already_existing_permanent}"
      structure_dir_path dir_path
    end
  end

  def rewrite_urls_to_relative(file_path)
    return unless File.exist?(file_path)

    file_ext = File.extname(file_path).downcase

    begin
      content = File.binread(file_path)

      if file_ext == '.html' || file_ext == '.htm'
        encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
        content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
      else
        content.force_encoding('UTF-8')
      end

      # URLs in HTML attributes
      rewrite_html_attr_urls(content)

      # URLs in CSS
      rewrite_css_urls(content)

      # URLs in JavaScript
      rewrite_js_urls(content)

      # for URLs in HTML attributes that start with a single slash
      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
        prefix, path, suffix = $1, $2, $3
        "#{prefix}./#{path}#{suffix}"
      end

      # for URLs in CSS that start with a single slash
      content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
        path = $1
        "url(\"./#{path}\")"
      end

      # save the modified content back to the file
      File.binwrite(file_path, content)
      puts "Rewrote URLs in #{file_path} to be relative."
    rescue Errno::ENOENT => e
      @logger.warn("Error reading file #{file_path}: #{e.message}")
    end
  end

  def download_file (file_remote_info, http)
    current_encoding = "".encoding
    file_url = file_remote_info[:file_url].encode(current_encoding)
    file_id = file_remote_info[:file_id]
    file_timestamp = file_remote_info[:timestamp]

    # sanitize file_id to ensure it is a valid path component
    raw_path_elements = file_id.split('/')

    sanitized_path_elements = raw_path_elements.map do |element|
      if Gem.win_platform?
        # for Windows, we need to sanitize path components to avoid invalid characters
        # this prevents issues with file names that contain characters not allowed in
        # Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
        element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
      else
        element
      end
    end

    current_backup_path = backup_path

    if file_id == ""
      dir_path = current_backup_path
      file_path = File.join(dir_path, 'index.html')
    elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
      # if file_id is a directory, we treat it as such
      dir_path = File.join(current_backup_path, *sanitized_path_elements)
      file_path = File.join(dir_path, 'index.html')
    else
      # if file_id is a file, we treat it as such
      filename = sanitized_path_elements.pop
      dir_path = File.join(current_backup_path, *sanitized_path_elements)
      file_path = File.join(dir_path, filename)
    end

    # check existence *before* download attempt
    # this handles cases where a file was created manually or by a previous partial run without a .db entry
    if File.exist? file_path
       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
    end

    begin
      structure_dir_path dir_path
      status = download_with_retry(file_path, file_url, file_timestamp, http)

      case status
      when :saved
        if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
          rewrite_urls_to_relative(file_path)
        end
        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
      when :skipped_not_found
        "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
      else
        # ideally, this case should not be reached if download_with_retry behaves as expected.
        @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
        "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
      end
    rescue StandardError => e
      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
      if File.exist?(file_path) and File.size(file_path) == 0
        File.delete(file_path)
        msg += "\n#{file_path} was empty and was removed."
      end
      msg
    end
  end

  def file_queue
    @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
  end

  def file_list_by_timestamp
    if @snapshot_at
      @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
    elsif @all_timestamps
      file_list_curated = get_file_list_all_timestamps
      file_list_curated.map do |file_remote_info|
        file_remote_info[1][:file_id] = file_remote_info[0]
        file_remote_info[1]
      end
    else
      file_list_curated = get_file_list_curated
      file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
      file_list_curated.map do |file_remote_info|
        file_remote_info[1][:file_id] = file_remote_info[0]
        file_remote_info[1]
      end
    end
  end

  private

  def validate_params(params)
    raise ArgumentError, "Base URL is required" unless params[:base_url]
    raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
  end

  def setup_logger
    logger = Logger.new(STDOUT)
    logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
    logger.formatter = proc do |severity, datetime, progname, msg|
      "#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
    end
    logger
  end

  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
    retries = 0
    begin
      wayback_url = if @rewritten
        "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
      else
        "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
      end

      # Escape square brackets because they are not valid in URI()
      wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')

      # reject invalid/unencoded wayback_url, behaving as if the resource weren't found
      if not @url_regexp.match?(wayback_url)
          @logger.warn("Skipped #{file_url}: invalid URL")
          return :skipped_not_found
      end

      request = Net::HTTP::Get.new(URI(wayback_url))
      request["Connection"] = "keep-alive"
      request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
      request["Accept-Encoding"] = "gzip, deflate"

      response = connection.request(request)

      save_response_body = lambda do
        File.open(file_path, "wb") do |file|
          body = response.body
          if response['content-encoding'] == 'gzip' && body && !body.empty?
            begin
              gz = Zlib::GzipReader.new(StringIO.new(body))
              decompressed_body = gz.read
              gz.close
              file.write(decompressed_body)
            rescue Zlib::GzipFile::Error => e
              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
              file.write(body)
            end
          else
            file.write(body) if body
          end
        end
      end

      if @all
        case response
        when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
          save_response_body.call
          if response.is_a?(Net::HTTPRedirection)
            @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
          elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
            @logger.info("Saved error page for #{file_url} (status #{response.code}).")
          end
          return :saved
        else
          # for any other response type when --all is true, treat as an error to be retried or failed
          raise "Unhandled HTTP response: #{response.code} #{response.message}"
        end
      else # not @all (our default behavior)
        case response
        when Net::HTTPSuccess
          save_response_body.call
          return :saved
        when Net::HTTPRedirection
          raise "Too many redirects for #{file_url}" if redirect_count >= 2
          location = response['location']
          @logger.warn("Redirect found for #{file_url} -> #{location}")
          return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
        when Net::HTTPTooManyRequests
          sleep(RATE_LIMIT * 2)
          raise "Rate limited, retrying..."
        when Net::HTTPNotFound
          @logger.warn("File not found, skipping: #{file_url}")
          return :skipped_not_found
        else
          raise "HTTP Error: #{response.code} #{response.message}"
        end
      end

    rescue StandardError => e
      if retries < MAX_RETRIES
        retries += 1
        @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
        sleep(RETRY_DELAY * retries)
        retry
      else
        @failed_downloads << {url: file_url, error: e.message}
        raise e
      end
    end
  end

  def cleanup
    @connection_pool.shutdown

    if @failed_downloads.any?
      @logger.error("Download completed with errors.")
      @logger.error("Failed downloads summary:")
      @failed_downloads.each do |failure|
        @logger.error("  #{failure[:url]} - #{failure[:error]}")
      end
      unless @reset
         puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
         return
      end
    end

    if !@keep || @reset
        puts "Cleaning up state files..." unless @keep && !@reset
        FileUtils.rm_f(cdx_path)
        FileUtils.rm_f(db_path)
    elsif @keep
        puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
    end
  end
end