Significant refactoring

including extra config settings, a proper rate limit, and a logger. Fixes: #307 #291 #281 #269 and probably others too
This commit is contained in:
Felipe 2024-12-03 00:23:47 +00:00
parent eaff48fbdb
commit 45fa2be573
2 changed files with 153 additions and 69 deletions

View File

@ -1,3 +1,4 @@
source "https://rubygems.org" source "https://rubygems.org"
gemspec gemspec
gem 'concurrent-ruby'

View File

@ -7,6 +7,8 @@ require 'fileutils'
require 'cgi' require 'cgi'
require 'json' require 'json'
require 'time' require 'time'
require 'concurrent'
require 'logger'
require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api' require_relative 'wayback_machine_downloader/archive_api'
@ -16,12 +18,20 @@ class WaybackMachineDownloader
include ArchiveAPI include ArchiveAPI
VERSION = "2.3.2" VERSION = "2.3.2"
DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3
RETRY_DELAY = 2
RATE_LIMIT = 0.25 # Delay between requests in seconds
CONNECTION_POOL_SIZE = 10
HTTP_CACHE_SIZE = 1000
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
attr_accessor :base_url, :exact_url, :directory, :all_timestamps, attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
:all, :maximum_pages, :threads_count :all, :maximum_pages, :threads_count, :logger
def initialize params def initialize params
validate_params(params)
@base_url = params[:base_url] @base_url = params[:base_url]
@exact_url = params[:exact_url] @exact_url = params[:exact_url]
@directory = params[:directory] @directory = params[:directory]
@ -32,7 +42,11 @@ class WaybackMachineDownloader
@exclude_filter = params[:exclude_filter] @exclude_filter = params[:exclude_filter]
@all = params[:all] @all = params[:all]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = params[:threads_count].to_i @threads_count = [params[:threads_count].to_i, 1].max # Garante mínimo de 1 thread
@timeout = params[:timeout] || DEFAULT_TIMEOUT
@logger = setup_logger
@http_cache = Concurrent::Map.new
@failed_downloads = Concurrent::Array.new
end end
def backup_name def backup_name
@ -82,28 +96,30 @@ class WaybackMachineDownloader
end end
def get_all_snapshots_to_consider def get_all_snapshots_to_consider
http = Net::HTTP.new("web.archive.org", 443) http = setup_http_client
http.use_ssl = true
snapshot_list_to_consider = [] snapshot_list_to_consider = []
http.start do begin
puts "Getting snapshot pages" http.start do |connection|
puts "Getting snapshot pages"
# Fetch the initial set of snapshots # Fetch the initial set of snapshots
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http) snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
print "." print "."
# Fetch additional pages if the exact URL flag is not set # Fetch additional pages if the exact URL flag is not set
unless @exact_url unless @exact_url
@maximum_pages.times do |page_index| @maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, http) snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
break if snapshot_list.empty? break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list snapshot_list_to_consider += snapshot_list
print "." print "."
end
end end
end end
ensure
http.finish if http.started?
end end
puts " found #{snapshot_list_to_consider.length} snapshots to consider." puts " found #{snapshot_list_to_consider.length} snapshots to consider."
@ -199,46 +215,49 @@ class WaybackMachineDownloader
def download_files def download_files
start_time = Time.now start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
puts
if file_list_by_timestamp.empty? if file_list_by_timestamp.empty?
puts "No files to download." puts "No files to download."
puts "Possible reasons:"
puts "\t* Site is not in Wayback Machine Archive."
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
return return
end end
puts "#{file_list_by_timestamp.count} files to download:" total_files = file_list_by_timestamp.count
puts "#{total_files} files to download:"
threads = []
mutex = Mutex.new
@processed_file_count = 0 @processed_file_count = 0
@threads_count = 1 unless @threads_count != 0 @download_mutex = Mutex.new
@threads_count.times do
threads << Thread.new do thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
http = Net::HTTP.new("web.archive.org", 443) pool = Concurrent::FixedThreadPool.new(thread_count)
http.use_ssl = true semaphore = Concurrent::Semaphore.new(CONNECTION_POOL_SIZE)
file_list_by_timestamp.each do |file_remote_info|
pool.post do
semaphore.acquire
http = nil
begin begin
until file_queue.empty? http = setup_http_client
file_remote_info = nil http.start do |connection|
mutex.synchronize { file_remote_info = file_queue.pop(true) rescue nil } result = download_file(file_remote_info, connection)
download_file(file_remote_info, http) if file_remote_info @download_mutex.synchronize do
@processed_file_count += 1
puts result if result
end
end end
ensure ensure
http.finish if http.started? semaphore.release
http&.finish if http&.started?
sleep(RATE_LIMIT)
end end
end end
end end
threads.each(&:join) pool.shutdown
pool.wait_for_termination
end_time = Time.now end_time = Time.now
puts puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)" cleanup
end end
def structure_dir_path dir_path def structure_dir_path dir_path
@ -288,38 +307,18 @@ class WaybackMachineDownloader
unless File.exist? file_path unless File.exist? file_path
begin begin
structure_dir_path dir_path structure_dir_path dir_path
open(file_path, "wb") do |file| download_with_retry(file_path, file_url, file_timestamp, http)
begin "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
file.write(body)
end
rescue OpenURI::HTTPError => e
puts "#{file_url} # #{e}"
if @all
file.write(e.io.read)
puts "#{file_path} saved anyway."
end
rescue StandardError => e
puts "#{file_url} # #{e}"
end
end
rescue StandardError => e rescue StandardError => e
puts "#{file_url} # #{e}" msg = "#{file_url} # #{e}"
ensure
if not @all and File.exist?(file_path) and File.size(file_path) == 0 if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path) File.delete(file_path)
puts "#{file_path} was empty and was removed." msg += "\n#{file_path} was empty and was removed."
end end
end msg
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end end
else else
semaphore.synchronize do "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
@processed_file_count += 1
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end
end end
end end
@ -334,4 +333,88 @@ class WaybackMachineDownloader
def semaphore def semaphore
@semaphore ||= Mutex.new @semaphore ||= Mutex.new
end end
private
def validate_params(params)
raise ArgumentError, "Base URL is required" unless params[:base_url]
raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
# Removida validação de threads_count pois agora é forçado a ser positivo
end
def setup_logger
logger = Logger.new(STDOUT)
logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
logger.formatter = proc do |severity, datetime, progname, msg|
"#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
end
logger
end
def setup_http_client
cached_client = @http_cache[Thread.current.object_id]
return cached_client if cached_client&.active?
http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true
http.read_timeout = @timeout
http.open_timeout = @timeout
http.keep_alive_timeout = 30
http.max_retries = MAX_RETRIES
@http_cache[Thread.current.object_id] = http
http
end
def download_with_retry(file_path, file_url, file_timestamp, connection)
retries = 0
begin
request = Net::HTTP::Get.new(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"))
request["Connection"] = "keep-alive"
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
response = connection.request(request)
case response
when Net::HTTPSuccess
File.open(file_path, "wb") do |file|
if block_given?
yield(response, file)
else
file.write(response.body)
end
end
when Net::HTTPTooManyRequests
sleep(RATE_LIMIT * 2)
raise "Rate limited, retrying..."
else
raise "HTTP Error: #{response.code} #{response.message}"
end
rescue StandardError => e
if retries < MAX_RETRIES
retries += 1
@logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
sleep(RETRY_DELAY * retries)
retry
else
@failed_downloads << {url: file_url, error: e.message}
raise e
end
end
end
def cleanup
@http_cache.each_value do |client|
client.finish if client&.started?
end
@http_cache.clear
if @failed_downloads.any?
@logger.error("Failed downloads summary:")
@failed_downloads.each do |failure|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
end
end
end
end end