2015-11-05 16:44:41 -06:00
|
|
|
# encoding: UTF-8
|
|
|
|
|
|
2016-10-17 12:40:18 -05:00
|
|
|
require 'thread'
|
2016-09-17 14:06:43 -05:00
|
|
|
require 'net/http'
|
2015-08-08 16:15:14 -05:00
|
|
|
require 'open-uri'
|
|
|
|
|
require 'fileutils'
|
2016-06-28 23:27:36 -07:00
|
|
|
require 'cgi'
|
2016-08-03 14:23:35 -05:00
|
|
|
require 'json'
|
2024-06-26 19:52:34 +00:00
|
|
|
require 'time'
|
2024-12-05 12:11:16 +00:00
|
|
|
require 'concurrent-ruby'
|
2024-12-03 00:23:47 +00:00
|
|
|
require 'logger'
|
2015-08-19 12:02:08 -05:00
|
|
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
2015-11-19 15:28:02 -06:00
|
|
|
require_relative 'wayback_machine_downloader/to_regex'
|
2016-09-17 13:37:13 -05:00
|
|
|
require_relative 'wayback_machine_downloader/archive_api'
|
2015-08-08 16:15:14 -05:00
|
|
|
|
2024-12-31 16:48:29 +00:00
|
|
|
class ConnectionPool
|
|
|
|
|
MAX_AGE = 300
|
|
|
|
|
CLEANUP_INTERVAL = 60
|
|
|
|
|
DEFAULT_TIMEOUT = 30
|
|
|
|
|
MAX_RETRIES = 3
|
|
|
|
|
|
|
|
|
|
def initialize(size)
|
|
|
|
|
@size = size
|
|
|
|
|
@pool = Concurrent::Map.new
|
|
|
|
|
@creation_times = Concurrent::Map.new
|
|
|
|
|
@cleanup_thread = schedule_cleanup
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def with_connection(&block)
|
|
|
|
|
conn = acquire_connection
|
|
|
|
|
begin
|
|
|
|
|
yield conn
|
|
|
|
|
ensure
|
|
|
|
|
release_connection(conn)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def shutdown
|
|
|
|
|
@cleanup_thread&.exit
|
|
|
|
|
@pool.each_value { |conn| conn.finish if conn&.started? }
|
|
|
|
|
@pool.clear
|
|
|
|
|
@creation_times.clear
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
|
|
def acquire_connection
|
|
|
|
|
thread_id = Thread.current.object_id
|
|
|
|
|
conn = @pool[thread_id]
|
|
|
|
|
|
|
|
|
|
if should_create_new?(conn)
|
|
|
|
|
conn&.finish if conn&.started?
|
|
|
|
|
conn = create_connection
|
|
|
|
|
@pool[thread_id] = conn
|
|
|
|
|
@creation_times[thread_id] = Time.now
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
conn
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def release_connection(conn)
|
|
|
|
|
return unless conn
|
|
|
|
|
if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
|
|
|
|
conn.finish
|
|
|
|
|
@pool.delete(Thread.current.object_id)
|
|
|
|
|
@creation_times.delete(Thread.current.object_id)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def should_create_new?(conn)
|
|
|
|
|
return true if conn.nil?
|
|
|
|
|
return true unless conn.started?
|
|
|
|
|
return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
|
|
|
|
false
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def create_connection
|
|
|
|
|
http = Net::HTTP.new("web.archive.org", 443)
|
|
|
|
|
http.use_ssl = true
|
|
|
|
|
http.read_timeout = DEFAULT_TIMEOUT
|
|
|
|
|
http.open_timeout = DEFAULT_TIMEOUT
|
|
|
|
|
http.keep_alive_timeout = 30
|
|
|
|
|
http.max_retries = MAX_RETRIES
|
|
|
|
|
http.start
|
|
|
|
|
http
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def schedule_cleanup
|
|
|
|
|
Thread.new do
|
|
|
|
|
loop do
|
|
|
|
|
cleanup_old_connections
|
|
|
|
|
sleep CLEANUP_INTERVAL
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def cleanup_old_connections
|
|
|
|
|
current_time = Time.now
|
|
|
|
|
@creation_times.each do |thread_id, creation_time|
|
|
|
|
|
if current_time - creation_time > MAX_AGE
|
|
|
|
|
conn = @pool[thread_id]
|
|
|
|
|
conn&.finish if conn&.started?
|
|
|
|
|
@pool.delete(thread_id)
|
|
|
|
|
@creation_times.delete(thread_id)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2015-07-25 18:44:37 -05:00
|
|
|
class WaybackMachineDownloader
|
2015-08-08 16:15:14 -05:00
|
|
|
|
2016-09-17 13:37:13 -05:00
|
|
|
include ArchiveAPI
|
|
|
|
|
|
2024-06-26 16:54:55 +00:00
|
|
|
VERSION = "2.3.2"
|
2024-12-03 00:23:47 +00:00
|
|
|
DEFAULT_TIMEOUT = 30
|
|
|
|
|
MAX_RETRIES = 3
|
|
|
|
|
RETRY_DELAY = 2
|
|
|
|
|
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
|
|
|
|
CONNECTION_POOL_SIZE = 10
|
|
|
|
|
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
2015-08-10 01:13:59 -05:00
|
|
|
|
2017-10-26 19:43:48 -05:00
|
|
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
2024-06-26 16:54:55 +00:00
|
|
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
2024-12-03 00:23:47 +00:00
|
|
|
:all, :maximum_pages, :threads_count, :logger
|
2015-08-08 16:15:14 -05:00
|
|
|
|
|
|
|
|
def initialize params
|
2024-12-03 00:23:47 +00:00
|
|
|
validate_params(params)
|
2015-08-08 16:15:14 -05:00
|
|
|
@base_url = params[:base_url]
|
2017-06-11 21:53:13 -05:00
|
|
|
@exact_url = params[:exact_url]
|
2016-09-17 12:49:48 -05:00
|
|
|
@directory = params[:directory]
|
2017-10-26 19:35:29 -05:00
|
|
|
@all_timestamps = params[:all_timestamps]
|
2016-07-30 14:08:01 -05:00
|
|
|
@from_timestamp = params[:from_timestamp].to_i
|
|
|
|
|
@to_timestamp = params[:to_timestamp].to_i
|
2015-11-19 15:28:02 -06:00
|
|
|
@only_filter = params[:only_filter]
|
2016-06-28 23:27:36 -07:00
|
|
|
@exclude_filter = params[:exclude_filter]
|
2016-07-31 09:51:27 -05:00
|
|
|
@all = params[:all]
|
2016-09-24 10:06:27 -07:00
|
|
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
2025-01-01 12:20:06 +00:00
|
|
|
@threads_count = [params[:threads_count].to_i, 1].max
|
2025-01-02 12:17:20 +00:00
|
|
|
@rewritten = params[:rewritten]
|
2024-12-03 00:23:47 +00:00
|
|
|
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
|
|
|
|
@logger = setup_logger
|
|
|
|
|
@failed_downloads = Concurrent::Array.new
|
2024-12-31 16:48:29 +00:00
|
|
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
2015-07-25 18:44:37 -05:00
|
|
|
end
|
2015-08-08 16:15:14 -05:00
|
|
|
|
|
|
|
|
def backup_name
|
2016-08-16 11:47:47 -05:00
|
|
|
if @base_url.include? '//'
|
|
|
|
|
@base_url.split('/')[2]
|
|
|
|
|
else
|
|
|
|
|
@base_url
|
|
|
|
|
end
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def backup_path
|
2016-09-17 12:49:48 -05:00
|
|
|
if @directory
|
|
|
|
|
if @directory[-1] == '/'
|
|
|
|
|
@directory
|
|
|
|
|
else
|
|
|
|
|
@directory + '/'
|
|
|
|
|
end
|
|
|
|
|
else
|
|
|
|
|
'websites/' + backup_name + '/'
|
|
|
|
|
end
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
|
|
|
|
|
2015-11-19 15:28:02 -06:00
|
|
|
def match_only_filter file_url
|
|
|
|
|
if @only_filter
|
|
|
|
|
only_filter_regex = @only_filter.to_regex
|
|
|
|
|
if only_filter_regex
|
|
|
|
|
only_filter_regex =~ file_url
|
|
|
|
|
else
|
|
|
|
|
file_url.downcase.include? @only_filter.downcase
|
|
|
|
|
end
|
|
|
|
|
else
|
|
|
|
|
true
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2016-06-28 23:27:36 -07:00
|
|
|
def match_exclude_filter file_url
|
|
|
|
|
if @exclude_filter
|
|
|
|
|
exclude_filter_regex = @exclude_filter.to_regex
|
|
|
|
|
if exclude_filter_regex
|
|
|
|
|
exclude_filter_regex =~ file_url
|
|
|
|
|
else
|
|
|
|
|
file_url.downcase.include? @exclude_filter.downcase
|
|
|
|
|
end
|
|
|
|
|
else
|
2016-07-28 17:58:54 -05:00
|
|
|
false
|
2016-06-28 23:27:36 -07:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2016-09-24 10:06:27 -07:00
|
|
|
def get_all_snapshots_to_consider
|
2021-05-03 16:48:49 +08:00
|
|
|
snapshot_list_to_consider = []
|
2024-06-26 16:54:55 +00:00
|
|
|
|
2024-12-31 16:48:29 +00:00
|
|
|
@connection_pool.with_connection do |connection|
|
|
|
|
|
puts "Getting snapshot pages"
|
2024-06-26 16:54:55 +00:00
|
|
|
|
2024-12-31 16:48:29 +00:00
|
|
|
# Fetch the initial set of snapshots
|
|
|
|
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
|
|
|
|
|
print "."
|
2024-06-26 16:54:55 +00:00
|
|
|
|
2024-12-31 16:48:29 +00:00
|
|
|
# Fetch additional pages if the exact URL flag is not set
|
|
|
|
|
unless @exact_url
|
|
|
|
|
@maximum_pages.times do |page_index|
|
|
|
|
|
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
|
|
|
|
|
break if snapshot_list.empty?
|
2024-06-26 16:54:55 +00:00
|
|
|
|
2024-12-31 16:48:29 +00:00
|
|
|
snapshot_list_to_consider += snapshot_list
|
|
|
|
|
print "."
|
2024-06-26 16:54:55 +00:00
|
|
|
end
|
2017-06-03 17:45:06 -04:00
|
|
|
end
|
2016-09-24 10:06:27 -07:00
|
|
|
end
|
2024-06-26 16:54:55 +00:00
|
|
|
|
|
|
|
|
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
2016-09-24 10:06:27 -07:00
|
|
|
puts
|
2024-06-26 16:54:55 +00:00
|
|
|
|
2016-09-24 10:06:27 -07:00
|
|
|
snapshot_list_to_consider
|
|
|
|
|
end
|
|
|
|
|
|
2015-08-09 21:26:43 -05:00
|
|
|
def get_file_list_curated
|
2015-08-08 16:15:14 -05:00
|
|
|
file_list_curated = Hash.new
|
2021-05-03 16:48:49 +08:00
|
|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
|
|
|
next unless file_url.include?('/')
|
2016-09-24 10:06:27 -07:00
|
|
|
file_id = file_url.split('/')[3..-1].join('/')
|
2024-06-26 16:54:55 +00:00
|
|
|
file_id = CGI::unescape file_id
|
2016-09-24 10:06:27 -07:00
|
|
|
file_id = file_id.tidy_bytes unless file_id == ""
|
|
|
|
|
if file_id.nil?
|
|
|
|
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
|
|
|
else
|
|
|
|
|
if match_exclude_filter(file_url)
|
|
|
|
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
|
|
|
elsif not match_only_filter(file_url)
|
|
|
|
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
|
|
|
|
elsif file_list_curated[file_id]
|
|
|
|
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
2015-08-09 22:33:32 -05:00
|
|
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
|
|
|
|
end
|
2016-09-24 10:06:27 -07:00
|
|
|
else
|
|
|
|
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
2015-11-19 15:28:02 -06:00
|
|
|
file_list_curated
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
|
|
|
|
|
2017-10-26 19:35:29 -05:00
|
|
|
def get_file_list_all_timestamps
|
2017-01-24 05:03:07 -05:00
|
|
|
file_list_curated = Hash.new
|
2021-05-03 16:48:49 +08:00
|
|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
|
|
|
|
next unless file_url.include?('/')
|
2017-01-24 05:03:07 -05:00
|
|
|
file_id = file_url.split('/')[3..-1].join('/')
|
2017-10-26 19:35:29 -05:00
|
|
|
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
2024-06-26 16:54:55 +00:00
|
|
|
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
2017-10-26 19:35:29 -05:00
|
|
|
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
2017-01-24 05:03:07 -05:00
|
|
|
if file_id.nil?
|
|
|
|
|
puts "Malformed file url, ignoring: #{file_url}"
|
|
|
|
|
else
|
|
|
|
|
if match_exclude_filter(file_url)
|
|
|
|
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
|
|
|
|
elsif not match_only_filter(file_url)
|
|
|
|
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
2017-10-26 19:35:29 -05:00
|
|
|
elsif file_list_curated[file_id_and_timestamp]
|
|
|
|
|
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
2017-01-24 05:03:07 -05:00
|
|
|
else
|
2017-10-26 19:35:29 -05:00
|
|
|
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
2017-01-24 05:03:07 -05:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
puts "file_list_curated: " + file_list_curated.count.to_s
|
|
|
|
|
file_list_curated
|
|
|
|
|
end
|
|
|
|
|
|
2015-08-08 16:15:14 -05:00
|
|
|
|
2015-11-05 16:19:03 -06:00
|
|
|
def get_file_list_by_timestamp
|
2017-01-24 05:03:07 -05:00
|
|
|
if @all_timestamps
|
2017-10-26 19:35:29 -05:00
|
|
|
file_list_curated = get_file_list_all_timestamps
|
2017-01-24 05:03:07 -05:00
|
|
|
file_list_curated.map do |file_remote_info|
|
|
|
|
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
|
|
|
file_remote_info[1]
|
|
|
|
|
end
|
|
|
|
|
else
|
|
|
|
|
file_list_curated = get_file_list_curated
|
|
|
|
|
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
|
|
|
|
file_list_curated.map do |file_remote_info|
|
|
|
|
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
|
|
|
|
file_remote_info[1]
|
|
|
|
|
end
|
2015-08-15 15:37:37 -05:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2016-08-03 14:23:35 -05:00
|
|
|
def list_files
|
2017-03-15 17:19:34 -04:00
|
|
|
# retrieval produces its own output
|
2021-05-03 20:52:28 +08:00
|
|
|
@orig_stdout = $stdout
|
|
|
|
|
$stdout = $stderr
|
2017-03-15 17:19:34 -04:00
|
|
|
files = get_file_list_by_timestamp
|
2021-05-03 20:52:28 +08:00
|
|
|
$stdout = @orig_stdout
|
2016-08-03 14:23:35 -05:00
|
|
|
puts "["
|
2021-05-03 20:54:29 +08:00
|
|
|
files[0...-1].each do |file|
|
2016-08-03 14:23:35 -05:00
|
|
|
puts file.to_json + ","
|
|
|
|
|
end
|
2021-05-03 20:54:29 +08:00
|
|
|
puts files[-1].to_json
|
2016-08-03 14:23:35 -05:00
|
|
|
puts "]"
|
|
|
|
|
end
|
|
|
|
|
|
2015-08-08 16:15:14 -05:00
|
|
|
def download_files
|
2016-09-04 23:38:38 +03:00
|
|
|
start_time = Time.now
|
2016-09-24 10:06:27 -07:00
|
|
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
2024-12-03 00:23:47 +00:00
|
|
|
|
2024-06-26 20:30:59 +00:00
|
|
|
if file_list_by_timestamp.empty?
|
2016-07-28 17:59:59 -05:00
|
|
|
puts "No files to download."
|
2015-11-06 13:11:26 -05:00
|
|
|
return
|
|
|
|
|
end
|
2024-06-26 16:54:55 +00:00
|
|
|
|
2024-12-03 00:23:47 +00:00
|
|
|
total_files = file_list_by_timestamp.count
|
|
|
|
|
puts "#{total_files} files to download:"
|
|
|
|
|
|
2016-09-15 19:59:42 -05:00
|
|
|
@processed_file_count = 0
|
2024-12-03 00:23:47 +00:00
|
|
|
@download_mutex = Mutex.new
|
|
|
|
|
|
|
|
|
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
|
|
|
|
pool = Concurrent::FixedThreadPool.new(thread_count)
|
|
|
|
|
|
|
|
|
|
file_list_by_timestamp.each do |file_remote_info|
|
|
|
|
|
pool.post do
|
2024-12-31 16:48:29 +00:00
|
|
|
@connection_pool.with_connection do |connection|
|
|
|
|
|
result = download_file(file_remote_info, connection)
|
|
|
|
|
@download_mutex.synchronize do
|
|
|
|
|
@processed_file_count += 1
|
|
|
|
|
puts result if result
|
2024-06-26 20:30:59 +00:00
|
|
|
end
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
2024-12-31 16:48:29 +00:00
|
|
|
sleep(RATE_LIMIT)
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
|
|
|
|
end
|
2016-09-04 23:38:38 +03:00
|
|
|
|
2024-12-03 00:23:47 +00:00
|
|
|
pool.shutdown
|
|
|
|
|
pool.wait_for_termination
|
|
|
|
|
|
2016-09-04 23:38:38 +03:00
|
|
|
end_time = Time.now
|
2024-12-03 00:23:47 +00:00
|
|
|
puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
|
|
|
|
|
cleanup
|
2015-08-10 01:13:59 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def structure_dir_path dir_path
|
|
|
|
|
begin
|
2017-06-03 16:53:37 -04:00
|
|
|
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
2015-08-10 01:13:59 -05:00
|
|
|
rescue Errno::EEXIST => e
|
2015-09-10 00:35:48 -05:00
|
|
|
error_to_string = e.to_s
|
|
|
|
|
puts "# #{error_to_string}"
|
|
|
|
|
if error_to_string.include? "File exists @ dir_s_mkdir - "
|
2015-09-10 00:43:21 -05:00
|
|
|
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
|
2015-09-10 00:35:48 -05:00
|
|
|
elsif error_to_string.include? "File exists - "
|
2015-09-10 00:43:21 -05:00
|
|
|
file_already_existing = error_to_string.split("File exists - ")[-1]
|
2015-09-10 00:35:48 -05:00
|
|
|
else
|
2015-09-10 00:43:21 -05:00
|
|
|
raise "Unhandled directory restructure error # #{error_to_string}"
|
2015-09-10 00:35:48 -05:00
|
|
|
end
|
2015-08-10 01:13:59 -05:00
|
|
|
file_already_existing_temporary = file_already_existing + '.temp'
|
|
|
|
|
file_already_existing_permanent = file_already_existing + '/index.html'
|
|
|
|
|
FileUtils::mv file_already_existing, file_already_existing_temporary
|
|
|
|
|
FileUtils::mkdir_p file_already_existing
|
|
|
|
|
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
2015-08-24 18:00:36 -05:00
|
|
|
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
2015-08-10 01:13:59 -05:00
|
|
|
structure_dir_path dir_path
|
|
|
|
|
end
|
2015-08-08 16:15:14 -05:00
|
|
|
end
|
|
|
|
|
|
2024-06-26 16:54:55 +00:00
|
|
|
def download_file (file_remote_info, http)
|
2017-02-17 12:54:12 -06:00
|
|
|
current_encoding = "".encoding
|
|
|
|
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
2016-09-04 23:38:38 +03:00
|
|
|
file_id = file_remote_info[:file_id]
|
|
|
|
|
file_timestamp = file_remote_info[:timestamp]
|
|
|
|
|
file_path_elements = file_id.split('/')
|
2024-06-26 20:30:59 +00:00
|
|
|
|
2016-09-04 23:38:38 +03:00
|
|
|
if file_id == ""
|
|
|
|
|
dir_path = backup_path
|
|
|
|
|
file_path = backup_path + 'index.html'
|
|
|
|
|
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
|
|
|
|
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
|
|
|
|
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
|
|
|
|
else
|
|
|
|
|
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
|
|
|
|
file_path = backup_path + file_path_elements[0..-1].join('/')
|
|
|
|
|
end
|
|
|
|
|
if Gem.win_platform?
|
2017-06-05 12:32:37 +02:00
|
|
|
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
2016-09-04 23:38:38 +03:00
|
|
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
|
|
|
|
end
|
2017-06-03 16:53:37 -04:00
|
|
|
unless File.exist? file_path
|
2016-09-04 23:38:38 +03:00
|
|
|
begin
|
|
|
|
|
structure_dir_path dir_path
|
2024-12-03 00:23:47 +00:00
|
|
|
download_with_retry(file_path, file_url, file_timestamp, http)
|
|
|
|
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
2016-09-04 23:38:38 +03:00
|
|
|
rescue StandardError => e
|
2024-12-03 00:23:47 +00:00
|
|
|
msg = "#{file_url} # #{e}"
|
2017-06-03 16:53:37 -04:00
|
|
|
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
2016-09-04 23:38:38 +03:00
|
|
|
File.delete(file_path)
|
2024-12-03 00:23:47 +00:00
|
|
|
msg += "\n#{file_path} was empty and was removed."
|
2016-09-04 23:38:38 +03:00
|
|
|
end
|
2024-12-03 00:23:47 +00:00
|
|
|
msg
|
2016-09-04 23:38:38 +03:00
|
|
|
end
|
|
|
|
|
else
|
2024-12-03 00:23:47 +00:00
|
|
|
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
2016-09-04 23:38:38 +03:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def file_queue
|
|
|
|
|
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def file_list_by_timestamp
|
|
|
|
|
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
|
|
|
|
end
|
|
|
|
|
|
2024-12-03 00:23:47 +00:00
|
|
|
private
|
|
|
|
|
|
|
|
|
|
def validate_params(params)
|
|
|
|
|
raise ArgumentError, "Base URL is required" unless params[:base_url]
|
|
|
|
|
raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def setup_logger
|
|
|
|
|
logger = Logger.new(STDOUT)
|
|
|
|
|
logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
|
|
|
|
|
logger.formatter = proc do |severity, datetime, progname, msg|
|
|
|
|
|
"#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
|
|
|
|
|
end
|
|
|
|
|
logger
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def download_with_retry(file_path, file_url, file_timestamp, connection)
|
|
|
|
|
retries = 0
|
|
|
|
|
begin
|
2025-01-02 12:17:20 +00:00
|
|
|
wayback_url = if @rewritten
|
|
|
|
|
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
|
|
|
|
|
else
|
|
|
|
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
2024-12-03 00:23:47 +00:00
|
|
|
request["Connection"] = "keep-alive"
|
|
|
|
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
|
|
|
|
|
|
|
|
|
response = connection.request(request)
|
|
|
|
|
|
|
|
|
|
case response
|
|
|
|
|
when Net::HTTPSuccess
|
|
|
|
|
File.open(file_path, "wb") do |file|
|
|
|
|
|
if block_given?
|
|
|
|
|
yield(response, file)
|
|
|
|
|
else
|
|
|
|
|
file.write(response.body)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
when Net::HTTPTooManyRequests
|
|
|
|
|
sleep(RATE_LIMIT * 2)
|
|
|
|
|
raise "Rate limited, retrying..."
|
|
|
|
|
else
|
|
|
|
|
raise "HTTP Error: #{response.code} #{response.message}"
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
rescue StandardError => e
|
|
|
|
|
if retries < MAX_RETRIES
|
|
|
|
|
retries += 1
|
|
|
|
|
@logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
|
|
|
|
|
sleep(RETRY_DELAY * retries)
|
|
|
|
|
retry
|
|
|
|
|
else
|
|
|
|
|
@failed_downloads << {url: file_url, error: e.message}
|
|
|
|
|
raise e
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def cleanup
|
2024-12-31 16:48:29 +00:00
|
|
|
@connection_pool.shutdown
|
2024-12-03 00:23:47 +00:00
|
|
|
|
|
|
|
|
if @failed_downloads.any?
|
|
|
|
|
@logger.error("Failed downloads summary:")
|
|
|
|
|
@failed_downloads.each do |failure|
|
|
|
|
|
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
2015-07-25 18:44:37 -05:00
|
|
|
end
|