This commit is contained in:
adampweb 2025-04-02 14:50:14 +02:00
commit f5572d6129

View File

@ -190,27 +190,74 @@ class WaybackMachineDownloader
end end
def get_all_snapshots_to_consider def get_all_snapshots_to_consider
snapshot_list_to_consider = [] snapshot_list_to_consider = Concurrent::Array.new
mutex = Mutex.new
@connection_pool.with_connection do |connection|
puts "Getting snapshot pages" puts "Getting snapshot pages"
# Fetch the initial set of snapshots # Fetch the initial set of snapshots, sequentially
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection) @connection_pool.with_connection do |connection|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
mutex.synchronize do
snapshot_list_to_consider.concat(initial_list)
print "." print "."
end
end
# Fetch additional pages if the exact URL flag is not set # Fetch additional pages if the exact URL flag is not set
unless @exact_url unless @exact_url
@maximum_pages.times do |page_index| page_index = 0
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection) batch_size = [@threads_count, 5].min
break if snapshot_list.empty? continue_fetching = true
snapshot_list_to_consider += snapshot_list while continue_fetching && page_index < @maximum_pages
# Determine the range of pages to fetch in this batch
end_index = [page_index + batch_size, @maximum_pages].min
current_batch = (page_index...end_index).to_a
# Create futures for concurrent API calls
futures = current_batch.map do |page|
Concurrent::Future.execute do
result = nil
@connection_pool.with_connection do |connection|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
end
[page, result]
end
end
results = []
futures.each do |future|
begin
results << future.value
rescue => e
puts "\nError fetching page #{future}: #{e.message}"
end
end
# Sort results by page number to maintain order
results.sort_by! { |page, _| page }
# Process results and check for empty pages
results.each do |page, result|
if result.empty?
continue_fetching = false
break
else
mutex.synchronize do
snapshot_list_to_consider.concat(result)
print "." print "."
end end
end end
end end
page_index = end_index
sleep(RATE_LIMIT) if continue_fetching
end
end
puts " found #{snapshot_list_to_consider.length} snapshots to consider." puts " found #{snapshot_list_to_consider.length} snapshots to consider."
puts puts