Fetching API calls sequentially

although the WM API is particularly wonky and this will not prevent all errors, this aligns better with what we have here.
This commit is contained in:
Felipe 2025-03-29 22:27:01 +00:00 committed by GitHub
parent c953d038e2
commit 0c701ee890
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -190,24 +190,71 @@ class WaybackMachineDownloader
end end
def get_all_snapshots_to_consider def get_all_snapshots_to_consider
snapshot_list_to_consider = [] snapshot_list_to_consider = Concurrent::Array.new
mutex = Mutex.new
puts "Getting snapshot pages"
# Fetch the initial set of snapshots, sequentially
@connection_pool.with_connection do |connection| @connection_pool.with_connection do |connection|
puts "Getting snapshot pages" initial_list = get_raw_list_from_api(@base_url, nil, connection)
mutex.synchronize do
snapshot_list_to_consider.concat(initial_list)
print "."
end
end
# Fetch the initial set of snapshots # Fetch additional pages if the exact URL flag is not set
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection) unless @exact_url
print "." page_index = 0
batch_size = [@threads_count, 5].min
continue_fetching = true
# Fetch additional pages if the exact URL flag is not set while continue_fetching && page_index < @maximum_pages
unless @exact_url # Determine the range of pages to fetch in this batch
@maximum_pages.times do |page_index| end_index = [page_index + batch_size, @maximum_pages].min
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection) current_batch = (page_index...end_index).to_a
break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list # Create futures for concurrent API calls
print "." futures = current_batch.map do |page|
Concurrent::Future.execute do
result = nil
@connection_pool.with_connection do |connection|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
end
[page, result]
end
end end
results = []
futures.each do |future|
begin
results << future.value
rescue => e
puts "\nError fetching page #{future}: #{e.message}"
end
end
# Sort results by page number to maintain order
results.sort_by! { |page, _| page }
# Process results and check for empty pages
results.each do |page, result|
if result.empty?
continue_fetching = false
break
else
mutex.synchronize do
snapshot_list_to_consider.concat(result)
print "."
end
end
end
page_index = end_index
sleep(RATE_LIMIT) if continue_fetching
end end
end end