mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
Merge branch 'master' of https://github.com/adampweb/wayback-machine-downloader
This commit is contained in:
commit
f5572d6129
@ -190,27 +190,74 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
|
|
||||||
def get_all_snapshots_to_consider
|
def get_all_snapshots_to_consider
|
||||||
snapshot_list_to_consider = []
|
snapshot_list_to_consider = Concurrent::Array.new
|
||||||
|
mutex = Mutex.new
|
||||||
|
|
||||||
@connection_pool.with_connection do |connection|
|
|
||||||
puts "Getting snapshot pages"
|
puts "Getting snapshot pages"
|
||||||
|
|
||||||
# Fetch the initial set of snapshots
|
# Fetch the initial set of snapshots, sequentially
|
||||||
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
|
@connection_pool.with_connection do |connection|
|
||||||
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
||||||
|
mutex.synchronize do
|
||||||
|
snapshot_list_to_consider.concat(initial_list)
|
||||||
print "."
|
print "."
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# Fetch additional pages if the exact URL flag is not set
|
# Fetch additional pages if the exact URL flag is not set
|
||||||
unless @exact_url
|
unless @exact_url
|
||||||
@maximum_pages.times do |page_index|
|
page_index = 0
|
||||||
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
|
batch_size = [@threads_count, 5].min
|
||||||
break if snapshot_list.empty?
|
continue_fetching = true
|
||||||
|
|
||||||
snapshot_list_to_consider += snapshot_list
|
while continue_fetching && page_index < @maximum_pages
|
||||||
|
# Determine the range of pages to fetch in this batch
|
||||||
|
end_index = [page_index + batch_size, @maximum_pages].min
|
||||||
|
current_batch = (page_index...end_index).to_a
|
||||||
|
|
||||||
|
# Create futures for concurrent API calls
|
||||||
|
futures = current_batch.map do |page|
|
||||||
|
Concurrent::Future.execute do
|
||||||
|
result = nil
|
||||||
|
@connection_pool.with_connection do |connection|
|
||||||
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
||||||
|
end
|
||||||
|
[page, result]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
futures.each do |future|
|
||||||
|
begin
|
||||||
|
results << future.value
|
||||||
|
rescue => e
|
||||||
|
puts "\nError fetching page #{future}: #{e.message}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sort results by page number to maintain order
|
||||||
|
results.sort_by! { |page, _| page }
|
||||||
|
|
||||||
|
# Process results and check for empty pages
|
||||||
|
results.each do |page, result|
|
||||||
|
if result.empty?
|
||||||
|
continue_fetching = false
|
||||||
|
break
|
||||||
|
else
|
||||||
|
mutex.synchronize do
|
||||||
|
snapshot_list_to_consider.concat(result)
|
||||||
print "."
|
print "."
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
page_index = end_index
|
||||||
|
|
||||||
|
sleep(RATE_LIMIT) if continue_fetching
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
||||||
puts
|
puts
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user