62 lines
2.2 KiB
Ruby
Raw Normal View History

require 'json'
require 'uri'
module ArchiveAPI
2024-06-26 16:53:08 +00:00
def get_raw_list_from_api(url, page_index, http)
# Automatically append /* if the URL doesn't contain a path after the domain
# This is a workaround for an issue with the API and *some* domains.
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
2025-06-15 13:11:02 -04:00
# But don't do this when exact_url flag is set
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
url = "#{url}/*"
end
request_url = URI("https://web.archive.org/cdx/search/cdx")
2024-06-26 16:53:08 +00:00
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
request_url.query = URI.encode_www_form(params)
retries = 0
max_retries = (@max_retries || 3)
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
begin
2024-06-26 16:53:08 +00:00
response = http.get(request_url)
body = response.body.to_s.strip
return [] if body.empty?
json = JSON.parse(body)
2024-06-26 16:53:08 +00:00
# Check if the response contains the header ["timestamp", "original"]
json.shift if json.first == ["timestamp", "original"]
json
rescue JSON::ParserError => e
warn "Failed to parse JSON from API for #{url}: #{e.message}"
[]
rescue Net::ReadTimeout, Net::OpenTimeout => e
if retries < max_retries
retries += 1
warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
sleep(delay * retries)
retry
else
warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
[]
end
rescue StandardError => e
# treat any other transient-ish error similarly, though without retries for now
warn "Error fetching CDX data for #{url}: #{e.message}"
[]
end
2017-03-15 17:08:40 -04:00
end
2024-06-26 16:53:08 +00:00
def parameters_for_api(page_index)
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
2024-06-26 16:53:08 +00:00
parameters.push(["filter", "statuscode:200"]) unless @all
parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
parameters.push(["page", page_index]) if page_index
parameters
end
end