2021-05-03 16:48:49 +08:00
|
|
|
require 'json'
|
2021-05-03 14:01:04 +08:00
|
|
|
require 'uri'
|
|
|
|
|
|
2016-09-17 13:37:13 -05:00
|
|
|
module ArchiveAPI
|
|
|
|
|
|
2024-06-26 16:53:08 +00:00
|
|
|
def get_raw_list_from_api(url, page_index, http)
|
2025-05-30 12:50:48 -04:00
|
|
|
# Automatically append /* if the URL doesn't contain a path after the domain
|
|
|
|
|
# This is a workaround for an issue with the API and *some* domains.
|
|
|
|
|
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
2025-06-15 13:11:02 -04:00
|
|
|
# But don't do this when exact_url flag is set
|
|
|
|
|
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
2025-05-30 12:50:48 -04:00
|
|
|
url = "#{url}/*"
|
|
|
|
|
end
|
|
|
|
|
|
2025-04-19 13:40:14 +00:00
|
|
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
2024-06-26 16:53:08 +00:00
|
|
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
2021-05-03 14:01:04 +08:00
|
|
|
request_url.query = URI.encode_www_form(params)
|
2016-10-31 17:46:35 +01:00
|
|
|
|
2021-05-03 16:48:49 +08:00
|
|
|
begin
|
2024-06-26 16:53:08 +00:00
|
|
|
response = http.get(request_url)
|
2025-02-09 16:24:02 +00:00
|
|
|
body = response.body.to_s.strip
|
|
|
|
|
return [] if body.empty?
|
|
|
|
|
json = JSON.parse(body)
|
2024-06-26 16:53:08 +00:00
|
|
|
|
|
|
|
|
# Check if the response contains the header ["timestamp", "original"]
|
|
|
|
|
json.shift if json.first == ["timestamp", "original"]
|
2021-05-03 16:48:49 +08:00
|
|
|
json
|
2025-07-27 17:18:57 +00:00
|
|
|
rescue JSON::ParserError => e
|
2024-06-26 16:53:08 +00:00
|
|
|
warn "Failed to fetch data from API: #{e.message}"
|
2021-05-03 16:48:49 +08:00
|
|
|
[]
|
|
|
|
|
end
|
2017-03-15 17:08:40 -04:00
|
|
|
end
|
2016-09-17 13:37:13 -05:00
|
|
|
|
2024-06-26 16:53:08 +00:00
|
|
|
def parameters_for_api(page_index)
|
2021-05-03 14:01:04 +08:00
|
|
|
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
2024-06-26 16:53:08 +00:00
|
|
|
parameters.push(["filter", "statuscode:200"]) unless @all
|
|
|
|
|
parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
|
|
|
|
|
parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
|
|
|
|
|
parameters.push(["page", page_index]) if page_index
|
2016-09-17 13:37:13 -05:00
|
|
|
parameters
|
|
|
|
|
end
|
|
|
|
|
|
2016-10-31 17:46:35 +01:00
|
|
|
end
|