Construct the cdx API query using a URI object

This avoids problems related to URL encoding.

Obsoletes: https://github.com/hartator/wayback-machine-downloader/pull/116
This commit is contained in:
Paul Wise 2021-05-03 14:01:04 +08:00
parent e6707a9d8b
commit afab72c894
No known key found for this signature in database
GPG Key ID: 3116BA5E9FFA69A3

View File

@ -1,28 +1,29 @@
require 'uri'
module ArchiveAPI module ArchiveAPI
def get_raw_list_from_api url, page_index def get_raw_list_from_api url, page_index
request_url = "https://web.archive.org/cdx/search/xd?url=" request_url = URI("https://web.archive.org/cdx/search/xd")
request_url += url params = [["url", url]]
request_url += parameters_for_api page_index params += parameters_for_api page_index
request_url.query = URI.encode_www_form(params)
URI.open(request_url).read URI.open(request_url).read
end end
def parameters_for_api page_index def parameters_for_api page_index
parameters = "&fl=timestamp,original&collapse=digest&gzip=false" parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
if @all if !@all
parameters += "" parameters.push(["filter", "statuscode:200"])
else
parameters += "&filter=statuscode:200"
end end
if @from_timestamp and @from_timestamp != 0 if @from_timestamp and @from_timestamp != 0
parameters += "&from=" + @from_timestamp.to_s parameters.push(["from", @from_timestamp.to_s])
end end
if @to_timestamp and @to_timestamp != 0 if @to_timestamp and @to_timestamp != 0
parameters += "&to=" + @to_timestamp.to_s parameters.push(["to", @to_timestamp.to_s])
end end
if page_index if page_index
parameters += "&page=#{page_index}" parameters.push(["page", page_index])
end end
parameters parameters
end end