From afab72c894984409090d1fb171e526cf4edfc9a6 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Mon, 3 May 2021 14:01:04 +0800 Subject: [PATCH] Construct the cdx API query using a URI object This avoids problems related to URL encoding. Obsoletes: https://github.com/hartator/wayback-machine-downloader/pull/116 --- lib/wayback_machine_downloader/archive_api.rb | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index 903f42b..ef8d3a0 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -1,28 +1,29 @@ +require 'uri' + module ArchiveAPI def get_raw_list_from_api url, page_index - request_url = "https://web.archive.org/cdx/search/xd?url=" - request_url += url - request_url += parameters_for_api page_index + request_url = URI("https://web.archive.org/cdx/search/xd") + params = [["url", url]] + params += parameters_for_api page_index + request_url.query = URI.encode_www_form(params) URI.open(request_url).read end def parameters_for_api page_index - parameters = "&fl=timestamp,original&collapse=digest&gzip=false" - if @all - parameters += "" - else - parameters += "&filter=statuscode:200" + parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]] + if !@all + parameters.push(["filter", "statuscode:200"]) end if @from_timestamp and @from_timestamp != 0 - parameters += "&from=" + @from_timestamp.to_s + parameters.push(["from", @from_timestamp.to_s]) end if @to_timestamp and @to_timestamp != 0 - parameters += "&to=" + @to_timestamp.to_s + parameters.push(["to", @to_timestamp.to_s]) end if page_index - parameters += "&page=#{page_index}" + parameters.push(["page", page_index]) end parameters end