From 466228fee45ee4fe847986d28c9ca64eb1fc08c4 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:53:08 +0000 Subject: [PATCH] Refactoring the archive API --- lib/wayback_machine_downloader/archive_api.rb | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index 01b36d9..6861ac3 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -3,37 +3,30 @@ require 'uri' module ArchiveAPI - def get_raw_list_from_api url, page_index + def get_raw_list_from_api(url, page_index, http) request_url = URI("https://web.archive.org/cdx/search/xd") - params = [["output", "json"], ["url", url]] - params += parameters_for_api page_index + params = [["output", "json"], ["url", url]] + parameters_for_api(page_index) request_url.query = URI.encode_www_form(params) begin - json = JSON.parse(URI(request_url).open.read) - if (json[0] <=> ["timestamp","original"]) == 0 - json.shift - end + response = http.get(request_url) + json = JSON.parse(response.body) + + # Check if the response contains the header ["timestamp", "original"] + json.shift if json.first == ["timestamp", "original"] json - rescue JSON::ParserError + rescue JSON::ParserError, StandardError => e + warn "Failed to fetch data from API: #{e.message}" [] end end - def parameters_for_api page_index + def parameters_for_api(page_index) parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]] - if !@all - parameters.push(["filter", "statuscode:200"]) - end - if @from_timestamp and @from_timestamp != 0 - parameters.push(["from", @from_timestamp.to_s]) - end - if @to_timestamp and @to_timestamp != 0 - parameters.push(["to", @to_timestamp.to_s]) - end - if page_index - parameters.push(["page", page_index]) - end + parameters.push(["filter", "statuscode:200"]) unless @all + parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0 + parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0 + parameters.push(["page", page_index]) if page_index parameters end