From e6a5f5f999cbe622a7dad9545acfde49ba6b8a96 Mon Sep 17 00:00:00 2001 From: hartator Date: Sat, 30 Jul 2016 14:08:26 -0500 Subject: [PATCH] Pass parameters directly to Wayback Machine API --- lib/wayback_machine_downloader.rb | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index ae64b15..c88d5d8 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -57,18 +57,27 @@ class WaybackMachineDownloader def get_file_list_curated index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}" all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api = "&fl=timestamp,original&fastLatest=true&filter=statuscode:200&collapse=original" + if @from_timestamp and @from_timestamp != 0 + parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s + end + if @to_timestamp and @to_timestamp != 0 + parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s + end + index_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api) + all_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api) file_list_curated = Hash.new [index_file_list_raw, all_file_list_raw].each do |file| file.each_line do |line| line = line.split(' ') - file_timestamp = line[1].to_i - file_url = line[2] + file_timestamp = line[0].to_i + file_url = line[1] file_id = file_url.split('/')[3..-1].join('/') file_id = CGI::unescape file_id file_id = file_id.tidy_bytes unless file_id == "" if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" - elsif @timestamp == 0 or file_timestamp <= @timestamp + else if match_exclude_filter(file_url) puts "File url matches exclude filter, ignoring: #{file_url}" elsif not match_only_filter(file_url)