From 95eaa9171533c08889cc58450997df8621173dec Mon Sep 17 00:00:00 2001 From: hartator Date: Sat, 17 Sep 2016 13:37:13 -0500 Subject: [PATCH] Refactor archive API calls to own module --- lib/wayback_machine_downloader.rb | 17 ++++-------- lib/wayback_machine_downloader/archive_api.rb | 26 +++++++++++++++++++ wayback_machine_downloader.gemspec | 2 +- 3 files changed, 32 insertions(+), 13 deletions(-) create mode 100644 lib/wayback_machine_downloader/archive_api.rb diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 4540215..ffb4554 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -6,9 +6,12 @@ require 'cgi' require 'json' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' +require_relative 'wayback_machine_downloader/archive_api' class WaybackMachineDownloader + include ArchiveAPI + VERSION = "0.5.4" attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count @@ -72,18 +75,8 @@ class WaybackMachineDownloader end def get_file_list_curated - parameters_for_wayback_machine_api = "&fl=timestamp,original&collapse=original" - unless @all - parameters_for_wayback_machine_api += "&filter=statuscode:200" - end - if @from_timestamp and @from_timestamp != 0 - parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s - end - if @to_timestamp and @to_timestamp != 0 - parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s - end - index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api) - all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api) + index_file_list_raw = get_raw_list_from_api(@base_url) + all_file_list_raw = get_raw_list_from_api(@base_url + '/*') file_list_curated = Hash.new [index_file_list_raw, all_file_list_raw].each do |file| file.each_line do |line| diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb new file mode 100644 index 0000000..8847863 --- /dev/null +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -0,0 +1,26 @@ +module ArchiveAPI + + def get_raw_list_from_api url + request_url = "http://web.archive.org/cdx/search/xd?url=" + request_url += url + request_url += parameters_for_api + request_uri = URI.parse request_url + response = Net::HTTP.get_response request_uri + response.body + end + + def parameters_for_api + parameters = "&fl=timestamp,original&collapse=original" + unless @all + parameters += "&filter=statuscode:200" + end + if @from_timestamp and @from_timestamp != 0 + parameters += "&from=" + @from_timestamp.to_s + end + if @to_timestamp and @to_timestamp != 0 + parameters += "&to=" + @to_timestamp.to_s + end + parameters + end + +end \ No newline at end of file diff --git a/wayback_machine_downloader.gemspec b/wayback_machine_downloader.gemspec index 3e69bcd..16cc571 100644 --- a/wayback_machine_downloader.gemspec +++ b/wayback_machine_downloader.gemspec @@ -8,7 +8,7 @@ Gem::Specification.new do |s| s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this." s.authors = ["hartator"] s.email = "hartator@gmail.com" - s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb"] + s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"] s.homepage = "https://github.com/hartator/wayback-machine-downloader" s.license = "MIT" s.required_ruby_version = '>= 1.9.2'