Refactor archive API calls to own module

This commit is contained in:
hartator 2016-09-17 13:37:13 -05:00
parent 59b379b9c6
commit 95eaa91715
3 changed files with 32 additions and 13 deletions

View File

@ -6,9 +6,12 @@ require 'cgi'
require 'json' require 'json'
require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api'
class WaybackMachineDownloader class WaybackMachineDownloader
include ArchiveAPI
VERSION = "0.5.4" VERSION = "0.5.4"
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
@ -72,18 +75,8 @@ class WaybackMachineDownloader
end end
def get_file_list_curated def get_file_list_curated
parameters_for_wayback_machine_api = "&fl=timestamp,original&collapse=original" index_file_list_raw = get_raw_list_from_api(@base_url)
unless @all all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
parameters_for_wayback_machine_api += "&filter=statuscode:200"
end
if @from_timestamp and @from_timestamp != 0
parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
end
if @to_timestamp and @to_timestamp != 0
parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
end
index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
file_list_curated = Hash.new file_list_curated = Hash.new
[index_file_list_raw, all_file_list_raw].each do |file| [index_file_list_raw, all_file_list_raw].each do |file|
file.each_line do |line| file.each_line do |line|

View File

@ -0,0 +1,26 @@
module ArchiveAPI
def get_raw_list_from_api url
request_url = "http://web.archive.org/cdx/search/xd?url="
request_url += url
request_url += parameters_for_api
request_uri = URI.parse request_url
response = Net::HTTP.get_response request_uri
response.body
end
def parameters_for_api
parameters = "&fl=timestamp,original&collapse=original"
unless @all
parameters += "&filter=statuscode:200"
end
if @from_timestamp and @from_timestamp != 0
parameters += "&from=" + @from_timestamp.to_s
end
if @to_timestamp and @to_timestamp != 0
parameters += "&to=" + @to_timestamp.to_s
end
parameters
end
end

View File

@ -8,7 +8,7 @@ Gem::Specification.new do |s|
s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this." s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this."
s.authors = ["hartator"] s.authors = ["hartator"]
s.email = "hartator@gmail.com" s.email = "hartator@gmail.com"
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb"] s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
s.homepage = "https://github.com/hartator/wayback-machine-downloader" s.homepage = "https://github.com/hartator/wayback-machine-downloader"
s.license = "MIT" s.license = "MIT"
s.required_ruby_version = '>= 1.9.2' s.required_ruby_version = '>= 1.9.2'