diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 4540215..ffb4554 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -6,9 +6,12 @@ require 'cgi' require 'json' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' +require_relative 'wayback_machine_downloader/archive_api' class WaybackMachineDownloader + include ArchiveAPI + VERSION = "0.5.4" attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count @@ -72,18 +75,8 @@ class WaybackMachineDownloader end def get_file_list_curated - parameters_for_wayback_machine_api = "&fl=timestamp,original&collapse=original" - unless @all - parameters_for_wayback_machine_api += "&filter=statuscode:200" - end - if @from_timestamp and @from_timestamp != 0 - parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s - end - if @to_timestamp and @to_timestamp != 0 - parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s - end - index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api) - all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api) + index_file_list_raw = get_raw_list_from_api(@base_url) + all_file_list_raw = get_raw_list_from_api(@base_url + '/*') file_list_curated = Hash.new [index_file_list_raw, all_file_list_raw].each do |file| file.each_line do |line| diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb new file mode 100644 index 0000000..8847863 --- /dev/null +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -0,0 +1,26 @@ +module ArchiveAPI + + def get_raw_list_from_api url + request_url = "http://web.archive.org/cdx/search/xd?url=" + request_url += url + request_url += parameters_for_api + request_uri = URI.parse request_url + response = Net::HTTP.get_response request_uri + response.body + end + + def parameters_for_api + parameters = "&fl=timestamp,original&collapse=original" + unless @all + parameters += "&filter=statuscode:200" + end + if @from_timestamp and @from_timestamp != 0 + parameters += "&from=" + @from_timestamp.to_s + end + if @to_timestamp and @to_timestamp != 0 + parameters += "&to=" + @to_timestamp.to_s + end + parameters + end + +end \ No newline at end of file diff --git a/wayback_machine_downloader.gemspec b/wayback_machine_downloader.gemspec index 3e69bcd..16cc571 100644 --- a/wayback_machine_downloader.gemspec +++ b/wayback_machine_downloader.gemspec @@ -8,7 +8,7 @@ Gem::Specification.new do |s| s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this." s.authors = ["hartator"] s.email = "hartator@gmail.com" - s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb"] + s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"] s.homepage = "https://github.com/hartator/wayback-machine-downloader" s.license = "MIT" s.required_ruby_version = '>= 1.9.2'