mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
Refactor archive API calls to own module
This commit is contained in:
parent
59b379b9c6
commit
95eaa91715
@ -6,9 +6,12 @@ require 'cgi'
|
||||
require 'json'
|
||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||
require_relative 'wayback_machine_downloader/to_regex'
|
||||
require_relative 'wayback_machine_downloader/archive_api'
|
||||
|
||||
class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
|
||||
VERSION = "0.5.4"
|
||||
|
||||
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
||||
@ -72,18 +75,8 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def get_file_list_curated
|
||||
parameters_for_wayback_machine_api = "&fl=timestamp,original&collapse=original"
|
||||
unless @all
|
||||
parameters_for_wayback_machine_api += "&filter=statuscode:200"
|
||||
end
|
||||
if @from_timestamp and @from_timestamp != 0
|
||||
parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
|
||||
end
|
||||
if @to_timestamp and @to_timestamp != 0
|
||||
parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
|
||||
end
|
||||
index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
|
||||
all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
|
||||
index_file_list_raw = get_raw_list_from_api(@base_url)
|
||||
all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
|
||||
file_list_curated = Hash.new
|
||||
[index_file_list_raw, all_file_list_raw].each do |file|
|
||||
file.each_line do |line|
|
||||
|
||||
26
lib/wayback_machine_downloader/archive_api.rb
Normal file
26
lib/wayback_machine_downloader/archive_api.rb
Normal file
@ -0,0 +1,26 @@
|
||||
module ArchiveAPI
|
||||
|
||||
def get_raw_list_from_api url
|
||||
request_url = "http://web.archive.org/cdx/search/xd?url="
|
||||
request_url += url
|
||||
request_url += parameters_for_api
|
||||
request_uri = URI.parse request_url
|
||||
response = Net::HTTP.get_response request_uri
|
||||
response.body
|
||||
end
|
||||
|
||||
def parameters_for_api
|
||||
parameters = "&fl=timestamp,original&collapse=original"
|
||||
unless @all
|
||||
parameters += "&filter=statuscode:200"
|
||||
end
|
||||
if @from_timestamp and @from_timestamp != 0
|
||||
parameters += "&from=" + @from_timestamp.to_s
|
||||
end
|
||||
if @to_timestamp and @to_timestamp != 0
|
||||
parameters += "&to=" + @to_timestamp.to_s
|
||||
end
|
||||
parameters
|
||||
end
|
||||
|
||||
end
|
||||
@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
||||
s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this."
|
||||
s.authors = ["hartator"]
|
||||
s.email = "hartator@gmail.com"
|
||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb"]
|
||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
||||
s.homepage = "https://github.com/hartator/wayback-machine-downloader"
|
||||
s.license = "MIT"
|
||||
s.required_ruby_version = '>= 1.9.2'
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user