mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
added new option for regex acceptance you dont have to download entire site when looking for specific path
This commit is contained in:
parent
ed7948a372
commit
432ca1d5b5
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
require_relative '../lib/wayback_machine_downloader'
|
require_relative '../lib/wayback_machine_downloader'
|
||||||
require 'optparse'
|
require 'optparse'
|
||||||
|
require 'pp'
|
||||||
|
|
||||||
options = {}
|
options = {}
|
||||||
option_parser = OptionParser.new do |opts|
|
option_parser = OptionParser.new do |opts|
|
||||||
@ -11,19 +12,26 @@ option_parser = OptionParser.new do |opts|
|
|||||||
opts.separator "Download any website from the Wayback Machine."
|
opts.separator "Download any website from the Wayback Machine."
|
||||||
|
|
||||||
opts.separator ""
|
opts.separator ""
|
||||||
opts.separator "Optional option:"
|
opts.separator "Optional options:"
|
||||||
|
|
||||||
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
|
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
|
||||||
options[:timestamp] = t
|
options[:timestamp] = t
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("--accept-regex [ACCEPT_REGEX]", String,"Specify a regular expression to download. If a path doesn't meet this regex, it won't get downloaded.") do |accept_regex|
|
||||||
|
options[:accept_regex] = accept_regex
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("-v", "--version", "Display version") do |t|
|
opts.on("-v", "--version", "Display version") do |t|
|
||||||
options[:version] = t
|
options[:version] = t
|
||||||
end
|
end
|
||||||
end.parse!
|
end.parse!
|
||||||
|
|
||||||
if base_url = ARGV[0]
|
# this used to be 0. we want to look at the /last/ option.
|
||||||
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
#
|
||||||
|
# TODO: this argument needs to be handled better. argument handling is sorta messy.
|
||||||
|
if base_url = ARGV[-1]
|
||||||
|
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], accept_regex: options[:accept_regex]
|
||||||
wayback_machine_downloader.download_files
|
wayback_machine_downloader.download_files
|
||||||
elsif options[:version]
|
elsif options[:version]
|
||||||
puts WaybackMachineDownloader::VERSION
|
puts WaybackMachineDownloader::VERSION
|
||||||
|
|||||||
@ -13,6 +13,7 @@ class WaybackMachineDownloader
|
|||||||
def initialize params
|
def initialize params
|
||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]
|
||||||
@timestamp = params[:timestamp].to_i
|
@timestamp = params[:timestamp].to_i
|
||||||
|
@accept_regex = /#{params[:accept_regex]}/
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
@ -48,7 +49,21 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
file_list_curated
|
|
||||||
|
# if accept_regex not defined, just return the file_list_curated
|
||||||
|
if @accept_regex.nil?
|
||||||
|
return file_list_curated
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# accept_regex defined. now we need to create a filtered list.
|
||||||
|
filtered_file_list_curated = Hash.new
|
||||||
|
file_list_curated.each do |file_id, fileinfo|
|
||||||
|
if fileinfo[:file_url].match @accept_regex
|
||||||
|
filtered_file_list_curated[file_id] = fileinfo
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return filtered_file_list_curated
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_file_list_by_timestamp
|
def get_file_list_by_timestamp
|
||||||
@ -64,6 +79,10 @@ class WaybackMachineDownloader
|
|||||||
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
|
||||||
puts
|
puts
|
||||||
file_list_by_timestamp = get_file_list_by_timestamp
|
file_list_by_timestamp = get_file_list_by_timestamp
|
||||||
|
if file_list_by_timestamp.count == 0
|
||||||
|
puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
|
||||||
|
return
|
||||||
|
end
|
||||||
count = 0
|
count = 0
|
||||||
file_list_by_timestamp.each do |file_remote_info|
|
file_list_by_timestamp.each do |file_remote_info|
|
||||||
count += 1
|
count += 1
|
||||||
|
|||||||
@ -30,6 +30,16 @@ class WaybackMachineDownloaderTest < Minitest::Test
|
|||||||
assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-1]
|
assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-1]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_file_list_notthere_regex
|
||||||
|
regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'abc123'
|
||||||
|
assert_equal 0, regextester.get_file_list_curated.length
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_file_list_singleresult_regex
|
||||||
|
regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'menu.html$'
|
||||||
|
assert_equal 1, regextester.get_file_list_curated.length
|
||||||
|
end
|
||||||
|
|
||||||
def test_file_download
|
def test_file_download
|
||||||
@wayback_machine_downloader.download_files
|
@wayback_machine_downloader.download_files
|
||||||
linux_page = open 'websites/www.onlyfreegames.net/linux.htm'
|
linux_page = open 'websites/www.onlyfreegames.net/linux.htm'
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user