diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 64ace86..c4ef13f 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -2,6 +2,7 @@ require_relative '../lib/wayback_machine_downloader' require 'optparse' +require 'pp' options = {} option_parser = OptionParser.new do |opts| @@ -11,19 +12,26 @@ option_parser = OptionParser.new do |opts| opts.separator "Download any website from the Wayback Machine." opts.separator "" - opts.separator "Optional option:" + opts.separator "Optional options:" opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t| options[:timestamp] = t end + opts.on("--accept-regex [ACCEPT_REGEX]", String,"Specify a regular expression to download. If a path doesn't meet this regex, it won't get downloaded.") do |accept_regex| + options[:accept_regex] = accept_regex + end + opts.on("-v", "--version", "Display version") do |t| options[:version] = t end end.parse! -if base_url = ARGV[0] - wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp] +# this used to be 0. we want to look at the /last/ option. +# +# TODO: this argument needs to be handled better. argument handling is sorta messy. +if base_url = ARGV[-1] + wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], accept_regex: options[:accept_regex] wayback_machine_downloader.download_files elsif options[:version] puts WaybackMachineDownloader::VERSION diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index b33dfa5..a92b1af 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -13,6 +13,7 @@ class WaybackMachineDownloader def initialize params @base_url = params[:base_url] @timestamp = params[:timestamp].to_i + @accept_regex = /#{params[:accept_regex]}/ end def backup_name @@ -48,7 +49,21 @@ class WaybackMachineDownloader end end end - file_list_curated + + # if accept_regex not defined, just return the file_list_curated + if @accept_regex.nil? + return file_list_curated + end + + + # accept_regex defined. now we need to create a filtered list. + filtered_file_list_curated = Hash.new + file_list_curated.each do |file_id, fileinfo| + if fileinfo[:file_url].match @accept_regex + filtered_file_list_curated[file_id] = fileinfo + end + end + return filtered_file_list_curated end def get_file_list_by_timestamp @@ -64,6 +79,10 @@ class WaybackMachineDownloader puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..." puts file_list_by_timestamp = get_file_list_by_timestamp + if file_list_by_timestamp.count == 0 + puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine." + return + end count = 0 file_list_by_timestamp.each do |file_remote_info| count += 1 diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index 5299487..79479e2 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -30,6 +30,16 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-1] end + def test_file_list_notthere_regex + regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'abc123' + assert_equal 0, regextester.get_file_list_curated.length + end + + def test_file_list_singleresult_regex + regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'menu.html$' + assert_equal 1, regextester.get_file_list_curated.length + end + def test_file_download @wayback_machine_downloader.download_files linux_page = open 'websites/www.onlyfreegames.net/linux.htm'