added new option for regex acceptance you dont have to download entire site when looking for specific path

2025-12-29 16:16:06 +00:00 · 2015-11-06 13:11:26 -05:00
parent ed7948a372
commit 432ca1d5b5
3 changed files with 41 additions and 4 deletions
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@@ -2,6 +2,7 @@

 require_relative '../lib/wayback_machine_downloader'
 require 'optparse'
+require 'pp'

 options = {}
 option_parser = OptionParser.new do |opts|
@@ -11,19 +12,26 @@ option_parser = OptionParser.new do |opts|
  opts.separator "Download any website from the Wayback Machine."

  opts.separator ""
-  opts.separator "Optional option:"
+  opts.separator "Optional options:"

  opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
    options[:timestamp] = t
  end

+  opts.on("--accept-regex [ACCEPT_REGEX]", String,"Specify a regular expression to download. If a path doesn't meet this regex, it won't get downloaded.") do |accept_regex|
+    options[:accept_regex] = accept_regex
+  end
+
  opts.on("-v", "--version", "Display version") do |t|
    options[:version] = t
  end
 end.parse!

-if base_url = ARGV[0]
-  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
+# this used to be 0. we want to look at the /last/ option.
+# 
+# TODO: this argument needs to be handled better. argument handling is sorta messy.
+if base_url = ARGV[-1]
+  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], accept_regex: options[:accept_regex]
  wayback_machine_downloader.download_files
 elsif options[:version]
  puts WaybackMachineDownloader::VERSION
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -13,6 +13,7 @@ class WaybackMachineDownloader
  def initialize params
    @base_url = params[:base_url]
    @timestamp = params[:timestamp].to_i
+    @accept_regex = /#{params[:accept_regex]}/
  end

  def backup_name
@@ -48,7 +49,21 @@ class WaybackMachineDownloader
        end
      end
    end
-    file_list_curated
+
+    # if accept_regex not defined, just return the file_list_curated
+    if @accept_regex.nil?
+      return file_list_curated
+    end
+
+
+    # accept_regex defined. now we need to create a filtered list.
+    filtered_file_list_curated = Hash.new
+    file_list_curated.each do |file_id, fileinfo|
+      if fileinfo[:file_url].match @accept_regex
+        filtered_file_list_curated[file_id] = fileinfo
+      end
+    end
+    return filtered_file_list_curated
  end

  def get_file_list_by_timestamp
@@ -64,6 +79,10 @@ class WaybackMachineDownloader
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
    puts
    file_list_by_timestamp = get_file_list_by_timestamp
+    if file_list_by_timestamp.count == 0
+      puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
+      return
+    end
    count = 0
    file_list_by_timestamp.each do |file_remote_info|
      count += 1
--- a/test/test_wayback_machine_downloader.rb
+++ b/test/test_wayback_machine_downloader.rb
@@ -30,6 +30,16 @@ class WaybackMachineDownloaderTest < Minitest::Test
    assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-1]
  end

+  def test_file_list_notthere_regex
+    regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'abc123'
+    assert_equal 0, regextester.get_file_list_curated.length
+  end
+
+  def test_file_list_singleresult_regex
+    regextester = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net', accept_regex: 'menu.html$'
+    assert_equal 1, regextester.get_file_list_curated.length
+  end
+
  def test_file_download
    @wayback_machine_downloader.download_files
    linux_page = open 'websites/www.onlyfreegames.net/linux.htm'