From 1eaa8098c0498d7b42cd3d5b6967d5c5ff86ca0e Mon Sep 17 00:00:00 2001
From: tedder <ted@timmons.me>
Date: Tue, 28 Jun 2016 23:27:36 -0700
Subject: [PATCH 1/7] add --exclude filter

- add exclude filter. Has precedence over the --only filter.
- bumped the version.
- tests: I modified the tests, but.. the whole suite is hopelessly broken. And when I say hopeless, I'm talking about myself; my Ruby is so rudimentary I was looking up how to create a variable. It really needs a static list of the website 'contents' mocked into it. I have a branch in my repo showing my hopeless work.
---
 bin/wayback_machine_downloader    |  6 +++++-
 lib/wayback_machine_downloader.rb | 28 +++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader
index 0077d70..379e024 100755
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@@ -22,13 +22,17 @@ option_parser = OptionParser.new do |opts|
     options[:only_filter] = t
   end
 
+  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
+    options[:exclude_filter] = t
+  end
+
   opts.on("-v", "--version", "Display version") do |t|
     options[:version] = t
   end
 end.parse!
 
 if (base_url = ARGV[-1])
-  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
+  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
   wayback_machine_downloader.download_files
 elsif options[:version]
   puts WaybackMachineDownloader::VERSION
diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index edd72c7..ccd3e12 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -2,19 +2,21 @@
 
 require 'open-uri'
 require 'fileutils'
+require 'cgi'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 
 class WaybackMachineDownloader
 
-  VERSION = "0.2.4"
+  VERSION = "0.2.5"
 
-  attr_accessor :base_url, :timestamp, :only_filter
+  attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
 
   def initialize params
     @base_url = params[:base_url]
     @timestamp = params[:timestamp].to_i
     @only_filter = params[:only_filter]
+    @exclude_filter = params[:exclude_filter]
   end
 
   def backup_name
@@ -38,6 +40,19 @@ class WaybackMachineDownloader
     end
   end
 
+  def match_exclude_filter file_url
+    if @exclude_filter
+      exclude_filter_regex = @exclude_filter.to_regex
+      if exclude_filter_regex
+        exclude_filter_regex =~ file_url
+      else
+        file_url.downcase.include? @exclude_filter.downcase
+      end
+    else
+      true
+    end
+  end
+
   def get_file_list_curated
     index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
     all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
@@ -48,12 +63,15 @@ class WaybackMachineDownloader
         file_timestamp = line[1].to_i
         file_url = line[2]
         file_id = file_url.split('/')[3..-1].join('/')
-        file_id = URI.unescape file_id
+        file_id = CGI::unescape file_id
         file_id = file_id.tidy_bytes unless file_id == ""
         if file_id.nil?
           puts "Malformed file url, ignoring: #{file_url}"
         elsif @timestamp == 0 or file_timestamp <= @timestamp
-          if not match_only_filter(file_url)
+          # match exclude first so it has precedence
+          if match_exclude_filter(file_url)
+            puts "File url matches exclude filter, ignoring: #{file_url}"
+          elsif not match_only_filter(file_url)
             puts "File url not in supplied only filter, ignoring: #{file_url}"
           elsif file_list_curated[file_id]
             unless file_list_curated[file_id][:timestamp] > file_timestamp
@@ -82,7 +100,7 @@ class WaybackMachineDownloader
     puts
     file_list_by_timestamp = get_file_list_by_timestamp
     if file_list_by_timestamp.count == 0
-      puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
+      puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@only_filter.to_s}\")\n\t* Site is not in wayback machine."
       return
     end
     count = 0

From 68cb0172b7bed1376202ce79ea1754ef45776658 Mon Sep 17 00:00:00 2001
From: hartator <hartator@gmail.com>
Date: Thu, 28 Jul 2016 17:57:17 -0500
Subject: [PATCH 2/7] Make the wording consistent between filters

---
 bin/wayback_machine_downloader | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader
index 379e024..ee4ae11 100755
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@@ -18,7 +18,7 @@ option_parser = OptionParser.new do |opts|
     options[:timestamp] = t
   end
 
-  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
+  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
     options[:only_filter] = t
   end
 

From 6f138d014f65ba5b8343ea8d9b7b3cfca2a36a3a Mon Sep 17 00:00:00 2001
From: hartator <hartator@gmail.com>
Date: Thu, 28 Jul 2016 17:58:12 -0500
Subject: [PATCH 3/7] Bump minor version (new feature)

---
 lib/wayback_machine_downloader.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index ccd3e12..6a8f496 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -8,7 +8,7 @@ require_relative 'wayback_machine_downloader/to_regex'
 
 class WaybackMachineDownloader
 
-  VERSION = "0.2.5"
+  VERSION = "0.3.0"
 
   attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
 

From 6fb6af6547388d4f9e4b7e7a12fb54f7b0e98e67 Mon Sep 17 00:00:00 2001
From: hartator <hartator@gmail.com>
Date: Thu, 28 Jul 2016 17:58:54 -0500
Subject: [PATCH 4/7] Fix behavior when no exclude filter is supplied

---
 lib/wayback_machine_downloader.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index 6a8f496..c136758 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -49,7 +49,7 @@ class WaybackMachineDownloader
         file_url.downcase.include? @exclude_filter.downcase
       end
     else
-      true
+      false
     end
   end
 

From a3ebcec9375734d73b1b83e71f0f042a58a0f2be Mon Sep 17 00:00:00 2001
From: hartator <hartator@gmail.com>
Date: Thu, 28 Jul 2016 17:59:59 -0500
Subject: [PATCH 5/7] Improve user output

---
 lib/wayback_machine_downloader.rb | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index c136758..c4018c7 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -68,11 +68,10 @@ class WaybackMachineDownloader
         if file_id.nil?
           puts "Malformed file url, ignoring: #{file_url}"
         elsif @timestamp == 0 or file_timestamp <= @timestamp
-          # match exclude first so it has precedence
           if match_exclude_filter(file_url)
             puts "File url matches exclude filter, ignoring: #{file_url}"
           elsif not match_only_filter(file_url)
-            puts "File url not in supplied only filter, ignoring: #{file_url}"
+            puts "File url doesn't match only filter, ignoring: #{file_url}"
           elsif file_list_curated[file_id]
             unless file_list_curated[file_id][:timestamp] > file_timestamp
               file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
@@ -100,7 +99,11 @@ class WaybackMachineDownloader
     puts
     file_list_by_timestamp = get_file_list_by_timestamp
     if file_list_by_timestamp.count == 0
-      puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@only_filter.to_s}\")\n\t* Site is not in wayback machine."
+      puts "No files to download."
+      puts "Possible reaosons:"
+      puts "\t* Site is not in Wayback Machine Archive."
+      puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
+      puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
       return
     end
     count = 0

From 7eba273c6f165ab40019d4a0203cc87694955a05 Mon Sep 17 00:00:00 2001
From: hartator <hartator@gmail.com>
Date: Thu, 28 Jul 2016 18:00:18 -0500
Subject: [PATCH 6/7] Add tests for exclude filter

---
 test/test_wayback_machine_downloader.rb | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb
index 5640caf..8066945 100644
--- a/test/test_wayback_machine_downloader.rb
+++ b/test/test_wayback_machine_downloader.rb
@@ -44,6 +44,21 @@ class WaybackMachineDownloaderTest < Minitest::Test
     assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size
   end
 
+  def test_file_list_exclude_filter_without_matches
+    @wayback_machine_downloader.exclude_filter = 'abc123'
+    assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
+  end
+
+  def test_file_list_exclude_filter_with_1_match
+    @wayback_machine_downloader.exclude_filter = 'menu.html'
+    assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
+  end
+
+  def test_file_list_exclude_filter_with_a_regex
+    @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i'
+    assert_equal 32, @wayback_machine_downloader.get_file_list_curated.size
+  end
+
   def test_file_download
     @wayback_machine_downloader.download_files
     linux_page = open 'websites/www.onlyfreegames.net/linux.htm'

From 9f79ca107af5a97127a3ffa9605ce63604633e49 Mon Sep 17 00:00:00 2001
From: hartator <hartator@gmail.com>
Date: Thu, 28 Jul 2016 18:09:14 -0500
Subject: [PATCH 7/7] Add exclude filter to Readme

---
 README.md | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 7841a5f..3aac9ce 100644
--- a/README.md
+++ b/README.md
@@ -21,16 +21,16 @@ Run wayback_machine_downloader with the base url of the website you want to retr
 
 It will download the last version of every file present on Wayback Machine to `./websites/example.com/`. It will also re-create a directory structure and auto-create `index.html` pages to work seamlessly with Apache and Nginx. All files downloaded are the original ones and not Wayback Machine rewritten versions. This way, URLs and links structure are the same than before.
 
-## Optional Timestamp
+## On or Before Timestamp
 
-You may want to supply a specific timestamp to lock your backup to an older version of the website, which can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20060716231334/http://example.com).
+Optional. You may want to supply a specific timestamp to lock your backup to an older version of the website, which can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/*20060716231334*/http://example.com).
 Wayback Machine Downloader will then fetch only file versions on or prior to the timestamp specified:
 
     wayback_machine_downloader http://example.com --timestamp 20060716231334
 
-## Optional Only URL Filter
+## Only URL Filter
 
-You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
+Optional. You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
 
 For example, if you only want to download files inside a specific `my_directory`:
 
@@ -40,6 +40,19 @@ Or if you want to download every images without anything else:
     
     wayback_machine_downloader http://example.com --only "/\.(gif|jpg|jpeg)$/i"
 
+## Exclude URL Filter
+
+Optional. You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the `--exclude` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
+
+For example, if you want to avoid downloading files inside `my_directory`:
+
+    wayback_machine_downloader http://example.com --exclude my_directory
+    
+Or if you want to download everything except images:
+    
+    wayback_machine_downloader http://example.com --exclude "/\.(gif|jpg|jpeg)$/i"
+
+
 ## Contributing
 
 Contributions are welcome! Just submit a pull request via GitHub.