Merge exclude filter branch to master

2025-12-29 16:16:06 +00:00 · 2016-07-30 10:19:54 -05:00 · 2016-07-30 10:19:54 -05:00 · f21f14c676
commit f21f14c676
parent 04ba00e75e 9f79ca107a
4 changed files with 66 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -21,16 +21,16 @@ Run wayback_machine_downloader with the base url of the website you want to retr

 It will download the last version of every file present on Wayback Machine to `./websites/example.com/`. It will also re-create a directory structure and auto-create `index.html` pages to work seamlessly with Apache and Nginx. All files downloaded are the original ones and not Wayback Machine rewritten versions. This way, URLs and links structure are the same than before.

-## Optional Timestamp
+## On or Before Timestamp

-You may want to supply a specific timestamp to lock your backup to an older version of the website, which can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20060716231334/http://example.com).
+Optional. You may want to supply a specific timestamp to lock your backup to an older version of the website, which can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/*20060716231334*/http://example.com).
 Wayback Machine Downloader will then fetch only file versions on or prior to the timestamp specified:

    wayback_machine_downloader http://example.com --timestamp 20060716231334

-## Optional Only URL Filter
+## Only URL Filter

-You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
+Optional. You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.

 For example, if you only want to download files inside a specific `my_directory`:

@ -40,15 +40,27 @@ Or if you want to download every images without anything else:
    
    wayback_machine_downloader http://example.com --only "/\.(gif|jpg|jpeg)$/i"

+## Exclude URL Filter
+
+Optional. You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the `--exclude` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
+
+For example, if you want to avoid downloading files inside `my_directory`:
+
+    wayback_machine_downloader http://example.com --exclude my_directory
+    
+Or if you want to download everything except images:
+    
+    wayback_machine_downloader http://example.com --exclude "/\.(gif|jpg|jpeg)$/i"
+
 ## Using the Docker image

 As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way:

-    docker pull hartator/wayback-machine-downloader
+docker pull hartator/wayback-machine-downloader

 Then, you should be able to use the Docker image to download websites. For example:

-    docker run --rm -it -v $PWD/websites:/websites hartator/wayback-machine-downloader http://example.com
+docker run --rm -it -v $PWD/websites:/websites hartator/wayback-machine-downloader http://example.com

 ## Contributing

--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@ -18,17 +18,21 @@ option_parser = OptionParser.new do |opts|
    options[:timestamp] = t
  end

-  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
+  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
    options[:only_filter] = t
  end

+  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
+    options[:exclude_filter] = t
+  end
+
  opts.on("-v", "--version", "Display version") do |t|
    options[:version] = t
  end
 end.parse!

 if (base_url = ARGV[-1])
-  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
+  wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
  wayback_machine_downloader.download_files
 elsif options[:version]
  puts WaybackMachineDownloader::VERSION
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@ -2,19 +2,21 @@

 require 'open-uri'
 require 'fileutils'
+require 'cgi'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'

 class WaybackMachineDownloader

-  VERSION = "0.2.4"
+  VERSION = "0.3.0"

-  attr_accessor :base_url, :timestamp, :only_filter
+  attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter

  def initialize params
    @base_url = params[:base_url]
    @timestamp = params[:timestamp].to_i
    @only_filter = params[:only_filter]
+    @exclude_filter = params[:exclude_filter]
  end

  def backup_name
@ -38,6 +40,19 @@ class WaybackMachineDownloader
    end
  end

+  def match_exclude_filter file_url
+    if @exclude_filter
+      exclude_filter_regex = @exclude_filter.to_regex
+      if exclude_filter_regex
+        exclude_filter_regex =~ file_url
+      else
+        file_url.downcase.include? @exclude_filter.downcase
+      end
+    else
+      false
+    end
+  end
+
  def get_file_list_curated
    index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
    all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
@ -48,13 +63,15 @@ class WaybackMachineDownloader
        file_timestamp = line[1].to_i
        file_url = line[2]
        file_id = file_url.split('/')[3..-1].join('/')
-        file_id = URI.unescape file_id
+        file_id = CGI::unescape file_id
        file_id = file_id.tidy_bytes unless file_id == ""
        if file_id.nil?
          puts "Malformed file url, ignoring: #{file_url}"
        elsif @timestamp == 0 or file_timestamp <= @timestamp
-          if not match_only_filter(file_url)
-            puts "File url not in supplied only filter, ignoring: #{file_url}"
+          if match_exclude_filter(file_url)
+            puts "File url matches exclude filter, ignoring: #{file_url}"
+          elsif not match_only_filter(file_url)
+            puts "File url doesn't match only filter, ignoring: #{file_url}"
          elsif file_list_curated[file_id]
            unless file_list_curated[file_id][:timestamp] > file_timestamp
              file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
@ -82,7 +99,11 @@ class WaybackMachineDownloader
    puts
    file_list_by_timestamp = get_file_list_by_timestamp
    if file_list_by_timestamp.count == 0
-      puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
+      puts "No files to download."
+      puts "Possible reaosons:"
+      puts "\t* Site is not in Wayback Machine Archive."
+      puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
+      puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
      return
    end
    count = 0
--- a/test/test_wayback_machine_downloader.rb
+++ b/test/test_wayback_machine_downloader.rb
@ -44,6 +44,21 @@ class WaybackMachineDownloaderTest < Minitest::Test
    assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size
  end

+  def test_file_list_exclude_filter_without_matches
+    @wayback_machine_downloader.exclude_filter = 'abc123'
+    assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
+  end
+
+  def test_file_list_exclude_filter_with_1_match
+    @wayback_machine_downloader.exclude_filter = 'menu.html'
+    assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
+  end
+
+  def test_file_list_exclude_filter_with_a_regex
+    @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i'
+    assert_equal 32, @wayback_machine_downloader.get_file_list_curated.size
+  end
+
  def test_file_download
    @wayback_machine_downloader.download_files
    linux_page = open 'websites/www.onlyfreegames.net/linux.htm'