From 1eaa8098c0498d7b42cd3d5b6967d5c5ff86ca0e Mon Sep 17 00:00:00 2001 From: tedder Date: Tue, 28 Jun 2016 23:27:36 -0700 Subject: [PATCH] add --exclude filter - add exclude filter. Has precedence over the --only filter. - bumped the version. - tests: I modified the tests, but.. the whole suite is hopelessly broken. And when I say hopeless, I'm talking about myself; my Ruby is so rudimentary I was looking up how to create a variable. It really needs a static list of the website 'contents' mocked into it. I have a branch in my repo showing my hopeless work. --- bin/wayback_machine_downloader | 6 +++++- lib/wayback_machine_downloader.rb | 28 +++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 0077d70..379e024 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -22,13 +22,17 @@ option_parser = OptionParser.new do |opts| options[:only_filter] = t end + opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t| + options[:exclude_filter] = t + end + opts.on("-v", "--version", "Display version") do |t| options[:version] = t end end.parse! if (base_url = ARGV[-1]) - wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter] + wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter] wayback_machine_downloader.download_files elsif options[:version] puts WaybackMachineDownloader::VERSION diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index edd72c7..ccd3e12 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -2,19 +2,21 @@ require 'open-uri' require 'fileutils' +require 'cgi' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' class WaybackMachineDownloader - VERSION = "0.2.4" + VERSION = "0.2.5" - attr_accessor :base_url, :timestamp, :only_filter + attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter def initialize params @base_url = params[:base_url] @timestamp = params[:timestamp].to_i @only_filter = params[:only_filter] + @exclude_filter = params[:exclude_filter] end def backup_name @@ -38,6 +40,19 @@ class WaybackMachineDownloader end end + def match_exclude_filter file_url + if @exclude_filter + exclude_filter_regex = @exclude_filter.to_regex + if exclude_filter_regex + exclude_filter_regex =~ file_url + else + file_url.downcase.include? @exclude_filter.downcase + end + else + true + end + end + def get_file_list_curated index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}" all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" @@ -48,12 +63,15 @@ class WaybackMachineDownloader file_timestamp = line[1].to_i file_url = line[2] file_id = file_url.split('/')[3..-1].join('/') - file_id = URI.unescape file_id + file_id = CGI::unescape file_id file_id = file_id.tidy_bytes unless file_id == "" if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" elsif @timestamp == 0 or file_timestamp <= @timestamp - if not match_only_filter(file_url) + # match exclude first so it has precedence + if match_exclude_filter(file_url) + puts "File url matches exclude filter, ignoring: #{file_url}" + elsif not match_only_filter(file_url) puts "File url not in supplied only filter, ignoring: #{file_url}" elsif file_list_curated[file_id] unless file_list_curated[file_id][:timestamp] > file_timestamp @@ -82,7 +100,7 @@ class WaybackMachineDownloader puts file_list_by_timestamp = get_file_list_by_timestamp if file_list_by_timestamp.count == 0 - puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine." + puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@only_filter.to_s}\")\n\t* Site is not in wayback machine." return end count = 0