mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
add --exclude filter
- add exclude filter. Has precedence over the --only filter. - bumped the version. - tests: I modified the tests, but.. the whole suite is hopelessly broken. And when I say hopeless, I'm talking about myself; my Ruby is so rudimentary I was looking up how to create a variable. It really needs a static list of the website 'contents' mocked into it. I have a branch in my repo showing my hopeless work.
This commit is contained in:
parent
7000035a30
commit
1eaa8098c0
@ -22,13 +22,17 @@ option_parser = OptionParser.new do |opts|
|
|||||||
options[:only_filter] = t
|
options[:only_filter] = t
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
|
||||||
|
options[:exclude_filter] = t
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("-v", "--version", "Display version") do |t|
|
opts.on("-v", "--version", "Display version") do |t|
|
||||||
options[:version] = t
|
options[:version] = t
|
||||||
end
|
end
|
||||||
end.parse!
|
end.parse!
|
||||||
|
|
||||||
if (base_url = ARGV[-1])
|
if (base_url = ARGV[-1])
|
||||||
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
|
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
|
||||||
wayback_machine_downloader.download_files
|
wayback_machine_downloader.download_files
|
||||||
elsif options[:version]
|
elsif options[:version]
|
||||||
puts WaybackMachineDownloader::VERSION
|
puts WaybackMachineDownloader::VERSION
|
||||||
|
|||||||
@ -2,19 +2,21 @@
|
|||||||
|
|
||||||
require 'open-uri'
|
require 'open-uri'
|
||||||
require 'fileutils'
|
require 'fileutils'
|
||||||
|
require 'cgi'
|
||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
require_relative 'wayback_machine_downloader/to_regex'
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
|
|
||||||
class WaybackMachineDownloader
|
class WaybackMachineDownloader
|
||||||
|
|
||||||
VERSION = "0.2.4"
|
VERSION = "0.2.5"
|
||||||
|
|
||||||
attr_accessor :base_url, :timestamp, :only_filter
|
attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
|
||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]
|
||||||
@timestamp = params[:timestamp].to_i
|
@timestamp = params[:timestamp].to_i
|
||||||
@only_filter = params[:only_filter]
|
@only_filter = params[:only_filter]
|
||||||
|
@exclude_filter = params[:exclude_filter]
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
@ -38,6 +40,19 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def match_exclude_filter file_url
|
||||||
|
if @exclude_filter
|
||||||
|
exclude_filter_regex = @exclude_filter.to_regex
|
||||||
|
if exclude_filter_regex
|
||||||
|
exclude_filter_regex =~ file_url
|
||||||
|
else
|
||||||
|
file_url.downcase.include? @exclude_filter.downcase
|
||||||
|
end
|
||||||
|
else
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def get_file_list_curated
|
def get_file_list_curated
|
||||||
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
||||||
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
||||||
@ -48,12 +63,15 @@ class WaybackMachineDownloader
|
|||||||
file_timestamp = line[1].to_i
|
file_timestamp = line[1].to_i
|
||||||
file_url = line[2]
|
file_url = line[2]
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
file_id = file_url.split('/')[3..-1].join('/')
|
||||||
file_id = URI.unescape file_id
|
file_id = CGI::unescape file_id
|
||||||
file_id = file_id.tidy_bytes unless file_id == ""
|
file_id = file_id.tidy_bytes unless file_id == ""
|
||||||
if file_id.nil?
|
if file_id.nil?
|
||||||
puts "Malformed file url, ignoring: #{file_url}"
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
||||||
if not match_only_filter(file_url)
|
# match exclude first so it has precedence
|
||||||
|
if match_exclude_filter(file_url)
|
||||||
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
|
elsif not match_only_filter(file_url)
|
||||||
puts "File url not in supplied only filter, ignoring: #{file_url}"
|
puts "File url not in supplied only filter, ignoring: #{file_url}"
|
||||||
elsif file_list_curated[file_id]
|
elsif file_list_curated[file_id]
|
||||||
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
||||||
@ -82,7 +100,7 @@ class WaybackMachineDownloader
|
|||||||
puts
|
puts
|
||||||
file_list_by_timestamp = get_file_list_by_timestamp
|
file_list_by_timestamp = get_file_list_by_timestamp
|
||||||
if file_list_by_timestamp.count == 0
|
if file_list_by_timestamp.count == 0
|
||||||
puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
|
puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@only_filter.to_s}\")\n\t* Site is not in wayback machine."
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
count = 0
|
count = 0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user