diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 7931250..0815950 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts| options[:to_timestamp] = t end + opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t| + options[:only_filter] = t + end + opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| options[:only_filter] = t end @@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts| options[:threads_count] = t end - opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t| + opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t| options[:maximum_pages] = t end - opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t| + opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t| options[:list] = true end @@ -58,7 +62,7 @@ end.parse! if (base_url = ARGV[-1]) options[:base_url] = base_url wayback_machine_downloader = WaybackMachineDownloader.new options - if wayback_machine_downloader.list + if options[:list] wayback_machine_downloader.list_files else wayback_machine_downloader.download_files diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 91376b5..3fbf33a 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,17 +16,19 @@ class WaybackMachineDownloader VERSION = "2.0.0" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + attr_accessor :base_url, :exact_url, :directory, + :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, + :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] + @exact_url = params[:exact_url] @directory = params[:directory] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] - @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i end @@ -78,18 +80,19 @@ class WaybackMachineDownloader end def get_all_snapshots_to_consider - # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index + # Note: Passing a page index parameter allow us to get more snapshots, + # but from a less fresh index print "Getting snapshot pages" snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." - snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) - print "." - @maximum_pages.times do |page_index| - snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) - break if snapshot_list.empty? - snapshot_list_to_consider += snapshot_list - print "." + unless @exact_url + @maximum_pages.times do |page_index| + snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) + break if snapshot_list.empty? + snapshot_list_to_consider += snapshot_list + print "." + end end puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider." puts @@ -134,8 +137,10 @@ class WaybackMachineDownloader end def list_files + # retrieval produces its own output + files = get_file_list_by_timestamp puts "[" - get_file_list_by_timestamp.each do |file| + files.each do |file| puts file.to_json + "," end puts "]" @@ -179,7 +184,7 @@ class WaybackMachineDownloader def structure_dir_path dir_path begin - FileUtils::mkdir_p dir_path unless File.exists? dir_path + FileUtils::mkdir_p dir_path unless File.exist? dir_path rescue Errno::EEXIST => e error_to_string = e.to_s puts "# #{error_to_string}" @@ -219,7 +224,7 @@ class WaybackMachineDownloader if Gem.win_platform? file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } end - unless File.exists? file_path + unless File.exist? file_path begin structure_dir_path dir_path open(file_path, "wb") do |file| @@ -240,7 +245,7 @@ class WaybackMachineDownloader rescue StandardError => e puts "#{file_url} # #{e}" ensure - if not @all and File.exists?(file_path) and File.size(file_path) == 0 + if not @all and File.exist?(file_path) and File.size(file_path) == 0 File.delete(file_path) puts "#{file_path} was empty and was removed." end diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index d3e82b7..f87c6f6 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -1,15 +1,15 @@ module ArchiveAPI - def get_raw_list_from_api url, page_index - request_url = "http://web.archive.org/cdx/search/xd?url=" - request_url += url - request_url += parameters_for_api page_index + def get_raw_list_from_api url, page_index + request_url = "http://web.archive.org/cdx/search/xd?url=" + request_url += url + request_url += parameters_for_api page_index open(request_url).read - end + end - def parameters_for_api page_index - parameters = "&fl=timestamp,original&collapse=digest&gzip=false" + def parameters_for_api page_index + parameters = "&fl=timestamp,original&collapse=digest&gzip=false" if @all parameters += "" else diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index f584103..ba000b4 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -60,7 +60,7 @@ module TibyBytes bytes.each_index do |i| byte = bytes[i] - is_ascii = byte < 128 + _is_ascii = byte < 128 is_cont = byte > 127 && byte < 192 is_lead = byte > 191 && byte < 245 is_unused = byte > 240 @@ -78,7 +78,7 @@ module TibyBytes # the leading byte. begin (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - rescue NoMethodError => e + rescue NoMethodError next end conts_expected = 0 @@ -98,7 +98,7 @@ module TibyBytes end begin bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - rescue ArgumentError => e + rescue ArgumentError nil end end diff --git a/lib/wayback_machine_downloader/to_regex.rb b/lib/wayback_machine_downloader/to_regex.rb index a059ccf..d9f0a67 100644 --- a/lib/wayback_machine_downloader/to_regex.rb +++ b/lib/wayback_machine_downloader/to_regex.rb @@ -25,7 +25,7 @@ module ToRegex # @option options [true,false] :lang /foo/[nesu] def to_regex(options = {}) if args = as_regexp(options) - ::Regexp.new *args + ::Regexp.new(*args) end end diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index cd5b822..a250c3c 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -4,7 +4,8 @@ require 'wayback_machine_downloader' class WaybackMachineDownloaderTest < Minitest::Test def setup - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'http://www.onlyfreegames.net') $stdout = StringIO.new end @@ -38,6 +39,16 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] end + def test_without_exact_url + @wayback_machine_downloader.exact_url = false + assert @wayback_machine_downloader.get_file_list_curated.size > 1 + end + + def test_exact_url + @wayback_machine_downloader.exact_url = true + assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size + end + def test_file_list_only_filter_without_matches @wayback_machine_downloader.only_filter = 'abc123' assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size @@ -85,20 +96,22 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] end - def test_file_list_exclude_filter_with_a_regex + def test_all_get_file_list_curated_size @wayback_machine_downloader.all = true assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size end # Testing encoding conflicts needs a different base_url def test_nonascii_suburls_download - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # Once just for the downloading... @wayback_machine_downloader.download_files end def test_nonascii_suburls_already_present - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # ... twice to test the "is already present" case @wayback_machine_downloader.download_files @wayback_machine_downloader.download_files