diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 7931250..b778049 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,7 +58,7 @@ end.parse! if (base_url = ARGV[-1]) options[:base_url] = base_url wayback_machine_downloader = WaybackMachineDownloader.new options - if wayback_machine_downloader.list + if options[:list] wayback_machine_downloader.list_files else wayback_machine_downloader.download_files diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 91376b5..e2c4b11 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,17 +16,19 @@ class WaybackMachineDownloader VERSION = "2.0.0" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + attr_accessor :base_url, :exact_match, :directory, + :from_timestamp, :to_timestamp, + :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] + @exact_match = params[:exact_match] @directory = params[:directory] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] - @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i end @@ -78,18 +80,19 @@ class WaybackMachineDownloader end def get_all_snapshots_to_consider - # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index + # Note: Passing a page index parameter allow us to get more snapshots, + # but from a less fresh index print "Getting snapshot pages" snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." - snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) - print "." - @maximum_pages.times do |page_index| - snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) - break if snapshot_list.empty? - snapshot_list_to_consider += snapshot_list - print "." + unless @exact_match + @maximum_pages.times do |page_index| + snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) + break if snapshot_list.empty? + snapshot_list_to_consider += snapshot_list + print "." + end end puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider." puts @@ -134,8 +137,11 @@ class WaybackMachineDownloader end def list_files + # retrieval produces its own output + files = get_file_list_by_timestamp + # ... hence delay printing the opening bracket puts "[" - get_file_list_by_timestamp.each do |file| + files.each do |file| puts file.to_json + "," end puts "]" @@ -179,7 +185,7 @@ class WaybackMachineDownloader def structure_dir_path dir_path begin - FileUtils::mkdir_p dir_path unless File.exists? dir_path + FileUtils::mkdir_p dir_path unless File.exist? dir_path rescue Errno::EEXIST => e error_to_string = e.to_s puts "# #{error_to_string}" @@ -219,7 +225,7 @@ class WaybackMachineDownloader if Gem.win_platform? file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } end - unless File.exists? file_path + unless File.exist? file_path begin structure_dir_path dir_path open(file_path, "wb") do |file| @@ -240,7 +246,7 @@ class WaybackMachineDownloader rescue StandardError => e puts "#{file_url} # #{e}" ensure - if not @all and File.exists?(file_path) and File.size(file_path) == 0 + if not @all and File.exist?(file_path) and File.size(file_path) == 0 File.delete(file_path) puts "#{file_path} was empty and was removed." end diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index d3e82b7..f87c6f6 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -1,15 +1,15 @@ module ArchiveAPI - def get_raw_list_from_api url, page_index - request_url = "http://web.archive.org/cdx/search/xd?url=" - request_url += url - request_url += parameters_for_api page_index + def get_raw_list_from_api url, page_index + request_url = "http://web.archive.org/cdx/search/xd?url=" + request_url += url + request_url += parameters_for_api page_index open(request_url).read - end + end - def parameters_for_api page_index - parameters = "&fl=timestamp,original&collapse=digest&gzip=false" + def parameters_for_api page_index + parameters = "&fl=timestamp,original&collapse=digest&gzip=false" if @all parameters += "" else diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index f584103..ba000b4 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -60,7 +60,7 @@ module TibyBytes bytes.each_index do |i| byte = bytes[i] - is_ascii = byte < 128 + _is_ascii = byte < 128 is_cont = byte > 127 && byte < 192 is_lead = byte > 191 && byte < 245 is_unused = byte > 240 @@ -78,7 +78,7 @@ module TibyBytes # the leading byte. begin (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - rescue NoMethodError => e + rescue NoMethodError next end conts_expected = 0 @@ -98,7 +98,7 @@ module TibyBytes end begin bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - rescue ArgumentError => e + rescue ArgumentError nil end end diff --git a/lib/wayback_machine_downloader/to_regex.rb b/lib/wayback_machine_downloader/to_regex.rb index a059ccf..d9f0a67 100644 --- a/lib/wayback_machine_downloader/to_regex.rb +++ b/lib/wayback_machine_downloader/to_regex.rb @@ -25,7 +25,7 @@ module ToRegex # @option options [true,false] :lang /foo/[nesu] def to_regex(options = {}) if args = as_regexp(options) - ::Regexp.new *args + ::Regexp.new(*args) end end diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index cd5b822..bf2bac5 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -4,7 +4,8 @@ require 'wayback_machine_downloader' class WaybackMachineDownloaderTest < Minitest::Test def setup - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'http://www.onlyfreegames.net') $stdout = StringIO.new end @@ -38,6 +39,16 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] end + def test_without_exact_match + @wayback_machine_downloader.exact_match = false + assert @wayback_machine_downloader.get_file_list_curated.size > 1 + end + + def test_exact_match + @wayback_machine_downloader.exact_match = true + assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size + end + def test_file_list_only_filter_without_matches @wayback_machine_downloader.only_filter = 'abc123' assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size @@ -85,20 +96,22 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] end - def test_file_list_exclude_filter_with_a_regex + def test_all_get_file_list_curated_size @wayback_machine_downloader.all = true assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size end # Testing encoding conflicts needs a different base_url def test_nonascii_suburls_download - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # Once just for the downloading... @wayback_machine_downloader.download_files end def test_nonascii_suburls_already_present - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # ... twice to test the "is already present" case @wayback_machine_downloader.download_files @wayback_machine_downloader.download_files