From 6779971dc9a19262a77e6a85c02abb0ddfe3d5bc Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:08:40 -0400 Subject: [PATCH 01/11] Fix whitespace --- lib/wayback_machine_downloader/archive_api.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index d3e82b7..f87c6f6 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -1,15 +1,15 @@ module ArchiveAPI - def get_raw_list_from_api url, page_index - request_url = "http://web.archive.org/cdx/search/xd?url=" - request_url += url - request_url += parameters_for_api page_index + def get_raw_list_from_api url, page_index + request_url = "http://web.archive.org/cdx/search/xd?url=" + request_url += url + request_url += parameters_for_api page_index open(request_url).read - end + end - def parameters_for_api page_index - parameters = "&fl=timestamp,original&collapse=digest&gzip=false" + def parameters_for_api page_index + parameters = "&fl=timestamp,original&collapse=digest&gzip=false" if @all parameters += "" else From ea73ed5ed6096effa7392cb2d95bbcf607f4b76d Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:10:16 -0400 Subject: [PATCH 02/11] Shorten some lines for readability --- lib/wayback_machine_downloader.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 913174b..5ac873b 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,7 +16,8 @@ class WaybackMachineDownloader VERSION = "1.1.5" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, + :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] @@ -78,7 +79,8 @@ class WaybackMachineDownloader end def get_all_snapshots_to_consider - # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index + # Note: Passing a page index parameter allow us to get more snapshots, + # but from a less fresh index print "Getting snapshot pages" snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) From 6b8c1aa194fa886b70b6678d1303ad3dcf67da54 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:12:41 -0400 Subject: [PATCH 03/11] Remove list attribute from the downloader. Whether to list or download is a program option external to the downloader --- bin/wayback_machine_downloader | 2 +- lib/wayback_machine_downloader.rb | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index af75e71..46171f2 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,7 +58,7 @@ end.parse! if (base_url = ARGV[-1]) options[:base_url] = base_url wayback_machine_downloader = WaybackMachineDownloader.new options - if wayback_machine_downloader.list + if options[:list] wayback_machine_downloader.list_files else wayback_machine_downloader.download_files diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 5ac873b..bc71597 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -17,7 +17,7 @@ class WaybackMachineDownloader VERSION = "1.1.5" attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, - :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] @@ -27,7 +27,6 @@ class WaybackMachineDownloader @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] - @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i end From 65b19485176e4cde850fb2120742fbcf412fcdb6 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:19:34 -0400 Subject: [PATCH 04/11] Avoid interleaving status output with file listing. Before: [ Getting snapshot pages.. found 1 snaphots to consider. {"file_url":"http://www.trackpedia.com:80/forums/archive/index.php/f-115.html","timestamp":20131221124252,"file_id":"forums/archive/index.php/f-115.html"}, ] After: Getting snapshot pages.. found 1 snaphots to consider. [ {"file_url":"http://www.trackpedia.com:80/forums/archive/index.php/f-115.html","timestamp":20131221124252,"file_id":"forums/archive/index.php/f-115.html"}, ] --- lib/wayback_machine_downloader.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index bc71597..0e79623 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -135,8 +135,11 @@ class WaybackMachineDownloader end def list_files + # retrieval produces its own output + files = get_file_list_by_timestamp + # ... hence delay printing the opening bracket puts "[" - get_file_list_by_timestamp.each do |file| + files.each do |file| puts file.to_json + "," end puts "]" From d926f965f9ce914d9bc14381c54254c7b241f7c7 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:58:05 -0400 Subject: [PATCH 05/11] Add exact_match option. With this option set, Wayback Machine Downloader will only look for snapshots matching the exact base_url passed in rather than base_url and its children. This is useful when trying to download a single file rather than mirroring a site. --- lib/wayback_machine_downloader.rb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 0e79623..9f466df 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,11 +16,13 @@ class WaybackMachineDownloader VERSION = "1.1.5" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, + attr_accessor :base_url, :exact_match, :directory, + :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] + @exact_match = params[:exact_match] @directory = params[:directory] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @@ -84,8 +86,10 @@ class WaybackMachineDownloader snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." - snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) - print "." + unless @exact_match + snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) + print "." + end @maximum_pages.times do |page_index| snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) break if snapshot_list.empty? From e73a88ab56dfa2d5d7863e9f1b5678ef4a868c6c Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 16:53:37 -0400 Subject: [PATCH 06/11] File.exists? causes warning is ruby 2.4.1, use exist? --- lib/wayback_machine_downloader.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 9f466df..ffbfb6e 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -187,7 +187,7 @@ class WaybackMachineDownloader def structure_dir_path dir_path begin - FileUtils::mkdir_p dir_path unless File.exists? dir_path + FileUtils::mkdir_p dir_path unless File.exist? dir_path rescue Errno::EEXIST => e error_to_string = e.to_s puts "# #{error_to_string}" @@ -227,7 +227,7 @@ class WaybackMachineDownloader if Gem.win_platform? file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } end - unless File.exists? file_path + unless File.exist? file_path begin structure_dir_path dir_path open(file_path, "wb") do |file| @@ -248,7 +248,7 @@ class WaybackMachineDownloader rescue StandardError => e puts "#{file_url} # #{e}" ensure - if not @all and File.exists?(file_path) and File.size(file_path) == 0 + if not @all and File.exist?(file_path) and File.size(file_path) == 0 File.delete(file_path) puts "#{file_path} was empty and was removed." end From 43ab0b2f484d476dc688361ad99876f4d7028061 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 16:54:51 -0400 Subject: [PATCH 07/11] Fix test name so that it does not shadow another test --- test/test_wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index cd5b822..d435721 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -85,7 +85,7 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] end - def test_file_list_exclude_filter_with_a_regex + def test_all_get_file_list_curated_size @wayback_machine_downloader.all = true assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size end From e6157c21b9a955040ba2110d2543cdac3cb4dde2 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 16:59:08 -0400 Subject: [PATCH 08/11] Parens are required before * when used for splatting. https://stackoverflow.com/questions/41821628/ruby-how-can-i-kill-warning-interpreted-as-argument-prefix --- lib/wayback_machine_downloader/to_regex.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader/to_regex.rb b/lib/wayback_machine_downloader/to_regex.rb index a059ccf..d9f0a67 100644 --- a/lib/wayback_machine_downloader/to_regex.rb +++ b/lib/wayback_machine_downloader/to_regex.rb @@ -25,7 +25,7 @@ module ToRegex # @option options [true,false] :lang /foo/[nesu] def to_regex(options = {}) if args = as_regexp(options) - ::Regexp.new *args + ::Regexp.new(*args) end end From aab9a49509a0535046b79bbb892eb753c1432111 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 17:00:50 -0400 Subject: [PATCH 09/11] Get rid of assigned but unused variable warnings under ruby 2.4 --- lib/wayback_machine_downloader/tidy_bytes.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index f584103..ba000b4 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -60,7 +60,7 @@ module TibyBytes bytes.each_index do |i| byte = bytes[i] - is_ascii = byte < 128 + _is_ascii = byte < 128 is_cont = byte > 127 && byte < 192 is_lead = byte > 191 && byte < 245 is_unused = byte > 240 @@ -78,7 +78,7 @@ module TibyBytes # the leading byte. begin (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - rescue NoMethodError => e + rescue NoMethodError next end conts_expected = 0 @@ -98,7 +98,7 @@ module TibyBytes end begin bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - rescue ArgumentError => e + rescue ArgumentError nil end end From e166e9443d68abb5fd0115c85d3fd830c9f473e9 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 17:24:17 -0400 Subject: [PATCH 10/11] Line length --- test/test_wayback_machine_downloader.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index d435721..a602bf4 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -4,7 +4,8 @@ require 'wayback_machine_downloader' class WaybackMachineDownloaderTest < Minitest::Test def setup - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'http://www.onlyfreegames.net') $stdout = StringIO.new end @@ -92,13 +93,15 @@ class WaybackMachineDownloaderTest < Minitest::Test # Testing encoding conflicts needs a different base_url def test_nonascii_suburls_download - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # Once just for the downloading... @wayback_machine_downloader.download_files end def test_nonascii_suburls_already_present - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # ... twice to test the "is already present" case @wayback_machine_downloader.download_files @wayback_machine_downloader.download_files From 4af80adca691ce29a78b97391aa41c00801635f6 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 17:45:06 -0400 Subject: [PATCH 11/11] Fix exact match logic and add a test --- lib/wayback_machine_downloader.rb | 14 ++++++-------- test/test_wayback_machine_downloader.rb | 10 ++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index ffbfb6e..4f5aa25 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -87,14 +87,12 @@ class WaybackMachineDownloader snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." unless @exact_match - snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) - print "." - end - @maximum_pages.times do |page_index| - snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) - break if snapshot_list.empty? - snapshot_list_to_consider += snapshot_list - print "." + @maximum_pages.times do |page_index| + snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) + break if snapshot_list.empty? + snapshot_list_to_consider += snapshot_list + print "." + end end puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider." puts diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index a602bf4..bf2bac5 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -39,6 +39,16 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] end + def test_without_exact_match + @wayback_machine_downloader.exact_match = false + assert @wayback_machine_downloader.get_file_list_curated.size > 1 + end + + def test_exact_match + @wayback_machine_downloader.exact_match = true + assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size + end + def test_file_list_only_filter_without_matches @wayback_machine_downloader.only_filter = 'abc123' assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size