From 6779971dc9a19262a77e6a85c02abb0ddfe3d5bc Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:08:40 -0400 Subject: [PATCH 01/15] Fix whitespace --- lib/wayback_machine_downloader/archive_api.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index d3e82b7..f87c6f6 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -1,15 +1,15 @@ module ArchiveAPI - def get_raw_list_from_api url, page_index - request_url = "http://web.archive.org/cdx/search/xd?url=" - request_url += url - request_url += parameters_for_api page_index + def get_raw_list_from_api url, page_index + request_url = "http://web.archive.org/cdx/search/xd?url=" + request_url += url + request_url += parameters_for_api page_index open(request_url).read - end + end - def parameters_for_api page_index - parameters = "&fl=timestamp,original&collapse=digest&gzip=false" + def parameters_for_api page_index + parameters = "&fl=timestamp,original&collapse=digest&gzip=false" if @all parameters += "" else From ea73ed5ed6096effa7392cb2d95bbcf607f4b76d Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:10:16 -0400 Subject: [PATCH 02/15] Shorten some lines for readability --- lib/wayback_machine_downloader.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 913174b..5ac873b 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,7 +16,8 @@ class WaybackMachineDownloader VERSION = "1.1.5" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, + :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] @@ -78,7 +79,8 @@ class WaybackMachineDownloader end def get_all_snapshots_to_consider - # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index + # Note: Passing a page index parameter allow us to get more snapshots, + # but from a less fresh index print "Getting snapshot pages" snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) From 6b8c1aa194fa886b70b6678d1303ad3dcf67da54 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:12:41 -0400 Subject: [PATCH 03/15] Remove list attribute from the downloader. Whether to list or download is a program option external to the downloader --- bin/wayback_machine_downloader | 2 +- lib/wayback_machine_downloader.rb | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index af75e71..46171f2 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,7 +58,7 @@ end.parse! if (base_url = ARGV[-1]) options[:base_url] = base_url wayback_machine_downloader = WaybackMachineDownloader.new options - if wayback_machine_downloader.list + if options[:list] wayback_machine_downloader.list_files else wayback_machine_downloader.download_files diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 5ac873b..bc71597 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -17,7 +17,7 @@ class WaybackMachineDownloader VERSION = "1.1.5" attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, - :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] @@ -27,7 +27,6 @@ class WaybackMachineDownloader @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] - @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i end From 65b19485176e4cde850fb2120742fbcf412fcdb6 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:19:34 -0400 Subject: [PATCH 04/15] Avoid interleaving status output with file listing. Before: [ Getting snapshot pages.. found 1 snaphots to consider. {"file_url":"http://www.trackpedia.com:80/forums/archive/index.php/f-115.html","timestamp":20131221124252,"file_id":"forums/archive/index.php/f-115.html"}, ] After: Getting snapshot pages.. found 1 snaphots to consider. [ {"file_url":"http://www.trackpedia.com:80/forums/archive/index.php/f-115.html","timestamp":20131221124252,"file_id":"forums/archive/index.php/f-115.html"}, ] --- lib/wayback_machine_downloader.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index bc71597..0e79623 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -135,8 +135,11 @@ class WaybackMachineDownloader end def list_files + # retrieval produces its own output + files = get_file_list_by_timestamp + # ... hence delay printing the opening bracket puts "[" - get_file_list_by_timestamp.each do |file| + files.each do |file| puts file.to_json + "," end puts "]" From d926f965f9ce914d9bc14381c54254c7b241f7c7 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Wed, 15 Mar 2017 17:58:05 -0400 Subject: [PATCH 05/15] Add exact_match option. With this option set, Wayback Machine Downloader will only look for snapshots matching the exact base_url passed in rather than base_url and its children. This is useful when trying to download a single file rather than mirroring a site. --- lib/wayback_machine_downloader.rb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 0e79623..9f466df 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,11 +16,13 @@ class WaybackMachineDownloader VERSION = "1.1.5" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, + attr_accessor :base_url, :exact_match, :directory, + :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] + @exact_match = params[:exact_match] @directory = params[:directory] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @@ -84,8 +86,10 @@ class WaybackMachineDownloader snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." - snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) - print "." + unless @exact_match + snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) + print "." + end @maximum_pages.times do |page_index| snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) break if snapshot_list.empty? From e73a88ab56dfa2d5d7863e9f1b5678ef4a868c6c Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 16:53:37 -0400 Subject: [PATCH 06/15] File.exists? causes warning is ruby 2.4.1, use exist? --- lib/wayback_machine_downloader.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 9f466df..ffbfb6e 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -187,7 +187,7 @@ class WaybackMachineDownloader def structure_dir_path dir_path begin - FileUtils::mkdir_p dir_path unless File.exists? dir_path + FileUtils::mkdir_p dir_path unless File.exist? dir_path rescue Errno::EEXIST => e error_to_string = e.to_s puts "# #{error_to_string}" @@ -227,7 +227,7 @@ class WaybackMachineDownloader if Gem.win_platform? file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } end - unless File.exists? file_path + unless File.exist? file_path begin structure_dir_path dir_path open(file_path, "wb") do |file| @@ -248,7 +248,7 @@ class WaybackMachineDownloader rescue StandardError => e puts "#{file_url} # #{e}" ensure - if not @all and File.exists?(file_path) and File.size(file_path) == 0 + if not @all and File.exist?(file_path) and File.size(file_path) == 0 File.delete(file_path) puts "#{file_path} was empty and was removed." end From 43ab0b2f484d476dc688361ad99876f4d7028061 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 16:54:51 -0400 Subject: [PATCH 07/15] Fix test name so that it does not shadow another test --- test/test_wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index cd5b822..d435721 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -85,7 +85,7 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] end - def test_file_list_exclude_filter_with_a_regex + def test_all_get_file_list_curated_size @wayback_machine_downloader.all = true assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size end From e6157c21b9a955040ba2110d2543cdac3cb4dde2 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 16:59:08 -0400 Subject: [PATCH 08/15] Parens are required before * when used for splatting. https://stackoverflow.com/questions/41821628/ruby-how-can-i-kill-warning-interpreted-as-argument-prefix --- lib/wayback_machine_downloader/to_regex.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader/to_regex.rb b/lib/wayback_machine_downloader/to_regex.rb index a059ccf..d9f0a67 100644 --- a/lib/wayback_machine_downloader/to_regex.rb +++ b/lib/wayback_machine_downloader/to_regex.rb @@ -25,7 +25,7 @@ module ToRegex # @option options [true,false] :lang /foo/[nesu] def to_regex(options = {}) if args = as_regexp(options) - ::Regexp.new *args + ::Regexp.new(*args) end end From aab9a49509a0535046b79bbb892eb753c1432111 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 17:00:50 -0400 Subject: [PATCH 09/15] Get rid of assigned but unused variable warnings under ruby 2.4 --- lib/wayback_machine_downloader/tidy_bytes.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index f584103..ba000b4 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -60,7 +60,7 @@ module TibyBytes bytes.each_index do |i| byte = bytes[i] - is_ascii = byte < 128 + _is_ascii = byte < 128 is_cont = byte > 127 && byte < 192 is_lead = byte > 191 && byte < 245 is_unused = byte > 240 @@ -78,7 +78,7 @@ module TibyBytes # the leading byte. begin (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - rescue NoMethodError => e + rescue NoMethodError next end conts_expected = 0 @@ -98,7 +98,7 @@ module TibyBytes end begin bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - rescue ArgumentError => e + rescue ArgumentError nil end end From e166e9443d68abb5fd0115c85d3fd830c9f473e9 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 17:24:17 -0400 Subject: [PATCH 10/15] Line length --- test/test_wayback_machine_downloader.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index d435721..a602bf4 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -4,7 +4,8 @@ require 'wayback_machine_downloader' class WaybackMachineDownloaderTest < Minitest::Test def setup - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'http://www.onlyfreegames.net') $stdout = StringIO.new end @@ -92,13 +93,15 @@ class WaybackMachineDownloaderTest < Minitest::Test # Testing encoding conflicts needs a different base_url def test_nonascii_suburls_download - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # Once just for the downloading... @wayback_machine_downloader.download_files end def test_nonascii_suburls_already_present - @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + @wayback_machine_downloader = WaybackMachineDownloader.new( + base_url: 'https://en.wikipedia.org/wiki/%C3%84') # ... twice to test the "is already present" case @wayback_machine_downloader.download_files @wayback_machine_downloader.download_files From 4af80adca691ce29a78b97391aa41c00801635f6 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 3 Jun 2017 17:45:06 -0400 Subject: [PATCH 11/15] Fix exact match logic and add a test --- lib/wayback_machine_downloader.rb | 14 ++++++-------- test/test_wayback_machine_downloader.rb | 10 ++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index ffbfb6e..4f5aa25 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -87,14 +87,12 @@ class WaybackMachineDownloader snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." unless @exact_match - snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil) - print "." - end - @maximum_pages.times do |page_index| - snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) - break if snapshot_list.empty? - snapshot_list_to_consider += snapshot_list - print "." + @maximum_pages.times do |page_index| + snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) + break if snapshot_list.empty? + snapshot_list_to_consider += snapshot_list + print "." + end end puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider." puts diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index a602bf4..bf2bac5 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -39,6 +39,16 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] end + def test_without_exact_match + @wayback_machine_downloader.exact_match = false + assert @wayback_machine_downloader.get_file_list_curated.size > 1 + end + + def test_exact_match + @wayback_machine_downloader.exact_match = true + assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size + end + def test_file_list_only_filter_without_matches @wayback_machine_downloader.only_filter = 'abc123' assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size From 28fd1e10a2090a745054bcf6cc101be8d78c6a06 Mon Sep 17 00:00:00 2001 From: hartator Date: Sun, 11 Jun 2017 13:19:36 -0500 Subject: [PATCH 12/15] Fix length of arguments per line --- lib/wayback_machine_downloader.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index e2c4b11..fe77758 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -17,8 +17,8 @@ class WaybackMachineDownloader VERSION = "2.0.0" attr_accessor :base_url, :exact_match, :directory, - :from_timestamp, :to_timestamp, - :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count + :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, + :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] From 4eca581257584d36774674c5fc134337f1802da8 Mon Sep 17 00:00:00 2001 From: hartator Date: Sun, 11 Jun 2017 13:19:50 -0500 Subject: [PATCH 13/15] Remove too verbose comment --- lib/wayback_machine_downloader.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index fe77758..1777ec0 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -139,7 +139,6 @@ class WaybackMachineDownloader def list_files # retrieval produces its own output files = get_file_list_by_timestamp - # ... hence delay printing the opening bracket puts "[" files.each do |file| puts file.to_json + "," From 246441ff17a4eab14a3797e04c360a9f4b620ef9 Mon Sep 17 00:00:00 2001 From: hartator Date: Sun, 11 Jun 2017 21:53:13 -0500 Subject: [PATCH 14/15] Replace exact match by exact url --- lib/wayback_machine_downloader.rb | 6 +++--- test/test_wayback_machine_downloader.rb | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 1777ec0..3fbf33a 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,13 +16,13 @@ class WaybackMachineDownloader VERSION = "2.0.0" - attr_accessor :base_url, :exact_match, :directory, + attr_accessor :base_url, :exact_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] - @exact_match = params[:exact_match] + @exact_url = params[:exact_url] @directory = params[:directory] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @@ -86,7 +86,7 @@ class WaybackMachineDownloader snapshot_list_to_consider = "" snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) print "." - unless @exact_match + unless @exact_url @maximum_pages.times do |page_index| snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) break if snapshot_list.empty? diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index bf2bac5..a250c3c 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -39,13 +39,13 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] end - def test_without_exact_match - @wayback_machine_downloader.exact_match = false + def test_without_exact_url + @wayback_machine_downloader.exact_url = false assert @wayback_machine_downloader.get_file_list_curated.size > 1 end - def test_exact_match - @wayback_machine_downloader.exact_match = true + def test_exact_url + @wayback_machine_downloader.exact_url = true assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size end From af8ef28d6703756fa2a17b7ea9f98f2ad8297ad0 Mon Sep 17 00:00:00 2001 From: hartator Date: Sun, 11 Jun 2017 21:53:45 -0500 Subject: [PATCH 15/15] Add explanation to use exact url flag for the CLI --- bin/wayback_machine_downloader | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index b778049..0815950 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts| options[:to_timestamp] = t end + opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t| + options[:only_filter] = t + end + opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| options[:only_filter] = t end @@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts| options[:threads_count] = t end - opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t| + opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t| options[:maximum_pages] = t end - opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t| + opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t| options[:list] = true end