From 330c1888fbf1201f5593b63107b56ee8601724d8 Mon Sep 17 00:00:00 2001 From: insaner Date: Tue, 24 Jan 2017 04:55:51 -0500 Subject: [PATCH 1/5] Update wayback_machine_downloader --- bin/wayback_machine_downloader | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index af75e71..1d0bb18 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts| options[:directory] = t end + opts.on("-s", "--all-timestamps", "Download all snapshots (ie. all timestamps)") do |t| + options[:all_timestamps] = true + end + opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t| options[:from_timestamp] = t end From 5bd9fbffdd4d6f37e9263008e04bb88b72346b62 Mon Sep 17 00:00:00 2001 From: insaner Date: Tue, 24 Jan 2017 05:03:07 -0500 Subject: [PATCH 2/5] Update wayback_machine_downloader.rb --- lib/wayback_machine_downloader.rb | 51 ++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 924d7ac..9394082 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -26,6 +26,7 @@ class WaybackMachineDownloader @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] + @all_timestamps = params[:all_timestamps] @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i @@ -123,13 +124,53 @@ class WaybackMachineDownloader end file_list_curated end + def get_file_list_ALL + file_list_curated = Hash.new + puts "snapshots_to_consider: " + get_all_snapshots_to_consider.lines.count.to_s + get_all_snapshots_to_consider.each_line do |line| + unless line.include?('/') + print "**" + line + next + end + file_timestamp = line[0..13].to_i + file_url = line[15..-2] + file_id = file_url.split('/')[3..-1].join('/') + file_id = [file_timestamp, file_id].join('/') + file_id = CGI::unescape file_id + file_id = file_id.tidy_bytes unless file_id == "" + if file_id.nil? + puts "Malformed file url, ignoring: #{file_url}" + else + if match_exclude_filter(file_url) + puts "File url matches exclude filter, ignoring: #{file_url}" + elsif not match_only_filter(file_url) + puts "File url doesn't match only filter, ignoring: #{file_url}" + elsif file_list_curated[file_id] + #puts "__" + file_id + else + file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} + end + end + end + puts "file_list_curated: " + file_list_curated.count.to_s + file_list_curated + end + def get_file_list_by_timestamp - file_list_curated = get_file_list_curated - file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse - file_list_curated.map do |file_remote_info| - file_remote_info[1][:file_id] = file_remote_info[0] - file_remote_info[1] + if @all_timestamps + file_list_curated = get_file_list_ALL + file_list_curated.map do |file_remote_info| + file_remote_info[1][:file_id] = file_remote_info[0] + file_remote_info[1] + end + else + file_list_curated = get_file_list_curated + file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse + file_list_curated.map do |file_remote_info| + file_remote_info[1][:file_id] = file_remote_info[0] + file_remote_info[1] + end end end From 4a6fdb14624294381873a0c6a5c9c56f6392b8a7 Mon Sep 17 00:00:00 2001 From: hartator Date: Thu, 26 Oct 2017 19:34:36 -0500 Subject: [PATCH 3/5] Test `all_timestamps` option download all timestamps --- test/test_wayback_machine_downloader.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index 0e4ceb4..8e83bfe 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -74,6 +74,11 @@ class WaybackMachineDownloaderTest < Minitest::Test assert_includes linux_page.read, "Linux Games" end + def test_all_timestamps_being_respected + @wayback_machine_downloader.all_timestamps = true + assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size + end + def test_from_timestamp_being_respected @wayback_machine_downloader.from_timestamp = 20050716231334 file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url] From d80b51f502de63cf3c978ca7bad8e3ecce0057ed Mon Sep 17 00:00:00 2001 From: hartator Date: Thu, 26 Oct 2017 19:35:29 -0500 Subject: [PATCH 4/5] Use more explicit variable name `all_timestamps` --- lib/wayback_machine_downloader.rb | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 9394082..67e32cd 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -16,17 +16,17 @@ class WaybackMachineDownloader VERSION = "1.1.4" - attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count + attr_accessor :base_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count def initialize params @base_url = params[:base_url] @directory = params[:directory] + @all_timestamps = params[:all_timestamps] @from_timestamp = params[:from_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] - @all_timestamps = params[:all_timestamps] @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i @@ -124,20 +124,17 @@ class WaybackMachineDownloader end file_list_curated end - def get_file_list_ALL + + def get_file_list_all_timestamps file_list_curated = Hash.new - puts "snapshots_to_consider: " + get_all_snapshots_to_consider.lines.count.to_s get_all_snapshots_to_consider.each_line do |line| - unless line.include?('/') - print "**" + line - next - end + next unless line.include?('/') file_timestamp = line[0..13].to_i file_url = line[15..-2] file_id = file_url.split('/')[3..-1].join('/') - file_id = [file_timestamp, file_id].join('/') - file_id = CGI::unescape file_id - file_id = file_id.tidy_bytes unless file_id == "" + file_id_and_timestamp = [file_timestamp, file_id].join('/') + file_id_and_timestamp = CGI::unescape file_id_and_timestamp + file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == "" if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" else @@ -145,10 +142,10 @@ class WaybackMachineDownloader puts "File url matches exclude filter, ignoring: #{file_url}" elsif not match_only_filter(file_url) puts "File url doesn't match only filter, ignoring: #{file_url}" - elsif file_list_curated[file_id] - #puts "__" + file_id + elsif file_list_curated[file_id_and_timestamp] + puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose else - file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} + file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp} end end end @@ -159,7 +156,7 @@ class WaybackMachineDownloader def get_file_list_by_timestamp if @all_timestamps - file_list_curated = get_file_list_ALL + file_list_curated = get_file_list_all_timestamps file_list_curated.map do |file_remote_info| file_remote_info[1][:file_id] = file_remote_info[0] file_remote_info[1] From 5116b77eb9d1ad757cec50eb849d893b5ed64951 Mon Sep 17 00:00:00 2001 From: hartator Date: Thu, 26 Oct 2017 19:36:06 -0500 Subject: [PATCH 5/5] Give more details about what the option `--all-timestamps` do --- bin/wayback_machine_downloader | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 1d0bb18..e666575 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -18,7 +18,7 @@ option_parser = OptionParser.new do |opts| options[:directory] = t end - opts.on("-s", "--all-timestamps", "Download all snapshots (ie. all timestamps)") do |t| + opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t| options[:all_timestamps] = true end