Merge branch 'master' of https://github.com/p/wayback-machine-downloader into p-master

2025-12-29 16:16:06 +00:00 · 2017-06-11 12:46:00 -05:00
parent 5b0ed26ab5 4af80adca6
commit 62f424b6d1
6 changed files with 49 additions and 30 deletions
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -16,17 +16,19 @@ class WaybackMachineDownloader

  VERSION = "2.0.0"

-  attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
+  attr_accessor :base_url, :exact_match, :directory,
+    :from_timestamp, :to_timestamp,
+    :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count

  def initialize params
    @base_url = params[:base_url]
+    @exact_match = params[:exact_match]
    @directory = params[:directory]
    @from_timestamp = params[:from_timestamp].to_i
    @to_timestamp = params[:to_timestamp].to_i
    @only_filter = params[:only_filter]
    @exclude_filter = params[:exclude_filter]
    @all = params[:all]
-    @list = params[:list]
    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
    @threads_count = params[:threads_count].to_i
  end
@@ -78,18 +80,19 @@ class WaybackMachineDownloader
  end

  def get_all_snapshots_to_consider
-    # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
+    # Note: Passing a page index parameter allow us to get more snapshots,
+    # but from a less fresh index
    print "Getting snapshot pages"
    snapshot_list_to_consider = ""
    snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
    print "."
-    snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
-    print "."
-    @maximum_pages.times do |page_index|
-      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
-      break if snapshot_list.empty?
-      snapshot_list_to_consider += snapshot_list
-      print "."
+    unless @exact_match
+      @maximum_pages.times do |page_index|
+        snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
+        break if snapshot_list.empty?
+        snapshot_list_to_consider += snapshot_list
+        print "."
+      end
    end
    puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
    puts
@@ -134,8 +137,11 @@ class WaybackMachineDownloader
  end

  def list_files
+    # retrieval produces its own output
+    files = get_file_list_by_timestamp
+    # ... hence delay printing the opening bracket
    puts "["
-    get_file_list_by_timestamp.each do |file|
+    files.each do |file|
      puts file.to_json + ","
    end
    puts "]"
@@ -179,7 +185,7 @@ class WaybackMachineDownloader

  def structure_dir_path dir_path
    begin
-      FileUtils::mkdir_p dir_path unless File.exists? dir_path
+      FileUtils::mkdir_p dir_path unless File.exist? dir_path
    rescue Errno::EEXIST => e
      error_to_string = e.to_s
      puts "# #{error_to_string}"
@@ -219,7 +225,7 @@ class WaybackMachineDownloader
    if Gem.win_platform?
      file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
    end
-    unless File.exists? file_path
+    unless File.exist? file_path
      begin
        structure_dir_path dir_path
        open(file_path, "wb") do |file|
@@ -240,7 +246,7 @@ class WaybackMachineDownloader
      rescue StandardError => e
        puts "#{file_url} # #{e}"
      ensure
-        if not @all and File.exists?(file_path) and File.size(file_path) == 0
+        if not @all and File.exist?(file_path) and File.size(file_path) == 0
          File.delete(file_path)
          puts "#{file_path} was empty and was removed."
        end
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -1,15 +1,15 @@
 module ArchiveAPI

-	def get_raw_list_from_api url, page_index
-		request_url = "http://web.archive.org/cdx/search/xd?url="
-		request_url += url
-		request_url += parameters_for_api page_index
+  def get_raw_list_from_api url, page_index
+    request_url = "http://web.archive.org/cdx/search/xd?url="
+    request_url += url
+    request_url += parameters_for_api page_index

    open(request_url).read
-	end
+  end

-	def parameters_for_api page_index
-		parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
+  def parameters_for_api page_index
+    parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
    if @all
      parameters += ""
    else
--- a/lib/wayback_machine_downloader/tidy_bytes.rb
+++ b/lib/wayback_machine_downloader/tidy_bytes.rb
@@ -60,7 +60,7 @@ module TibyBytes
      bytes.each_index do |i|

        byte          = bytes[i]
-        is_ascii      = byte < 128
+        _is_ascii     = byte < 128
        is_cont       = byte > 127 && byte < 192
        is_lead       = byte > 191 && byte < 245
        is_unused     = byte > 240
@@ -78,7 +78,7 @@ module TibyBytes
            # the leading byte.
            begin
              (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
-            rescue NoMethodError => e
+            rescue NoMethodError
              next
            end
            conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
      end
      begin
        bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
-      rescue ArgumentError => e
+      rescue ArgumentError
        nil
      end
    end
--- a/lib/wayback_machine_downloader/to_regex.rb
+++ b/lib/wayback_machine_downloader/to_regex.rb
@@ -25,7 +25,7 @@ module ToRegex
    # @option options [true,false] :lang /foo/[nesu]
    def to_regex(options = {})
      if args = as_regexp(options)
-        ::Regexp.new *args
+        ::Regexp.new(*args)
      end
    end