Added retry command

fixes [Feature request} Retry flag Fixes StrawberryMaster/wayback-machine-downloader#31
2025-12-29 16:16:06 +00:00 · 2025-08-20 01:21:29 +00:00
parent fa306ac92b
commit fc8d8a9441
3 changed files with 40 additions and 129 deletions
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -11,7 +11,6 @@ require 'concurrent-ruby'
 require 'logger'
 require 'zlib'
 require 'stringio'
-require 'digest'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@@ -117,7 +116,7 @@ class WaybackMachineDownloader
  include ArchiveAPI
  include SubdomainProcessor

-  VERSION = "2.4.3"
+  VERSION = "2.4.0"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
@@ -172,19 +171,12 @@ class WaybackMachineDownloader

  def backup_name
    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
-    raw = if url_to_process.include?('//')
+    
+    if url_to_process.include? '//'
      url_to_process.split('/')[2]
    else
      url_to_process
    end
-
-    # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
-    if Gem.win_platform?
-      raw = raw.gsub(/[:*?"<>|]/, '_')
-      raw = raw.gsub(/[ .]+\z/, '')
-    end
-    raw = 'site' if raw.nil? || raw.empty?
-    raw
  end

  def backup_path
@@ -348,15 +340,15 @@ class WaybackMachineDownloader
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
      next if file_timestamp.to_i > target_timestamp
-
-      raw_tail = file_url.split('/')[3..-1]&.join('/')
-      file_id = sanitize_and_prepare_id(raw_tail, file_url)
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id = CGI::unescape file_id
+      file_id = file_id.tidy_bytes unless file_id == ""
      next if file_id.nil?
      next if match_exclude_filter(file_url)
      next unless match_only_filter(file_url)
-
+      # Select the most recent version <= target_timestamp
      if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
-        file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
+        file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
      end
    end
    file_versions.values
@@ -376,27 +368,22 @@ class WaybackMachineDownloader
    file_list_curated = Hash.new
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
-
-      raw_tail = file_url.split('/')[3..-1]&.join('/')
-      file_id = sanitize_and_prepare_id(raw_tail, file_url)
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id = CGI::unescape file_id
+      file_id = file_id.tidy_bytes unless file_id == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
-        next
-      end
-
-      if file_id.include?('<') || file_id.include?('>')
-        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
-        elsif !match_only_filter(file_url)
+        elsif not match_only_filter(file_url)
          puts "File url doesn't match only filter, ignoring: #{file_url}"
        elsif file_list_curated[file_id]
          unless file_list_curated[file_id][:timestamp] > file_timestamp
-            file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
+            file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
          end
        else
-          file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
+          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
        end
      end
    end
@@ -407,32 +394,21 @@ class WaybackMachineDownloader
    file_list_curated = Hash.new
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
-
-      raw_tail = file_url.split('/')[3..-1]&.join('/')
-      file_id = sanitize_and_prepare_id(raw_tail, file_url)
+      file_id = file_url.split('/')[3..-1].join('/')
+      file_id_and_timestamp = [file_timestamp, file_id].join('/')
+      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
+      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
-        next
-      end
-
-      file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
-      file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
-      if file_id_and_timestamp.nil?
-        puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
-        next
-      end
-
-      if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
-        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
-        elsif !match_only_filter(file_url)
+        elsif not match_only_filter(file_url)
          puts "File url doesn't match only filter, ignoring: #{file_url}"
        elsif file_list_curated[file_id_and_timestamp]
-          # duplicate combo, ignore silently (verbose flag not shown here)
+          puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
        else
-          file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
+          file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
        end
      end
    end
@@ -773,86 +749,6 @@ class WaybackMachineDownloader
    end
    logger
  end
-    
-  # safely sanitize a file id (or id+timestamp)
-  def sanitize_and_prepare_id(raw, file_url)
-    return nil if raw.nil?
-    return ""  if raw.empty?
-    original = raw.dup
-    begin
-      # work on a binary copy to avoid premature encoding errors
-      raw = raw.dup.force_encoding(Encoding::BINARY)
-
-      # percent-decode (repeat until stable in case of double-encoding)
-      loop do
-        decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
-        break if decoded == raw
-        raw = decoded
-      end
-
-      # try tidy_bytes
-      begin
-        raw = raw.tidy_bytes
-      rescue StandardError
-        # fallback: scrub to UTF-8
-        raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
-      end
-
-      # ensure UTF-8 and scrub again
-      unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
-        raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
-      end
-
-      # strip HTML/comment artifacts & control chars
-      raw.gsub!(/<!--+/, '')
-      raw.gsub!(/[\x00-\x1F]/, '')
-
-      # split query; hash it for stable short name
-      path_part, query_part = raw.split('?', 2)
-      if query_part && !query_part.empty?
-        q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
-        if path_part.include?('.')
-          pre, _sep, post = path_part.rpartition('.')
-          path_part = "#{pre}__q#{q_digest}.#{post}"
-        else
-          path_part = "#{path_part}__q#{q_digest}"
-        end
-      end
-      raw = path_part
-
-      # collapse slashes & trim leading slash
-      raw.gsub!(%r{/+}, '/')
-      raw.sub!(%r{\A/}, '')
-
-      # segment-wise sanitation
-      raw = raw.split('/').map do |segment|
-        seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
-        seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
-        seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
-        seg.empty? ? '_' : seg
-      end.join('/')
-
-      # remove any remaining angle brackets
-      raw.tr!('<>', '')
-
-      # final fallback if empty
-      raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
-
-      raw
-    rescue => e
-      @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
-      # deterministic fallback – never return nil so caller won’t mark malformed
-      "file__#{Digest::SHA1.hexdigest(original)[0,10]}"
-    end
-  end
-
-  # wrap URL in parentheses if it contains characters that commonly break unquoted
-  # Windows CMD usage (e.g., &). This is only for display; user still must quote
-  # when invoking manually.
-  def safe_display_url(url)
-    return url unless url && url.match?(/[&]/)
-    "(#{url})"
-  end

  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
    retries = 0