2 Commits

Author SHA1 Message Date
Felipe
40e9c9bb51 Bumped version 2025-08-16 19:38:01 +00:00
Felipe
6bc08947b7 More aggressive sanitization
this should deal with some of the issues we've seen, luckily. What a ride!
2025-08-12 18:55:00 -03:00
2 changed files with 82 additions and 9 deletions

View File

@@ -11,6 +11,7 @@ require 'concurrent-ruby'
require 'logger' require 'logger'
require 'zlib' require 'zlib'
require 'stringio' require 'stringio'
require 'digest'
require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api' require_relative 'wayback_machine_downloader/archive_api'
@@ -116,7 +117,7 @@ class WaybackMachineDownloader
include ArchiveAPI include ArchiveAPI
include SubdomainProcessor include SubdomainProcessor
VERSION = "2.4.1" VERSION = "2.4.2"
DEFAULT_TIMEOUT = 30 DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_DELAY = 2 RETRY_DELAY = 2
@@ -171,12 +172,19 @@ class WaybackMachineDownloader
def backup_name def backup_name
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
raw = if url_to_process.include?('//')
if url_to_process.include? '//'
url_to_process.split('/')[2] url_to_process.split('/')[2]
else else
url_to_process url_to_process
end end
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
if Gem.win_platform?
raw = raw.gsub(/[:*?"<>|]/, '_')
raw = raw.gsub(/[ .]+\z/, '')
end
raw = 'site' if raw.nil? || raw.empty?
raw
end end
def backup_path def backup_path
@@ -768,18 +776,83 @@ class WaybackMachineDownloader
# safely sanitize a file id (or id+timestamp) # safely sanitize a file id (or id+timestamp)
def sanitize_and_prepare_id(raw, file_url) def sanitize_and_prepare_id(raw, file_url)
return nil if raw.nil? return nil if raw.nil? || raw.empty?
original = raw.dup
begin begin
raw = CGI.unescape(raw) rescue raw # work on a binary copy to avoid premature encoding errors
raw.gsub!(/<[^>]*>/, '') raw = raw.dup.force_encoding(Encoding::BINARY)
raw = raw.tidy_bytes unless raw.empty?
# percent-decode (repeat until stable in case of double-encoding)
loop do
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
break if decoded == raw
raw = decoded
end
# try tidy_bytes
begin
raw = raw.tidy_bytes
rescue StandardError
# fallback: scrub to UTF-8
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
end
# ensure UTF-8 and scrub again
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
end
# strip HTML/comment artifacts & control chars
raw.gsub!(/<!--+/, '')
raw.gsub!(/[\x00-\x1F]/, '')
# split query; hash it for stable short name
path_part, query_part = raw.split('?', 2)
if query_part && !query_part.empty?
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
if path_part.include?('.')
pre, _sep, post = path_part.rpartition('.')
path_part = "#{pre}__q#{q_digest}.#{post}"
else
path_part = "#{path_part}__q#{q_digest}"
end
end
raw = path_part
# collapse slashes & trim leading slash
raw.gsub!(%r{/+}, '/')
raw.sub!(%r{\A/}, '')
# segment-wise sanitation
raw = raw.split('/').map do |segment|
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
seg.empty? ? '_' : seg
end.join('/')
# remove any remaining angle brackets
raw.tr!('<>', '')
# final fallback if empty
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
raw raw
rescue => e rescue => e
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}") @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
nil # deterministic fallback never return nil so caller wont mark malformed
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
end end
end end
# wrap URL in parentheses if it contains characters that commonly break unquoted
# Windows CMD usage (e.g., &). This is only for display; user still must quote
# when invoking manually.
def safe_display_url(url)
return url unless url && url.match?(/[&]/)
"(#{url})"
end
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0) def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
retries = 0 retries = 0
begin begin

View File

@@ -1,6 +1,6 @@
Gem::Specification.new do |s| Gem::Specification.new do |s|
s.name = "wayback_machine_downloader_straw" s.name = "wayback_machine_downloader_straw"
s.version = "2.4.1" s.version = "2.4.2"
s.executables << "wayback_machine_downloader" s.executables << "wayback_machine_downloader"
s.summary = "Download an entire website from the Wayback Machine." s.summary = "Download an entire website from the Wayback Machine."
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)" s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"