mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
More aggressive sanitization
this should deal with some of the issues we've seen, luckily. What a ride!
This commit is contained in:
parent
c731e0c7bd
commit
6bc08947b7
@ -11,6 +11,7 @@ require 'concurrent-ruby'
|
|||||||
require 'logger'
|
require 'logger'
|
||||||
require 'zlib'
|
require 'zlib'
|
||||||
require 'stringio'
|
require 'stringio'
|
||||||
|
require 'digest'
|
||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
require_relative 'wayback_machine_downloader/to_regex'
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
require_relative 'wayback_machine_downloader/archive_api'
|
require_relative 'wayback_machine_downloader/archive_api'
|
||||||
@ -171,12 +172,19 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
||||||
|
raw = if url_to_process.include?('//')
|
||||||
if url_to_process.include? '//'
|
|
||||||
url_to_process.split('/')[2]
|
url_to_process.split('/')[2]
|
||||||
else
|
else
|
||||||
url_to_process
|
url_to_process
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
||||||
|
if Gem.win_platform?
|
||||||
|
raw = raw.gsub(/[:*?"<>|]/, '_')
|
||||||
|
raw = raw.gsub(/[ .]+\z/, '')
|
||||||
|
end
|
||||||
|
raw = 'site' if raw.nil? || raw.empty?
|
||||||
|
raw
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_path
|
def backup_path
|
||||||
@ -768,18 +776,83 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
# safely sanitize a file id (or id+timestamp)
|
# safely sanitize a file id (or id+timestamp)
|
||||||
def sanitize_and_prepare_id(raw, file_url)
|
def sanitize_and_prepare_id(raw, file_url)
|
||||||
return nil if raw.nil?
|
return nil if raw.nil? || raw.empty?
|
||||||
|
original = raw.dup
|
||||||
begin
|
begin
|
||||||
raw = CGI.unescape(raw) rescue raw
|
# work on a binary copy to avoid premature encoding errors
|
||||||
raw.gsub!(/<[^>]*>/, '')
|
raw = raw.dup.force_encoding(Encoding::BINARY)
|
||||||
raw = raw.tidy_bytes unless raw.empty?
|
|
||||||
|
# percent-decode (repeat until stable in case of double-encoding)
|
||||||
|
loop do
|
||||||
|
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
||||||
|
break if decoded == raw
|
||||||
|
raw = decoded
|
||||||
|
end
|
||||||
|
|
||||||
|
# try tidy_bytes
|
||||||
|
begin
|
||||||
|
raw = raw.tidy_bytes
|
||||||
|
rescue StandardError
|
||||||
|
# fallback: scrub to UTF-8
|
||||||
|
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||||
|
end
|
||||||
|
|
||||||
|
# ensure UTF-8 and scrub again
|
||||||
|
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
||||||
|
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||||
|
end
|
||||||
|
|
||||||
|
# strip HTML/comment artifacts & control chars
|
||||||
|
raw.gsub!(/<!--+/, '')
|
||||||
|
raw.gsub!(/[\x00-\x1F]/, '')
|
||||||
|
|
||||||
|
# split query; hash it for stable short name
|
||||||
|
path_part, query_part = raw.split('?', 2)
|
||||||
|
if query_part && !query_part.empty?
|
||||||
|
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
||||||
|
if path_part.include?('.')
|
||||||
|
pre, _sep, post = path_part.rpartition('.')
|
||||||
|
path_part = "#{pre}__q#{q_digest}.#{post}"
|
||||||
|
else
|
||||||
|
path_part = "#{path_part}__q#{q_digest}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
raw = path_part
|
||||||
|
|
||||||
|
# collapse slashes & trim leading slash
|
||||||
|
raw.gsub!(%r{/+}, '/')
|
||||||
|
raw.sub!(%r{\A/}, '')
|
||||||
|
|
||||||
|
# segment-wise sanitation
|
||||||
|
raw = raw.split('/').map do |segment|
|
||||||
|
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||||
|
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
||||||
|
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
||||||
|
seg.empty? ? '_' : seg
|
||||||
|
end.join('/')
|
||||||
|
|
||||||
|
# remove any remaining angle brackets
|
||||||
|
raw.tr!('<>', '')
|
||||||
|
|
||||||
|
# final fallback if empty
|
||||||
|
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
||||||
|
|
||||||
raw
|
raw
|
||||||
rescue => e
|
rescue => e
|
||||||
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
||||||
nil
|
# deterministic fallback – never return nil so caller won’t mark malformed
|
||||||
|
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
||||||
|
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
||||||
|
# when invoking manually.
|
||||||
|
def safe_display_url(url)
|
||||||
|
return url unless url && url.match?(/[&]/)
|
||||||
|
"(#{url})"
|
||||||
|
end
|
||||||
|
|
||||||
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
||||||
retries = 0
|
retries = 0
|
||||||
begin
|
begin
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user