diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 50a13c2..376fcce 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -11,6 +11,7 @@ require 'concurrent-ruby' require 'logger' require 'zlib' require 'stringio' +require 'digest' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/archive_api' @@ -171,12 +172,19 @@ class WaybackMachineDownloader def backup_name url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url - - if url_to_process.include? '//' + raw = if url_to_process.include?('//') url_to_process.split('/')[2] else url_to_process end + + # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port) + if Gem.win_platform? + raw = raw.gsub(/[:*?"<>|]/, '_') + raw = raw.gsub(/[ .]+\z/, '') + end + raw = 'site' if raw.nil? || raw.empty? + raw end def backup_path @@ -768,18 +776,83 @@ class WaybackMachineDownloader # safely sanitize a file id (or id+timestamp) def sanitize_and_prepare_id(raw, file_url) - return nil if raw.nil? + return nil if raw.nil? || raw.empty? + original = raw.dup begin - raw = CGI.unescape(raw) rescue raw - raw.gsub!(/<[^>]*>/, '') - raw = raw.tidy_bytes unless raw.empty? + # work on a binary copy to avoid premature encoding errors + raw = raw.dup.force_encoding(Encoding::BINARY) + + # percent-decode (repeat until stable in case of double-encoding) + loop do + decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') } + break if decoded == raw + raw = decoded + end + + # try tidy_bytes + begin + raw = raw.tidy_bytes + rescue StandardError + # fallback: scrub to UTF-8 + raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '') + end + + # ensure UTF-8 and scrub again + unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding? + raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '') + end + + # strip HTML/comment artifacts & control chars + raw.gsub!(/