From c30ee7397749c4794ee6de8cf96e12047a67a076 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Sat, 26 Jul 2025 20:58:50 +0000 Subject: [PATCH] Sanitize file_id we were not consistently handling non-UTF-8 characters here, especially after commit e4487baafcab64d2b81a5fd7a6b572ac8fa772e2. This also fixes #25 --- lib/wayback_machine_downloader.rb | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index eee812d..cf6c2e4 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -115,7 +115,7 @@ class WaybackMachineDownloader include ArchiveAPI include SubdomainProcessor - VERSION = "2.3.12" + VERSION = "2.3.11" DEFAULT_TIMEOUT = 30 MAX_RETRIES = 3 RETRY_DELAY = 2 @@ -352,16 +352,6 @@ class WaybackMachineDownloader file_versions.values end - # Returns a list of files for the composite snapshot - def get_file_list_composite_snapshot(target_timestamp) - file_list = get_composite_snapshot_file_list(target_timestamp) - file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse - file_list.map do |file_remote_info| - file_remote_info[1][:file_id] = file_remote_info[0] - file_remote_info[1] - end - end - def get_file_list_curated file_list_curated = Hash.new get_all_snapshots_to_consider.each do |file_timestamp, file_url| @@ -694,6 +684,7 @@ class WaybackMachineDownloader file_timestamp = file_remote_info[:timestamp] # sanitize file_id to ensure it is a valid path component + file_id = file_id.tidy_bytes if file_id raw_path_elements = file_id.split('/') sanitized_path_elements = raw_path_elements.map do |element|