Sanitize file_id

we were not consistently handling non-UTF-8 characters here, especially after commit e4487baafcab64d2b81a5fd7a6b572ac8fa772e2. This also fixes #25
This commit is contained in:
Felipe 2025-07-26 20:58:50 +00:00 committed by GitHub
parent d3466b3387
commit c30ee73977

View File

@ -115,7 +115,7 @@ class WaybackMachineDownloader
include ArchiveAPI include ArchiveAPI
include SubdomainProcessor include SubdomainProcessor
VERSION = "2.3.12" VERSION = "2.3.11"
DEFAULT_TIMEOUT = 30 DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_DELAY = 2 RETRY_DELAY = 2
@ -352,16 +352,6 @@ class WaybackMachineDownloader
file_versions.values file_versions.values
end end
# Returns a list of files for the composite snapshot
def get_file_list_composite_snapshot(target_timestamp)
file_list = get_composite_snapshot_file_list(target_timestamp)
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
file_list.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
def get_file_list_curated def get_file_list_curated
file_list_curated = Hash.new file_list_curated = Hash.new
get_all_snapshots_to_consider.each do |file_timestamp, file_url| get_all_snapshots_to_consider.each do |file_timestamp, file_url|
@ -694,6 +684,7 @@ class WaybackMachineDownloader
file_timestamp = file_remote_info[:timestamp] file_timestamp = file_remote_info[:timestamp]
# sanitize file_id to ensure it is a valid path component # sanitize file_id to ensure it is a valid path component
file_id = file_id.tidy_bytes if file_id
raw_path_elements = file_id.split('/') raw_path_elements = file_id.split('/')
sanitized_path_elements = raw_path_elements.map do |element| sanitized_path_elements = raw_path_elements.map do |element|