Correct file path and sanitization in Windows

Not only we weren't normalizing the file directories, we were also agressively sanitizing incorrect characters, leading to some funny stuff on Windows. Fixes #16
This commit is contained in:
Felipe 2025-06-15 13:48:11 +00:00 committed by GitHub
parent 3fdfd70fc1
commit 18357a77ed

View File

@ -131,7 +131,11 @@ class WaybackMachineDownloader
validate_params(params) validate_params(params)
@base_url = params[:base_url] @base_url = params[:base_url]
@exact_url = params[:exact_url] @exact_url = params[:exact_url]
@directory = params[:directory] if params[:directory]
@directory = File.expand_path(params[:directory])
else
@directory = nil
end
@all_timestamps = params[:all_timestamps] @all_timestamps = params[:all_timestamps]
@from_timestamp = params[:from_timestamp].to_i @from_timestamp = params[:from_timestamp].to_i
@to_timestamp = params[:to_timestamp].to_i @to_timestamp = params[:to_timestamp].to_i
@ -165,13 +169,11 @@ class WaybackMachineDownloader
def backup_path def backup_path
if @directory if @directory
if @directory[-1] == '/' # because @directory is already an absolute path, we just ensure it exists
@directory @directory
else else
@directory + '/' # ensure the default path is absolute and normalized
end File.expand_path(File.join('websites', backup_name))
else
'websites/' + backup_name + '/'
end end
end end
@ -638,21 +640,35 @@ class WaybackMachineDownloader
file_url = file_remote_info[:file_url].encode(current_encoding) file_url = file_remote_info[:file_url].encode(current_encoding)
file_id = file_remote_info[:file_id] file_id = file_remote_info[:file_id]
file_timestamp = file_remote_info[:timestamp] file_timestamp = file_remote_info[:timestamp]
file_path_elements = file_id.split('/')
# sanitize file_id to ensure it is a valid path component
raw_path_elements = file_id.split('/')
sanitized_path_elements = raw_path_elements.map do |element|
if Gem.win_platform?
# for Windows, we need to sanitize path components to avoid invalid characters
# this prevents issues with file names that contain characters not allowed in
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
element.gsub(/[:\*?"<>\|\&\=\/\\]/, ->(match) { '%' + match.ord.to_s(16).upcase })
else
element
end
end
current_backup_path = backup_path
if file_id == "" if file_id == ""
dir_path = backup_path dir_path = current_backup_path
file_path = backup_path + 'index.html' file_path = File.join(dir_path, 'index.html')
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
dir_path = backup_path + file_path_elements[0..-1].join('/') # if file_id is a directory, we treat it as such
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html' dir_path = File.join(current_backup_path, *sanitized_path_elements)
file_path = File.join(dir_path, 'index.html')
else else
dir_path = backup_path + file_path_elements[0..-2].join('/') # if file_id is a file, we treat it as such
file_path = backup_path + file_path_elements[0..-1].join('/') filename = sanitized_path_elements.pop
end dir_path = File.join(current_backup_path, *sanitized_path_elements)
if Gem.win_platform? file_path = File.join(dir_path, filename)
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
end end
# check existence *before* download attempt # check existence *before* download attempt