Minor refactoring of HTML tag sanitization

This commit is contained in:
Felipe 2025-08-12 08:42:27 -03:00
parent 6ad312f31f
commit 9fd2a7f8d1
No known key found for this signature in database
GPG Key ID: 4A41D9F5AD5E5D67

View File

@ -340,16 +340,15 @@ class WaybackMachineDownloader
get_all_snapshots_to_consider.each do |file_timestamp, file_url| get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless file_url.include?('/') next unless file_url.include?('/')
next if file_timestamp.to_i > target_timestamp next if file_timestamp.to_i > target_timestamp
file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id raw_tail = file_url.split('/')[3..-1]&.join('/')
file_id.gsub!(/<[^>]*>/, '') # sanitize HTML tags file_id = sanitize_and_prepare_id(raw_tail, file_url)
file_id = file_id.tidy_bytes unless file_id == ""
next if file_id.nil? next if file_id.nil?
next if match_exclude_filter(file_url) next if match_exclude_filter(file_url)
next unless match_only_filter(file_url) next unless match_only_filter(file_url)
# Select the most recent version <= target_timestamp
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id} file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
end end
end end
file_versions.values file_versions.values
@ -369,25 +368,27 @@ class WaybackMachineDownloader
file_list_curated = Hash.new file_list_curated = Hash.new
get_all_snapshots_to_consider.each do |file_timestamp, file_url| get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless file_url.include?('/') next unless file_url.include?('/')
file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id raw_tail = file_url.split('/')[3..-1]&.join('/')
file_id.gsub!(/<[^>]*>/, '') # sanitize HTML tags file_id = sanitize_and_prepare_id(raw_tail, file_url)
file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil? if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}" puts "Malformed file url, ignoring: #{file_url}"
elsif file_id.include?('<') || file_id.include?('>') next
end
if file_id.include?('<') || file_id.include?('>')
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}" puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
else else
if match_exclude_filter(file_url) if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}" puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url) elsif !match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}" puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id] elsif file_list_curated[file_id]
unless file_list_curated[file_id][:timestamp] > file_timestamp unless file_list_curated[file_id][:timestamp] > file_timestamp
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
end end
else else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
end end
end end
end end
@ -398,24 +399,32 @@ class WaybackMachineDownloader
file_list_curated = Hash.new file_list_curated = Hash.new
get_all_snapshots_to_consider.each do |file_timestamp, file_url| get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless file_url.include?('/') next unless file_url.include?('/')
file_id = file_url.split('/')[3..-1].join('/')
file_id_and_timestamp = [file_timestamp, file_id].join('/') raw_tail = file_url.split('/')[3..-1]&.join('/')
file_id_and_timestamp = CGI::unescape file_id_and_timestamp file_id = sanitize_and_prepare_id(raw_tail, file_url)
file_id_and_timestamp.gsub!(/<[^>]*>/, '') # sanitize HTML tags
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
if file_id.nil? if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}" puts "Malformed file url, ignoring: #{file_url}"
elsif file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>') next
end
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
if file_id_and_timestamp.nil?
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
next
end
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}" puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
else else
if match_exclude_filter(file_url) if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}" puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url) elsif !match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}" puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id_and_timestamp] elsif file_list_curated[file_id_and_timestamp]
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose # duplicate combo, ignore silently (verbose flag not shown here)
else else
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp} file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
end end
end end
end end
@ -756,6 +765,20 @@ class WaybackMachineDownloader
end end
logger logger
end end
# safely sanitize a file id (or id+timestamp)
def sanitize_and_prepare_id(raw, file_url)
return nil if raw.nil?
begin
raw = CGI.unescape(raw) rescue raw
raw.gsub!(/<[^>]*>/, '')
raw = raw.tidy_bytes unless raw.empty?
raw
rescue => e
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
nil
end
end
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0) def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
retries = 0 retries = 0