Sanitizing HTML tags

some sites contain tags *in* their URL, and fail to save on some devices like Windows
2025-12-29 16:16:06 +00:00 · 2025-08-05 23:44:34 +00:00 · 2025-08-05 23:44:34 +00:00 · 6ad312f31f
commit 6ad312f31f
parent 62ea35daa6
1 changed files with 7 additions and 0 deletions
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@ -342,6 +342,7 @@ class WaybackMachineDownloader
      next if file_timestamp.to_i > target_timestamp
      file_id = file_url.split('/')[3..-1].join('/')
      file_id = CGI::unescape file_id
      file_id.gsub!(/<[^>]*>/, '') # sanitize HTML tags
      file_id = file_id.tidy_bytes unless file_id == ""
      next if file_id.nil?
      next if match_exclude_filter(file_url)
@ -370,9 +371,12 @@ class WaybackMachineDownloader
      next unless file_url.include?('/')
      file_id = file_url.split('/')[3..-1].join('/')
      file_id = CGI::unescape file_id
      file_id.gsub!(/<[^>]*>/, '') # sanitize HTML tags
      file_id = file_id.tidy_bytes unless file_id == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
      elsif file_id.include?('<') || file_id.include?('>')
        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
@ -397,9 +401,12 @@ class WaybackMachineDownloader
      file_id = file_url.split('/')[3..-1].join('/')
      file_id_and_timestamp = [file_timestamp, file_id].join('/')
      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
      file_id_and_timestamp.gsub!(/<[^>]*>/, '') # sanitize HTML tags
      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
      elsif file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"