Fixes for tidy_bytes

admittedly not the cleanest way to do this, although it works for #25.
2025-12-29 16:16:06 +00:00 · 2025-07-31 12:58:22 -03:00 · 2025-07-31 12:58:22 -03:00 · 1f4202908f
commit 1f4202908f
parent bed3f6101c
1 changed files with 45 additions and 13 deletions
--- a/lib/wayback_machine_downloader/tidy_bytes.rb
+++ b/lib/wayback_machine_downloader/tidy_bytes.rb
@ -2,24 +2,56 @@
 # essentially, this is for converting a string with a potentially
 # broken or unknown encoding into a valid UTF-8 string
 # @todo: consider using charlock_holmes for this in the future
 module TidyBytes
  UNICODE_REPLACEMENT_CHARACTER = "<EFBFBD>"
  # common encodings to try for best multilingual compatibility
  COMMON_ENCODINGS = [
    Encoding::UTF_8,
    Encoding::Windows_1251, # Cyrillic/Russian legacy
    Encoding::GB18030,      # Simplified Chinese
    Encoding::Shift_JIS,    # Japanese
    Encoding::EUC_KR,       # Korean
    Encoding::ISO_8859_1,   # Western European
    Encoding::Windows_1252  # Western European/Latin1 superset
  ].select { |enc| Encoding.name_list.include?(enc.name) }
  # returns true if the string appears to be binary (has null bytes)
  def binary_data?
    self.include?("\x00".b)
  end
  # attempts to return a valid UTF-8 version of the string
  def tidy_bytes
-    # return if the string is already valid UTF-8
+    return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
-    return self if self.valid_encoding? && self.encoding == Encoding::UTF_8
+    return self.dup.force_encoding("BINARY") if binary_data?
    # create a mutable copy so we don't modify the original string
    str = self.dup
-
+    COMMON_ENCODINGS.each do |enc|
-    # attempt to encode to UTF-8
+      str.force_encoding(enc)
-    begin
+      begin
-      return str.encode(Encoding::UTF-8)
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
-    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
        # try next encoding
      end
    end
-    # if it failed, force the encoding to ISO-8859-1, transcode the
+    # if no clean conversion found, try again but accept replacement characters
-    # string to UTF-8, and use replacement options for any characters
+    str = self.dup
-    # that might still be problematic
+    COMMON_ENCODINGS.each do |enc|
-    str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '<27>')
+      str.force_encoding(enc)
      begin
        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
        return utf8 if utf8.valid_encoding?
      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
        # try next encoding
      end
    end
    # fallback: replace all invalid/undefined bytes
    str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
  end
  def tidy_bytes!
@ -43,4 +75,4 @@ end
 class String
  include TidyBytes
-end
+end