Fixes for tidy_bytes

admittedly not the cleanest way to do this, although it works for #25.
This commit is contained in:
Felipe 2025-07-31 12:58:22 -03:00
parent bed3f6101c
commit 1f4202908f
No known key found for this signature in database
GPG Key ID: 4A41D9F5AD5E5D67

View File

@ -2,24 +2,56 @@
# essentially, this is for converting a string with a potentially # essentially, this is for converting a string with a potentially
# broken or unknown encoding into a valid UTF-8 string # broken or unknown encoding into a valid UTF-8 string
# @todo: consider using charlock_holmes for this in the future
module TidyBytes module TidyBytes
UNICODE_REPLACEMENT_CHARACTER = "<EFBFBD>"
# common encodings to try for best multilingual compatibility
COMMON_ENCODINGS = [
Encoding::UTF_8,
Encoding::Windows_1251, # Cyrillic/Russian legacy
Encoding::GB18030, # Simplified Chinese
Encoding::Shift_JIS, # Japanese
Encoding::EUC_KR, # Korean
Encoding::ISO_8859_1, # Western European
Encoding::Windows_1252 # Western European/Latin1 superset
].select { |enc| Encoding.name_list.include?(enc.name) }
# returns true if the string appears to be binary (has null bytes)
def binary_data?
self.include?("\x00".b)
end
# attempts to return a valid UTF-8 version of the string
def tidy_bytes def tidy_bytes
# return if the string is already valid UTF-8 return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
return self if self.valid_encoding? && self.encoding == Encoding::UTF_8 return self.dup.force_encoding("BINARY") if binary_data?
# create a mutable copy so we don't modify the original string
str = self.dup str = self.dup
COMMON_ENCODINGS.each do |enc|
# attempt to encode to UTF-8 str.force_encoding(enc)
begin begin
return str.encode(Encoding::UTF-8) utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
# try next encoding
end
end end
# if it failed, force the encoding to ISO-8859-1, transcode the # if no clean conversion found, try again but accept replacement characters
# string to UTF-8, and use replacement options for any characters str = self.dup
# that might still be problematic COMMON_ENCODINGS.each do |enc|
str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '<27>') str.force_encoding(enc)
begin
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
return utf8 if utf8.valid_encoding?
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
# try next encoding
end
end
# fallback: replace all invalid/undefined bytes
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
end end
def tidy_bytes! def tidy_bytes!
@ -43,4 +75,4 @@ end
class String class String
include TidyBytes include TidyBytes
end end