2024-12-31 00:11:58 +00:00
|
|
|
|
# frozen_string_literal: true
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
# essentially, this is for converting a string with a potentially
|
|
|
|
|
|
# broken or unknown encoding into a valid UTF-8 string
|
2025-07-31 12:58:22 -03:00
|
|
|
|
# @todo: consider using charlock_holmes for this in the future
|
2024-12-31 00:11:58 +00:00
|
|
|
|
module TidyBytes
|
2025-07-31 12:58:22 -03:00
|
|
|
|
UNICODE_REPLACEMENT_CHARACTER = "<EFBFBD>"
|
|
|
|
|
|
|
|
|
|
|
|
# common encodings to try for best multilingual compatibility
|
|
|
|
|
|
COMMON_ENCODINGS = [
|
|
|
|
|
|
Encoding::UTF_8,
|
|
|
|
|
|
Encoding::Windows_1251, # Cyrillic/Russian legacy
|
|
|
|
|
|
Encoding::GB18030, # Simplified Chinese
|
|
|
|
|
|
Encoding::Shift_JIS, # Japanese
|
|
|
|
|
|
Encoding::EUC_KR, # Korean
|
|
|
|
|
|
Encoding::ISO_8859_1, # Western European
|
|
|
|
|
|
Encoding::Windows_1252 # Western European/Latin1 superset
|
|
|
|
|
|
].select { |enc| Encoding.name_list.include?(enc.name) }
|
|
|
|
|
|
|
|
|
|
|
|
# returns true if the string appears to be binary (has null bytes)
|
|
|
|
|
|
def binary_data?
|
|
|
|
|
|
self.include?("\x00".b)
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
# attempts to return a valid UTF-8 version of the string
|
2025-07-29 10:10:56 -03:00
|
|
|
|
def tidy_bytes
|
2025-07-31 12:58:22 -03:00
|
|
|
|
return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
|
|
|
|
|
|
return self.dup.force_encoding("BINARY") if binary_data?
|
2025-07-29 10:10:56 -03:00
|
|
|
|
|
|
|
|
|
|
str = self.dup
|
2025-07-31 12:58:22 -03:00
|
|
|
|
COMMON_ENCODINGS.each do |enc|
|
|
|
|
|
|
str.force_encoding(enc)
|
|
|
|
|
|
begin
|
|
|
|
|
|
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
|
|
|
|
return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
|
|
|
|
|
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
|
|
|
|
# try next encoding
|
|
|
|
|
|
end
|
|
|
|
|
|
end
|
2025-02-09 16:47:29 +00:00
|
|
|
|
|
2025-07-31 12:58:22 -03:00
|
|
|
|
# if no clean conversion found, try again but accept replacement characters
|
|
|
|
|
|
str = self.dup
|
|
|
|
|
|
COMMON_ENCODINGS.each do |enc|
|
|
|
|
|
|
str.force_encoding(enc)
|
|
|
|
|
|
begin
|
|
|
|
|
|
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
|
|
|
|
|
return utf8 if utf8.valid_encoding?
|
|
|
|
|
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
|
|
|
|
# try next encoding
|
|
|
|
|
|
end
|
2024-12-31 00:11:58 +00:00
|
|
|
|
end
|
2025-07-29 10:10:56 -03:00
|
|
|
|
|
2025-07-31 12:58:22 -03:00
|
|
|
|
# fallback: replace all invalid/undefined bytes
|
|
|
|
|
|
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
2025-07-29 10:10:56 -03:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
def tidy_bytes!
|
|
|
|
|
|
replace(self.tidy_bytes)
|
|
|
|
|
|
end
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2024-12-31 00:11:58 +00:00
|
|
|
|
def self.included(base)
|
2025-07-29 10:10:56 -03:00
|
|
|
|
base.send(:include, InstanceMethods)
|
|
|
|
|
|
end
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
module InstanceMethods
|
|
|
|
|
|
def tidy_bytes
|
|
|
|
|
|
TidyBytes.instance_method(:tidy_bytes).bind(self).call
|
|
|
|
|
|
end
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
def tidy_bytes!
|
|
|
|
|
|
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
|
2015-08-17 19:40:43 -05:00
|
|
|
|
end
|
|
|
|
|
|
end
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
class String
|
2024-12-31 00:11:58 +00:00
|
|
|
|
include TidyBytes
|
2025-07-31 12:58:22 -03:00
|
|
|
|
end
|