2024-12-31 00:11:58 +00:00
|
|
|
|
# frozen_string_literal: true
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
# essentially, this is for converting a string with a potentially
|
|
|
|
|
|
# broken or unknown encoding into a valid UTF-8 string
|
2024-12-31 00:11:58 +00:00
|
|
|
|
module TidyBytes
|
2025-07-29 10:10:56 -03:00
|
|
|
|
def tidy_bytes
|
|
|
|
|
|
# return if the string is already valid UTF-8
|
|
|
|
|
|
return self if self.valid_encoding? && self.encoding == Encoding::UTF_8
|
|
|
|
|
|
|
|
|
|
|
|
# create a mutable copy so we don't modify the original string
|
|
|
|
|
|
str = self.dup
|
2025-02-09 16:47:29 +00:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
# attempt to encode to UTF-8
|
|
|
|
|
|
begin
|
|
|
|
|
|
return str.encode(Encoding::UTF-8)
|
|
|
|
|
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
2024-12-31 00:11:58 +00:00
|
|
|
|
end
|
2025-07-29 10:10:56 -03:00
|
|
|
|
|
|
|
|
|
|
# if it failed, force the encoding to ISO-8859-1, transcode the
|
|
|
|
|
|
# string to UTF-8, and use replacement options for any characters
|
|
|
|
|
|
# that might still be problematic
|
|
|
|
|
|
str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '<27>')
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
def tidy_bytes!
|
|
|
|
|
|
replace(self.tidy_bytes)
|
|
|
|
|
|
end
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2024-12-31 00:11:58 +00:00
|
|
|
|
def self.included(base)
|
2025-07-29 10:10:56 -03:00
|
|
|
|
base.send(:include, InstanceMethods)
|
|
|
|
|
|
end
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
module InstanceMethods
|
|
|
|
|
|
def tidy_bytes
|
|
|
|
|
|
TidyBytes.instance_method(:tidy_bytes).bind(self).call
|
|
|
|
|
|
end
|
2015-08-17 19:40:43 -05:00
|
|
|
|
|
2025-07-29 10:10:56 -03:00
|
|
|
|
def tidy_bytes!
|
|
|
|
|
|
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
|
2015-08-17 19:40:43 -05:00
|
|
|
|
end
|
|
|
|
|
|
end
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
class String
|
2024-12-31 00:11:58 +00:00
|
|
|
|
include TidyBytes
|
2025-07-29 10:10:56 -03:00
|
|
|
|
end
|