diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index dbc97af..48b5d35 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -2,24 +2,56 @@ # essentially, this is for converting a string with a potentially # broken or unknown encoding into a valid UTF-8 string +# @todo: consider using charlock_holmes for this in the future module TidyBytes + UNICODE_REPLACEMENT_CHARACTER = "�" + + # common encodings to try for best multilingual compatibility + COMMON_ENCODINGS = [ + Encoding::UTF_8, + Encoding::Windows_1251, # Cyrillic/Russian legacy + Encoding::GB18030, # Simplified Chinese + Encoding::Shift_JIS, # Japanese + Encoding::EUC_KR, # Korean + Encoding::ISO_8859_1, # Western European + Encoding::Windows_1252 # Western European/Latin1 superset + ].select { |enc| Encoding.name_list.include?(enc.name) } + + # returns true if the string appears to be binary (has null bytes) + def binary_data? + self.include?("\x00".b) + end + + # attempts to return a valid UTF-8 version of the string def tidy_bytes - # return if the string is already valid UTF-8 - return self if self.valid_encoding? && self.encoding == Encoding::UTF_8 + return self if self.encoding == Encoding::UTF_8 && self.valid_encoding? + return self.dup.force_encoding("BINARY") if binary_data? - # create a mutable copy so we don't modify the original string str = self.dup - - # attempt to encode to UTF-8 - begin - return str.encode(Encoding::UTF-8) - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + COMMON_ENCODINGS.each do |enc| + str.force_encoding(enc) + begin + utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER) + return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER) + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + # try next encoding + end end - # if it failed, force the encoding to ISO-8859-1, transcode the - # string to UTF-8, and use replacement options for any characters - # that might still be problematic - str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '�') + # if no clean conversion found, try again but accept replacement characters + str = self.dup + COMMON_ENCODINGS.each do |enc| + str.force_encoding(enc) + begin + utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER) + return utf8 if utf8.valid_encoding? + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + # try next encoding + end + end + + # fallback: replace all invalid/undefined bytes + str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER) end def tidy_bytes! @@ -43,4 +75,4 @@ end class String include TidyBytes -end +end \ No newline at end of file