diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index 52222ef..dbc97af 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -1,78 +1,46 @@ # frozen_string_literal: true +# essentially, this is for converting a string with a potentially +# broken or unknown encoding into a valid UTF-8 string module TidyBytes - # precomputing CP1252 to UTF-8 mappings for bytes 128-159 - CP1252_MAP = (128..159).map do |byte| - case byte - when 128 then [226, 130, 172] # EURO SIGN - when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK - when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK - when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK - when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS - when 134 then [226, 128, 160] # DAGGER - when 135 then [226, 128, 161] # DOUBLE DAGGER - when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT - when 137 then [226, 128, 176] # PER MILLE SIGN - when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON - when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK - when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE - when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON - when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK - when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK - when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK - when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK - when 149 then [226, 128, 162] # BULLET - when 150 then [226, 128, 147] # EN DASH - when 151 then [226, 128, 148] # EM DASH - when 152 then [203, 156] # SMALL TILDE - when 153 then [226, 132, 162] # TRADE MARK SIGN - when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON - when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - when 156 then [197, 147] # LATIN SMALL LIGATURE OE - when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON - when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS - else nil # ANYTHING ELSE... - end - end.freeze + def tidy_bytes + # return if the string is already valid UTF-8 + return self if self.valid_encoding? && self.encoding == Encoding::UTF_8 - # precomputing all possible byte conversions - CP1252_TO_UTF8 = Array.new(256) do |b| - if (128..159).cover?(b) - CP1252_MAP[b - 128]&.pack('C*') - elsif b < 128 - b.chr - else - b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*') + # create a mutable copy so we don't modify the original string + str = self.dup + + # attempt to encode to UTF-8 + begin + return str.encode(Encoding::UTF-8) + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError end - end.freeze + + # if it failed, force the encoding to ISO-8859-1, transcode the + # string to UTF-8, and use replacement options for any characters + # that might still be problematic + str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '�') + end + + def tidy_bytes! + replace(self.tidy_bytes) + end def self.included(base) - base.class_eval do - def tidy_bytes(force = false) - return nil if empty? - - if force - buffer = String.new(capacity: bytesize) - each_byte { |b| buffer << CP1252_TO_UTF8[b] } - return buffer.force_encoding(Encoding::UTF_8) - end + base.send(:include, InstanceMethods) + end - begin - encode('UTF-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - buffer = String.new(capacity: bytesize) - scrub { |b| CP1252_TO_UTF8[b.ord] } - end - end + module InstanceMethods + def tidy_bytes + TidyBytes.instance_method(:tidy_bytes).bind(self).call + end - def tidy_bytes!(force = false) - result = tidy_bytes(force) - result ? replace(result) : self - end + def tidy_bytes! + TidyBytes.instance_method(:tidy_bytes!).bind(self).call end end end class String include TidyBytes -end \ No newline at end of file +end