diff --git a/lib/tidy_bytes.rb b/lib/tidy_bytes.rb new file mode 100644 index 0000000..f584103 --- /dev/null +++ b/lib/tidy_bytes.rb @@ -0,0 +1,122 @@ +module TibyBytes + + # CP-1252 decimal byte => UTF-8 approximation as an array of bytes + CP1252 = { + 128 => [226, 130, 172], + 129 => nil, + 130 => [226, 128, 154], + 131 => [198, 146], + 132 => [226, 128, 158], + 133 => [226, 128, 166], + 134 => [226, 128, 160], + 135 => [226, 128, 161], + 136 => [203, 134], + 137 => [226, 128, 176], + 138 => [197, 160], + 139 => [226, 128, 185], + 140 => [197, 146], + 141 => nil, + 142 => [197, 189], + 143 => nil, + 144 => nil, + 145 => [226, 128, 152], + 146 => [226, 128, 153], + 147 => [226, 128, 156], + 148 => [226, 128, 157], + 149 => [226, 128, 162], + 150 => [226, 128, 147], + 151 => [226, 128, 148], + 152 => [203, 156], + 153 => [226, 132, 162], + 154 => [197, 161], + 155 => [226, 128, 186], + 156 => [197, 147], + 157 => nil, + 158 => [197, 190], + 159 => [197, 184] + } + + module StringMixin + + # Attempt to replace invalid UTF-8 bytes with valid ones. This method + # naively assumes if you have invalid UTF8 bytes, they are either Windows + # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not + # always work. + # + # Passing +true+ will forcibly tidy all bytes, assuming that the string's + # encoding is CP-1252 or ISO-8859-1. + def tidy_bytes(force = false) + + if force + return unpack("C*").map do |b| + tidy_byte(b) + end.flatten.compact.pack("C*").unpack("U*").pack("U*") + end + + bytes = unpack("C*") + conts_expected = 0 + last_lead = 0 + + bytes.each_index do |i| + + byte = bytes[i] + is_ascii = byte < 128 + is_cont = byte > 127 && byte < 192 + is_lead = byte > 191 && byte < 245 + is_unused = byte > 240 + is_restricted = byte > 244 + + # Impossible or highly unlikely byte? Clean it. + if is_unused || is_restricted + bytes[i] = tidy_byte(byte) + elsif is_cont + # Not expecting contination byte? Clean up. Otherwise, now expect one less. + conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 + else + if conts_expected > 0 + # Expected continuation, but got ASCII or leading? Clean backwards up to + # the leading byte. + begin + (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} + rescue NoMethodError => e + next + end + conts_expected = 0 + end + if is_lead + # Final byte is leading? Clean it. + if i == bytes.length - 1 + bytes[i] = tidy_byte(bytes.last) + else + # Valid leading byte? Expect continuations determined by position of + # first zero bit, with max of 3. + conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 + last_lead = i + end + end + end + end + begin + bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") + rescue ArgumentError => e + nil + end + end + + # Tidy bytes in-place. + def tidy_bytes!(force = false) + replace tidy_bytes(force) + end + + private + + def tidy_byte(byte) + byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64] + end + + end +end + +class String + include TibyBytes::StringMixin +end