From 9bbb67cd90d661ca38b4ea9307f88b09d08deaaa Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Tue, 31 Dec 2024 00:11:58 +0000 Subject: [PATCH] More testing --- lib/wayback_machine_downloader/tidy_bytes.rb | 220 ++++++++++--------- 1 file changed, 121 insertions(+), 99 deletions(-) diff --git a/lib/wayback_machine_downloader/tidy_bytes.rb b/lib/wayback_machine_downloader/tidy_bytes.rb index 0d08431..a4103b1 100644 --- a/lib/wayback_machine_downloader/tidy_bytes.rb +++ b/lib/wayback_machine_downloader/tidy_bytes.rb @@ -1,42 +1,52 @@ -module TibyBytes +# frozen_string_literal: true - # CP-1252 decimal byte => UTF-8 approximation as an array of bytes - CP1252 = { - 128 => [226, 130, 172], - 129 => nil, - 130 => [226, 128, 154], - 131 => [198, 146], - 132 => [226, 128, 158], - 133 => [226, 128, 166], - 134 => [226, 128, 160], - 135 => [226, 128, 161], - 136 => [203, 134], - 137 => [226, 128, 176], - 138 => [197, 160], - 139 => [226, 128, 185], - 140 => [197, 146], - 141 => nil, - 142 => [197, 189], - 143 => nil, - 144 => nil, - 145 => [226, 128, 152], - 146 => [226, 128, 153], - 147 => [226, 128, 156], - 148 => [226, 128, 157], - 149 => [226, 128, 162], - 150 => [226, 128, 147], - 151 => [226, 128, 148], - 152 => [203, 156], - 153 => [226, 132, 162], - 154 => [197, 161], - 155 => [226, 128, 186], - 156 => [197, 147], - 157 => nil, - 158 => [197, 190], - 159 => [197, 184] - } +module TidyBytes + # Using a frozen array so we have a O(1) lookup time + CP1252_MAP = Array.new(160) do |i| + case i + when 128 then [226, 130, 172] + when 130 then [226, 128, 154] + when 131 then [198, 146] + when 132 then [226, 128, 158] + when 133 then [226, 128, 166] + when 134 then [226, 128, 160] + when 135 then [226, 128, 161] + when 136 then [203, 134] + when 137 then [226, 128, 176] + when 138 then [197, 160] + when 139 then [226, 128, 185] + when 140 then [197, 146] + when 142 then [197, 189] + when 145 then [226, 128, 152] + when 146 then [226, 128, 153] + when 147 then [226, 128, 156] + when 148 then [226, 128, 157] + when 149 then [226, 128, 162] + when 150 then [226, 128, 147] + when 151 then [226, 128, 148] + when 152 then [203, 156] + when 153 then [226, 132, 162] + when 154 then [197, 161] + when 155 then [226, 128, 186] + when 156 then [197, 147] + when 158 then [197, 190] + when 159 then [197, 184] + end + end.freeze - module StringMixin + def self.included(base) + base.class_eval do + private + + def tidy_byte(byte) + if byte < 160 + CP1252_MAP[byte] + else + byte < 192 ? [194, byte] : [195, byte - 64] + end + end + + public # Attempt to replace invalid UTF-8 bytes with valid ones. This method # naively assumes if you have invalid UTF8 bytes, they are either Windows @@ -45,78 +55,90 @@ module TibyBytes # # Passing +true+ will forcibly tidy all bytes, assuming that the string's # encoding is CP-1252 or ISO-8859-1. - def tidy_bytes(force = false) - if force - return unpack("C*").map do |b| - tidy_byte(b) - end.flatten.compact.pack("C*").unpack("U*").pack("U*") - end - - bytes = unpack("C*") - conts_expected = 0 - last_lead = 0 - - bytes.each_index do |i| - - byte = bytes[i] - _is_ascii = byte < 128 - is_cont = byte > 127 && byte < 192 - is_lead = byte > 191 && byte < 245 - is_unused = byte > 240 - is_restricted = byte > 244 - - # Impossible or highly unlikely byte? Clean it. - if is_unused || is_restricted - bytes[i] = tidy_byte(byte) - elsif is_cont - # Not expecting continuation byte? Clean up. Otherwise, now expect one less. - conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 - else - if conts_expected > 0 - # Expected continuation, but got ASCII or leading? Clean backwards up to - # the leading byte. - begin - (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} - rescue NoMethodError - next - end - conts_expected = 0 + def tidy_bytes(force = false) + return nil if empty? + + if force + buffer = String.new(capacity: bytesize) + each_byte do |b| + cleaned = tidy_byte(b) + buffer << cleaned.pack("C*") if cleaned end - if is_lead - # Final byte is leading? Clean it. - if i == bytes.length - 1 - bytes[i] = tidy_byte(bytes.last) + return buffer.force_encoding(Encoding::UTF_8) + end + + buffer = String.new(capacity: bytesize) + bytes = each_byte.to_a + conts_expected = 0 + last_lead = 0 + + bytes.each_with_index do |byte, i| + if byte < 128 # ASCII + buffer << byte + next + end + + if byte > 244 || byte > 240 # invalid bytes + cleaned = tidy_byte(byte) + buffer << cleaned.pack("C*") if cleaned + next + end + + is_cont = byte > 127 && byte < 192 + is_lead = byte > 191 && byte < 245 + + if is_cont + # Not expecting continuation byte? Clean up. Otherwise, now expect one less. + if conts_expected == 0 + cleaned = tidy_byte(byte) + buffer << cleaned.pack("C*") if cleaned else - # Valid leading byte? Expect continuations determined by position of - # first zero bit, with max of 3. - conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 - last_lead = i + buffer << byte + conts_expected -= 1 + end + else + if conts_expected > 0 + # Expected continuation, but got ASCII or leading? Clean backwards up to + # the leading byte. + (1..(i - last_lead)).each do |j| + back_byte = bytes[i - j] + cleaned = tidy_byte(back_byte) + buffer << cleaned.pack("C*") if cleaned + end + conts_expected = 0 + end + + if is_lead + # Final byte is leading? Clean it. + if i == bytes.length - 1 + cleaned = tidy_byte(byte) + buffer << cleaned.pack("C*") if cleaned + else + # Valid leading byte? Expect continuations determined by position of + # first zero bit, with max of 3. + buffer << byte + conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 + last_lead = i + end end end end - end - begin - bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") - rescue ArgumentError + + buffer.force_encoding(Encoding::UTF_8) + rescue nil end + + # Tidy bytes in place. + def tidy_bytes!(force = false) + result = tidy_bytes(force) + result ? replace(result) : self + end end - - # Tidy bytes in-place. - def tidy_bytes!(force = false) - replace tidy_bytes(force) - end - - private - - def tidy_byte(byte) - byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64] - end - end end class String - include TibyBytes::StringMixin -end + include TidyBytes +end \ No newline at end of file