More testing

This commit is contained in:
Felipe 2024-12-31 00:11:58 +00:00
parent a78b62ae7a
commit 9bbb67cd90

View File

@ -1,42 +1,52 @@
module TibyBytes # frozen_string_literal: true
# CP-1252 decimal byte => UTF-8 approximation as an array of bytes module TidyBytes
CP1252 = { # Using a frozen array so we have a O(1) lookup time
128 => [226, 130, 172], CP1252_MAP = Array.new(160) do |i|
129 => nil, case i
130 => [226, 128, 154], when 128 then [226, 130, 172]
131 => [198, 146], when 130 then [226, 128, 154]
132 => [226, 128, 158], when 131 then [198, 146]
133 => [226, 128, 166], when 132 then [226, 128, 158]
134 => [226, 128, 160], when 133 then [226, 128, 166]
135 => [226, 128, 161], when 134 then [226, 128, 160]
136 => [203, 134], when 135 then [226, 128, 161]
137 => [226, 128, 176], when 136 then [203, 134]
138 => [197, 160], when 137 then [226, 128, 176]
139 => [226, 128, 185], when 138 then [197, 160]
140 => [197, 146], when 139 then [226, 128, 185]
141 => nil, when 140 then [197, 146]
142 => [197, 189], when 142 then [197, 189]
143 => nil, when 145 then [226, 128, 152]
144 => nil, when 146 then [226, 128, 153]
145 => [226, 128, 152], when 147 then [226, 128, 156]
146 => [226, 128, 153], when 148 then [226, 128, 157]
147 => [226, 128, 156], when 149 then [226, 128, 162]
148 => [226, 128, 157], when 150 then [226, 128, 147]
149 => [226, 128, 162], when 151 then [226, 128, 148]
150 => [226, 128, 147], when 152 then [203, 156]
151 => [226, 128, 148], when 153 then [226, 132, 162]
152 => [203, 156], when 154 then [197, 161]
153 => [226, 132, 162], when 155 then [226, 128, 186]
154 => [197, 161], when 156 then [197, 147]
155 => [226, 128, 186], when 158 then [197, 190]
156 => [197, 147], when 159 then [197, 184]
157 => nil, end
158 => [197, 190], end.freeze
159 => [197, 184]
}
module StringMixin def self.included(base)
base.class_eval do
private
def tidy_byte(byte)
if byte < 160
CP1252_MAP[byte]
else
byte < 192 ? [194, byte] : [195, byte - 64]
end
end
public
# Attempt to replace invalid UTF-8 bytes with valid ones. This method # Attempt to replace invalid UTF-8 bytes with valid ones. This method
# naively assumes if you have invalid UTF8 bytes, they are either Windows # naively assumes if you have invalid UTF8 bytes, they are either Windows
@ -45,78 +55,90 @@ module TibyBytes
# #
# Passing +true+ will forcibly tidy all bytes, assuming that the string's # Passing +true+ will forcibly tidy all bytes, assuming that the string's
# encoding is CP-1252 or ISO-8859-1. # encoding is CP-1252 or ISO-8859-1.
def tidy_bytes(force = false)
if force def tidy_bytes(force = false)
return unpack("C*").map do |b| return nil if empty?
tidy_byte(b)
end.flatten.compact.pack("C*").unpack("U*").pack("U*") if force
end buffer = String.new(capacity: bytesize)
each_byte do |b|
bytes = unpack("C*") cleaned = tidy_byte(b)
conts_expected = 0 buffer << cleaned.pack("C*") if cleaned
last_lead = 0
bytes.each_index do |i|
byte = bytes[i]
_is_ascii = byte < 128
is_cont = byte > 127 && byte < 192
is_lead = byte > 191 && byte < 245
is_unused = byte > 240
is_restricted = byte > 244
# Impossible or highly unlikely byte? Clean it.
if is_unused || is_restricted
bytes[i] = tidy_byte(byte)
elsif is_cont
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
else
if conts_expected > 0
# Expected continuation, but got ASCII or leading? Clean backwards up to
# the leading byte.
begin
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
rescue NoMethodError
next
end
conts_expected = 0
end end
if is_lead return buffer.force_encoding(Encoding::UTF_8)
# Final byte is leading? Clean it. end
if i == bytes.length - 1
bytes[i] = tidy_byte(bytes.last) buffer = String.new(capacity: bytesize)
bytes = each_byte.to_a
conts_expected = 0
last_lead = 0
bytes.each_with_index do |byte, i|
if byte < 128 # ASCII
buffer << byte
next
end
if byte > 244 || byte > 240 # invalid bytes
cleaned = tidy_byte(byte)
buffer << cleaned.pack("C*") if cleaned
next
end
is_cont = byte > 127 && byte < 192
is_lead = byte > 191 && byte < 245
if is_cont
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
if conts_expected == 0
cleaned = tidy_byte(byte)
buffer << cleaned.pack("C*") if cleaned
else else
# Valid leading byte? Expect continuations determined by position of buffer << byte
# first zero bit, with max of 3. conts_expected -= 1
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 end
last_lead = i else
if conts_expected > 0
# Expected continuation, but got ASCII or leading? Clean backwards up to
# the leading byte.
(1..(i - last_lead)).each do |j|
back_byte = bytes[i - j]
cleaned = tidy_byte(back_byte)
buffer << cleaned.pack("C*") if cleaned
end
conts_expected = 0
end
if is_lead
# Final byte is leading? Clean it.
if i == bytes.length - 1
cleaned = tidy_byte(byte)
buffer << cleaned.pack("C*") if cleaned
else
# Valid leading byte? Expect continuations determined by position of
# first zero bit, with max of 3.
buffer << byte
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
last_lead = i
end
end end
end end
end end
end
begin buffer.force_encoding(Encoding::UTF_8)
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") rescue
rescue ArgumentError
nil nil
end end
# Tidy bytes in place.
def tidy_bytes!(force = false)
result = tidy_bytes(force)
result ? replace(result) : self
end
end end
# Tidy bytes in-place.
def tidy_bytes!(force = false)
replace tidy_bytes(force)
end
private
def tidy_byte(byte)
byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
end
end end
end end
class String class String
include TibyBytes::StringMixin include TidyBytes
end end