mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-18 02:06:35 +00:00
More testing
This commit is contained in:
parent
a78b62ae7a
commit
9bbb67cd90
@ -1,42 +1,52 @@
|
||||
module TibyBytes
|
||||
# frozen_string_literal: true
|
||||
|
||||
# CP-1252 decimal byte => UTF-8 approximation as an array of bytes
|
||||
CP1252 = {
|
||||
128 => [226, 130, 172],
|
||||
129 => nil,
|
||||
130 => [226, 128, 154],
|
||||
131 => [198, 146],
|
||||
132 => [226, 128, 158],
|
||||
133 => [226, 128, 166],
|
||||
134 => [226, 128, 160],
|
||||
135 => [226, 128, 161],
|
||||
136 => [203, 134],
|
||||
137 => [226, 128, 176],
|
||||
138 => [197, 160],
|
||||
139 => [226, 128, 185],
|
||||
140 => [197, 146],
|
||||
141 => nil,
|
||||
142 => [197, 189],
|
||||
143 => nil,
|
||||
144 => nil,
|
||||
145 => [226, 128, 152],
|
||||
146 => [226, 128, 153],
|
||||
147 => [226, 128, 156],
|
||||
148 => [226, 128, 157],
|
||||
149 => [226, 128, 162],
|
||||
150 => [226, 128, 147],
|
||||
151 => [226, 128, 148],
|
||||
152 => [203, 156],
|
||||
153 => [226, 132, 162],
|
||||
154 => [197, 161],
|
||||
155 => [226, 128, 186],
|
||||
156 => [197, 147],
|
||||
157 => nil,
|
||||
158 => [197, 190],
|
||||
159 => [197, 184]
|
||||
}
|
||||
module TidyBytes
|
||||
# Using a frozen array so we have a O(1) lookup time
|
||||
CP1252_MAP = Array.new(160) do |i|
|
||||
case i
|
||||
when 128 then [226, 130, 172]
|
||||
when 130 then [226, 128, 154]
|
||||
when 131 then [198, 146]
|
||||
when 132 then [226, 128, 158]
|
||||
when 133 then [226, 128, 166]
|
||||
when 134 then [226, 128, 160]
|
||||
when 135 then [226, 128, 161]
|
||||
when 136 then [203, 134]
|
||||
when 137 then [226, 128, 176]
|
||||
when 138 then [197, 160]
|
||||
when 139 then [226, 128, 185]
|
||||
when 140 then [197, 146]
|
||||
when 142 then [197, 189]
|
||||
when 145 then [226, 128, 152]
|
||||
when 146 then [226, 128, 153]
|
||||
when 147 then [226, 128, 156]
|
||||
when 148 then [226, 128, 157]
|
||||
when 149 then [226, 128, 162]
|
||||
when 150 then [226, 128, 147]
|
||||
when 151 then [226, 128, 148]
|
||||
when 152 then [203, 156]
|
||||
when 153 then [226, 132, 162]
|
||||
when 154 then [197, 161]
|
||||
when 155 then [226, 128, 186]
|
||||
when 156 then [197, 147]
|
||||
when 158 then [197, 190]
|
||||
when 159 then [197, 184]
|
||||
end
|
||||
end.freeze
|
||||
|
||||
module StringMixin
|
||||
def self.included(base)
|
||||
base.class_eval do
|
||||
private
|
||||
|
||||
def tidy_byte(byte)
|
||||
if byte < 160
|
||||
CP1252_MAP[byte]
|
||||
else
|
||||
byte < 192 ? [194, byte] : [195, byte - 64]
|
||||
end
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
||||
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
||||
@ -45,78 +55,90 @@ module TibyBytes
|
||||
#
|
||||
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
||||
# encoding is CP-1252 or ISO-8859-1.
|
||||
def tidy_bytes(force = false)
|
||||
|
||||
if force
|
||||
return unpack("C*").map do |b|
|
||||
tidy_byte(b)
|
||||
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
||||
end
|
||||
def tidy_bytes(force = false)
|
||||
return nil if empty?
|
||||
|
||||
bytes = unpack("C*")
|
||||
conts_expected = 0
|
||||
last_lead = 0
|
||||
|
||||
bytes.each_index do |i|
|
||||
|
||||
byte = bytes[i]
|
||||
_is_ascii = byte < 128
|
||||
is_cont = byte > 127 && byte < 192
|
||||
is_lead = byte > 191 && byte < 245
|
||||
is_unused = byte > 240
|
||||
is_restricted = byte > 244
|
||||
|
||||
# Impossible or highly unlikely byte? Clean it.
|
||||
if is_unused || is_restricted
|
||||
bytes[i] = tidy_byte(byte)
|
||||
elsif is_cont
|
||||
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
||||
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
||||
else
|
||||
if conts_expected > 0
|
||||
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
||||
# the leading byte.
|
||||
begin
|
||||
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
||||
rescue NoMethodError
|
||||
next
|
||||
end
|
||||
conts_expected = 0
|
||||
if force
|
||||
buffer = String.new(capacity: bytesize)
|
||||
each_byte do |b|
|
||||
cleaned = tidy_byte(b)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
end
|
||||
if is_lead
|
||||
# Final byte is leading? Clean it.
|
||||
if i == bytes.length - 1
|
||||
bytes[i] = tidy_byte(bytes.last)
|
||||
return buffer.force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
|
||||
buffer = String.new(capacity: bytesize)
|
||||
bytes = each_byte.to_a
|
||||
conts_expected = 0
|
||||
last_lead = 0
|
||||
|
||||
bytes.each_with_index do |byte, i|
|
||||
if byte < 128 # ASCII
|
||||
buffer << byte
|
||||
next
|
||||
end
|
||||
|
||||
if byte > 244 || byte > 240 # invalid bytes
|
||||
cleaned = tidy_byte(byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
next
|
||||
end
|
||||
|
||||
is_cont = byte > 127 && byte < 192
|
||||
is_lead = byte > 191 && byte < 245
|
||||
|
||||
if is_cont
|
||||
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
||||
if conts_expected == 0
|
||||
cleaned = tidy_byte(byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
else
|
||||
# Valid leading byte? Expect continuations determined by position of
|
||||
# first zero bit, with max of 3.
|
||||
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
||||
last_lead = i
|
||||
buffer << byte
|
||||
conts_expected -= 1
|
||||
end
|
||||
else
|
||||
if conts_expected > 0
|
||||
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
||||
# the leading byte.
|
||||
(1..(i - last_lead)).each do |j|
|
||||
back_byte = bytes[i - j]
|
||||
cleaned = tidy_byte(back_byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
end
|
||||
conts_expected = 0
|
||||
end
|
||||
|
||||
if is_lead
|
||||
# Final byte is leading? Clean it.
|
||||
if i == bytes.length - 1
|
||||
cleaned = tidy_byte(byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
else
|
||||
# Valid leading byte? Expect continuations determined by position of
|
||||
# first zero bit, with max of 3.
|
||||
buffer << byte
|
||||
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
||||
last_lead = i
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
begin
|
||||
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
||||
rescue ArgumentError
|
||||
|
||||
buffer.force_encoding(Encoding::UTF_8)
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
|
||||
# Tidy bytes in place.
|
||||
def tidy_bytes!(force = false)
|
||||
result = tidy_bytes(force)
|
||||
result ? replace(result) : self
|
||||
end
|
||||
end
|
||||
|
||||
# Tidy bytes in-place.
|
||||
def tidy_bytes!(force = false)
|
||||
replace tidy_bytes(force)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def tidy_byte(byte)
|
||||
byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
class String
|
||||
include TibyBytes::StringMixin
|
||||
include TidyBytes
|
||||
end
|
||||
Loading…
x
Reference in New Issue
Block a user