Refactor tidy_bytes.rb

I'm not sure if we can easily determine the encoding behind each site (and I don't think Wayback Machine does that), *but* we can at least translate it and get it to download. This should be mostly useful for other, non-Western European languages. See #25
This commit is contained in:
Felipe 2025-07-29 10:10:56 -03:00 committed by GitHub
parent 2bf04aff48
commit bc868e6b39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,74 +1,42 @@
# frozen_string_literal: true # frozen_string_literal: true
# essentially, this is for converting a string with a potentially
# broken or unknown encoding into a valid UTF-8 string
module TidyBytes module TidyBytes
# precomputing CP1252 to UTF-8 mappings for bytes 128-159 def tidy_bytes
CP1252_MAP = (128..159).map do |byte| # return if the string is already valid UTF-8
case byte return self if self.valid_encoding? && self.encoding == Encoding::UTF_8
when 128 then [226, 130, 172] # EURO SIGN
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
when 134 then [226, 128, 160] # DAGGER
when 135 then [226, 128, 161] # DOUBLE DAGGER
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
when 137 then [226, 128, 176] # PER MILLE SIGN
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
when 149 then [226, 128, 162] # BULLET
when 150 then [226, 128, 147] # EN DASH
when 151 then [226, 128, 148] # EM DASH
when 152 then [203, 156] # SMALL TILDE
when 153 then [226, 132, 162] # TRADE MARK SIGN
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
else nil # ANYTHING ELSE...
end
end.freeze
# precomputing all possible byte conversions # create a mutable copy so we don't modify the original string
CP1252_TO_UTF8 = Array.new(256) do |b| str = self.dup
if (128..159).cover?(b)
CP1252_MAP[b - 128]&.pack('C*') # attempt to encode to UTF-8
elsif b < 128 begin
b.chr return str.encode(Encoding::UTF-8)
else rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
end end
end.freeze
# if it failed, force the encoding to ISO-8859-1, transcode the
# string to UTF-8, and use replacement options for any characters
# that might still be problematic
str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '<27>')
end
def tidy_bytes!
replace(self.tidy_bytes)
end
def self.included(base) def self.included(base)
base.class_eval do base.send(:include, InstanceMethods)
def tidy_bytes(force = false) end
return nil if empty?
if force module InstanceMethods
buffer = String.new(capacity: bytesize) def tidy_bytes
each_byte { |b| buffer << CP1252_TO_UTF8[b] } TidyBytes.instance_method(:tidy_bytes).bind(self).call
return buffer.force_encoding(Encoding::UTF_8) end
end
begin def tidy_bytes!
encode('UTF-8') TidyBytes.instance_method(:tidy_bytes!).bind(self).call
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
buffer = String.new(capacity: bytesize)
scrub { |b| CP1252_TO_UTF8[b.ord] }
end
end
def tidy_bytes!(force = false)
result = tidy_bytes(force)
result ? replace(result) : self
end
end end
end end
end end