mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
78 lines
2.8 KiB
Ruby
78 lines
2.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module TidyBytes
|
|
# precomputing CP1252 to UTF-8 mappings for bytes 128-159
|
|
CP1252_MAP = (128..159).map do |byte|
|
|
case byte
|
|
when 128 then [226, 130, 172] # EURO SIGN
|
|
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
|
|
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
|
|
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
|
|
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
|
|
when 134 then [226, 128, 160] # DAGGER
|
|
when 135 then [226, 128, 161] # DOUBLE DAGGER
|
|
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
when 137 then [226, 128, 176] # PER MILLE SIGN
|
|
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
|
|
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
|
|
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
|
|
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
|
|
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
|
|
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
|
|
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
|
|
when 149 then [226, 128, 162] # BULLET
|
|
when 150 then [226, 128, 147] # EN DASH
|
|
when 151 then [226, 128, 148] # EM DASH
|
|
when 152 then [203, 156] # SMALL TILDE
|
|
when 153 then [226, 132, 162] # TRADE MARK SIGN
|
|
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
|
|
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
|
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
|
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
|
else nil # ANYTHING ELSE...
|
|
end
|
|
end.freeze
|
|
|
|
# precomputing all possible byte conversions
|
|
CP1252_TO_UTF8 = Array.new(256) do |b|
|
|
if (128..159).cover?(b)
|
|
CP1252_MAP[b - 128]&.pack('C*')
|
|
elsif b < 128
|
|
b.chr
|
|
else
|
|
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
|
|
end
|
|
end.freeze
|
|
|
|
def self.included(base)
|
|
base.class_eval do
|
|
def tidy_bytes(force = false)
|
|
return nil if empty?
|
|
|
|
if force
|
|
buffer = String.new(capacity: bytesize)
|
|
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
|
return buffer.force_encoding(Encoding::UTF_8)
|
|
end
|
|
|
|
begin
|
|
encode('UTF-8')
|
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
buffer = String.new(capacity: bytesize)
|
|
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
|
end
|
|
end
|
|
|
|
def tidy_bytes!(force = false)
|
|
result = tidy_bytes(force)
|
|
result ? replace(result) : self
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
class String
|
|
include TidyBytes
|
|
end |