Refactor tidy_bytes.rb

I'm not sure if we can easily determine the encoding behind each site (and I don't think Wayback Machine does that), *but* we can at least translate it and get it to download. This should be mostly useful for other, non-Western European languages. See #25
2025-12-29 16:16:06 +00:00 · 2025-07-29 10:10:56 -03:00 · 2025-07-29 10:10:56 -03:00 · bc868e6b39
commit bc868e6b39
parent 2bf04aff48
1 changed files with 31 additions and 63 deletions
--- a/lib/wayback_machine_downloader/tidy_bytes.rb
+++ b/lib/wayback_machine_downloader/tidy_bytes.rb
@ -1,74 +1,42 @@
 # frozen_string_literal: true
 # essentially, this is for converting a string with a potentially
 # broken or unknown encoding into a valid UTF-8 string
 module TidyBytes
-  # precomputing CP1252 to UTF-8 mappings for bytes 128-159
+  def tidy_bytes
-  CP1252_MAP = (128..159).map do |byte|
+    # return if the string is already valid UTF-8
-    case byte
+    return self if self.valid_encoding? && self.encoding == Encoding::UTF_8
    when 128 then [226, 130, 172]  # EURO SIGN
    when 130 then [226, 128, 154]  # SINGLE LOW-9 QUOTATION MARK
    when 131 then [198, 146]       # LATIN SMALL LETTER F WITH HOOK
    when 132 then [226, 128, 158]  # DOUBLE LOW-9 QUOTATION MARK
    when 133 then [226, 128, 166]  # HORIZONTAL ELLIPSIS
    when 134 then [226, 128, 160]  # DAGGER
    when 135 then [226, 128, 161]  # DOUBLE DAGGER
    when 136 then [203, 134]       # MODIFIER LETTER CIRCUMFLEX ACCENT
    when 137 then [226, 128, 176]  # PER MILLE SIGN
    when 138 then [197, 160]       # LATIN CAPITAL LETTER S WITH CARON
    when 139 then [226, 128, 185]  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    when 140 then [197, 146]       # LATIN CAPITAL LIGATURE OE
    when 142 then [197, 189]       # LATIN CAPITAL LETTER Z WITH CARON
    when 145 then [226, 128, 152]  # LEFT SINGLE QUOTATION MARK
    when 146 then [226, 128, 153]  # RIGHT SINGLE QUOTATION MARK
    when 147 then [226, 128, 156]  # LEFT DOUBLE QUOTATION MARK
    when 148 then [226, 128, 157]  # RIGHT DOUBLE QUOTATION MARK
    when 149 then [226, 128, 162]  # BULLET
    when 150 then [226, 128, 147]  # EN DASH
    when 151 then [226, 128, 148]  # EM DASH
    when 152 then [203, 156]       # SMALL TILDE
    when 153 then [226, 132, 162]  # TRADE MARK SIGN
    when 154 then [197, 161]       # LATIN SMALL LETTER S WITH CARON
    when 155 then [226, 128, 186]  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    when 156 then [197, 147]       # LATIN SMALL LIGATURE OE
    when 158 then [197, 190]       # LATIN SMALL LETTER Z WITH CARON
    when 159 then [197, 184]       # LATIN SMALL LETTER Y WITH DIAERESIS
    else nil                       # ANYTHING ELSE...
    end
  end.freeze
-  # precomputing all possible byte conversions 
+    # create a mutable copy so we don't modify the original string
-  CP1252_TO_UTF8 = Array.new(256) do |b|
+    str = self.dup
-    if (128..159).cover?(b)
+
-      CP1252_MAP[b - 128]&.pack('C*')
+    # attempt to encode to UTF-8
-    elsif b < 128
+    begin
-      b.chr
+      return str.encode(Encoding::UTF-8)
-    else
+    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
      b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
    end
-  end.freeze
+
    # if it failed, force the encoding to ISO-8859-1, transcode the
    # string to UTF-8, and use replacement options for any characters
    # that might still be problematic
    str.force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '<27>')
  end
  def tidy_bytes!
    replace(self.tidy_bytes)
  end
  def self.included(base)
-    base.class_eval do
+    base.send(:include, InstanceMethods)
-      def tidy_bytes(force = false)
+  end
        return nil if empty?
-        if force
+  module InstanceMethods
-          buffer = String.new(capacity: bytesize)
+    def tidy_bytes
-          each_byte { |b| buffer << CP1252_TO_UTF8[b] }
+      TidyBytes.instance_method(:tidy_bytes).bind(self).call
-          return buffer.force_encoding(Encoding::UTF_8)
+    end
        end
-        begin
+    def tidy_bytes!
-          encode('UTF-8')
+      TidyBytes.instance_method(:tidy_bytes!).bind(self).call
        rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
          buffer = String.new(capacity: bytesize)
          scrub { |b| CP1252_TO_UTF8[b.ord] }
        end
      end
      def tidy_bytes!(force = false)
        result = tidy_bytes(force)
        result ? replace(result) : self
      end
    end
  end
 end