Bumped version

More aggressive sanitization
this should deal with some of the issues we've seen, luckily. What a ride!
2025-12-29 16:16:06 +00:00 · 2025-08-16 19:38:01 +00:00 · 2025-08-12 18:55:00 -03:00 · 2025-08-12 11:46:03 +00:00 · 2025-08-12 08:42:27 -03:00 · 2025-08-05 23:44:34 +00:00
9 changed files with 361 additions and 198 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,7 @@ tmp
 *.rbc

 test.rb
+
+# Dev environment
+.vscode
+*.code-workspace
--- a/7
+++ b/7
@@ -1,4 +1,4 @@
-FROM ruby:3.4.4-alpine
+FROM ruby:3.4.5-alpine
 USER root
 WORKDIR /build

@@ -6,10 +6,9 @@ COPY Gemfile /build/
 COPY *.gemspec /build/

 RUN bundle config set jobs "$(nproc)" \
-    && bundle config set without 'development test' \
    && bundle install

 COPY . /build

-WORKDIR /
-ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
+WORKDIR /build
+ENTRYPOINT [ "/build/bin/wayback_machine_downloader", "--directory", "/build/websites" ]
--- a/README.md
+++ b/README.md
@@ -81,36 +81,19 @@ services:
    tty: true
    image: wayback_machine_downloader:latest
    container_name: wayback_machine_downloader
-    environment:
-      - ENVIRONMENT=${ENVIRONMENT:-development}
-      - OPTIONS=${OPTIONS:-""}
-      - TARGET_URL=${TARGET_URL}
    volumes:
      - .:/build:rw
      - ./websites:/build/websites:rw
-    command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
 ```
 #### Usage:
-Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
+Now you can create a Docker image as named "wayback_machine_downloader" with the following command:
 ```bash
 docker compose up -d --build
 ```

-After that you must set TARGET_URL environment variable:
-```bash
-export TARGET_URL="https://example.com/"
-```
-
-The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
-
-Example:
-```bash
-export OPTIONS="--list -f 20060121"
-```
-
 After that you can run the exists container with the following command:
 ```bash
-docker compose run --rm wayback_machine_downloader https://example.com
+docker compose run --rm wayback_machine_downloader https://example.com [options]
 ```

 ## ⚙️ Configuration
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,11 +5,6 @@ services:
    tty: true
    image: wayback_machine_downloader:latest
    container_name: wayback_machine_downloader
-    environment:
-      - ENVIRONMENT=${DEVELOPMENT:-production}
-      - OPTIONS=${OPTIONS:-""}
-      - TARGET_URL=${TARGET_URL}
    volumes:
      - .:/build:rw
-      - ./websites:/websites:rw
-    command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}
+      - ./websites:/build/websites:rw
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -11,10 +11,12 @@ require 'concurrent-ruby'
 require 'logger'
 require 'zlib'
 require 'stringio'
+require 'digest'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
 require_relative 'wayback_machine_downloader/subdom_processor'
+require_relative 'wayback_machine_downloader/url_rewrite'

 class ConnectionPool
  MAX_AGE = 300
@@ -115,7 +117,7 @@ class WaybackMachineDownloader
  include ArchiveAPI
  include SubdomainProcessor

-  VERSION = "2.3.11"
+  VERSION = "2.4.2"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
@@ -128,14 +130,16 @@ class WaybackMachineDownloader

  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
+    :snapshot_at

  def initialize params
    validate_params(params)
-    @base_url = params[:base_url]
+    @base_url = params[:base_url]&.tidy_bytes
    @exact_url = params[:exact_url]
    if params[:directory]
-      @directory = File.expand_path(params[:directory])
+      sanitized_dir = params[:directory].tidy_bytes
+      @directory = File.expand_path(sanitized_dir)
    else
      @directory = nil
    end
@@ -158,6 +162,7 @@ class WaybackMachineDownloader
    @rewrite = params[:rewrite] || false
    @recursive_subdomains = params[:recursive_subdomains] || false
    @subdomain_depth = params[:subdomain_depth] || 1
+    @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil

    # URL for rejecting invalid/unencoded wayback urls
    @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -167,12 +172,19 @@ class WaybackMachineDownloader

  def backup_name
    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
-    
-    if url_to_process.include? '//'
+    raw = if url_to_process.include?('//')
      url_to_process.split('/')[2]
    else
      url_to_process
    end
+
+    # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
+    if Gem.win_platform?
+      raw = raw.gsub(/[:*?"<>|]/, '_')
+      raw = raw.gsub(/[ .]+\z/, '')
+    end
+    raw = 'site' if raw.nil? || raw.empty?
+    raw
  end

  def backup_path
@@ -330,26 +342,61 @@ class WaybackMachineDownloader
    snapshot_list_to_consider
  end

+  # Get a composite snapshot file list for a specific timestamp
+  def get_composite_snapshot_file_list(target_timestamp)
+    file_versions = {}
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
+      next if file_timestamp.to_i > target_timestamp
+
+      raw_tail = file_url.split('/')[3..-1]&.join('/')
+      file_id = sanitize_and_prepare_id(raw_tail, file_url)
+      next if file_id.nil?
+      next if match_exclude_filter(file_url)
+      next unless match_only_filter(file_url)
+
+      if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
+        file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
+      end
+    end
+    file_versions.values
+  end
+
+  # Returns a list of files for the composite snapshot
+  def get_file_list_composite_snapshot(target_timestamp)
+    file_list = get_composite_snapshot_file_list(target_timestamp)
+    file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
+    file_list.map do |file_remote_info|
+      file_remote_info[1][:file_id] = file_remote_info[0]
+      file_remote_info[1]
+    end
+  end
+
  def get_file_list_curated
    file_list_curated = Hash.new
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
-      file_id = file_url.split('/')[3..-1].join('/')
-      file_id = CGI::unescape file_id
-      file_id = file_id.tidy_bytes unless file_id == ""
+
+      raw_tail = file_url.split('/')[3..-1]&.join('/')
+      file_id = sanitize_and_prepare_id(raw_tail, file_url)
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
+        next
+      end
+
+      if file_id.include?('<') || file_id.include?('>')
+        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
-        elsif not match_only_filter(file_url)
+        elsif !match_only_filter(file_url)
          puts "File url doesn't match only filter, ignoring: #{file_url}"
        elsif file_list_curated[file_id]
          unless file_list_curated[file_id][:timestamp] > file_timestamp
-            file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
+            file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
          end
        else
-          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
+          file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
        end
      end
    end
@@ -360,21 +407,32 @@ class WaybackMachineDownloader
    file_list_curated = Hash.new
    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
      next unless file_url.include?('/')
-      file_id = file_url.split('/')[3..-1].join('/')
-      file_id_and_timestamp = [file_timestamp, file_id].join('/')
-      file_id_and_timestamp = CGI::unescape file_id_and_timestamp
-      file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
+
+      raw_tail = file_url.split('/')[3..-1]&.join('/')
+      file_id = sanitize_and_prepare_id(raw_tail, file_url)
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
+        next
+      end
+
+      file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
+      file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
+      if file_id_and_timestamp.nil?
+        puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
+        next
+      end
+
+      if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
+        puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
      else
        if match_exclude_filter(file_url)
          puts "File url matches exclude filter, ignoring: #{file_url}"
-        elsif not match_only_filter(file_url)
+        elsif !match_only_filter(file_url)
          puts "File url doesn't match only filter, ignoring: #{file_url}"
        elsif file_list_curated[file_id_and_timestamp]
-          puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
+          # duplicate combo, ignore silently (verbose flag not shown here)
        else
-          file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
+          file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
        end
      end
    end
@@ -384,7 +442,9 @@ class WaybackMachineDownloader


  def get_file_list_by_timestamp
-    if @all_timestamps
+    if @snapshot_at
+      @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
+    elsif @all_timestamps
      file_list_curated = get_file_list_all_timestamps
      file_list_curated.map do |file_remote_info|
        file_remote_info[1][:file_id] = file_remote_info[0]
@@ -439,6 +499,39 @@ class WaybackMachineDownloader
    end
  end

+  def processing_files(pool, files_to_process)
+    files_to_process.each do |file_remote_info|
+      pool.post do
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # assume download success if the result message contains ' -> '
+            if result_message && result_message.include?(' -> ')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
+          end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
+          end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
+        end
+        sleep(RATE_LIMIT)
+      end
+    end
+  end
+
  def download_files
    start_time = Time.now
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -485,36 +578,7 @@ class WaybackMachineDownloader
    thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
    pool = Concurrent::FixedThreadPool.new(thread_count)

-    files_to_process.each do |file_remote_info|
-      pool.post do
-        download_success = false
-        begin
-          @connection_pool.with_connection do |connection|
-            result_message = download_file(file_remote_info, connection)
-            # assume download success if the result message contains ' -> '
-            if result_message && result_message.include?(' -> ')
-               download_success = true
-            end
-            @download_mutex.synchronize do
-              @processed_file_count += 1
-              # adjust progress message to reflect remaining files
-              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
-              puts progress_message if progress_message
-            end
-          end
-          # sppend to DB only after successful download outside the connection block
-          if download_success
-            append_to_db(file_remote_info[:file_id])
-          end
-        rescue => e
-          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
-           @download_mutex.synchronize do
-              @processed_file_count += 1
-           end
-        end
-        sleep(RATE_LIMIT)
-      end
-    end
+    processing_files(pool, files_to_process)

    pool.shutdown
    pool.wait_for_termination
@@ -574,64 +638,13 @@ class WaybackMachineDownloader
      end

      # URLs in HTML attributes
-      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-        prefix, url, suffix = $1, $2, $3
-        
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "#{prefix}#{path}#{suffix}"
-          rescue
-            "#{prefix}#{url}#{suffix}"
-          end
-        elsif url.start_with?('/')
-          "#{prefix}./#{url[1..-1]}#{suffix}"
-        else
-          "#{prefix}#{url}#{suffix}"
-        end
-      end
+      rewrite_html_attr_urls(content)
      
      # URLs in CSS
-      content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
-        url = $1
-        
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "url(\"#{path}\")"
-          rescue
-            "url(\"#{url}\")"
-          end
-        elsif url.start_with?('/')
-          "url(\"./#{url[1..-1]}\")"
-        else
-          "url(\"#{url}\")"
-        end
-      end
+      rewrite_css_urls(content)
      
      # URLs in JavaScript
-      content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-        quote_start, url, quote_end = $1, $2, $3
-        
-        if url.start_with?('http')
-          begin
-            uri = URI.parse(url)
-            path = uri.path
-            path = path[1..-1] if path.start_with?('/')
-            "#{quote_start}#{path}#{quote_end}"
-          rescue
-            "#{quote_start}#{url}#{quote_end}"
-          end
-        elsif url.start_with?('/')
-          "#{quote_start}./#{url[1..-1]}#{quote_end}"
-        else
-          "#{quote_start}#{url}#{quote_end}"
-        end
-      end
+      rewrite_js_urls(content)
      
      # for URLs in HTML attributes that start with a single slash
      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
@@ -727,7 +740,22 @@ class WaybackMachineDownloader
  end

  def file_list_by_timestamp
-    @file_list_by_timestamp ||= get_file_list_by_timestamp
+    if @snapshot_at
+      @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
+    elsif @all_timestamps
+      file_list_curated = get_file_list_all_timestamps
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
+    else
+      file_list_curated = get_file_list_curated
+      file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
+      file_list_curated.map do |file_remote_info|
+        file_remote_info[1][:file_id] = file_remote_info[0]
+        file_remote_info[1]
+      end
+    end
  end

  private
@@ -745,6 +773,85 @@ class WaybackMachineDownloader
    end
    logger
  end
+    
+  # safely sanitize a file id (or id+timestamp)
+  def sanitize_and_prepare_id(raw, file_url)
+    return nil if raw.nil? || raw.empty?
+    original = raw.dup
+    begin
+      # work on a binary copy to avoid premature encoding errors
+      raw = raw.dup.force_encoding(Encoding::BINARY)
+
+      # percent-decode (repeat until stable in case of double-encoding)
+      loop do
+        decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
+        break if decoded == raw
+        raw = decoded
+      end
+
+      # try tidy_bytes
+      begin
+        raw = raw.tidy_bytes
+      rescue StandardError
+        # fallback: scrub to UTF-8
+        raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
+      end
+
+      # ensure UTF-8 and scrub again
+      unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
+        raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
+      end
+
+      # strip HTML/comment artifacts & control chars
+      raw.gsub!(/<!--+/, '')
+      raw.gsub!(/[\x00-\x1F]/, '')
+
+      # split query; hash it for stable short name
+      path_part, query_part = raw.split('?', 2)
+      if query_part && !query_part.empty?
+        q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
+        if path_part.include?('.')
+          pre, _sep, post = path_part.rpartition('.')
+          path_part = "#{pre}__q#{q_digest}.#{post}"
+        else
+          path_part = "#{path_part}__q#{q_digest}"
+        end
+      end
+      raw = path_part
+
+      # collapse slashes & trim leading slash
+      raw.gsub!(%r{/+}, '/')
+      raw.sub!(%r{\A/}, '')
+
+      # segment-wise sanitation
+      raw = raw.split('/').map do |segment|
+        seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
+        seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
+        seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
+        seg.empty? ? '_' : seg
+      end.join('/')
+
+      # remove any remaining angle brackets
+      raw.tr!('<>', '')
+
+      # final fallback if empty
+      raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
+
+      raw
+    rescue => e
+      @logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
+      # deterministic fallback – never return nil so caller won’t mark malformed
+      "file__#{Digest::SHA1.hexdigest(original)[0,10]}"
+    end
+  end
+
+  # wrap URL in parentheses if it contains characters that commonly break unquoted
+  # Windows CMD usage (e.g., &). This is only for display; user still must quote
+  # when invoking manually.
+  def safe_display_url(url)
+    return url unless url && url.match?(/[&]/)
+    "(#{url})"
+  end

  def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
    retries = 0
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -25,7 +25,7 @@ module ArchiveAPI
      # Check if the response contains the header ["timestamp", "original"]
      json.shift if json.first == ["timestamp", "original"]
      json
-    rescue JSON::ParserError, StandardError => e
+    rescue JSON::ParserError => e
      warn "Failed to fetch data from API: #{e.message}"
      []
    end
--- a/lib/wayback_machine_downloader/tidy_bytes.rb
+++ b/lib/wayback_machine_downloader/tidy_bytes.rb
@@ -1,73 +1,74 @@
 # frozen_string_literal: true

+# essentially, this is for converting a string with a potentially
+# broken or unknown encoding into a valid UTF-8 string
+# @todo: consider using charlock_holmes for this in the future
 module TidyBytes
-  # precomputing CP1252 to UTF-8 mappings for bytes 128-159
-  CP1252_MAP = (128..159).map do |byte|
-    case byte
-    when 128 then [226, 130, 172]  # EURO SIGN
-    when 130 then [226, 128, 154]  # SINGLE LOW-9 QUOTATION MARK
-    when 131 then [198, 146]       # LATIN SMALL LETTER F WITH HOOK
-    when 132 then [226, 128, 158]  # DOUBLE LOW-9 QUOTATION MARK
-    when 133 then [226, 128, 166]  # HORIZONTAL ELLIPSIS
-    when 134 then [226, 128, 160]  # DAGGER
-    when 135 then [226, 128, 161]  # DOUBLE DAGGER
-    when 136 then [203, 134]       # MODIFIER LETTER CIRCUMFLEX ACCENT
-    when 137 then [226, 128, 176]  # PER MILLE SIGN
-    when 138 then [197, 160]       # LATIN CAPITAL LETTER S WITH CARON
-    when 139 then [226, 128, 185]  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-    when 140 then [197, 146]       # LATIN CAPITAL LIGATURE OE
-    when 142 then [197, 189]       # LATIN CAPITAL LETTER Z WITH CARON
-    when 145 then [226, 128, 152]  # LEFT SINGLE QUOTATION MARK
-    when 146 then [226, 128, 153]  # RIGHT SINGLE QUOTATION MARK
-    when 147 then [226, 128, 156]  # LEFT DOUBLE QUOTATION MARK
-    when 148 then [226, 128, 157]  # RIGHT DOUBLE QUOTATION MARK
-    when 149 then [226, 128, 162]  # BULLET
-    when 150 then [226, 128, 147]  # EN DASH
-    when 151 then [226, 128, 148]  # EM DASH
-    when 152 then [203, 156]       # SMALL TILDE
-    when 153 then [226, 132, 162]  # TRADE MARK SIGN
-    when 154 then [197, 161]       # LATIN SMALL LETTER S WITH CARON
-    when 155 then [226, 128, 186]  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-    when 156 then [197, 147]       # LATIN SMALL LIGATURE OE
-    when 158 then [197, 190]       # LATIN SMALL LETTER Z WITH CARON
-    when 159 then [197, 184]       # LATIN SMALL LETTER Y WITH DIAERESIS
-    end
-  end.freeze
+  UNICODE_REPLACEMENT_CHARACTER = "<EFBFBD>"

-  # precomputing all possible byte conversions 
-  CP1252_TO_UTF8 = Array.new(256) do |b|
-    if (128..159).cover?(b)
-      CP1252_MAP[b - 128]&.pack('C*')
-    elsif b < 128
-      b.chr
-    else
-      b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
+  # common encodings to try for best multilingual compatibility
+  COMMON_ENCODINGS = [
+    Encoding::UTF_8,
+    Encoding::Windows_1251, # Cyrillic/Russian legacy
+    Encoding::GB18030,      # Simplified Chinese
+    Encoding::Shift_JIS,    # Japanese
+    Encoding::EUC_KR,       # Korean
+    Encoding::ISO_8859_1,   # Western European
+    Encoding::Windows_1252  # Western European/Latin1 superset
+  ].select { |enc| Encoding.name_list.include?(enc.name) }
+
+  # returns true if the string appears to be binary (has null bytes)
+  def binary_data?
+    self.include?("\x00".b)
+  end
+
+  # attempts to return a valid UTF-8 version of the string
+  def tidy_bytes
+    return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
+    return self.dup.force_encoding("BINARY") if binary_data?
+
+    str = self.dup
+    COMMON_ENCODINGS.each do |enc|
+      str.force_encoding(enc)
+      begin
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+        return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
+      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        # try next encoding
+      end
    end
-  end.freeze
+
+    # if no clean conversion found, try again but accept replacement characters
+    str = self.dup
+    COMMON_ENCODINGS.each do |enc|
+      str.force_encoding(enc)
+      begin
+        utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+        return utf8 if utf8.valid_encoding?
+      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+        # try next encoding
+      end
+    end
+
+    # fallback: replace all invalid/undefined bytes
+    str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
+  end
+
+  def tidy_bytes!
+    replace(self.tidy_bytes)
+  end

  def self.included(base)
-    base.class_eval do
-      def tidy_bytes(force = false)
-        return nil if empty?
-        
-        if force
-          buffer = String.new(capacity: bytesize)
-          each_byte { |b| buffer << CP1252_TO_UTF8[b] }
-          return buffer.force_encoding(Encoding::UTF_8)
-        end
+    base.send(:include, InstanceMethods)
+  end

-        begin
-          encode('UTF-8')
-        rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
-          buffer = String.new(capacity: bytesize)
-          scrub { |b| CP1252_TO_UTF8[b.ord] }
-        end
-      end
+  module InstanceMethods
+    def tidy_bytes
+      TidyBytes.instance_method(:tidy_bytes).bind(self).call
+    end

-      def tidy_bytes!(force = false)
-        result = tidy_bytes(force)
-        result ? replace(result) : self
-      end
+    def tidy_bytes!
+      TidyBytes.instance_method(:tidy_bytes!).bind(self).call
    end
  end
 end
--- a/lib/wayback_machine_downloader/url_rewrite.rb
+++ b/lib/wayback_machine_downloader/url_rewrite.rb
@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+
+# URLs in HTML attributes
+def rewrite_html_attr_urls(content)
+  
+  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
+    prefix, url, suffix = $1, $2, $3
+    
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "#{prefix}#{path}#{suffix}"
+      rescue
+        "#{prefix}#{url}#{suffix}"
+      end
+    elsif url.start_with?('/')
+      "#{prefix}./#{url[1..-1]}#{suffix}"
+    else
+      "#{prefix}#{url}#{suffix}"
+    end
+  end
+  content
+end
+
+# URLs in CSS
+def rewrite_css_urls(content)
+
+  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
+    url = $1
+    
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "url(\"#{path}\")"
+      rescue
+        "url(\"#{url}\")"
+      end
+    elsif url.start_with?('/')
+      "url(\"./#{url[1..-1]}\")"
+    else
+      "url(\"#{url}\")"
+    end
+  end
+  content
+end
+
+# URLs in JavaScript
+def rewrite_js_urls(content)
+  
+  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
+    quote_start, url, quote_end = $1, $2, $3
+    
+    if url.start_with?('http')
+      begin
+        uri = URI.parse(url)
+        path = uri.path
+        path = path[1..-1] if path.start_with?('/')
+        "#{quote_start}#{path}#{quote_end}"
+      rescue
+        "#{quote_start}#{url}#{quote_end}"
+      end
+    elsif url.start_with?('/')
+      "#{quote_start}./#{url[1..-1]}#{quote_end}"
+    else
+      "#{quote_start}#{url}#{quote_end}"
+    end
+  end
+  
+  content
+end
--- a/wayback_machine_downloader.gemspec
+++ b/wayback_machine_downloader.gemspec
@@ -1,12 +1,12 @@
 Gem::Specification.new do |s|
  s.name        = "wayback_machine_downloader_straw"
-  s.version     = "2.3.11"
+  s.version     = "2.4.2"
  s.executables << "wayback_machine_downloader"
  s.summary     = "Download an entire website from the Wayback Machine."
  s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
  s.authors     = ["strawberrymaster"]
  s.email       = "strawberrymaster@vivaldi.net"
-  s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
+  s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
  s.homepage    = "https://github.com/StrawberryMaster/wayback-machine-downloader"
  s.license     = "MIT"
  s.required_ruby_version = ">= 3.4.3"
Author	SHA1	Message	Date
Felipe	40e9c9bb51	Bumped version	2025-08-16 19:38:01 +00:00
Felipe	6bc08947b7	More aggressive sanitization this should deal with some of the issues we've seen, luckily. What a ride!	2025-08-12 18:55:00 -03:00
Felipe	c731e0c7bd	Bumped version	2025-08-12 11:46:03 +00:00
Felipe	9fd2a7f8d1	Minor refactoring of HTML tag sanitization	2025-08-12 08:42:27 -03:00
Felipe	6ad312f31f	Sanitizing HTML tags some sites contain tags in their URL, and fail to save on some devices like Windows	2025-08-05 23:44:34 +00:00
Felipe	62ea35daa6	Bumping version	2025-08-04 21:23:48 +00:00
Felipe	1f4202908f	Fixes for tidy_bytes admittedly not the cleanest way to do this, although it works for #25.	2025-07-31 12:58:22 -03:00
Felipe	bed3f6101c	Added missing gemspec file	2025-07-31 12:57:03 -03:00
Felipe	754df6b8d6	Merge pull request #27 from adampweb/master Refactored huge functions & cleanup	2025-07-29 18:09:51 -03:00
adampweb	801fb77f79	Perf: Refactored a huge function into smaller subprocesses	2025-07-29 21:12:20 +02:00
adampweb	e9849e6c9c	Cleanup: I removed the obsolete options. The classic way provides more flexibility	2025-07-29 20:55:10 +02:00
Felipe	bc868e6b39	Refactor tidy_bytes.rb I'm not sure if we can easily determine the encoding behind each site (and I don't think Wayback Machine does that), but we can at least translate it and get it to download. This should be mostly useful for other, non-Western European languages. See #25	2025-07-29 10:10:56 -03:00
Felipe	2bf04aff48	Sanitize base_url and directory parameters this might be the cause of #25, at least from what it appears	2025-07-27 17:18:57 +00:00
Felipe	51becde916	Minor fix	2025-07-26 21:01:40 +00:00
Felipe	c30ee73977	Sanitize file_id we were not consistently handling non-UTF-8 characters here, especially after commit `e4487baafc`. This also fixes #25	2025-07-26 20:58:50 +00:00
Felipe	d3466b3387	Bumping version normally I would've yanked the old gem, but that's not working here	2025-07-22 12:41:26 +00:00
Felipe	0250579f0e	Added missing file	2025-07-22 12:38:12 +00:00
Felipe	0663c1c122	Merge pull request #23 from adampweb/master Fixed base image vulnerability	2025-07-21 14:44:43 -03:00
adampweb	93115f70ec	Merge pull request #5 from adampweb/snyk-fix-88576ceadf7e0c41b63a2af504a3c8ae [Snyk] Security upgrade ruby from 3.4.4-alpine to 3.4.5-alpine	2025-07-21 18:46:03 +02:00
snyk-bot	3d37ae10fd	fix: Dockerfile to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-ALPINE322-OPENSSL-10597997 - https://snyk.io/vuln/SNYK-ALPINE322-OPENSSL-10597997	2025-07-21 16:45:10 +00:00
Felipe	bff10e7260	Initial implementation of a composite snapshot see issue #22. TBF	2025-07-21 15:30:49 +00:00
adampweb	ffdce7e4ec	Exclude dev enviroment config	2025-07-20 17:14:09 +02:00
adampweb	e4487baafc	Fix: Handle default case in tidy_bytes	2025-07-20 17:13:36 +02:00