Bump version

Merge pull request #14 from elidickinson/fix-bracket-urls
Fix bug with archive urls containing square brackets
2025-12-29 16:16:06 +00:00 · 2025-06-05 22:34:40 +00:00 · 2025-06-03 23:12:07 -03:00 · 2025-06-03 16:36:03 -04:00 · 2025-05-30 14:34:32 -03:00 · 2025-05-30 12:50:48 -04:00
7 changed files with 233 additions and 48 deletions
--- a/14
+++ b/14
@@ -1,11 +1,15 @@
-FROM ruby:3.1.6-alpine
+FROM ruby:3.4.4-alpine
 USER root
 WORKDIR /build
 COPY . /build
-RUN gem update \
+COPY Gemfile /build/
-    && gem install concurrent-ruby \
+COPY *.gemspec /build/
 RUN bundle config set jobs "$(nproc)" \
    && bundle config set without 'development test' \
    && bundle install
 COPY . /build
 WORKDIR /
-ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
+ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
--- a/README.md
+++ b/README.md
@@ -62,6 +62,12 @@ docker build -t wayback_machine_downloader .
 docker run -it --rm wayback_machine_downloader [options] URL
 ```
 or the example without cloning the repo - fetching smallrockets.com until the year 2013:
 ```bash
 docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
 ```
 ### 🐳 Using Docker Compose
 We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database):
@@ -74,21 +80,49 @@ services:
    tty: true
    image: wayback_machine_downloader:latest
    container_name: wayback_machine_downloader
    environment:
      - ENVIRONMENT=${ENVIRONMENT:-development}
      - OPTIONS=${OPTIONS:-""}
      - TARGET_URL=${TARGET_URL}
    volumes:
      - .:/build:rw
      - ./websites:/build/websites:rw
    command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
 ```
 #### Usage:
 Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
 ```bash
 docker compose up -d --build
 ```
 After that you must set TARGET_URL environment variable:
 ```bash
 export TARGET_URL="https://example.com/"
 ```
 The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
 Example:
 ```bash
 export OPTIONS="--list -f 20060121"
 ```
 After that you can run the exists container with the following command:
 ```bash
 docker compose run --rm wayback_machine_downloader https://example.com
 ```
 ## ⚙️ Configuration
 There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
 ```ruby
 DEFAULT_TIMEOUT = 30        # HTTP timeout (in seconds)
-MAX_RETRIES = 3             # Failed request retries
+MAX_RETRIES = 3             # Number of times to retry failed requests
-RETRY_DELAY = 2             # Wait between retries
+RETRY_DELAY = 2             # Wait time between retries (seconds)
-RATE_LIMIT = 0.25           # Throttle between requests
+RATE_LIMIT = 0.25           # Throttle between requests (seconds)
-CONNECTION_POOL_SIZE = 10   # No. of simultaneous connections
+CONNECTION_POOL_SIZE = 10   # Maximum simultaneous connections
-MEMORY_BUFFER_SIZE = 16384  # Size of download buffer
+MEMORY_BUFFER_SIZE = 16384  # Download buffer size (bytes)
 STATE_CDX_FILENAME = '.cdx.json'       # Stores snapshot listing
 STATE_DB_FILENAME = '.downloaded.txt'  # Tracks completed downloads
 ```
 ## 🛠️ Advanced usage
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
    options[:rewritten] = true
  end
  opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
    options[:rewrite] = true
  end
  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
    options[:reset] = true
  end
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,8 +6,10 @@ services:
    image: wayback_machine_downloader:latest
    container_name: wayback_machine_downloader
    environment:
-      - ENVIRONMENT=${ENVIRONMENT}
+      - ENVIRONMENT=${DEVELOPMENT:-production}
      - OPTIONS=${OPTIONS:-""}
      - TARGET_URL=${TARGET_URL}
    volumes:
      - .:/build:rw
-      - ./websites:/build/websites:rw
+      - ./websites:/websites:rw
-    command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
+    command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
  include ArchiveAPI
-  VERSION = "2.3.4"
+  VERSION = "2.3.8"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger, :reset, :keep
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
  def initialize params
    validate_params(params)
@@ -148,15 +148,18 @@ class WaybackMachineDownloader
    @failed_downloads = Concurrent::Array.new
    @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
    @db_mutex = Mutex.new
    @rewrite = params[:rewrite] || false
    handle_reset
  end
  def backup_name
-    if @base_url.include? '//'
+    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
-      @base_url.split('/')[2]
+    
    if url_to_process.include? '//'
      url_to_process.split('/')[2]
    else
-      @base_url
+      url_to_process
    end
  end
@@ -240,6 +243,7 @@ class WaybackMachineDownloader
    # Fetch the initial set of snapshots, sequentially
    @connection_pool.with_connection do |connection|
      initial_list = get_raw_list_from_api(@base_url, nil, connection)
      initial_list ||= []
      mutex.synchronize do
        snapshot_list_to_consider.concat(initial_list)
        print "."
@@ -264,6 +268,7 @@ class WaybackMachineDownloader
            @connection_pool.with_connection do |connection|
              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
            end
            result ||= []
            [page, result]
          end
        end
@@ -283,7 +288,7 @@ class WaybackMachineDownloader
        # Process results and check for empty pages
        results.each do |page, result|
-          if result.empty?
+          if result.nil? || result.empty?
            continue_fetching = false
            break
          else
@@ -476,8 +481,8 @@ class WaybackMachineDownloader
        begin
          @connection_pool.with_connection do |connection|
            result_message = download_file(file_remote_info, connection)
-            # for now, assume success if no exception and message doesn't indicate error/skip
+            # assume download success if the result message contains ' -> '
-            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+            if result_message && result_message.include?(' -> ')
               download_success = true
            end
            @download_mutex.synchronize do
@@ -533,15 +538,109 @@ class WaybackMachineDownloader
    end
  end
  def rewrite_urls_to_relative(file_path)
    return unless File.exist?(file_path)
    file_ext = File.extname(file_path).downcase
    begin
      content = File.binread(file_path)
      if file_ext == '.html' || file_ext == '.htm'
        encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
        content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
      else
        content.force_encoding('UTF-8')
      end
      # URLs in HTML attributes
      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
        prefix, url, suffix = $1, $2, $3
        if url.start_with?('http')
          begin
            uri = URI.parse(url)
            path = uri.path
            path = path[1..-1] if path.start_with?('/')
            "#{prefix}#{path}#{suffix}"
          rescue
            "#{prefix}#{url}#{suffix}"
          end
        elsif url.start_with?('/')
          "#{prefix}./#{url[1..-1]}#{suffix}"
        else
          "#{prefix}#{url}#{suffix}"
        end
      end
      # URLs in CSS
      content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
        url = $1
        if url.start_with?('http')
          begin
            uri = URI.parse(url)
            path = uri.path
            path = path[1..-1] if path.start_with?('/')
            "url(\"#{path}\")"
          rescue
            "url(\"#{url}\")"
          end
        elsif url.start_with?('/')
          "url(\"./#{url[1..-1]}\")"
        else
          "url(\"#{url}\")"
        end
      end
      # URLs in JavaScript
      content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
        quote_start, url, quote_end = $1, $2, $3
        if url.start_with?('http')
          begin
            uri = URI.parse(url)
            path = uri.path
            path = path[1..-1] if path.start_with?('/')
            "#{quote_start}#{path}#{quote_end}"
          rescue
            "#{quote_start}#{url}#{quote_end}"
          end
        elsif url.start_with?('/')
          "#{quote_start}./#{url[1..-1]}#{quote_end}"
        else
          "#{quote_start}#{url}#{quote_end}"
        end
      end
      # for URLs in HTML attributes that start with a single slash
      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
        prefix, path, suffix = $1, $2, $3
        "#{prefix}./#{path}#{suffix}"
      end
      # for URLs in CSS that start with a single slash
      content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
        path = $1
        "url(\"./#{path}\")"
      end
      # save the modified content back to the file
      File.binwrite(file_path, content)
      puts "Rewrote URLs in #{file_path} to be relative."
    rescue Errno::ENOENT => e
      @logger.warn("Error reading file #{file_path}: #{e.message}")
    end
  end
  def download_file (file_remote_info, http)
    current_encoding = "".encoding
    file_url = file_remote_info[:file_url].encode(current_encoding)
    file_id = file_remote_info[:file_id]
    file_timestamp = file_remote_info[:timestamp]
-    original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
+    file_path_elements = file_id.split('/')
    file_path_elements = original_file_id.split('/')
-    if original_file_id == ""
+    if file_id == ""
      dir_path = backup_path
      file_path = backup_path + 'index.html'
    elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -564,11 +663,24 @@ class WaybackMachineDownloader
    begin
      structure_dir_path dir_path
-      download_with_retry(file_path, file_url, file_timestamp, http)
+      status = download_with_retry(file_path, file_url, file_timestamp, http)
-      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+
      case status
      when :saved
        if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
          rewrite_urls_to_relative(file_path)
        end
        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
      when :skipped_not_found
        "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
      else
        # ideally, this case should not be reached if download_with_retry behaves as expected.
        @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
        "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
      end
    rescue StandardError => e
      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
-      if not @all and File.exist?(file_path) and File.size(file_path) == 0
+      if File.exist?(file_path) and File.size(file_path) == 0
        File.delete(file_path)
        msg += "\n#{file_path} was empty and was removed."
      end
@@ -609,6 +721,9 @@ class WaybackMachineDownloader
        "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
      end
      # Escape square brackets because they are not valid in URI()
      wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
      request = Net::HTTP::Get.new(URI(wayback_url))
      request["Connection"] = "keep-alive"
      request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
@@ -616,8 +731,7 @@ class WaybackMachineDownloader
      response = connection.request(request)
-      case response
+      save_response_body = lambda do
      when Net::HTTPSuccess
        File.open(file_path, "wb") do |file|
          body = response.body
          if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -627,26 +741,48 @@ class WaybackMachineDownloader
              gz.close
              file.write(decompressed_body)
            rescue Zlib::GzipFile::Error => e
-              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
+              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
              file.write(body)
            end
          else
            file.write(body) if body
          end
        end
-      when Net::HTTPRedirection
+      end
-        raise "Too many redirects for #{file_url}" if redirect_count >= 2
+
-        location = response['location']
+      if @all
-        @logger.warn("Redirect found for #{file_url} -> #{location}")
+        case response
-        return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
+        when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
-      when Net::HTTPTooManyRequests
+          save_response_body.call
-        sleep(RATE_LIMIT * 2)
+          if response.is_a?(Net::HTTPRedirection)
-        raise "Rate limited, retrying..."
+            @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
-      when Net::HTTPNotFound
+          elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
-        @logger.warn("File not found, skipping: #{file_url}")
+            @logger.info("Saved error page for #{file_url} (status #{response.code}).")
-        return
+          end
-      else
+          return :saved
-        raise "HTTP Error: #{response.code} #{response.message}"
+        else
          # for any other response type when --all is true, treat as an error to be retried or failed
          raise "Unhandled HTTP response: #{response.code} #{response.message}"
        end
      else # not @all (our default behavior)
        case response
        when Net::HTTPSuccess
          save_response_body.call
          return :saved
        when Net::HTTPRedirection
          raise "Too many redirects for #{file_url}" if redirect_count >= 2
          location = response['location']
          @logger.warn("Redirect found for #{file_url} -> #{location}")
          return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
        when Net::HTTPTooManyRequests
          sleep(RATE_LIMIT * 2)
          raise "Rate limited, retrying..."
        when Net::HTTPNotFound
          @logger.warn("File not found, skipping: #{file_url}")
          return :skipped_not_found
        else
          raise "HTTP Error: #{response.code} #{response.message}"
        end
      end
    rescue StandardError => e
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -4,6 +4,13 @@ require 'uri'
 module ArchiveAPI
  def get_raw_list_from_api(url, page_index, http)
    # Automatically append /* if the URL doesn't contain a path after the domain
    # This is a workaround for an issue with the API and *some* domains.
    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
    if url && !url.match(/^https?:\/\/.*\//i)
      url = "#{url}/*"
    end
    request_url = URI("https://web.archive.org/cdx/search/cdx")
    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
    request_url.query = URI.encode_www_form(params)
--- a/wayback_machine_downloader.gemspec
+++ b/wayback_machine_downloader.gemspec
@@ -1,8 +1,6 @@
 require './lib/wayback_machine_downloader'
 Gem::Specification.new do |s|
  s.name        = "wayback_machine_downloader_straw"
-  s.version     = WaybackMachineDownloader::VERSION
+  s.version     = "2.3.8"
  s.executables << "wayback_machine_downloader"
  s.summary     = "Download an entire website from the Wayback Machine."
  s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
@@ -11,7 +9,7 @@ Gem::Specification.new do |s|
  s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
  s.homepage    = "https://github.com/StrawberryMaster/wayback-machine-downloader"
  s.license     = "MIT"
-  s.required_ruby_version = ">= 1.9.2"
+  s.required_ruby_version = ">= 3.4.3"
  s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4"
  s.add_development_dependency "rake", "~> 12.2"
  s.add_development_dependency "minitest", "~> 5.2"
Author	SHA1	Message	Date
Felipe	3fdfd70fc1	Bump version	2025-06-05 22:34:40 +00:00
Felipe	2bf74b4173	Merge pull request #14 from elidickinson/fix-bracket-urls Fix bug with archive urls containing square brackets	2025-06-03 23:12:07 -03:00
Eli Dickinson	79cbb639e7	Fix bug with archive urls containing square brackets	2025-06-03 16:36:03 -04:00
Felipe	071d208b31	Merge pull request #13 from elidickinson/master workaround for API only showing html files for some domains (fixes #6)	2025-05-30 14:34:32 -03:00
Eli Dickinson	1681a12579	workaround for API only showing html files for some domains See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6	2025-05-30 12:50:48 -04:00
Felipe	f38756dd76	Correction for downloaded data folder if you downloaded content from example.org/, it would be listed in a folder titled instead of the sitename. See #6 (and thanks to elidickinson for pointing it out!)	2025-05-30 14:00:32 +00:00
Felipe	9452411e32	Added nil checks	2025-05-30 13:52:25 +00:00
Felipe	61e22cfe25	Bump versions	2025-05-27 18:10:09 +00:00
Felipe	183ed61104	Attempt at fixing --all I honestly don't recall if this was implemented in the original code, and I'm guessing this worked at some point during this fork. It seems to work correctly now, however. See #6 and #11	2025-05-27 17:17:34 +00:00
Felipe	e6ecf32a43	Dockerfile test 2 I really should not be using deprecated parameters.	2025-05-21 21:34:36 -03:00
Felipe	375c6314ad	Dockerfile test ...again	2025-05-21 21:26:37 -03:00
Felipe	6e2739f5a8	Testing	2025-05-18 18:00:10 +00:00
Felipe	caba6a665f	Rough attempt to make this more efficient	2025-05-18 17:52:28 +00:00
Felipe	ab4324c0eb	Bumping to 2.3.6	2025-05-18 16:49:44 +00:00
Felipe	e28d7d578b	Experimental ability to rewrite URLs to local browsing	2025-05-18 16:48:50 +00:00
Felipe	a7a25574cf	Merge pull request #10 from adampweb/master Using ghcr.io for pulling Docker image	2025-05-15 08:50:33 -03:00
Felipe	23cc3d69b1	Merge pull request #9 from adampweb/feature/increase-performance Increase performance of Bundler processes	2025-05-15 08:50:04 -03:00
adampweb	01fa1f8c9f	Merge pull request #2 from vitaly-zdanevich/patch-1 README.md: add docker example without cloning the repo	2025-05-14 21:19:11 +02:00
adampweb	d2f98d9428	Merge remote-tracking branch 'upstream/master' into feature/increase-performance	2025-05-14 15:41:07 +02:00
adampweb	c7a5381eaf	Using nproc in Bundler processes	2025-05-14 15:03:22 +02:00
Felipe	9709834e20	Merge pull request #8 from adampweb/master Fix: delete empty files, Compose command fixes	2025-05-12 10:36:10 -03:00
adampweb	77998372cb	Docker: If you load any component of the app before (or during) the Docker build process, it may cause failures	2025-05-11 20:05:00 +02:00
adampweb	2c789b7df6	Restructure Docker Compose config	2025-05-11 11:27:08 +02:00
adampweb	1ef8c14c48	Removed unused variable from `if` condition	2025-05-11 10:57:36 +02:00
Felipe	780e45343f	Merge pull request #7 from adampweb/master Vulnerablity fix: Ruby 3.x	2025-05-10 11:34:07 -03:00
adampweb	42e6d62284	Merge remote-tracking branch 'upstream/master'	2025-05-09 20:17:01 +02:00
adampweb	543161d7fb	Supplement of docs	2025-05-09 19:54:15 +02:00
adampweb	99a6de981e	Env. vars: set default values and related docs	2025-05-09 19:38:39 +02:00
adampweb	d85c880d23	Vulnerablity fix: Updates Ruby version to address vulnerability Updates the Ruby version in the Dockerfile to the latest stable release in the 3.x series to address identified vulnerabilities. Details: https://hub.docker.com/layers/library/ruby/3.1.6-alpine/images/sha256-7ff1261ca74033c38e86b04e30a6078567ec17e59d465d96250665897fb52180	2025-05-09 18:32:47 +02:00
Felipe	917f4f8798	Bumping version	2025-04-30 13:05:30 +00:00
Felipe	787bc2e535	Added missing configs	2025-04-30 13:05:21 +00:00
Felipe	4db13a7792	Fix --all-timestamps we were accidentally removing the timestamp prefix from `file_id`, rendering that option useless in 2.3.4. This should again now. This will fix #4	2025-04-30 13:01:29 +00:00
Vitaly Zdanevich	13e88ce04a	README.md: add -v .:/websites	2025-04-14 10:56:01 +04:00
Vitaly Zdanevich	c7fc7c7b58	README.md: add docker example without cloning the repo	2025-04-14 10:43:49 +04:00