Bump version

Merge pull request #14 from elidickinson/fix-bracket-urls
Fix bug with archive urls containing square brackets
2025-12-29 16:16:06 +00:00 · 2025-06-05 22:34:40 +00:00 · 2025-06-03 23:12:07 -03:00 · 2025-06-03 16:36:03 -04:00 · 2025-05-30 14:34:32 -03:00 · 2025-05-30 12:50:48 -04:00
4 changed files with 77 additions and 32 deletions
--- a/8
+++ b/8
@@ -1,15 +1,15 @@
-FROM ruby:3.4.3-alpine
+FROM ruby:3.4.4-alpine
 USER root
 WORKDIR /build

 COPY Gemfile /build/
 COPY *.gemspec /build/

-RUN gem update \
-    && bundle config set jobs $(nproc) \
+RUN bundle config set jobs "$(nproc)" \
+    && bundle config set without 'development test' \
    && bundle install

 COPY . /build

 WORKDIR /
-ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
+ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -113,7 +113,7 @@ class WaybackMachineDownloader

  include ArchiveAPI

-  VERSION = "2.3.6"
+  VERSION = "2.3.8"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
  end

  def backup_name
-    if @base_url.include? '//'
-      @base_url.split('/')[2]
+    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
+    
+    if url_to_process.include? '//'
+      url_to_process.split('/')[2]
    else
-      @base_url
+      url_to_process
    end
  end

@@ -241,6 +243,7 @@ class WaybackMachineDownloader
    # Fetch the initial set of snapshots, sequentially
    @connection_pool.with_connection do |connection|
      initial_list = get_raw_list_from_api(@base_url, nil, connection)
+      initial_list ||= []
      mutex.synchronize do
        snapshot_list_to_consider.concat(initial_list)
        print "."
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
            @connection_pool.with_connection do |connection|
              result = get_raw_list_from_api("#{@base_url}/*", page, connection)
            end
+            result ||= []
            [page, result]
          end
        end
@@ -284,7 +288,7 @@ class WaybackMachineDownloader

        # Process results and check for empty pages
        results.each do |page, result|
-          if result.empty?
+          if result.nil? || result.empty?
            continue_fetching = false
            break
          else
@@ -477,8 +481,8 @@ class WaybackMachineDownloader
        begin
          @connection_pool.with_connection do |connection|
            result_message = download_file(file_remote_info, connection)
-            # for now, assume success if no exception and message doesn't indicate error/skip
-            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+            # assume download success if the result message contains ' -> '
+            if result_message && result_message.include?(' -> ')
               download_success = true
            end
            @download_mutex.synchronize do
@@ -659,11 +663,21 @@ class WaybackMachineDownloader

    begin
      structure_dir_path dir_path
-      download_with_retry(file_path, file_url, file_timestamp, http)
-      if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
-        rewrite_urls_to_relative(file_path)
+      status = download_with_retry(file_path, file_url, file_timestamp, http)
+
+      case status
+      when :saved
+        if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
+          rewrite_urls_to_relative(file_path)
+        end
+        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+      when :skipped_not_found
+        "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
+      else
+        # ideally, this case should not be reached if download_with_retry behaves as expected.
+        @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
+        "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
      end
-      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
    rescue StandardError => e
      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
      if File.exist?(file_path) and File.size(file_path) == 0
@@ -707,6 +721,9 @@ class WaybackMachineDownloader
        "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
      end

+      # Escape square brackets because they are not valid in URI()
+      wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
+
      request = Net::HTTP::Get.new(URI(wayback_url))
      request["Connection"] = "keep-alive"
      request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
@@ -714,8 +731,7 @@ class WaybackMachineDownloader

      response = connection.request(request)

-      case response
-      when Net::HTTPSuccess
+      save_response_body = lambda do
        File.open(file_path, "wb") do |file|
          body = response.body
          if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -725,26 +741,48 @@ class WaybackMachineDownloader
              gz.close
              file.write(decompressed_body)
            rescue Zlib::GzipFile::Error => e
-              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
+              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
              file.write(body)
            end
          else
            file.write(body) if body
          end
        end
-      when Net::HTTPRedirection
-        raise "Too many redirects for #{file_url}" if redirect_count >= 2
-        location = response['location']
-        @logger.warn("Redirect found for #{file_url} -> #{location}")
-        return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
-      when Net::HTTPTooManyRequests
-        sleep(RATE_LIMIT * 2)
-        raise "Rate limited, retrying..."
-      when Net::HTTPNotFound
-        @logger.warn("File not found, skipping: #{file_url}")
-        return
-      else
-        raise "HTTP Error: #{response.code} #{response.message}"
+      end
+
+      if @all
+        case response
+        when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
+          save_response_body.call
+          if response.is_a?(Net::HTTPRedirection)
+            @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
+          elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
+            @logger.info("Saved error page for #{file_url} (status #{response.code}).")
+          end
+          return :saved
+        else
+          # for any other response type when --all is true, treat as an error to be retried or failed
+          raise "Unhandled HTTP response: #{response.code} #{response.message}"
+        end
+      else # not @all (our default behavior)
+        case response
+        when Net::HTTPSuccess
+          save_response_body.call
+          return :saved
+        when Net::HTTPRedirection
+          raise "Too many redirects for #{file_url}" if redirect_count >= 2
+          location = response['location']
+          @logger.warn("Redirect found for #{file_url} -> #{location}")
+          return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
+        when Net::HTTPTooManyRequests
+          sleep(RATE_LIMIT * 2)
+          raise "Rate limited, retrying..."
+        when Net::HTTPNotFound
+          @logger.warn("File not found, skipping: #{file_url}")
+          return :skipped_not_found
+        else
+          raise "HTTP Error: #{response.code} #{response.message}"
+        end
      end

    rescue StandardError => e
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -4,6 +4,13 @@ require 'uri'
 module ArchiveAPI

  def get_raw_list_from_api(url, page_index, http)
+    # Automatically append /* if the URL doesn't contain a path after the domain
+    # This is a workaround for an issue with the API and *some* domains.
+    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
+    if url && !url.match(/^https?:\/\/.*\//i)
+      url = "#{url}/*"
+    end
+
    request_url = URI("https://web.archive.org/cdx/search/cdx")
    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
    request_url.query = URI.encode_www_form(params)
--- a/wayback_machine_downloader.gemspec
+++ b/wayback_machine_downloader.gemspec
@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
  s.name        = "wayback_machine_downloader_straw"
-  s.version     = "2.3.6"
+  s.version     = "2.3.8"
  s.executables << "wayback_machine_downloader"
  s.summary     = "Download an entire website from the Wayback Machine."
  s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
Author	SHA1	Message	Date
Felipe	3fdfd70fc1	Bump version	2025-06-05 22:34:40 +00:00
Felipe	2bf74b4173	Merge pull request #14 from elidickinson/fix-bracket-urls Fix bug with archive urls containing square brackets	2025-06-03 23:12:07 -03:00
Eli Dickinson	79cbb639e7	Fix bug with archive urls containing square brackets	2025-06-03 16:36:03 -04:00
Felipe	071d208b31	Merge pull request #13 from elidickinson/master workaround for API only showing html files for some domains (fixes #6)	2025-05-30 14:34:32 -03:00
Eli Dickinson	1681a12579	workaround for API only showing html files for some domains See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6	2025-05-30 12:50:48 -04:00
Felipe	f38756dd76	Correction for downloaded data folder if you downloaded content from example.org/, it would be listed in a folder titled instead of the sitename. See #6 (and thanks to elidickinson for pointing it out!)	2025-05-30 14:00:32 +00:00
Felipe	9452411e32	Added nil checks	2025-05-30 13:52:25 +00:00
Felipe	61e22cfe25	Bump versions	2025-05-27 18:10:09 +00:00
Felipe	183ed61104	Attempt at fixing --all I honestly don't recall if this was implemented in the original code, and I'm guessing this worked at some point during this fork. It seems to work correctly now, however. See #6 and #11	2025-05-27 17:17:34 +00:00
Felipe	e6ecf32a43	Dockerfile test 2 I really should not be using deprecated parameters.	2025-05-21 21:34:36 -03:00
Felipe	375c6314ad	Dockerfile test ...again	2025-05-21 21:26:37 -03:00
Felipe	6e2739f5a8	Testing	2025-05-18 18:00:10 +00:00
Felipe	caba6a665f	Rough attempt to make this more efficient	2025-05-18 17:52:28 +00:00