Improve URL detection for timestamps + fix retry download links

another page_requisites fix
page requisites fix
2025-12-29 16:16:06 +00:00 · 2025-12-18 14:41:35 +00:00 · 2025-12-10 12:16:26 +00:00 · 2025-12-10 12:13:39 +00:00 · 2025-12-10 11:59:00 +00:00 · 2025-11-15 12:59:07 +00:00
7 changed files with 318 additions and 88 deletions
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ docker run -it --rm wayback_machine_downloader [options] URL
 As an example of how this works without cloning this repo, this command fetches smallrockets.com until the year 2013:

 ```bash
-docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
+docker run -v .:/build/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
 ```

 ### 🐳 Using Docker Compose
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@@ -86,6 +86,10 @@ option_parser = OptionParser.new do |opts|
    options[:subdomain_depth] = t
  end

+  opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
+    options[:page_requisites] = true
+  end
+
  opts.on("-v", "--version", "Display version") do |t|
    options[:version] = t
  end
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -15,6 +15,7 @@ require 'digest'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
+require_relative 'wayback_machine_downloader/page_requisites'
 require_relative 'wayback_machine_downloader/subdom_processor'
 require_relative 'wayback_machine_downloader/url_rewrite'

@@ -127,8 +128,9 @@ class WaybackMachineDownloader

  include ArchiveAPI
  include SubdomainProcessor
+  include URLRewrite

-  VERSION = "2.4.4"
+  VERSION = "2.4.5"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
@@ -142,7 +144,7 @@ class WaybackMachineDownloader
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
    :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
-    :snapshot_at
+    :snapshot_at, :page_requisites

  def initialize params
    validate_params(params)
@@ -175,6 +177,8 @@ class WaybackMachineDownloader
    @subdomain_depth = params[:subdomain_depth] || 1
    @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
    @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
+    @page_requisites = params[:page_requisites] || false
+    @pending_jobs = Concurrent::AtomicFixnum.new(0)

    # URL for rejecting invalid/unencoded wayback urls
    @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
@@ -183,18 +187,29 @@ class WaybackMachineDownloader
  end

  def backup_name
-    url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
+    url_to_process = @base_url
+    url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')
+
    raw = if url_to_process.include?('//')
      url_to_process.split('/')[2]
    else
      url_to_process
    end

+    # if it looks like a wildcard pattern, normalize to a safe host-ish name
+    if raw&.start_with?('*.')
+      raw = raw.sub(/\A\*\./, 'all-')
+    end
+
    # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
    if Gem.win_platform?
      raw = raw.gsub(/[:*?"<>|]/, '_')
      raw = raw.gsub(/[ .]+\z/, '')
+    else
+      # still good practice to strip path separators (and maybe '*' for POSIX too)
+      raw = raw.gsub(/[\/:*?"<>|]/, '_')
    end
+
    raw = 'site' if raw.nil? || raw.empty?
    raw
  end
@@ -549,7 +564,7 @@ class WaybackMachineDownloader
      end
    end
  end
-
+  
  def download_files
    start_time = Time.now
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@@ -570,6 +585,12 @@ class WaybackMachineDownloader

    # Load IDs of already downloaded files
    downloaded_ids = load_downloaded_ids
+    
+    # We use a thread-safe Set to track what we have queued/downloaded in this session
+    # to avoid infinite loops with page requisites
+    @session_downloaded_ids = Concurrent::Set.new
+    downloaded_ids.each { |id| @session_downloaded_ids.add(id) }
+
    files_to_process = files_to_download.reject do |file_info|
      downloaded_ids.include?(file_info[:file_id])
    end
@@ -580,8 +601,8 @@ class WaybackMachineDownloader
    if skipped_count > 0
      puts "Found #{skipped_count} previously downloaded files, skipping them."
    end
-
-    if remaining_count == 0
+    
+    if remaining_count == 0 && !@page_requisites
      puts "All matching files have already been downloaded."
      cleanup
      return
@@ -594,12 +615,22 @@ class WaybackMachineDownloader
    @download_mutex = Mutex.new

    thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
-    pool = Concurrent::FixedThreadPool.new(thread_count)
+    @worker_pool = Concurrent::FixedThreadPool.new(thread_count)

-    processing_files(pool, files_to_process)
+    # initial batch
+    files_to_process.each do |file_remote_info|
+      @session_downloaded_ids.add(file_remote_info[:file_id])
+      submit_download_job(file_remote_info)
+    end

-    pool.shutdown
-    pool.wait_for_termination
+    # wait for all jobs to finish
+    loop do
+      sleep 0.5
+      break if @pending_jobs.value == 0
+    end
+
+    @worker_pool.shutdown
+    @worker_pool.wait_for_termination

    end_time = Time.now
    puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
@@ -617,6 +648,138 @@ class WaybackMachineDownloader
    cleanup
  end

+  # helper to submit jobs and increment the counter
+  def submit_download_job(file_remote_info)
+    @pending_jobs.increment
+    @worker_pool.post do
+      begin
+        process_single_file(file_remote_info)
+      ensure
+        @pending_jobs.decrement
+      end
+    end
+  end
+
+  def process_single_file(file_remote_info)
+    download_success = false
+    downloaded_path = nil
+    
+    @connection_pool.with_connection do |connection|
+      result_message, path = download_file(file_remote_info, connection)
+      downloaded_path = path
+      
+      if result_message && result_message.include?(' -> ')
+         download_success = true
+      end
+      
+      @download_mutex.synchronize do
+        @processed_file_count += 1 if @processed_file_count < @total_to_download
+        # only print if it's a "User" file or a requisite we found
+        puts result_message if result_message
+      end
+    end
+
+    if download_success
+      append_to_db(file_remote_info[:file_id])
+      
+      if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
+        process_page_requisites(downloaded_path, file_remote_info)
+      end
+    end
+  rescue => e
+    @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+  end
+  
+  def process_page_requisites(file_path, parent_remote_info)
+    return unless File.exist?(file_path)
+
+    content = File.read(file_path)
+    content = content.force_encoding('UTF-8').scrub
+
+    assets = PageRequisites.extract(content)
+
+    # prepare base URI for resolving relative paths
+    parent_raw = parent_remote_info[:file_url]
+    parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
+    
+    begin
+      base_uri = URI(parent_raw)
+      # calculate the "root" host of the site we are downloading to compare later
+      current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
+    rescue URI::InvalidURIError
+      return
+    end
+
+    parent_timestamp = parent_remote_info[:timestamp]
+
+    assets.each do |asset_rel_url|
+      begin
+        # resolve full URL (handles relative paths like "../img/logo.png")
+        resolved_uri = base_uri + asset_rel_url
+
+        # detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
+        asset_timestamp = parent_timestamp
+        if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
+          embedded_ts = $1
+          begin
+            orig_uri = URI($2)
+            resolved_uri = orig_uri
+            asset_timestamp = embedded_ts.to_i
+          rescue URI::InvalidURIError
+            # fall back to original resolved_uri and parent timestamp
+          end
+        end
+
+        # filter out navigation links (pages) vs assets
+        # skip if extension is empty or looks like an HTML page
+        path = resolved_uri.path
+        ext = File.extname(path).downcase
+        if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
+           next 
+        end
+
+        # construct the URL for the Wayback API
+        asset_wbm_url = resolved_uri.host + resolved_uri.path
+        asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
+
+        # construct the local file ID
+        #  if the asset is on the SAME domain, strip the domain from the folder path
+        #  if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
+        if resolved_uri.host == current_project_host
+           # e.g. /static/css/style.css
+           asset_file_id = resolved_uri.path
+           asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
+        else
+           # e.g. cdn.google.com/jquery.js
+           asset_file_id = asset_wbm_url
+        end
+
+      rescue URI::InvalidURIError, StandardError
+        next
+      end
+
+      # sanitize and queue
+      asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
+
+      unless @session_downloaded_ids.include?(asset_id)
+        @session_downloaded_ids.add(asset_id)
+
+        new_file_info = {
+          file_url: asset_wbm_url,
+          timestamp: asset_timestamp,
+          file_id: asset_id
+        }
+
+        @download_mutex.synchronize do
+          @total_to_download += 1
+          puts "Queued requisite: #{asset_file_id}"
+        end
+
+        submit_download_job(new_file_info)
+      end
+    end
+  end
+
  def structure_dir_path dir_path
    begin
      FileUtils::mkdir_p dir_path unless File.exist? dir_path
@@ -648,7 +811,8 @@ class WaybackMachineDownloader
    begin
      content = File.binread(file_path)

-      if file_ext == '.html' || file_ext == '.htm'
+      # detect encoding for HTML files
+      if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
        encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
        content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
      else
@@ -664,13 +828,13 @@ class WaybackMachineDownloader
      # URLs in JavaScript
      content = rewrite_js_urls(content)
      
-      # for URLs in HTML attributes that start with a single slash
+      # for URLs that start with a single slash, make them relative
      content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
        prefix, path, suffix = $1, $2, $3
        "#{prefix}./#{path}#{suffix}"
      end
      
-      # for URLs in CSS that start with a single slash
+      # for URLs in CSS that start with a single slash, make them relative
      content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
        path = $1
        "url(\"./#{path}\")"
@@ -723,7 +887,7 @@ class WaybackMachineDownloader
    # check existence *before* download attempt
    # this handles cases where a file was created manually or by a previous partial run without a .db entry
    if File.exist? file_path
-       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
+       return ["#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
    end

    begin
@@ -735,13 +899,13 @@ class WaybackMachineDownloader
        if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
          rewrite_urls_to_relative(file_path)
        end
-        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+        return ["#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
      when :skipped_not_found
-        "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
+        return ["Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
      else
        # ideally, this case should not be reached if download_with_retry behaves as expected.
        @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
-        "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
+        return ["Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
      end
    rescue StandardError => e
      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
@@ -749,7 +913,7 @@ class WaybackMachineDownloader
        File.delete(file_path)
        msg += "\n#{file_path} was empty and was removed."
      end
-      msg
+      return [msg, nil]
    end
  end

--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -16,6 +16,10 @@ module ArchiveAPI
    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
    request_url.query = URI.encode_www_form(params)

+    retries = 0
+    max_retries = (@max_retries || 3)
+    delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
+
    begin
      response = http.get(request_url)
      body = response.body.to_s.strip
@@ -26,7 +30,21 @@ module ArchiveAPI
      json.shift if json.first == ["timestamp", "original"]
      json
    rescue JSON::ParserError => e
-      warn "Failed to fetch data from API: #{e.message}"
+      warn "Failed to parse JSON from API for #{url}: #{e.message}"
+      []
+    rescue Net::ReadTimeout, Net::OpenTimeout => e
+      if retries < max_retries
+        retries += 1
+        warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
+        sleep(delay * retries)
+        retry
+      else
+        warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
+        []
+      end
+    rescue StandardError => e
+      # treat any other transient-ish error similarly, though without retries for now
+      warn "Error fetching CDX data for #{url}: #{e.message}"
      []
    end
  end
--- a/lib/wayback_machine_downloader/page_requisites.rb
+++ b/lib/wayback_machine_downloader/page_requisites.rb
@@ -0,0 +1,33 @@
+module PageRequisites
+  # regex to find links in href, src, url(), and srcset
+  # this ignores data: URIs, mailto:, and anchors
+  ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
+
+  def self.extract(html_content)
+    assets = []
+    
+    html_content.scan(ASSET_REGEX) do |match|
+      # match is an array of capture groups; find the one that matched
+      url = match.compact.first
+      next unless url
+      
+      # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
+      if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
+        url.split(',').each do |src_def|
+          src_url = src_def.strip.split(' ').first
+          assets << src_url if valid_asset?(src_url)
+        end
+      else
+        assets << url if valid_asset?(url)
+      end
+    end
+
+    assets.uniq
+  end
+
+  def self.valid_asset?(url)
+    return false if url.strip.empty?
+    return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
+    true
+  end
+end
--- a/lib/wayback_machine_downloader/url_rewrite.rb
+++ b/lib/wayback_machine_downloader/url_rewrite.rb
@@ -1,74 +1,85 @@
 # frozen_string_literal: true

-# URLs in HTML attributes
-def rewrite_html_attr_urls(content)
-  
-  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-    prefix, url, suffix = $1, $2, $3
-    
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
-        "#{prefix}#{path}#{suffix}"
-      rescue
-        "#{prefix}#{url}#{suffix}"
-      end
-    elsif url.start_with?('/')
-      "#{prefix}./#{url[1..-1]}#{suffix}"
-    else
-      "#{prefix}#{url}#{suffix}"
-    end
-  end
-  content
-end
+module URLRewrite
+  # server-side extensions that should work locally
+  SERVER_SIDE_EXTS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze

-# URLs in CSS
-def rewrite_css_urls(content)
-
-  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
-    url = $1
-    
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
-        "url(\"#{path}\")"
-      rescue
-        "url(\"#{url}\")"
-      end
-    elsif url.start_with?('/')
-      "url(\"./#{url[1..-1]}\")"
-    else
-      "url(\"#{url}\")"
+  def rewrite_html_attr_urls(content)
+    # rewrite URLs to relative paths
+    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      prefix, path, suffix = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{prefix}#{path}#{suffix}"
    end
-  end
-  content
-end

-# URLs in JavaScript
-def rewrite_js_urls(content)
-  
-  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
-    quote_start, url, quote_end = $1, $2, $3
-    
-    if url.start_with?('http')
-      begin
-        uri = URI.parse(url)
-        path = uri.path
-        path = path[1..-1] if path.start_with?('/')
-        "#{quote_start}#{path}#{quote_end}"
-      rescue
-        "#{quote_start}#{url}#{quote_end}"
-      end
-    elsif url.start_with?('/')
-      "#{quote_start}./#{url[1..-1]}#{quote_end}"
-    else
-      "#{quote_start}#{url}#{quote_end}"
+    # rewrite absolute URLs to same domain as relative
+    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      prefix, path, suffix = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{prefix}#{path}#{suffix}"
    end
+
+    content
+  end
+
+  def rewrite_css_urls(content)
+    # rewrite URLs in CSS
+    content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
+      path = normalize_path_for_local($1)
+      "url(\"#{path}\")"
+    end
+
+    # rewrite absolute URLs in CSS
+    content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
+      path = normalize_path_for_local($1)
+      "url(\"#{path}\")"
+    end
+
+    content
+  end
+
+  def rewrite_js_urls(content)
+    # rewrite archive.org URLs in JavaScript strings
+    content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      quote_start, path, quote_end = $1, $2, $3
+      path = normalize_path_for_local(path)
+      "#{quote_start}#{path}#{quote_end}"
+    end
+
+    # rewrite absolute URLs in JavaScript
+    content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
+      quote_start, path, quote_end = $1, $2, $3
+      next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
+      path = normalize_path_for_local(path)
+      "#{quote_start}#{path}#{quote_end}"
+    end
+
+    content
+  end
+
+  private
+
+  def normalize_path_for_local(path)
+    return "./index.html" if path.empty? || path == "/"
+    
+    # handle query strings - they're already part of the filename
+    path = path.split('?').first if path.include?('?')
+    
+    # check if this is a server-side script
+    ext = File.extname(path).downcase
+    if SERVER_SIDE_EXTS.include?(ext)
+      # keep the path as-is but ensure it starts with ./
+      path = "./#{path}" unless path.start_with?('./', '/')
+    else
+      # regular file handling
+      path = "./#{path}" unless path.start_with?('./', '/')
+      
+      # if it looks like a directory, add index.html
+      if path.end_with?('/') || !path.include?('.')
+        path = "#{path.chomp('/')}/index.html"
+      end
+    end
+    
+    path
  end
-  
-  content
 end
--- a/wayback_machine_downloader.gemspec
+++ b/wayback_machine_downloader.gemspec
@@ -1,12 +1,12 @@
 Gem::Specification.new do |s|
  s.name        = "wayback_machine_downloader_straw"
-  s.version     = "2.4.4"
+  s.version     = "2.4.5"
  s.executables << "wayback_machine_downloader"
  s.summary     = "Download an entire website from the Wayback Machine."
  s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
  s.authors     = ["strawberrymaster"]
  s.email       = "strawberrymaster@vivaldi.net"
-  s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
+  s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/page_requisites.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
  s.homepage    = "https://github.com/StrawberryMaster/wayback-machine-downloader"
  s.license     = "MIT"
  s.required_ruby_version = ">= 3.4.3"
Author	SHA1	Message	Date
Felipe	d572207122	Improve URL detection for timestamps + fix retry download links	2025-12-18 14:41:35 +00:00
Felipe	b2fc748c2c	another page_requisites fix	2025-12-10 12:16:26 +00:00
Felipe	8632050c45	page requisites fix	2025-12-10 12:13:39 +00:00
Felipe	2aa694eed0	Initial implementation of --page-requisites see StrawberryMaster/wayback-machine-downloader#39	2025-12-10 11:59:00 +00:00
Felipe	4d2513eca8	Be a bit more tolerant of timeouts here	2025-11-15 12:59:07 +00:00
Felipe	67685b781e	Improve handling for wildcard URLs fixes #38	2025-11-15 12:45:34 +00:00
Felipe	f7c0f1a964	Better support for .php, .asp, and other files when using --local see #37	2025-11-04 23:18:04 +00:00
Nicolai Weitkemper	99da3ca48e	Fix Docker command volume mount path in README (#35 )	2025-10-28 15:30:19 -03:00