Merge remote-tracking branch 'upstream/master'

2025-12-29 16:16:06 +00:00 · 2025-05-09 20:17:01 +02:00 · 2025-05-09 20:17:01 +02:00 · 42e6d62284
commit 42e6d62284
parent 543161d7fb 917f4f8798
5 changed files with 240 additions and 59 deletions
--- a/LICENSE.md
+++ b/LICENSE.md
--- a/README.md
+++ b/README.md
@ -108,14 +108,15 @@ docker compose run --rm wayback_machine_downloader https://example.com

 ## ⚙️ Configuration
 There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
-
 ```ruby
 DEFAULT_TIMEOUT = 30        # HTTP timeout (in seconds)
-MAX_RETRIES = 3             # Failed request retries
-RETRY_DELAY = 2             # Wait between retries
-RATE_LIMIT = 0.25           # Throttle between requests
-CONNECTION_POOL_SIZE = 10   # No. of simultaneous connections
-MEMORY_BUFFER_SIZE = 16384  # Size of download buffer
+MAX_RETRIES = 3             # Number of times to retry failed requests
+RETRY_DELAY = 2             # Wait time between retries (seconds)
+RATE_LIMIT = 0.25           # Throttle between requests (seconds)
+CONNECTION_POOL_SIZE = 10   # Maximum simultaneous connections
+MEMORY_BUFFER_SIZE = 16384  # Download buffer size (bytes)
+STATE_CDX_FILENAME = '.cdx.json'       # Stores snapshot listing
+STATE_DB_FILENAME = '.downloaded.txt'  # Tracks completed downloads
 ```

 ## 🛠️ Advanced usage
@ -244,6 +245,29 @@ ruby wayback_machine_downloader https://example.com --list
 ```
 It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application.

+### Job management
+The downloader automatically saves its progress (`.cdx.json` for snapshot list, `.downloaded.txt` for completed files) in the output directory. If you run the same command again pointing to the same output directory, it will resume where it left off, skipping already downloaded files.
+
+> [!NOTE]
+> Automatic resumption can be affected by changing the URL, mode selection (like `--all-timestamps`), filtering selections, or other options. If you want to ensure a clean start, use the `--reset` option.
+
+| Option | Description |
+|--------|-------------|
+| `--reset` | Delete state files (`.cdx.json`, `.downloaded.txt`) and restart the download from scratch. Does not delete already downloaded website files. |
+| `--keep` | Keep state files (`.cdx.json`, `.downloaded.txt`) even after a successful download. By default, these are deleted upon successful completion. |
+
+**Example** - Restart a download job from the beginning:
+```bash
+ruby wayback_machine_downloader https://example.com --reset
+```
+This is useful if you suspect the state files are corrupted or want to ensure a completely fresh download process without deleting the files you already have.
+
+**Example 2** - Keep state files after download:
+```bash
+ruby wayback_machine_downloader https://example.com --keep
+```
+This can be useful for debugging or if you plan to extend the download later with different parameters (e.g., adding `--to` timestamp) while leveraging the existing snapshot list.
+
 ## 🤝 Contributing
 1. Fork the repository
 2. Create a feature branch
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
  end

  opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
-    options[:rewritten] = t
+    options[:rewritten] = true
+  end
+
+  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
+    options[:reset] = true
+  end
+
+  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
+    options[:keep] = true
  end

  opts.on("-v", "--version", "Display version") do |t|
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@ -9,6 +9,8 @@ require 'json'
 require 'time'
 require 'concurrent-ruby'
 require 'logger'
+require 'zlib'
+require 'stringio'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@ -111,17 +113,19 @@ class WaybackMachineDownloader

  include ArchiveAPI

-  VERSION = "2.3.3"
+  VERSION = "2.3.5"
  DEFAULT_TIMEOUT = 30
  MAX_RETRIES = 3
  RETRY_DELAY = 2
  RATE_LIMIT = 0.25  # Delay between requests in seconds
  CONNECTION_POOL_SIZE = 10
  MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
+  STATE_CDX_FILENAME = ".cdx.json"
+  STATE_DB_FILENAME = ".downloaded.txt"

  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep

  def initialize params
    validate_params(params)
@ -137,10 +141,15 @@ class WaybackMachineDownloader
    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
    @threads_count = [params[:threads_count].to_i, 1].max
    @rewritten = params[:rewritten]
+    @reset = params[:reset]
+    @keep = params[:keep]
    @timeout = params[:timeout] || DEFAULT_TIMEOUT
    @logger = setup_logger
    @failed_downloads = Concurrent::Array.new
    @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
+    @db_mutex = Mutex.new
+
+    handle_reset
  end

  def backup_name
@ -163,6 +172,23 @@ class WaybackMachineDownloader
    end
  end

+  def cdx_path
+    File.join(backup_path, STATE_CDX_FILENAME)
+  end
+
+  def db_path
+    File.join(backup_path, STATE_DB_FILENAME)
+  end
+
+  def handle_reset
+    if @reset
+      puts "Resetting download state..."
+      FileUtils.rm_f(cdx_path)
+      FileUtils.rm_f(db_path)
+      puts "Removed state files: #{cdx_path}, #{db_path}"
+    end
+  end
+
  def match_only_filter file_url
    if @only_filter
      only_filter_regex = @only_filter.to_regex
@ -190,10 +216,26 @@ class WaybackMachineDownloader
  end

  def get_all_snapshots_to_consider
+    if File.exist?(cdx_path) && !@reset
+      puts "Loading snapshot list from #{cdx_path}"
+      begin
+        snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
+        puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
+        puts
+        return Concurrent::Array.new(snapshot_list_to_consider)
+      rescue JSON::ParserError => e
+        puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      rescue => e
+        puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      end
+    end
+
    snapshot_list_to_consider = Concurrent::Array.new
    mutex = Mutex.new
-    
-    puts "Getting snapshot pages"
+
+    puts "Getting snapshot pages from Wayback Machine API..."

    # Fetch the initial set of snapshots, sequentially
    @connection_pool.with_connection do |connection|
@ -209,12 +251,12 @@ class WaybackMachineDownloader
      page_index = 0
      batch_size = [@threads_count, 5].min
      continue_fetching = true
-      
+
      while continue_fetching && page_index < @maximum_pages
        # Determine the range of pages to fetch in this batch
        end_index = [page_index + batch_size, @maximum_pages].min
        current_batch = (page_index...end_index).to_a
-        
+
        # Create futures for concurrent API calls
        futures = current_batch.map do |page|
          Concurrent::Future.execute do
@ -225,9 +267,9 @@ class WaybackMachineDownloader
            [page, result]
          end
        end
-        
+
        results = []
-        
+
        futures.each do |future|
          begin
            results << future.value
@ -235,10 +277,10 @@ class WaybackMachineDownloader
            puts "\nError fetching page #{future}: #{e.message}"
          end
        end
-        
+
        # Sort results by page number to maintain order
        results.sort_by! { |page, _| page }
-        
+
        # Process results and check for empty pages
        results.each do |page, result|
          if result.empty?
@ -251,14 +293,23 @@ class WaybackMachineDownloader
            end
          end
        end
-        
+
        page_index = end_index
-        
+
        sleep(RATE_LIMIT) if continue_fetching
      end
    end

-    puts " found #{snapshot_list_to_consider.length} snapshots to consider."
+    puts " found #{snapshot_list_to_consider.length} snapshots."
+
+    # Save the fetched list to the cache file
+    begin
+      FileUtils.mkdir_p(File.dirname(cdx_path))
+      File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
+      puts "Saved snapshot list to #{cdx_path}"
+    rescue => e
+      puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
+    end
    puts

    snapshot_list_to_consider
@ -348,32 +399,103 @@ class WaybackMachineDownloader
    puts "]"
  end

+  def load_downloaded_ids
+    downloaded_ids = Set.new
+    if File.exist?(db_path) && !@reset
+      puts "Loading list of already downloaded files from #{db_path}"
+      begin
+        File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
+      rescue => e
+        puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
+        downloaded_ids.clear
+      end
+    end
+    downloaded_ids
+  end
+
+  def append_to_db(file_id)
+    @db_mutex.synchronize do
+      begin
+        FileUtils.mkdir_p(File.dirname(db_path))
+        File.open(db_path, 'a') { |f| f.puts(file_id) }
+      rescue => e
+        @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
+      end
+    end
+  end
+
  def download_files
    start_time = Time.now
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
-    
-    if file_list_by_timestamp.empty?
-      puts "No files to download."
+
+    FileUtils.mkdir_p(backup_path)
+
+    # Load the list of files to potentially download
+    files_to_download = file_list_by_timestamp
+
+    if files_to_download.empty?
+      puts "No files found matching criteria."
+      cleanup
      return
    end

-    total_files = file_list_by_timestamp.count
-    puts "#{total_files} files to download:"
-    
+    total_files = files_to_download.count
+    puts "#{total_files} files found matching criteria."
+
+    # Load IDs of already downloaded files
+    downloaded_ids = load_downloaded_ids
+    files_to_process = files_to_download.reject do |file_info|
+      downloaded_ids.include?(file_info[:file_id])
+    end
+
+    remaining_count = files_to_process.count
+    skipped_count = total_files - remaining_count
+
+    if skipped_count > 0
+      puts "Found #{skipped_count} previously downloaded files, skipping them."
+    end
+
+    if remaining_count == 0
+      puts "All matching files have already been downloaded."
+      cleanup
+      return
+    end
+
+    puts "#{remaining_count} files to download:"
+
    @processed_file_count = 0
+    @total_to_download = remaining_count
    @download_mutex = Mutex.new
-    
+
    thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
    pool = Concurrent::FixedThreadPool.new(thread_count)
-    
-    file_list_by_timestamp.each do |file_remote_info|
+
+    files_to_process.each do |file_remote_info|
      pool.post do
-        @connection_pool.with_connection do |connection|
-          result = download_file(file_remote_info, connection)
-          @download_mutex.synchronize do
-            @processed_file_count += 1
-            puts result if result
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # for now, assume success if no exception and message doesn't indicate error/skip
+            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
          end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
+          end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
        end
        sleep(RATE_LIMIT)
      end
@ -383,7 +505,8 @@ class WaybackMachineDownloader
    pool.wait_for_termination

    end_time = Time.now
-    puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
+    puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
+    puts "Results saved in #{backup_path}"
    cleanup
  end

@ -431,21 +554,24 @@ class WaybackMachineDownloader
      dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
      file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
    end
-    unless File.exist? file_path
-      begin
-        structure_dir_path dir_path
-        download_with_retry(file_path, file_url, file_timestamp, http)
-        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
-      rescue StandardError => e
-        msg = "#{file_url} # #{e}"
-        if not @all and File.exist?(file_path) and File.size(file_path) == 0
-          File.delete(file_path)
-          msg += "\n#{file_path} was empty and was removed."
-        end
-        msg
+
+    # check existence *before* download attempt
+    # this handles cases where a file was created manually or by a previous partial run without a .db entry
+    if File.exist? file_path
+       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
+    end
+
+    begin
+      structure_dir_path dir_path
+      download_with_retry(file_path, file_url, file_timestamp, http)
+      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+    rescue StandardError => e
+      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
+      if not @all and File.exist?(file_path) and File.size(file_path) == 0
+        File.delete(file_path)
+        msg += "\n#{file_path} was empty and was removed."
      end
-    else
-      "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
+      msg
    end
  end

@ -478,23 +604,33 @@ class WaybackMachineDownloader
    begin
      wayback_url = if @rewritten
        "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
-      else  
+      else
        "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
      end
-      
+
      request = Net::HTTP::Get.new(URI(wayback_url))
      request["Connection"] = "keep-alive"
      request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
-      
+      request["Accept-Encoding"] = "gzip, deflate"
+
      response = connection.request(request)
-      
+
      case response
      when Net::HTTPSuccess
        File.open(file_path, "wb") do |file|
-          if block_given?
-            yield(response, file)
+          body = response.body
+          if response['content-encoding'] == 'gzip' && body && !body.empty?
+            begin
+              gz = Zlib::GzipReader.new(StringIO.new(body))
+              decompressed_body = gz.read
+              gz.close
+              file.write(decompressed_body)
+            rescue Zlib::GzipFile::Error => e
+              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
+              file.write(body)
+            end
          else
-            file.write(response.body)
+            file.write(body) if body
          end
        end
      when Net::HTTPRedirection
@ -511,7 +647,7 @@ class WaybackMachineDownloader
      else
        raise "HTTP Error: #{response.code} #{response.message}"
      end
-      
+
    rescue StandardError => e
      if retries < MAX_RETRIES
        retries += 1
@ -527,12 +663,25 @@ class WaybackMachineDownloader

  def cleanup
    @connection_pool.shutdown
-    
+
    if @failed_downloads.any?
+      @logger.error("Download completed with errors.")
      @logger.error("Failed downloads summary:")
      @failed_downloads.each do |failure|
        @logger.error("  #{failure[:url]} - #{failure[:error]}")
      end
+      unless @reset
+         puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
+         return
+      end
+    end
+
+    if !@keep || @reset
+        puts "Cleaning up state files..." unless @keep && !@reset
+        FileUtils.rm_f(cdx_path)
+        FileUtils.rm_f(db_path)
+    elsif @keep
+        puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
    end
  end
 end
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@ -4,7 +4,7 @@ require 'uri'
 module ArchiveAPI

  def get_raw_list_from_api(url, page_index, http)
-    request_url = URI("https://web.archive.org/cdx/search/xd")
+    request_url = URI("https://web.archive.org/cdx/search/cdx")
    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
    request_url.query = URI.encode_www_form(params)