Added support for resuming incomplete downloads

2025-12-29 16:16:06 +00:00 · 2025-04-19 13:40:14 +00:00
parent 27dd619aa4
commit febffe5de4
4 changed files with 216 additions and 47 deletions
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -120,10 +120,12 @@ class WaybackMachineDownloader
  RATE_LIMIT = 0.25  # Delay between requests in seconds
  CONNECTION_POOL_SIZE = 10
  MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
+  STATE_CDX_FILENAME = ".cdx.json"
+  STATE_DB_FILENAME = ".downloaded.txt"

  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
    :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep

  def initialize params
    validate_params(params)
@@ -139,10 +141,15 @@ class WaybackMachineDownloader
    @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
    @threads_count = [params[:threads_count].to_i, 1].max
    @rewritten = params[:rewritten]
+    @reset = params[:reset]
+    @keep = params[:keep]
    @timeout = params[:timeout] || DEFAULT_TIMEOUT
    @logger = setup_logger
    @failed_downloads = Concurrent::Array.new
    @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
+    @db_mutex = Mutex.new
+
+    handle_reset
  end

  def backup_name
@@ -165,6 +172,23 @@ class WaybackMachineDownloader
    end
  end

+  def cdx_path
+    File.join(backup_path, STATE_CDX_FILENAME)
+  end
+
+  def db_path
+    File.join(backup_path, STATE_DB_FILENAME)
+  end
+
+  def handle_reset
+    if @reset
+      puts "Resetting download state..."
+      FileUtils.rm_f(cdx_path)
+      FileUtils.rm_f(db_path)
+      puts "Removed state files: #{cdx_path}, #{db_path}"
+    end
+  end
+
  def match_only_filter file_url
    if @only_filter
      only_filter_regex = @only_filter.to_regex
@@ -192,10 +216,26 @@ class WaybackMachineDownloader
  end

  def get_all_snapshots_to_consider
+    if File.exist?(cdx_path) && !@reset
+      puts "Loading snapshot list from #{cdx_path}"
+      begin
+        snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
+        puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
+        puts
+        return Concurrent::Array.new(snapshot_list_to_consider)
+      rescue JSON::ParserError => e
+        puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      rescue => e
+        puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      end
+    end
+
    snapshot_list_to_consider = Concurrent::Array.new
    mutex = Mutex.new
-    
-    puts "Getting snapshot pages"
+
+    puts "Getting snapshot pages from Wayback Machine API..."

    # Fetch the initial set of snapshots, sequentially
    @connection_pool.with_connection do |connection|
@@ -211,12 +251,12 @@ class WaybackMachineDownloader
      page_index = 0
      batch_size = [@threads_count, 5].min
      continue_fetching = true
-      
+
      while continue_fetching && page_index < @maximum_pages
        # Determine the range of pages to fetch in this batch
        end_index = [page_index + batch_size, @maximum_pages].min
        current_batch = (page_index...end_index).to_a
-        
+
        # Create futures for concurrent API calls
        futures = current_batch.map do |page|
          Concurrent::Future.execute do
@@ -227,9 +267,9 @@ class WaybackMachineDownloader
            [page, result]
          end
        end
-        
+
        results = []
-        
+
        futures.each do |future|
          begin
            results << future.value
@@ -237,10 +277,10 @@ class WaybackMachineDownloader
            puts "\nError fetching page #{future}: #{e.message}"
          end
        end
-        
+
        # Sort results by page number to maintain order
        results.sort_by! { |page, _| page }
-        
+
        # Process results and check for empty pages
        results.each do |page, result|
          if result.empty?
@@ -253,14 +293,23 @@ class WaybackMachineDownloader
            end
          end
        end
-        
+
        page_index = end_index
-        
+
        sleep(RATE_LIMIT) if continue_fetching
      end
    end

-    puts " found #{snapshot_list_to_consider.length} snapshots to consider."
+    puts " found #{snapshot_list_to_consider.length} snapshots."
+
+    # Save the fetched list to the cache file
+    begin
+      FileUtils.mkdir_p(File.dirname(cdx_path))
+      File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
+      puts "Saved snapshot list to #{cdx_path}"
+    rescue => e
+      puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
+    end
    puts

    snapshot_list_to_consider
@@ -350,32 +399,103 @@ class WaybackMachineDownloader
    puts "]"
  end

+  def load_downloaded_ids
+    downloaded_ids = Set.new
+    if File.exist?(db_path) && !@reset
+      puts "Loading list of already downloaded files from #{db_path}"
+      begin
+        File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
+      rescue => e
+        puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
+        downloaded_ids.clear
+      end
+    end
+    downloaded_ids
+  end
+
+  def append_to_db(file_id)
+    @db_mutex.synchronize do
+      begin
+        FileUtils.mkdir_p(File.dirname(db_path))
+        File.open(db_path, 'a') { |f| f.puts(file_id) }
+      rescue => e
+        @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
+      end
+    end
+  end
+
  def download_files
    start_time = Time.now
    puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
-    
-    if file_list_by_timestamp.empty?
-      puts "No files to download."
+
+    FileUtils.mkdir_p(backup_path)
+
+    # Load the list of files to potentially download
+    files_to_download = file_list_by_timestamp
+
+    if files_to_download.empty?
+      puts "No files found matching criteria."
+      cleanup
      return
    end

-    total_files = file_list_by_timestamp.count
-    puts "#{total_files} files to download:"
-    
+    total_files = files_to_download.count
+    puts "#{total_files} files found matching criteria."
+
+    # Load IDs of already downloaded files
+    downloaded_ids = load_downloaded_ids
+    files_to_process = files_to_download.reject do |file_info|
+      downloaded_ids.include?(file_info[:file_id])
+    end
+
+    remaining_count = files_to_process.count
+    skipped_count = total_files - remaining_count
+
+    if skipped_count > 0
+      puts "Found #{skipped_count} previously downloaded files, skipping them."
+    end
+
+    if remaining_count == 0
+      puts "All matching files have already been downloaded."
+      cleanup
+      return
+    end
+
+    puts "#{remaining_count} files to download:"
+
    @processed_file_count = 0
+    @total_to_download = remaining_count
    @download_mutex = Mutex.new
-    
+
    thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
    pool = Concurrent::FixedThreadPool.new(thread_count)
-    
-    file_list_by_timestamp.each do |file_remote_info|
+
+    files_to_process.each do |file_remote_info|
      pool.post do
-        @connection_pool.with_connection do |connection|
-          result = download_file(file_remote_info, connection)
-          @download_mutex.synchronize do
-            @processed_file_count += 1
-            puts result if result
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # for now, assume success if no exception and message doesn't indicate error/skip
+            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
          end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
+          end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
        end
        sleep(RATE_LIMIT)
      end
@@ -385,7 +505,8 @@ class WaybackMachineDownloader
    pool.wait_for_termination

    end_time = Time.now
-    puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
+    puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
+    puts "Results saved in #{backup_path}"
    cleanup
  end

@@ -417,9 +538,10 @@ class WaybackMachineDownloader
    file_url = file_remote_info[:file_url].encode(current_encoding)
    file_id = file_remote_info[:file_id]
    file_timestamp = file_remote_info[:timestamp]
-    file_path_elements = file_id.split('/')
+    original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
+    file_path_elements = original_file_id.split('/')

-    if file_id == ""
+    if original_file_id == ""
      dir_path = backup_path
      file_path = backup_path + 'index.html'
    elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -433,21 +555,24 @@ class WaybackMachineDownloader
      dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
      file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
    end
-    unless File.exist? file_path
-      begin
-        structure_dir_path dir_path
-        download_with_retry(file_path, file_url, file_timestamp, http)
-        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
-      rescue StandardError => e
-        msg = "#{file_url} # #{e}"
-        if not @all and File.exist?(file_path) and File.size(file_path) == 0
-          File.delete(file_path)
-          msg += "\n#{file_path} was empty and was removed."
-        end
-        msg
+
+    # check existence *before* download attempt
+    # this handles cases where a file was created manually or by a previous partial run without a .db entry
+    if File.exist? file_path
+       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
+    end
+
+    begin
+      structure_dir_path dir_path
+      download_with_retry(file_path, file_url, file_timestamp, http)
+      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+    rescue StandardError => e
+      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
+      if not @all and File.exist?(file_path) and File.size(file_path) == 0
+        File.delete(file_path)
+        msg += "\n#{file_path} was empty and was removed."
      end
-    else
-      "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
+      msg
    end
  end

@@ -523,7 +648,7 @@ class WaybackMachineDownloader
      else
        raise "HTTP Error: #{response.code} #{response.message}"
      end
-      
+
    rescue StandardError => e
      if retries < MAX_RETRIES
        retries += 1
@@ -539,12 +664,25 @@ class WaybackMachineDownloader

  def cleanup
    @connection_pool.shutdown
-    
+
    if @failed_downloads.any?
+      @logger.error("Download completed with errors.")
      @logger.error("Failed downloads summary:")
      @failed_downloads.each do |failure|
        @logger.error("  #{failure[:url]} - #{failure[:error]}")
      end
+      unless @reset
+         puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
+         return
+      end
+    end
+
+    if !@keep || @reset
+        puts "Cleaning up state files..." unless @keep && !@reset
+        FileUtils.rm_f(cdx_path)
+        FileUtils.rm_f(db_path)
+    elsif @keep
+        puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
    end
  end
 end
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -4,7 +4,7 @@ require 'uri'
 module ArchiveAPI

  def get_raw_list_from_api(url, page_index, http)
-    request_url = URI("https://web.archive.org/cdx/search/xd")
+    request_url = URI("https://web.archive.org/cdx/search/cdx")
    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
    request_url.query = URI.encode_www_form(params)