From 576298dca8f3312a031b0009d38f7f7b7b10b2a1 Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:05:09 +0000
Subject: [PATCH 1/7] License fix

---
 LICENSE.md => LICENSE | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename LICENSE.md => LICENSE (100%)

diff --git a/LICENSE.md b/LICENSE
similarity index 100%
rename from LICENSE.md
rename to LICENSE

From 27dd619aa4a025fe7853a8a179e74f92b8b6bec3 Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:07:07 +0000
Subject: [PATCH 2/7] gzip support

---
 lib/wayback_machine_downloader.rb | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index ff4f6f9..0e9b15a 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -9,6 +9,8 @@ require 'json'
 require 'time'
 require 'concurrent-ruby'
 require 'logger'
+require 'zlib'
+require 'stringio'
 require_relative 'wayback_machine_downloader/tidy_bytes'
 require_relative 'wayback_machine_downloader/to_regex'
 require_relative 'wayback_machine_downloader/archive_api'
@@ -478,23 +480,33 @@ class WaybackMachineDownloader
     begin
       wayback_url = if @rewritten
         "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
-      else  
+      else
         "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
       end
-      
+
       request = Net::HTTP::Get.new(URI(wayback_url))
       request["Connection"] = "keep-alive"
       request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
-      
+      request["Accept-Encoding"] = "gzip, deflate"
+
       response = connection.request(request)
-      
+
       case response
       when Net::HTTPSuccess
         File.open(file_path, "wb") do |file|
-          if block_given?
-            yield(response, file)
+          body = response.body
+          if response['content-encoding'] == 'gzip' && body && !body.empty?
+            begin
+              gz = Zlib::GzipReader.new(StringIO.new(body))
+              decompressed_body = gz.read
+              gz.close
+              file.write(decompressed_body)
+            rescue Zlib::GzipFile::Error => e
+              @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
+              file.write(body)
+            end
           else
-            file.write(response.body)
+            file.write(body) if body
           end
         end
       when Net::HTTPRedirection

From febffe5de485943a7f91707cec2ecfb242620c1a Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:40:14 +0000
Subject: [PATCH 3/7] Added support for resuming incomplete downloads

---
 README.md                                     |  23 ++
 bin/wayback_machine_downloader                |  10 +-
 lib/wayback_machine_downloader.rb             | 228 ++++++++++++++----
 lib/wayback_machine_downloader/archive_api.rb |   2 +-
 4 files changed, 216 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index bf9c8e2..bf42abe 100644
--- a/README.md
+++ b/README.md
@@ -217,6 +217,29 @@ ruby wayback_machine_downloader https://example.com --list
 ```
 It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application.
 
+### Job management
+The downloader automatically saves its progress (`.cdx.json` for snapshot list, `.downloaded.txt` for completed files) in the output directory. If you run the same command again pointing to the same output directory, it will resume where it left off, skipping already downloaded files.
+
+> [!NOTE]
+> Automatic resumption can be affected by changing the URL, mode selection (like `--all-timestamps`), filtering selections, or other options. If you want to ensure a clean start, use the `--reset` option.
+
+| Option | Description |
+|--------|-------------|
+| `--reset` | Delete state files (`.cdx.json`, `.downloaded.txt`) and restart the download from scratch. Does not delete already downloaded website files. |
+| `--keep` | Keep state files (`.cdx.json`, `.downloaded.txt`) even after a successful download. By default, these are deleted upon successful completion. |
+
+**Example** - Restart a download job from the beginning:
+```bash
+ruby wayback_machine_downloader https://example.com --reset
+```
+This is useful if you suspect the state files are corrupted or want to ensure a completely fresh download process without deleting the files you already have.
+
+**Example 2** - Keep state files after download:
+```bash
+ruby wayback_machine_downloader https://example.com --keep
+```
+This can be useful for debugging or if you plan to extend the download later with different parameters (e.g., adding `--to` timestamp) while leveraging the existing snapshot list.
+
 ## 🤝 Contributing
 1. Fork the repository
 2. Create a feature branch
diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader
index 00b2fd9..9f63b65 100755
--- a/bin/wayback_machine_downloader
+++ b/bin/wayback_machine_downloader
@@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
   end
 
   opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
-    options[:rewritten] = t
+    options[:rewritten] = true
+  end
+
+  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
+    options[:reset] = true
+  end
+
+  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
+    options[:keep] = true
   end
 
   opts.on("-v", "--version", "Display version") do |t|
diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index 0e9b15a..824801a 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -120,10 +120,12 @@ class WaybackMachineDownloader
   RATE_LIMIT = 0.25  # Delay between requests in seconds
   CONNECTION_POOL_SIZE = 10
   MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
+  STATE_CDX_FILENAME = ".cdx.json"
+  STATE_DB_FILENAME = ".downloaded.txt"
 
   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
-    :all, :maximum_pages, :threads_count, :logger
+    :all, :maximum_pages, :threads_count, :logger, :reset, :keep
 
   def initialize params
     validate_params(params)
@@ -139,10 +141,15 @@ class WaybackMachineDownloader
     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
     @threads_count = [params[:threads_count].to_i, 1].max
     @rewritten = params[:rewritten]
+    @reset = params[:reset]
+    @keep = params[:keep]
     @timeout = params[:timeout] || DEFAULT_TIMEOUT
     @logger = setup_logger
     @failed_downloads = Concurrent::Array.new
     @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
+    @db_mutex = Mutex.new
+
+    handle_reset
   end
 
   def backup_name
@@ -165,6 +172,23 @@ class WaybackMachineDownloader
     end
   end
 
+  def cdx_path
+    File.join(backup_path, STATE_CDX_FILENAME)
+  end
+
+  def db_path
+    File.join(backup_path, STATE_DB_FILENAME)
+  end
+
+  def handle_reset
+    if @reset
+      puts "Resetting download state..."
+      FileUtils.rm_f(cdx_path)
+      FileUtils.rm_f(db_path)
+      puts "Removed state files: #{cdx_path}, #{db_path}"
+    end
+  end
+
   def match_only_filter file_url
     if @only_filter
       only_filter_regex = @only_filter.to_regex
@@ -192,10 +216,26 @@ class WaybackMachineDownloader
   end
 
   def get_all_snapshots_to_consider
+    if File.exist?(cdx_path) && !@reset
+      puts "Loading snapshot list from #{cdx_path}"
+      begin
+        snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
+        puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
+        puts
+        return Concurrent::Array.new(snapshot_list_to_consider)
+      rescue JSON::ParserError => e
+        puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      rescue => e
+        puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
+        FileUtils.rm_f(cdx_path)
+      end
+    end
+
     snapshot_list_to_consider = Concurrent::Array.new
     mutex = Mutex.new
-    
-    puts "Getting snapshot pages"
+
+    puts "Getting snapshot pages from Wayback Machine API..."
 
     # Fetch the initial set of snapshots, sequentially
     @connection_pool.with_connection do |connection|
@@ -211,12 +251,12 @@ class WaybackMachineDownloader
       page_index = 0
       batch_size = [@threads_count, 5].min
       continue_fetching = true
-      
+
       while continue_fetching && page_index < @maximum_pages
         # Determine the range of pages to fetch in this batch
         end_index = [page_index + batch_size, @maximum_pages].min
         current_batch = (page_index...end_index).to_a
-        
+
         # Create futures for concurrent API calls
         futures = current_batch.map do |page|
           Concurrent::Future.execute do
@@ -227,9 +267,9 @@ class WaybackMachineDownloader
             [page, result]
           end
         end
-        
+
         results = []
-        
+
         futures.each do |future|
           begin
             results << future.value
@@ -237,10 +277,10 @@ class WaybackMachineDownloader
             puts "\nError fetching page #{future}: #{e.message}"
           end
         end
-        
+
         # Sort results by page number to maintain order
         results.sort_by! { |page, _| page }
-        
+
         # Process results and check for empty pages
         results.each do |page, result|
           if result.empty?
@@ -253,14 +293,23 @@ class WaybackMachineDownloader
             end
           end
         end
-        
+
         page_index = end_index
-        
+
         sleep(RATE_LIMIT) if continue_fetching
       end
     end
 
-    puts " found #{snapshot_list_to_consider.length} snapshots to consider."
+    puts " found #{snapshot_list_to_consider.length} snapshots."
+
+    # Save the fetched list to the cache file
+    begin
+      FileUtils.mkdir_p(File.dirname(cdx_path))
+      File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
+      puts "Saved snapshot list to #{cdx_path}"
+    rescue => e
+      puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
+    end
     puts
 
     snapshot_list_to_consider
@@ -350,32 +399,103 @@ class WaybackMachineDownloader
     puts "]"
   end
 
+  def load_downloaded_ids
+    downloaded_ids = Set.new
+    if File.exist?(db_path) && !@reset
+      puts "Loading list of already downloaded files from #{db_path}"
+      begin
+        File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
+      rescue => e
+        puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
+        downloaded_ids.clear
+      end
+    end
+    downloaded_ids
+  end
+
+  def append_to_db(file_id)
+    @db_mutex.synchronize do
+      begin
+        FileUtils.mkdir_p(File.dirname(db_path))
+        File.open(db_path, 'a') { |f| f.puts(file_id) }
+      rescue => e
+        @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
+      end
+    end
+  end
+
   def download_files
     start_time = Time.now
     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
-    
-    if file_list_by_timestamp.empty?
-      puts "No files to download."
+
+    FileUtils.mkdir_p(backup_path)
+
+    # Load the list of files to potentially download
+    files_to_download = file_list_by_timestamp
+
+    if files_to_download.empty?
+      puts "No files found matching criteria."
+      cleanup
       return
     end
 
-    total_files = file_list_by_timestamp.count
-    puts "#{total_files} files to download:"
-    
+    total_files = files_to_download.count
+    puts "#{total_files} files found matching criteria."
+
+    # Load IDs of already downloaded files
+    downloaded_ids = load_downloaded_ids
+    files_to_process = files_to_download.reject do |file_info|
+      downloaded_ids.include?(file_info[:file_id])
+    end
+
+    remaining_count = files_to_process.count
+    skipped_count = total_files - remaining_count
+
+    if skipped_count > 0
+      puts "Found #{skipped_count} previously downloaded files, skipping them."
+    end
+
+    if remaining_count == 0
+      puts "All matching files have already been downloaded."
+      cleanup
+      return
+    end
+
+    puts "#{remaining_count} files to download:"
+
     @processed_file_count = 0
+    @total_to_download = remaining_count
     @download_mutex = Mutex.new
-    
+
     thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
     pool = Concurrent::FixedThreadPool.new(thread_count)
-    
-    file_list_by_timestamp.each do |file_remote_info|
+
+    files_to_process.each do |file_remote_info|
       pool.post do
-        @connection_pool.with_connection do |connection|
-          result = download_file(file_remote_info, connection)
-          @download_mutex.synchronize do
-            @processed_file_count += 1
-            puts result if result
+        download_success = false
+        begin
+          @connection_pool.with_connection do |connection|
+            result_message = download_file(file_remote_info, connection)
+            # for now, assume success if no exception and message doesn't indicate error/skip
+            if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
+               download_success = true
+            end
+            @download_mutex.synchronize do
+              @processed_file_count += 1
+              # adjust progress message to reflect remaining files
+              progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
+              puts progress_message if progress_message
+            end
           end
+          # sppend to DB only after successful download outside the connection block
+          if download_success
+            append_to_db(file_remote_info[:file_id])
+          end
+        rescue => e
+          @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
+           @download_mutex.synchronize do
+              @processed_file_count += 1
+           end
         end
         sleep(RATE_LIMIT)
       end
@@ -385,7 +505,8 @@ class WaybackMachineDownloader
     pool.wait_for_termination
 
     end_time = Time.now
-    puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
+    puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
+    puts "Results saved in #{backup_path}"
     cleanup
   end
 
@@ -417,9 +538,10 @@ class WaybackMachineDownloader
     file_url = file_remote_info[:file_url].encode(current_encoding)
     file_id = file_remote_info[:file_id]
     file_timestamp = file_remote_info[:timestamp]
-    file_path_elements = file_id.split('/')
+    original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
+    file_path_elements = original_file_id.split('/')
 
-    if file_id == ""
+    if original_file_id == ""
       dir_path = backup_path
       file_path = backup_path + 'index.html'
     elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
@@ -433,21 +555,24 @@ class WaybackMachineDownloader
       dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
     end
-    unless File.exist? file_path
-      begin
-        structure_dir_path dir_path
-        download_with_retry(file_path, file_url, file_timestamp, http)
-        "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
-      rescue StandardError => e
-        msg = "#{file_url} # #{e}"
-        if not @all and File.exist?(file_path) and File.size(file_path) == 0
-          File.delete(file_path)
-          msg += "\n#{file_path} was empty and was removed."
-        end
-        msg
+
+    # check existence *before* download attempt
+    # this handles cases where a file was created manually or by a previous partial run without a .db entry
+    if File.exist? file_path
+       return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
+    end
+
+    begin
+      structure_dir_path dir_path
+      download_with_retry(file_path, file_url, file_timestamp, http)
+      "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
+    rescue StandardError => e
+      msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
+      if not @all and File.exist?(file_path) and File.size(file_path) == 0
+        File.delete(file_path)
+        msg += "\n#{file_path} was empty and was removed."
       end
-    else
-      "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
+      msg
     end
   end
 
@@ -523,7 +648,7 @@ class WaybackMachineDownloader
       else
         raise "HTTP Error: #{response.code} #{response.message}"
       end
-      
+
     rescue StandardError => e
       if retries < MAX_RETRIES
         retries += 1
@@ -539,12 +664,25 @@ class WaybackMachineDownloader
 
   def cleanup
     @connection_pool.shutdown
-    
+
     if @failed_downloads.any?
+      @logger.error("Download completed with errors.")
       @logger.error("Failed downloads summary:")
       @failed_downloads.each do |failure|
         @logger.error("  #{failure[:url]} - #{failure[:error]}")
       end
+      unless @reset
+         puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
+         return
+      end
+    end
+
+    if !@keep || @reset
+        puts "Cleaning up state files..." unless @keep && !@reset
+        FileUtils.rm_f(cdx_path)
+        FileUtils.rm_f(db_path)
+    elsif @keep
+        puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
     end
   end
 end
diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb
index 45eb799..ab99758 100644
--- a/lib/wayback_machine_downloader/archive_api.rb
+++ b/lib/wayback_machine_downloader/archive_api.rb
@@ -4,7 +4,7 @@ require 'uri'
 module ArchiveAPI
 
   def get_raw_list_from_api(url, page_index, http)
-    request_url = URI("https://web.archive.org/cdx/search/xd")
+    request_url = URI("https://web.archive.org/cdx/search/cdx")
     params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
     request_url.query = URI.encode_www_form(params)
 

From 31d51728aff2f584a4e9544d8339db58a85a0021 Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Sat, 19 Apr 2025 14:07:05 +0000
Subject: [PATCH 4/7] Bump version

---
 lib/wayback_machine_downloader.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index 824801a..1d99769 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
 
   include ArchiveAPI
 
-  VERSION = "2.3.3"
+  VERSION = "2.3.4"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2

From 4db13a7792a5161987d59fc747e521e7805daac6 Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:01:29 +0000
Subject: [PATCH 5/7] Fix --all-timestamps we were accidentally removing the
 timestamp prefix from `file_id`, rendering that option useless in 2.3.4. This
 should again now. This will fix #4

---
 lib/wayback_machine_downloader.rb | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index 1d99769..0118f7c 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -538,10 +538,9 @@ class WaybackMachineDownloader
     file_url = file_remote_info[:file_url].encode(current_encoding)
     file_id = file_remote_info[:file_id]
     file_timestamp = file_remote_info[:timestamp]
-    original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
-    file_path_elements = original_file_id.split('/')
+    file_path_elements = file_id.split('/')
 
-    if original_file_id == ""
+    if file_id == ""
       dir_path = backup_path
       file_path = backup_path + 'index.html'
     elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'

From 787bc2e535e39eeab1a263fa83f9045d7ae7cdf1 Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:05:21 +0000
Subject: [PATCH 6/7] Added missing configs

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bf42abe..7da8763 100644
--- a/README.md
+++ b/README.md
@@ -81,14 +81,15 @@ services:
 
 ## ⚙️ Configuration
 There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
-
 ```ruby
 DEFAULT_TIMEOUT = 30        # HTTP timeout (in seconds)
-MAX_RETRIES = 3             # Failed request retries
-RETRY_DELAY = 2             # Wait between retries
-RATE_LIMIT = 0.25           # Throttle between requests
-CONNECTION_POOL_SIZE = 10   # No. of simultaneous connections
-MEMORY_BUFFER_SIZE = 16384  # Size of download buffer
+MAX_RETRIES = 3             # Number of times to retry failed requests
+RETRY_DELAY = 2             # Wait time between retries (seconds)
+RATE_LIMIT = 0.25           # Throttle between requests (seconds)
+CONNECTION_POOL_SIZE = 10   # Maximum simultaneous connections
+MEMORY_BUFFER_SIZE = 16384  # Download buffer size (bytes)
+STATE_CDX_FILENAME = '.cdx.json'       # Stores snapshot listing
+STATE_DB_FILENAME = '.downloaded.txt'  # Tracks completed downloads
 ```
 
 ## 🛠️ Advanced usage

From 917f4f879858307fda7410737b2b80bb1d9c11f1 Mon Sep 17 00:00:00 2001
From: Felipe <41008398+StrawberryMaster@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:05:30 +0000
Subject: [PATCH 7/7] Bumping version

---
 lib/wayback_machine_downloader.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb
index 0118f7c..beba6c5 100644
--- a/lib/wayback_machine_downloader.rb
+++ b/lib/wayback_machine_downloader.rb
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
 
   include ArchiveAPI
 
-  VERSION = "2.3.4"
+  VERSION = "2.3.5"
   DEFAULT_TIMEOUT = 30
   MAX_RETRIES = 3
   RETRY_DELAY = 2