From 576298dca8f3312a031b0009d38f7f7b7b10b2a1 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:05:09 +0000 Subject: [PATCH 1/7] License fix --- LICENSE.md => LICENSE | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename LICENSE.md => LICENSE (100%) diff --git a/LICENSE.md b/LICENSE similarity index 100% rename from LICENSE.md rename to LICENSE From 27dd619aa4a025fe7853a8a179e74f92b8b6bec3 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:07:07 +0000 Subject: [PATCH 2/7] gzip support --- lib/wayback_machine_downloader.rb | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index ff4f6f9..0e9b15a 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -9,6 +9,8 @@ require 'json' require 'time' require 'concurrent-ruby' require 'logger' +require 'zlib' +require 'stringio' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/archive_api' @@ -478,23 +480,33 @@ class WaybackMachineDownloader begin wayback_url = if @rewritten "https://web.archive.org/web/#{file_timestamp}/#{file_url}" - else + else "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}" end - + request = Net::HTTP::Get.new(URI(wayback_url)) request["Connection"] = "keep-alive" request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}" - + request["Accept-Encoding"] = "gzip, deflate" + response = connection.request(request) - + case response when Net::HTTPSuccess File.open(file_path, "wb") do |file| - if block_given? - yield(response, file) + body = response.body + if response['content-encoding'] == 'gzip' && body && !body.empty? + begin + gz = Zlib::GzipReader.new(StringIO.new(body)) + decompressed_body = gz.read + gz.close + file.write(decompressed_body) + rescue Zlib::GzipFile::Error => e + @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}") + file.write(body) + end else - file.write(response.body) + file.write(body) if body end end when Net::HTTPRedirection From febffe5de485943a7f91707cec2ecfb242620c1a Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:40:14 +0000 Subject: [PATCH 3/7] Added support for resuming incomplete downloads --- README.md | 23 ++ bin/wayback_machine_downloader | 10 +- lib/wayback_machine_downloader.rb | 228 ++++++++++++++---- lib/wayback_machine_downloader/archive_api.rb | 2 +- 4 files changed, 216 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index bf9c8e2..bf42abe 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,29 @@ ruby wayback_machine_downloader https://example.com --list ``` It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application. +### Job management +The downloader automatically saves its progress (`.cdx.json` for snapshot list, `.downloaded.txt` for completed files) in the output directory. If you run the same command again pointing to the same output directory, it will resume where it left off, skipping already downloaded files. + +> [!NOTE] +> Automatic resumption can be affected by changing the URL, mode selection (like `--all-timestamps`), filtering selections, or other options. If you want to ensure a clean start, use the `--reset` option. + +| Option | Description | +|--------|-------------| +| `--reset` | Delete state files (`.cdx.json`, `.downloaded.txt`) and restart the download from scratch. Does not delete already downloaded website files. | +| `--keep` | Keep state files (`.cdx.json`, `.downloaded.txt`) even after a successful download. By default, these are deleted upon successful completion. | + +**Example** - Restart a download job from the beginning: +```bash +ruby wayback_machine_downloader https://example.com --reset +``` +This is useful if you suspect the state files are corrupted or want to ensure a completely fresh download process without deleting the files you already have. + +**Example 2** - Keep state files after download: +```bash +ruby wayback_machine_downloader https://example.com --keep +``` +This can be useful for debugging or if you plan to extend the download later with different parameters (e.g., adding `--to` timestamp) while leveraging the existing snapshot list. + ## 🤝 Contributing 1. Fork the repository 2. Create a feature branch diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 00b2fd9..9f63b65 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts| end opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t| - options[:rewritten] = t + options[:rewritten] = true + end + + opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t| + options[:reset] = true + end + + opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t| + options[:keep] = true end opts.on("-v", "--version", "Display version") do |t| diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 0e9b15a..824801a 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -120,10 +120,12 @@ class WaybackMachineDownloader RATE_LIMIT = 0.25 # Delay between requests in seconds CONNECTION_POOL_SIZE = 10 MEMORY_BUFFER_SIZE = 16384 # 16KB chunks + STATE_CDX_FILENAME = ".cdx.json" + STATE_DB_FILENAME = ".downloaded.txt" attr_accessor :base_url, :exact_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, - :all, :maximum_pages, :threads_count, :logger + :all, :maximum_pages, :threads_count, :logger, :reset, :keep def initialize params validate_params(params) @@ -139,10 +141,15 @@ class WaybackMachineDownloader @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = [params[:threads_count].to_i, 1].max @rewritten = params[:rewritten] + @reset = params[:reset] + @keep = params[:keep] @timeout = params[:timeout] || DEFAULT_TIMEOUT @logger = setup_logger @failed_downloads = Concurrent::Array.new @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE) + @db_mutex = Mutex.new + + handle_reset end def backup_name @@ -165,6 +172,23 @@ class WaybackMachineDownloader end end + def cdx_path + File.join(backup_path, STATE_CDX_FILENAME) + end + + def db_path + File.join(backup_path, STATE_DB_FILENAME) + end + + def handle_reset + if @reset + puts "Resetting download state..." + FileUtils.rm_f(cdx_path) + FileUtils.rm_f(db_path) + puts "Removed state files: #{cdx_path}, #{db_path}" + end + end + def match_only_filter file_url if @only_filter only_filter_regex = @only_filter.to_regex @@ -192,10 +216,26 @@ class WaybackMachineDownloader end def get_all_snapshots_to_consider + if File.exist?(cdx_path) && !@reset + puts "Loading snapshot list from #{cdx_path}" + begin + snapshot_list_to_consider = JSON.parse(File.read(cdx_path)) + puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache." + puts + return Concurrent::Array.new(snapshot_list_to_consider) + rescue JSON::ParserError => e + puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..." + FileUtils.rm_f(cdx_path) + rescue => e + puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..." + FileUtils.rm_f(cdx_path) + end + end + snapshot_list_to_consider = Concurrent::Array.new mutex = Mutex.new - - puts "Getting snapshot pages" + + puts "Getting snapshot pages from Wayback Machine API..." # Fetch the initial set of snapshots, sequentially @connection_pool.with_connection do |connection| @@ -211,12 +251,12 @@ class WaybackMachineDownloader page_index = 0 batch_size = [@threads_count, 5].min continue_fetching = true - + while continue_fetching && page_index < @maximum_pages # Determine the range of pages to fetch in this batch end_index = [page_index + batch_size, @maximum_pages].min current_batch = (page_index...end_index).to_a - + # Create futures for concurrent API calls futures = current_batch.map do |page| Concurrent::Future.execute do @@ -227,9 +267,9 @@ class WaybackMachineDownloader [page, result] end end - + results = [] - + futures.each do |future| begin results << future.value @@ -237,10 +277,10 @@ class WaybackMachineDownloader puts "\nError fetching page #{future}: #{e.message}" end end - + # Sort results by page number to maintain order results.sort_by! { |page, _| page } - + # Process results and check for empty pages results.each do |page, result| if result.empty? @@ -253,14 +293,23 @@ class WaybackMachineDownloader end end end - + page_index = end_index - + sleep(RATE_LIMIT) if continue_fetching end end - puts " found #{snapshot_list_to_consider.length} snapshots to consider." + puts " found #{snapshot_list_to_consider.length} snapshots." + + # Save the fetched list to the cache file + begin + FileUtils.mkdir_p(File.dirname(cdx_path)) + File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON + puts "Saved snapshot list to #{cdx_path}" + rescue => e + puts "Error saving snapshot cache to #{cdx_path}: #{e.message}" + end puts snapshot_list_to_consider @@ -350,32 +399,103 @@ class WaybackMachineDownloader puts "]" end + def load_downloaded_ids + downloaded_ids = Set.new + if File.exist?(db_path) && !@reset + puts "Loading list of already downloaded files from #{db_path}" + begin + File.foreach(db_path) { |line| downloaded_ids.add(line.strip) } + rescue => e + puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded." + downloaded_ids.clear + end + end + downloaded_ids + end + + def append_to_db(file_id) + @db_mutex.synchronize do + begin + FileUtils.mkdir_p(File.dirname(db_path)) + File.open(db_path, 'a') { |f| f.puts(file_id) } + rescue => e + @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}") + end + end + end + def download_files start_time = Time.now puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." - - if file_list_by_timestamp.empty? - puts "No files to download." + + FileUtils.mkdir_p(backup_path) + + # Load the list of files to potentially download + files_to_download = file_list_by_timestamp + + if files_to_download.empty? + puts "No files found matching criteria." + cleanup return end - total_files = file_list_by_timestamp.count - puts "#{total_files} files to download:" - + total_files = files_to_download.count + puts "#{total_files} files found matching criteria." + + # Load IDs of already downloaded files + downloaded_ids = load_downloaded_ids + files_to_process = files_to_download.reject do |file_info| + downloaded_ids.include?(file_info[:file_id]) + end + + remaining_count = files_to_process.count + skipped_count = total_files - remaining_count + + if skipped_count > 0 + puts "Found #{skipped_count} previously downloaded files, skipping them." + end + + if remaining_count == 0 + puts "All matching files have already been downloaded." + cleanup + return + end + + puts "#{remaining_count} files to download:" + @processed_file_count = 0 + @total_to_download = remaining_count @download_mutex = Mutex.new - + thread_count = [@threads_count, CONNECTION_POOL_SIZE].min pool = Concurrent::FixedThreadPool.new(thread_count) - - file_list_by_timestamp.each do |file_remote_info| + + files_to_process.each do |file_remote_info| pool.post do - @connection_pool.with_connection do |connection| - result = download_file(file_remote_info, connection) - @download_mutex.synchronize do - @processed_file_count += 1 - puts result if result + download_success = false + begin + @connection_pool.with_connection do |connection| + result_message = download_file(file_remote_info, connection) + # for now, assume success if no exception and message doesn't indicate error/skip + if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists') + download_success = true + end + @download_mutex.synchronize do + @processed_file_count += 1 + # adjust progress message to reflect remaining files + progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message + puts progress_message if progress_message + end end + # sppend to DB only after successful download outside the connection block + if download_success + append_to_db(file_remote_info[:file_id]) + end + rescue => e + @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}") + @download_mutex.synchronize do + @processed_file_count += 1 + end end sleep(RATE_LIMIT) end @@ -385,7 +505,8 @@ class WaybackMachineDownloader pool.wait_for_termination end_time = Time.now - puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}" + puts "\nDownload finished in #{(end_time - start_time).round(2)}s." + puts "Results saved in #{backup_path}" cleanup end @@ -417,9 +538,10 @@ class WaybackMachineDownloader file_url = file_remote_info[:file_url].encode(current_encoding) file_id = file_remote_info[:file_id] file_timestamp = file_remote_info[:timestamp] - file_path_elements = file_id.split('/') + original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id + file_path_elements = original_file_id.split('/') - if file_id == "" + if original_file_id == "" dir_path = backup_path file_path = backup_path + 'index.html' elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' @@ -433,21 +555,24 @@ class WaybackMachineDownloader dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } end - unless File.exist? file_path - begin - structure_dir_path dir_path - download_with_retry(file_path, file_url, file_timestamp, http) - "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})" - rescue StandardError => e - msg = "#{file_url} # #{e}" - if not @all and File.exist?(file_path) and File.size(file_path) == 0 - File.delete(file_path) - msg += "\n#{file_path} was empty and was removed." - end - msg + + # check existence *before* download attempt + # this handles cases where a file was created manually or by a previous partial run without a .db entry + if File.exist? file_path + return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})" + end + + begin + structure_dir_path dir_path + download_with_retry(file_path, file_url, file_timestamp, http) + "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})" + rescue StandardError => e + msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})" + if not @all and File.exist?(file_path) and File.size(file_path) == 0 + File.delete(file_path) + msg += "\n#{file_path} was empty and was removed." end - else - "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})" + msg end end @@ -523,7 +648,7 @@ class WaybackMachineDownloader else raise "HTTP Error: #{response.code} #{response.message}" end - + rescue StandardError => e if retries < MAX_RETRIES retries += 1 @@ -539,12 +664,25 @@ class WaybackMachineDownloader def cleanup @connection_pool.shutdown - + if @failed_downloads.any? + @logger.error("Download completed with errors.") @logger.error("Failed downloads summary:") @failed_downloads.each do |failure| @logger.error(" #{failure[:url]} - #{failure[:error]}") end + unless @reset + puts "State files kept due to download errors: #{cdx_path}, #{db_path}" + return + end + end + + if !@keep || @reset + puts "Cleaning up state files..." unless @keep && !@reset + FileUtils.rm_f(cdx_path) + FileUtils.rm_f(db_path) + elsif @keep + puts "Keeping state files as requested: #{cdx_path}, #{db_path}" end end end diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index 45eb799..ab99758 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -4,7 +4,7 @@ require 'uri' module ArchiveAPI def get_raw_list_from_api(url, page_index, http) - request_url = URI("https://web.archive.org/cdx/search/xd") + request_url = URI("https://web.archive.org/cdx/search/cdx") params = [["output", "json"], ["url", url]] + parameters_for_api(page_index) request_url.query = URI.encode_www_form(params) From 31d51728aff2f584a4e9544d8339db58a85a0021 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Sat, 19 Apr 2025 14:07:05 +0000 Subject: [PATCH 4/7] Bump version --- lib/wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 824801a..1d99769 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -113,7 +113,7 @@ class WaybackMachineDownloader include ArchiveAPI - VERSION = "2.3.3" + VERSION = "2.3.4" DEFAULT_TIMEOUT = 30 MAX_RETRIES = 3 RETRY_DELAY = 2 From 4db13a7792a5161987d59fc747e521e7805daac6 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Wed, 30 Apr 2025 13:01:29 +0000 Subject: [PATCH 5/7] Fix --all-timestamps we were accidentally removing the timestamp prefix from `file_id`, rendering that option useless in 2.3.4. This should again now. This will fix #4 --- lib/wayback_machine_downloader.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 1d99769..0118f7c 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -538,10 +538,9 @@ class WaybackMachineDownloader file_url = file_remote_info[:file_url].encode(current_encoding) file_id = file_remote_info[:file_id] file_timestamp = file_remote_info[:timestamp] - original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id - file_path_elements = original_file_id.split('/') + file_path_elements = file_id.split('/') - if original_file_id == "" + if file_id == "" dir_path = backup_path file_path = backup_path + 'index.html' elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' From 787bc2e535e39eeab1a263fa83f9045d7ae7cdf1 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Wed, 30 Apr 2025 13:05:21 +0000 Subject: [PATCH 6/7] Added missing configs --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index bf42abe..7da8763 100644 --- a/README.md +++ b/README.md @@ -81,14 +81,15 @@ services: ## ⚙️ Configuration There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are: - ```ruby DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds) -MAX_RETRIES = 3 # Failed request retries -RETRY_DELAY = 2 # Wait between retries -RATE_LIMIT = 0.25 # Throttle between requests -CONNECTION_POOL_SIZE = 10 # No. of simultaneous connections -MEMORY_BUFFER_SIZE = 16384 # Size of download buffer +MAX_RETRIES = 3 # Number of times to retry failed requests +RETRY_DELAY = 2 # Wait time between retries (seconds) +RATE_LIMIT = 0.25 # Throttle between requests (seconds) +CONNECTION_POOL_SIZE = 10 # Maximum simultaneous connections +MEMORY_BUFFER_SIZE = 16384 # Download buffer size (bytes) +STATE_CDX_FILENAME = '.cdx.json' # Stores snapshot listing +STATE_DB_FILENAME = '.downloaded.txt' # Tracks completed downloads ``` ## 🛠️ Advanced usage From 917f4f879858307fda7410737b2b80bb1d9c11f1 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Wed, 30 Apr 2025 13:05:30 +0000 Subject: [PATCH 7/7] Bumping version --- lib/wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 0118f7c..beba6c5 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -113,7 +113,7 @@ class WaybackMachineDownloader include ArchiveAPI - VERSION = "2.3.4" + VERSION = "2.3.5" DEFAULT_TIMEOUT = 30 MAX_RETRIES = 3 RETRY_DELAY = 2