mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
42e6d62284
36
README.md
36
README.md
@ -108,14 +108,15 @@ docker compose run --rm wayback_machine_downloader https://example.com
|
||||
|
||||
## ⚙️ Configuration
|
||||
There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
|
||||
|
||||
```ruby
|
||||
DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds)
|
||||
MAX_RETRIES = 3 # Failed request retries
|
||||
RETRY_DELAY = 2 # Wait between retries
|
||||
RATE_LIMIT = 0.25 # Throttle between requests
|
||||
CONNECTION_POOL_SIZE = 10 # No. of simultaneous connections
|
||||
MEMORY_BUFFER_SIZE = 16384 # Size of download buffer
|
||||
MAX_RETRIES = 3 # Number of times to retry failed requests
|
||||
RETRY_DELAY = 2 # Wait time between retries (seconds)
|
||||
RATE_LIMIT = 0.25 # Throttle between requests (seconds)
|
||||
CONNECTION_POOL_SIZE = 10 # Maximum simultaneous connections
|
||||
MEMORY_BUFFER_SIZE = 16384 # Download buffer size (bytes)
|
||||
STATE_CDX_FILENAME = '.cdx.json' # Stores snapshot listing
|
||||
STATE_DB_FILENAME = '.downloaded.txt' # Tracks completed downloads
|
||||
```
|
||||
|
||||
## 🛠️ Advanced usage
|
||||
@ -244,6 +245,29 @@ ruby wayback_machine_downloader https://example.com --list
|
||||
```
|
||||
It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application.
|
||||
|
||||
### Job management
|
||||
The downloader automatically saves its progress (`.cdx.json` for snapshot list, `.downloaded.txt` for completed files) in the output directory. If you run the same command again pointing to the same output directory, it will resume where it left off, skipping already downloaded files.
|
||||
|
||||
> [!NOTE]
|
||||
> Automatic resumption can be affected by changing the URL, mode selection (like `--all-timestamps`), filtering selections, or other options. If you want to ensure a clean start, use the `--reset` option.
|
||||
|
||||
| Option | Description |
|
||||
|--------|-------------|
|
||||
| `--reset` | Delete state files (`.cdx.json`, `.downloaded.txt`) and restart the download from scratch. Does not delete already downloaded website files. |
|
||||
| `--keep` | Keep state files (`.cdx.json`, `.downloaded.txt`) even after a successful download. By default, these are deleted upon successful completion. |
|
||||
|
||||
**Example** - Restart a download job from the beginning:
|
||||
```bash
|
||||
ruby wayback_machine_downloader https://example.com --reset
|
||||
```
|
||||
This is useful if you suspect the state files are corrupted or want to ensure a completely fresh download process without deleting the files you already have.
|
||||
|
||||
**Example 2** - Keep state files after download:
|
||||
```bash
|
||||
ruby wayback_machine_downloader https://example.com --keep
|
||||
```
|
||||
This can be useful for debugging or if you plan to extend the download later with different parameters (e.g., adding `--to` timestamp) while leveraging the existing snapshot list.
|
||||
|
||||
## 🤝 Contributing
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
|
||||
@ -59,7 +59,15 @@ option_parser = OptionParser.new do |opts|
|
||||
end
|
||||
|
||||
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
|
||||
options[:rewritten] = t
|
||||
options[:rewritten] = true
|
||||
end
|
||||
|
||||
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
||||
options[:reset] = true
|
||||
end
|
||||
|
||||
opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
|
||||
options[:keep] = true
|
||||
end
|
||||
|
||||
opts.on("-v", "--version", "Display version") do |t|
|
||||
|
||||
@ -9,6 +9,8 @@ require 'json'
|
||||
require 'time'
|
||||
require 'concurrent-ruby'
|
||||
require 'logger'
|
||||
require 'zlib'
|
||||
require 'stringio'
|
||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||
require_relative 'wayback_machine_downloader/to_regex'
|
||||
require_relative 'wayback_machine_downloader/archive_api'
|
||||
@ -111,17 +113,19 @@ class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
|
||||
VERSION = "2.3.3"
|
||||
VERSION = "2.3.5"
|
||||
DEFAULT_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 2
|
||||
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
||||
CONNECTION_POOL_SIZE = 10
|
||||
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
||||
STATE_CDX_FILENAME = ".cdx.json"
|
||||
STATE_DB_FILENAME = ".downloaded.txt"
|
||||
|
||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||
:all, :maximum_pages, :threads_count, :logger
|
||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
||||
|
||||
def initialize params
|
||||
validate_params(params)
|
||||
@ -137,10 +141,15 @@ class WaybackMachineDownloader
|
||||
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
||||
@threads_count = [params[:threads_count].to_i, 1].max
|
||||
@rewritten = params[:rewritten]
|
||||
@reset = params[:reset]
|
||||
@keep = params[:keep]
|
||||
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
||||
@logger = setup_logger
|
||||
@failed_downloads = Concurrent::Array.new
|
||||
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
||||
@db_mutex = Mutex.new
|
||||
|
||||
handle_reset
|
||||
end
|
||||
|
||||
def backup_name
|
||||
@ -163,6 +172,23 @@ class WaybackMachineDownloader
|
||||
end
|
||||
end
|
||||
|
||||
def cdx_path
|
||||
File.join(backup_path, STATE_CDX_FILENAME)
|
||||
end
|
||||
|
||||
def db_path
|
||||
File.join(backup_path, STATE_DB_FILENAME)
|
||||
end
|
||||
|
||||
def handle_reset
|
||||
if @reset
|
||||
puts "Resetting download state..."
|
||||
FileUtils.rm_f(cdx_path)
|
||||
FileUtils.rm_f(db_path)
|
||||
puts "Removed state files: #{cdx_path}, #{db_path}"
|
||||
end
|
||||
end
|
||||
|
||||
def match_only_filter file_url
|
||||
if @only_filter
|
||||
only_filter_regex = @only_filter.to_regex
|
||||
@ -190,10 +216,26 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def get_all_snapshots_to_consider
|
||||
if File.exist?(cdx_path) && !@reset
|
||||
puts "Loading snapshot list from #{cdx_path}"
|
||||
begin
|
||||
snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
|
||||
puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
|
||||
puts
|
||||
return Concurrent::Array.new(snapshot_list_to_consider)
|
||||
rescue JSON::ParserError => e
|
||||
puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
|
||||
FileUtils.rm_f(cdx_path)
|
||||
rescue => e
|
||||
puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
|
||||
FileUtils.rm_f(cdx_path)
|
||||
end
|
||||
end
|
||||
|
||||
snapshot_list_to_consider = Concurrent::Array.new
|
||||
mutex = Mutex.new
|
||||
|
||||
puts "Getting snapshot pages"
|
||||
puts "Getting snapshot pages from Wayback Machine API..."
|
||||
|
||||
# Fetch the initial set of snapshots, sequentially
|
||||
@connection_pool.with_connection do |connection|
|
||||
@ -258,7 +300,16 @@ class WaybackMachineDownloader
|
||||
end
|
||||
end
|
||||
|
||||
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
||||
puts " found #{snapshot_list_to_consider.length} snapshots."
|
||||
|
||||
# Save the fetched list to the cache file
|
||||
begin
|
||||
FileUtils.mkdir_p(File.dirname(cdx_path))
|
||||
File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
|
||||
puts "Saved snapshot list to #{cdx_path}"
|
||||
rescue => e
|
||||
puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
|
||||
end
|
||||
puts
|
||||
|
||||
snapshot_list_to_consider
|
||||
@ -348,31 +399,102 @@ class WaybackMachineDownloader
|
||||
puts "]"
|
||||
end
|
||||
|
||||
def load_downloaded_ids
|
||||
downloaded_ids = Set.new
|
||||
if File.exist?(db_path) && !@reset
|
||||
puts "Loading list of already downloaded files from #{db_path}"
|
||||
begin
|
||||
File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
|
||||
rescue => e
|
||||
puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
|
||||
downloaded_ids.clear
|
||||
end
|
||||
end
|
||||
downloaded_ids
|
||||
end
|
||||
|
||||
def append_to_db(file_id)
|
||||
@db_mutex.synchronize do
|
||||
begin
|
||||
FileUtils.mkdir_p(File.dirname(db_path))
|
||||
File.open(db_path, 'a') { |f| f.puts(file_id) }
|
||||
rescue => e
|
||||
@logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def download_files
|
||||
start_time = Time.now
|
||||
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
||||
|
||||
if file_list_by_timestamp.empty?
|
||||
puts "No files to download."
|
||||
FileUtils.mkdir_p(backup_path)
|
||||
|
||||
# Load the list of files to potentially download
|
||||
files_to_download = file_list_by_timestamp
|
||||
|
||||
if files_to_download.empty?
|
||||
puts "No files found matching criteria."
|
||||
cleanup
|
||||
return
|
||||
end
|
||||
|
||||
total_files = file_list_by_timestamp.count
|
||||
puts "#{total_files} files to download:"
|
||||
total_files = files_to_download.count
|
||||
puts "#{total_files} files found matching criteria."
|
||||
|
||||
# Load IDs of already downloaded files
|
||||
downloaded_ids = load_downloaded_ids
|
||||
files_to_process = files_to_download.reject do |file_info|
|
||||
downloaded_ids.include?(file_info[:file_id])
|
||||
end
|
||||
|
||||
remaining_count = files_to_process.count
|
||||
skipped_count = total_files - remaining_count
|
||||
|
||||
if skipped_count > 0
|
||||
puts "Found #{skipped_count} previously downloaded files, skipping them."
|
||||
end
|
||||
|
||||
if remaining_count == 0
|
||||
puts "All matching files have already been downloaded."
|
||||
cleanup
|
||||
return
|
||||
end
|
||||
|
||||
puts "#{remaining_count} files to download:"
|
||||
|
||||
@processed_file_count = 0
|
||||
@total_to_download = remaining_count
|
||||
@download_mutex = Mutex.new
|
||||
|
||||
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
||||
pool = Concurrent::FixedThreadPool.new(thread_count)
|
||||
|
||||
file_list_by_timestamp.each do |file_remote_info|
|
||||
files_to_process.each do |file_remote_info|
|
||||
pool.post do
|
||||
download_success = false
|
||||
begin
|
||||
@connection_pool.with_connection do |connection|
|
||||
result = download_file(file_remote_info, connection)
|
||||
result_message = download_file(file_remote_info, connection)
|
||||
# for now, assume success if no exception and message doesn't indicate error/skip
|
||||
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
||||
download_success = true
|
||||
end
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
# adjust progress message to reflect remaining files
|
||||
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
||||
puts progress_message if progress_message
|
||||
end
|
||||
end
|
||||
# sppend to DB only after successful download outside the connection block
|
||||
if download_success
|
||||
append_to_db(file_remote_info[:file_id])
|
||||
end
|
||||
rescue => e
|
||||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
puts result if result
|
||||
end
|
||||
end
|
||||
sleep(RATE_LIMIT)
|
||||
@ -383,7 +505,8 @@ class WaybackMachineDownloader
|
||||
pool.wait_for_termination
|
||||
|
||||
end_time = Time.now
|
||||
puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
|
||||
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
||||
puts "Results saved in #{backup_path}"
|
||||
cleanup
|
||||
end
|
||||
|
||||
@ -431,22 +554,25 @@ class WaybackMachineDownloader
|
||||
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
end
|
||||
unless File.exist? file_path
|
||||
|
||||
# check existence *before* download attempt
|
||||
# this handles cases where a file was created manually or by a previous partial run without a .db entry
|
||||
if File.exist? file_path
|
||||
return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
end
|
||||
|
||||
begin
|
||||
structure_dir_path dir_path
|
||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
rescue StandardError => e
|
||||
msg = "#{file_url} # #{e}"
|
||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
||||
File.delete(file_path)
|
||||
msg += "\n#{file_path} was empty and was removed."
|
||||
end
|
||||
msg
|
||||
end
|
||||
else
|
||||
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
||||
end
|
||||
end
|
||||
|
||||
def file_queue
|
||||
@ -485,16 +611,26 @@ class WaybackMachineDownloader
|
||||
request = Net::HTTP::Get.new(URI(wayback_url))
|
||||
request["Connection"] = "keep-alive"
|
||||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||||
request["Accept-Encoding"] = "gzip, deflate"
|
||||
|
||||
response = connection.request(request)
|
||||
|
||||
case response
|
||||
when Net::HTTPSuccess
|
||||
File.open(file_path, "wb") do |file|
|
||||
if block_given?
|
||||
yield(response, file)
|
||||
body = response.body
|
||||
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
||||
begin
|
||||
gz = Zlib::GzipReader.new(StringIO.new(body))
|
||||
decompressed_body = gz.read
|
||||
gz.close
|
||||
file.write(decompressed_body)
|
||||
rescue Zlib::GzipFile::Error => e
|
||||
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
||||
file.write(body)
|
||||
end
|
||||
else
|
||||
file.write(response.body)
|
||||
file.write(body) if body
|
||||
end
|
||||
end
|
||||
when Net::HTTPRedirection
|
||||
@ -529,10 +665,23 @@ class WaybackMachineDownloader
|
||||
@connection_pool.shutdown
|
||||
|
||||
if @failed_downloads.any?
|
||||
@logger.error("Download completed with errors.")
|
||||
@logger.error("Failed downloads summary:")
|
||||
@failed_downloads.each do |failure|
|
||||
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
||||
end
|
||||
unless @reset
|
||||
puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
|
||||
return
|
||||
end
|
||||
end
|
||||
|
||||
if !@keep || @reset
|
||||
puts "Cleaning up state files..." unless @keep && !@reset
|
||||
FileUtils.rm_f(cdx_path)
|
||||
FileUtils.rm_f(db_path)
|
||||
elsif @keep
|
||||
puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@ -4,7 +4,7 @@ require 'uri'
|
||||
module ArchiveAPI
|
||||
|
||||
def get_raw_list_from_api(url, page_index, http)
|
||||
request_url = URI("https://web.archive.org/cdx/search/xd")
|
||||
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||
request_url.query = URI.encode_www_form(params)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user