2015-07-25 19:28:32 -05:00
#!/usr/bin/env ruby
2015-08-15 15:36:26 -05:00
require_relative '../lib/wayback_machine_downloader'
2015-08-09 21:26:17 -05:00
require 'optparse'
2015-11-06 13:11:26 -05:00
require 'pp'
2015-08-09 21:26:17 -05:00
options = {}
option_parser = OptionParser.new do |opts|
opts.banner = "Usage: wayback_machine_downloader http://example.com"
opts.separator ""
2016-07-30 14:30:30 -05:00
opts.separator "Download an entire website from the Wayback Machine."
2015-08-09 21:26:17 -05:00
opts.separator ""
2015-11-06 13:11:26 -05:00
opts.separator "Optional options:"
2015-08-09 21:26:17 -05:00
2017-06-10 15:49:25 -05:00
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
2016-09-17 12:48:28 -05:00
options[:directory] = t
end
2017-10-26 19:36:06 -05:00
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
2017-01-24 04:55:51 -05:00
options[:all_timestamps] = true
end
2016-07-30 14:30:30 -05:00
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
options[:from_timestamp] = t
end
opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
options[:to_timestamp] = t
2015-08-09 21:26:17 -05:00
end
2015-08-22 12:27:19 -05:00
2017-06-11 22:17:23 -05:00
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
options[:exact_url] = t
2017-06-11 21:53:45 -05:00
end
2017-06-10 15:49:25 -05:00
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
2015-11-19 15:26:20 -06:00
options[:only_filter] = t
2015-11-06 13:11:26 -05:00
end
2017-06-10 15:49:25 -05:00
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
2016-06-28 23:27:36 -07:00
options[:exclude_filter] = t
end
2016-07-31 09:51:27 -05:00
opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
options[:all] = true
end
2021-05-03 20:20:09 +08:00
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
2016-09-04 23:38:38 +03:00
options[:threads_count] = t
end
2017-06-11 21:53:45 -05:00
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
2016-09-23 18:19:37 -07:00
options[:maximum_pages] = t
end
2017-06-11 21:53:45 -05:00
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
2016-09-17 12:48:49 -05:00
options[:list] = true
end
2025-01-02 12:17:20 +00:00
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
2025-04-19 13:40:14 +00:00
options[:rewritten] = true
end
2025-05-18 16:44:52 +00:00
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
options[:rewrite] = true
end
2025-04-19 13:40:14 +00:00
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
options[:reset] = true
end
opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
options[:keep] = true
2025-01-02 12:17:20 +00:00
end
2025-08-20 01:21:29 +00:00
opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
options[:max_retries] = t
end
2025-07-09 12:53:58 +00:00
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
options[:recursive_subdomains] = true
end
opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
options[:subdomain_depth] = t
end
2015-08-22 12:27:19 -05:00
opts.on("-v", "--version", "Display version") do |t|
options[:version] = t
end
2015-08-09 21:26:17 -05:00
end.parse!
2016-03-27 15:59:19 -05:00
if (base_url = ARGV[-1])
2016-07-30 14:30:30 -05:00
options[:base_url] = base_url
wayback_machine_downloader = WaybackMachineDownloader.new options
2017-03-15 17:12:41 -04:00
if options[:list]
2016-08-03 14:23:35 -05:00
wayback_machine_downloader.list_files
else
wayback_machine_downloader.download_files
end
2015-08-22 12:27:19 -05:00
elsif options[:version]
puts WaybackMachineDownloader::VERSION
2015-08-09 21:26:17 -05:00
else
2015-08-10 01:13:36 -05:00
puts "You need to specify a website to backup. (e.g., http://example.com)"
2015-08-09 21:26:17 -05:00
puts "Run `wayback_machine_downloader --help` for more help."
end