diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 560e92e..f990789 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -74,6 +74,14 @@ option_parser = OptionParser.new do |opts| options[:keep] = true end + opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t| + options[:recursive_subdomains] = true + end + + opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t| + options[:subdomain_depth] = t + end + opts.on("-v", "--version", "Display version") do |t| options[:version] = t end diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 86d950d..9082307 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -14,6 +14,7 @@ require 'stringio' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/archive_api' +require_relative 'wayback_machine_downloader/subdom_processor' class ConnectionPool MAX_AGE = 300 @@ -112,6 +113,7 @@ end class WaybackMachineDownloader include ArchiveAPI + include SubdomainProcessor VERSION = "2.3.10" DEFAULT_TIMEOUT = 30 @@ -153,6 +155,8 @@ class WaybackMachineDownloader @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE) @db_mutex = Mutex.new @rewrite = params[:rewrite] || false + @recursive_subdomains = params[:recursive_subdomains] || false + @subdomain_depth = params[:subdomain_depth] || 1 handle_reset end @@ -513,6 +517,16 @@ class WaybackMachineDownloader end_time = Time.now puts "\nDownload finished in #{(end_time - start_time).round(2)}s." + + # process subdomains if enabled + if @recursive_subdomains + subdomain_start_time = Time.now + process_subdomains + subdomain_end_time = Time.now + subdomain_time = (subdomain_end_time - subdomain_start_time).round(2) + puts "Subdomain processing finished in #{subdomain_time}s." + end + puts "Results saved in #{backup_path}" cleanup end diff --git a/lib/wayback_machine_downloader/subdom_processor.rb b/lib/wayback_machine_downloader/subdom_processor.rb new file mode 100644 index 0000000..f95a55f --- /dev/null +++ b/lib/wayback_machine_downloader/subdom_processor.rb @@ -0,0 +1,238 @@ +# frozen_string_literal: true + +module SubdomainProcessor + def process_subdomains + return unless @recursive_subdomains + + puts "Starting subdomain processing..." + + # extract base domain from the URL for comparison + base_domain = extract_base_domain(@base_url) + @processed_domains = Set.new([base_domain]) + @subdomain_queue = Queue.new + + # scan downloaded files for subdomain links + initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}")) + puts "Scanning #{initial_files.size} downloaded files for subdomain links..." + + subdomains_found = scan_files_for_subdomains(initial_files, base_domain) + + if subdomains_found.empty? + puts "No subdomains found in downloaded content." + return + end + + puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}" + + # add found subdomains to the queue + subdomains_found.each do |subdomain| + full_domain = "#{subdomain}.#{base_domain}" + @subdomain_queue << "https://#{full_domain}/" + end + + # process the subdomain queue + download_subdomains(base_domain) + + # after all downloads, rewrite all URLs to make local references + rewrite_subdomain_links(base_domain) if @rewrite + end + + private + + def extract_base_domain(url) + uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil + return nil unless uri + + host = uri.host || uri.path.split('/').first + host = host.downcase + + # extract the base domain (e.g., "example.com" from "sub.example.com") + parts = host.split('.') + return host if parts.size <= 2 + + # for domains like co.uk, we want to keep the last 3 parts + if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2 + parts.last(3).join('.') + else + parts.last(2).join('.') + end + end + + def scan_files_for_subdomains(files, base_domain) + return [] unless base_domain + + subdomains = Set.new + + files.each do |file_path| + next unless File.exist?(file_path) + + begin + content = File.read(file_path) + + # extract URLs from HTML href/src attributes + content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match| + subdomain = match[0].downcase + next if subdomain == 'www' # skip www subdomain + subdomains.add(subdomain) + end + + # extract URLs from CSS + content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match| + subdomain = match[0].downcase + next if subdomain == 'www' # skip www subdomain + subdomains.add(subdomain) + end + + # extract URLs from JavaScript strings + content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match| + subdomain = match[0].downcase + next if subdomain == 'www' # skip www subdomain + subdomains.add(subdomain) + end + rescue => e + puts "Error scanning file #{file_path}: #{e.message}" + end + end + + subdomains.to_a + end + + def download_subdomains(base_domain) + puts "Starting subdomain downloads..." + depth = 0 + max_depth = @subdomain_depth || 1 + + while depth < max_depth && !@subdomain_queue.empty? + current_batch = [] + + # get all subdomains at current depth + while !@subdomain_queue.empty? + current_batch << @subdomain_queue.pop + end + + puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..." + + # download each subdomain + current_batch.each do |subdomain_url| + download_subdomain(subdomain_url, base_domain) + end + + # if we need to go deeper, scan the newly downloaded files + if depth + 1 < max_depth + # get all files in the subdomains directory + new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}")) + new_subdomains = scan_files_for_subdomains(new_files, base_domain) + + # filter out already processed subdomains + new_subdomains.each do |subdomain| + full_domain = "#{subdomain}.#{base_domain}" + unless @processed_domains.include?(full_domain) + @processed_domains.add(full_domain) + @subdomain_queue << "https://#{full_domain}/" + end + end + + puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty? + end + + depth += 1 + end + end + + def download_subdomain(subdomain_url, base_domain) + begin + uri = URI.parse(subdomain_url) + subdomain_host = uri.host + + # skip if already processed + if @processed_domains.include?(subdomain_host) + puts "Skipping already processed subdomain: #{subdomain_host}" + return + end + + @processed_domains.add(subdomain_host) + puts "Downloading subdomain: #{subdomain_url}" + + # create the directory for this subdomain + subdomain_dir = File.join(backup_path, "subdomains", subdomain_host) + FileUtils.mkdir_p(subdomain_dir) + + # create subdomain downloader with appropriate options + subdomain_options = { + base_url: subdomain_url, + directory: subdomain_dir, + from_timestamp: @from_timestamp, + to_timestamp: @to_timestamp, + all: @all, + threads_count: @threads_count, + maximum_pages: [@maximum_pages / 2, 10].max, + rewrite: @rewrite, + # don't recursively process subdomains from here + recursive_subdomains: false + } + + # download the subdomain content + subdomain_downloader = WaybackMachineDownloader.new(subdomain_options) + subdomain_downloader.download_files + + puts "Completed download of subdomain: #{subdomain_host}" + rescue => e + puts "Error downloading subdomain #{subdomain_url}: #{e.message}" + end + end + + def rewrite_subdomain_links(base_domain) + puts "Rewriting all files to use local subdomain references..." + + all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}")) + subdomains = @processed_domains.reject { |domain| domain == base_domain } + + puts "Found #{all_files.size} files to check for rewriting" + puts "Will rewrite links for subdomains: #{subdomains.join(', ')}" + + rewritten_count = 0 + + all_files.each do |file_path| + next unless File.exist?(file_path) + + begin + content = File.read(file_path) + original_content = content.dup + + # replace subdomain URLs with local paths + subdomains.each do |subdomain_host| + # for HTML attributes (href, src, etc.) + content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do + prefix, path, suffix = $1, $2, $3 + path = "/index.html" if path.empty? || path == "/" + "#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}" + end + + # for CSS url() + content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do + path = $1 + path = "/index.html" if path.empty? || path == "/" + "url(\"../subdomains/#{subdomain_host}#{path}\")" + end + + # for JavaScript strings + content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do + quote_start, path, quote_end = $1, $2, $3 + path = "/index.html" if path.empty? || path == "/" + "#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}" + end + end + + # save if modified + if content != original_content + File.write(file_path, content) + rewritten_count += 1 + end + rescue => e + puts "Error rewriting file #{file_path}: #{e.message}" + end + end + + puts "Rewrote links in #{rewritten_count} files" + end +end \ No newline at end of file