wayback-machine-downloader/lib/wayback_machine_downloader/subdom_processor.rb

# frozen_string_literal: true

module SubdomainProcessor
  def process_subdomains
    return unless @recursive_subdomains
    
    puts "Starting subdomain processing..."
    
    # extract base domain from the URL for comparison
    base_domain = extract_base_domain(@base_url)
    @processed_domains = Set.new([base_domain])
    @subdomain_queue = Queue.new
    
    # scan downloaded files for subdomain links
    initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
    puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
    
    subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
    
    if subdomains_found.empty?
      puts "No subdomains found in downloaded content."
      return
    end
    
    puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
    
    # add found subdomains to the queue
    subdomains_found.each do |subdomain|
      full_domain = "#{subdomain}.#{base_domain}"
      @subdomain_queue << "https://#{full_domain}/"
    end
    
    # process the subdomain queue
    download_subdomains(base_domain)
    
    # after all downloads, rewrite all URLs to make local references
    rewrite_subdomain_links(base_domain) if @rewrite
  end
  
  private
  
  def extract_base_domain(url)
    uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
    return nil unless uri
    
    host = uri.host || uri.path.split('/').first
    host = host.downcase
    
    # extract the base domain (e.g., "example.com" from "sub.example.com")
    parts = host.split('.')
    return host if parts.size <= 2
    
    # for domains like co.uk, we want to keep the last 3 parts
    if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
      parts.last(3).join('.')
    else
      parts.last(2).join('.')
    end
  end
  
  def scan_files_for_subdomains(files, base_domain)
    return [] unless base_domain
    
    subdomains = Set.new
    
    files.each do |file_path|
      next unless File.exist?(file_path)
      
      begin
        content = File.read(file_path)
        
        # extract URLs from HTML href/src attributes
        content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
          subdomain = match[0].downcase
          next if subdomain == 'www' # skip www subdomain
          subdomains.add(subdomain)
        end
        
        # extract URLs from CSS
        content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
          subdomain = match[0].downcase
          next if subdomain == 'www' # skip www subdomain
          subdomains.add(subdomain)
        end
        
        # extract URLs from JavaScript strings
        content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
          subdomain = match[0].downcase
          next if subdomain == 'www' # skip www subdomain
          subdomains.add(subdomain)
        end
      rescue => e
        puts "Error scanning file #{file_path}: #{e.message}"
      end
    end
    
    subdomains.to_a
  end
  
  def download_subdomains(base_domain)
    puts "Starting subdomain downloads..."
    depth = 0
    max_depth = @subdomain_depth || 1
    
    while depth < max_depth && !@subdomain_queue.empty?
      current_batch = []
      
      # get all subdomains at current depth
      while !@subdomain_queue.empty?
        current_batch << @subdomain_queue.pop
      end
      
      puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
      
      # download each subdomain
      current_batch.each do |subdomain_url|
        download_subdomain(subdomain_url, base_domain)
      end
      
      # if we need to go deeper, scan the newly downloaded files
      if depth + 1 < max_depth
        # get all files in the subdomains directory
        new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
        new_subdomains = scan_files_for_subdomains(new_files, base_domain)
        
        # filter out already processed subdomains
        new_subdomains.each do |subdomain|
          full_domain = "#{subdomain}.#{base_domain}"
          unless @processed_domains.include?(full_domain)
            @processed_domains.add(full_domain)
            @subdomain_queue << "https://#{full_domain}/"
          end
        end
        
        puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
      end
      
      depth += 1
    end
  end
  
  def download_subdomain(subdomain_url, base_domain)
    begin
      uri = URI.parse(subdomain_url)
      subdomain_host = uri.host
      
      # skip if already processed
      if @processed_domains.include?(subdomain_host)
        puts "Skipping already processed subdomain: #{subdomain_host}"
        return
      end
      
      @processed_domains.add(subdomain_host)
      puts "Downloading subdomain: #{subdomain_url}"
      
      # create the directory for this subdomain
      subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
      FileUtils.mkdir_p(subdomain_dir)
      
      # create subdomain downloader with appropriate options
      subdomain_options = {
        base_url: subdomain_url,
        directory: subdomain_dir,
        from_timestamp: @from_timestamp,
        to_timestamp: @to_timestamp,
        all: @all,
        threads_count: @threads_count,
        maximum_pages: [@maximum_pages / 2, 10].max,
        rewrite: @rewrite,
        # don't recursively process subdomains from here
        recursive_subdomains: false
      }
      
      # download the subdomain content
      subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
      subdomain_downloader.download_files
      
      puts "Completed download of subdomain: #{subdomain_host}"
    rescue => e
      puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
    end
  end
  
  def rewrite_subdomain_links(base_domain)
    puts "Rewriting all files to use local subdomain references..."
    
    all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
    subdomains = @processed_domains.reject { |domain| domain == base_domain }
    
    puts "Found #{all_files.size} files to check for rewriting"
    puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
    
    rewritten_count = 0
    
    all_files.each do |file_path|
      next unless File.exist?(file_path)
      
      begin
        content = File.read(file_path)
        original_content = content.dup
        
        # replace subdomain URLs with local paths
        subdomains.each do |subdomain_host|
          # for HTML attributes (href, src, etc.)
          content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
            prefix, path, suffix = $1, $2, $3
            path = "/index.html" if path.empty? || path == "/"
            "#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
          end
          
          # for CSS url()
          content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
            path = $1
            path = "/index.html" if path.empty? || path == "/"
            "url(\"../subdomains/#{subdomain_host}#{path}\")"
          end
          
          # for JavaScript strings
          content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
            quote_start, path, quote_end = $1, $2, $3
            path = "/index.html" if path.empty? || path == "/"
            "#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
          end
        end
        
        # save if modified
        if content != original_content
          File.write(file_path, content)
          rewritten_count += 1
        end
      rescue => e
        puts "Error rewriting file #{file_path}: #{e.message}"
      end
    end
    
    puts "Rewrote links in #{rewritten_count} files"
  end
end
Ability to recursively download across subdomains this is quite experimental. Fixes #15 but still needs more testing 2025-07-09 12:53:58 +00:00			`# frozen_string_literal: true`

			`module SubdomainProcessor`
			`def process_subdomains`
			`return unless @recursive_subdomains`

			`puts "Starting subdomain processing..."`

			`# extract base domain from the URL for comparison`
			`base_domain = extract_base_domain(@base_url)`
			`@processed_domains = Set.new([base_domain])`
			`@subdomain_queue = Queue.new`

			`# scan downloaded files for subdomain links`
			`initial_files = Dir.glob(File.join(backup_path, "*/.{html,htm,css,js}"))`
			`puts "Scanning #{initial_files.size} downloaded files for subdomain links..."`

			`subdomains_found = scan_files_for_subdomains(initial_files, base_domain)`

			`if subdomains_found.empty?`
			`puts "No subdomains found in downloaded content."`
			`return`
			`end`

			`puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"`

			`# add found subdomains to the queue`
			`subdomains_found.each do \|subdomain\|`
			`full_domain = "#{subdomain}.#{base_domain}"`
			`@subdomain_queue << "https://#{full_domain}/"`
			`end`

			`# process the subdomain queue`
			`download_subdomains(base_domain)`

			`# after all downloads, rewrite all URLs to make local references`
			`rewrite_subdomain_links(base_domain) if @rewrite`
			`end`

			`private`

			`def extract_base_domain(url)`
			`uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil`
			`return nil unless uri`

			`host = uri.host \|\| uri.path.split('/').first`
			`host = host.downcase`

			`# extract the base domain (e.g., "example.com" from "sub.example.com")`
			`parts = host.split('.')`
			`return host if parts.size <= 2`

			`# for domains like co.uk, we want to keep the last 3 parts`
			`if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2`
			`parts.last(3).join('.')`
			`else`
			`parts.last(2).join('.')`
			`end`
			`end`

			`def scan_files_for_subdomains(files, base_domain)`
			`return [] unless base_domain`

			`subdomains = Set.new`

			`files.each do \|file_path\|`
			`next unless File.exist?(file_path)`

			`begin`
			`content = File.read(file_path)`

			`# extract URLs from HTML href/src attributes`
			`content.scan(/(?:href\|src\|action\|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do \|match\|`
			`subdomain = match[0].downcase`
			`next if subdomain == 'www' # skip www subdomain`
			`subdomains.add(subdomain)`
			`end`

			`# extract URLs from CSS`
			`content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do \|match\|`
			`subdomain = match[0].downcase`
			`next if subdomain == 'www' # skip www subdomain`
			`subdomains.add(subdomain)`
			`end`

			`# extract URLs from JavaScript strings`
			`content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do \|match\|`
			`subdomain = match[0].downcase`
			`next if subdomain == 'www' # skip www subdomain`
			`subdomains.add(subdomain)`
			`end`
			`rescue => e`
			`puts "Error scanning file #{file_path}: #{e.message}"`
			`end`
			`end`

			`subdomains.to_a`
			`end`

			`def download_subdomains(base_domain)`
			`puts "Starting subdomain downloads..."`
			`depth = 0`
			`max_depth = @subdomain_depth \|\| 1`

			`while depth < max_depth && !@subdomain_queue.empty?`
			`current_batch = []`

			`# get all subdomains at current depth`
			`while !@subdomain_queue.empty?`
			`current_batch << @subdomain_queue.pop`
			`end`

			`puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."`

			`# download each subdomain`
			`current_batch.each do \|subdomain_url\|`
			`download_subdomain(subdomain_url, base_domain)`
			`end`

			`# if we need to go deeper, scan the newly downloaded files`
			`if depth + 1 < max_depth`
			`# get all files in the subdomains directory`
			`new_files = Dir.glob(File.join(backup_path, "subdomains", "*/.{html,htm,css,js}"))`
			`new_subdomains = scan_files_for_subdomains(new_files, base_domain)`

			`# filter out already processed subdomains`
			`new_subdomains.each do \|subdomain\|`
			`full_domain = "#{subdomain}.#{base_domain}"`
			`unless @processed_domains.include?(full_domain)`
			`@processed_domains.add(full_domain)`
			`@subdomain_queue << "https://#{full_domain}/"`
			`end`
			`end`

			`puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?`
			`end`

			`depth += 1`
			`end`
			`end`

			`def download_subdomain(subdomain_url, base_domain)`
			`begin`
			`uri = URI.parse(subdomain_url)`
			`subdomain_host = uri.host`

			`# skip if already processed`
			`if @processed_domains.include?(subdomain_host)`
			`puts "Skipping already processed subdomain: #{subdomain_host}"`
			`return`
			`end`

			`@processed_domains.add(subdomain_host)`
			`puts "Downloading subdomain: #{subdomain_url}"`

			`# create the directory for this subdomain`
			`subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)`
			`FileUtils.mkdir_p(subdomain_dir)`

			`# create subdomain downloader with appropriate options`
			`subdomain_options = {`
			`base_url: subdomain_url,`
			`directory: subdomain_dir,`
			`from_timestamp: @from_timestamp,`
			`to_timestamp: @to_timestamp,`
			`all: @all,`
			`threads_count: @threads_count,`
			`maximum_pages: [@maximum_pages / 2, 10].max,`
			`rewrite: @rewrite,`
			`# don't recursively process subdomains from here`
			`recursive_subdomains: false`
			`}`

			`# download the subdomain content`
			`subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)`
			`subdomain_downloader.download_files`

			`puts "Completed download of subdomain: #{subdomain_host}"`
			`rescue => e`
			`puts "Error downloading subdomain #{subdomain_url}: #{e.message}"`
			`end`
			`end`

			`def rewrite_subdomain_links(base_domain)`
			`puts "Rewriting all files to use local subdomain references..."`

			`all_files = Dir.glob(File.join(backup_path, "*/.{html,htm,css,js}"))`
			`subdomains = @processed_domains.reject { \|domain\| domain == base_domain }`

			`puts "Found #{all_files.size} files to check for rewriting"`
			`puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"`

			`rewritten_count = 0`

			`all_files.each do \|file_path\|`
			`next unless File.exist?(file_path)`

			`begin`
			`content = File.read(file_path)`
			`original_content = content.dup`

			`# replace subdomain URLs with local paths`
			`subdomains.each do \|subdomain_host\|`
			`# for HTML attributes (href, src, etc.)`
			`content.gsub!(/(\s(?:href\|src\|action\|data-src\|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do`
			`prefix, path, suffix = $1, $2, $3`
			`path = "/index.html" if path.empty? \|\| path == "/"`
			`"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"`
			`end`

			`# for CSS url()`
			`content.gsub!(/url\(\s["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]?)["']?\s*\)/i) do`
			`path = $1`
			`path = "/index.html" if path.empty? \|\| path == "/"`
			`"url(\"../subdomains/#{subdomain_host}#{path}\")"`
			`end`

			`# for JavaScript strings`
			`content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do`
			`quote_start, path, quote_end = $1, $2, $3`
			`path = "/index.html" if path.empty? \|\| path == "/"`
			`"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"`
			`end`
			`end`

			`# save if modified`
			`if content != original_content`
			`File.write(file_path, content)`
			`rewritten_count += 1`
			`end`
			`rescue => e`
			`puts "Error rewriting file #{file_path}: #{e.message}"`
			`end`
			`end`

			`puts "Rewrote links in #{rewritten_count} files"`
			`end`
			`end`