mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
238 lines
7.9 KiB
Ruby
238 lines
7.9 KiB
Ruby
|
|
# frozen_string_literal: true
|
||
|
|
|
||
|
|
module SubdomainProcessor
|
||
|
|
def process_subdomains
|
||
|
|
return unless @recursive_subdomains
|
||
|
|
|
||
|
|
puts "Starting subdomain processing..."
|
||
|
|
|
||
|
|
# extract base domain from the URL for comparison
|
||
|
|
base_domain = extract_base_domain(@base_url)
|
||
|
|
@processed_domains = Set.new([base_domain])
|
||
|
|
@subdomain_queue = Queue.new
|
||
|
|
|
||
|
|
# scan downloaded files for subdomain links
|
||
|
|
initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
||
|
|
puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
|
||
|
|
|
||
|
|
subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
|
||
|
|
|
||
|
|
if subdomains_found.empty?
|
||
|
|
puts "No subdomains found in downloaded content."
|
||
|
|
return
|
||
|
|
end
|
||
|
|
|
||
|
|
puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
|
||
|
|
|
||
|
|
# add found subdomains to the queue
|
||
|
|
subdomains_found.each do |subdomain|
|
||
|
|
full_domain = "#{subdomain}.#{base_domain}"
|
||
|
|
@subdomain_queue << "https://#{full_domain}/"
|
||
|
|
end
|
||
|
|
|
||
|
|
# process the subdomain queue
|
||
|
|
download_subdomains(base_domain)
|
||
|
|
|
||
|
|
# after all downloads, rewrite all URLs to make local references
|
||
|
|
rewrite_subdomain_links(base_domain) if @rewrite
|
||
|
|
end
|
||
|
|
|
||
|
|
private
|
||
|
|
|
||
|
|
def extract_base_domain(url)
|
||
|
|
uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
|
||
|
|
return nil unless uri
|
||
|
|
|
||
|
|
host = uri.host || uri.path.split('/').first
|
||
|
|
host = host.downcase
|
||
|
|
|
||
|
|
# extract the base domain (e.g., "example.com" from "sub.example.com")
|
||
|
|
parts = host.split('.')
|
||
|
|
return host if parts.size <= 2
|
||
|
|
|
||
|
|
# for domains like co.uk, we want to keep the last 3 parts
|
||
|
|
if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
|
||
|
|
parts.last(3).join('.')
|
||
|
|
else
|
||
|
|
parts.last(2).join('.')
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
def scan_files_for_subdomains(files, base_domain)
|
||
|
|
return [] unless base_domain
|
||
|
|
|
||
|
|
subdomains = Set.new
|
||
|
|
|
||
|
|
files.each do |file_path|
|
||
|
|
next unless File.exist?(file_path)
|
||
|
|
|
||
|
|
begin
|
||
|
|
content = File.read(file_path)
|
||
|
|
|
||
|
|
# extract URLs from HTML href/src attributes
|
||
|
|
content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||
|
|
subdomain = match[0].downcase
|
||
|
|
next if subdomain == 'www' # skip www subdomain
|
||
|
|
subdomains.add(subdomain)
|
||
|
|
end
|
||
|
|
|
||
|
|
# extract URLs from CSS
|
||
|
|
content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||
|
|
subdomain = match[0].downcase
|
||
|
|
next if subdomain == 'www' # skip www subdomain
|
||
|
|
subdomains.add(subdomain)
|
||
|
|
end
|
||
|
|
|
||
|
|
# extract URLs from JavaScript strings
|
||
|
|
content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||
|
|
subdomain = match[0].downcase
|
||
|
|
next if subdomain == 'www' # skip www subdomain
|
||
|
|
subdomains.add(subdomain)
|
||
|
|
end
|
||
|
|
rescue => e
|
||
|
|
puts "Error scanning file #{file_path}: #{e.message}"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
subdomains.to_a
|
||
|
|
end
|
||
|
|
|
||
|
|
def download_subdomains(base_domain)
|
||
|
|
puts "Starting subdomain downloads..."
|
||
|
|
depth = 0
|
||
|
|
max_depth = @subdomain_depth || 1
|
||
|
|
|
||
|
|
while depth < max_depth && !@subdomain_queue.empty?
|
||
|
|
current_batch = []
|
||
|
|
|
||
|
|
# get all subdomains at current depth
|
||
|
|
while !@subdomain_queue.empty?
|
||
|
|
current_batch << @subdomain_queue.pop
|
||
|
|
end
|
||
|
|
|
||
|
|
puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
|
||
|
|
|
||
|
|
# download each subdomain
|
||
|
|
current_batch.each do |subdomain_url|
|
||
|
|
download_subdomain(subdomain_url, base_domain)
|
||
|
|
end
|
||
|
|
|
||
|
|
# if we need to go deeper, scan the newly downloaded files
|
||
|
|
if depth + 1 < max_depth
|
||
|
|
# get all files in the subdomains directory
|
||
|
|
new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
|
||
|
|
new_subdomains = scan_files_for_subdomains(new_files, base_domain)
|
||
|
|
|
||
|
|
# filter out already processed subdomains
|
||
|
|
new_subdomains.each do |subdomain|
|
||
|
|
full_domain = "#{subdomain}.#{base_domain}"
|
||
|
|
unless @processed_domains.include?(full_domain)
|
||
|
|
@processed_domains.add(full_domain)
|
||
|
|
@subdomain_queue << "https://#{full_domain}/"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
|
||
|
|
end
|
||
|
|
|
||
|
|
depth += 1
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
def download_subdomain(subdomain_url, base_domain)
|
||
|
|
begin
|
||
|
|
uri = URI.parse(subdomain_url)
|
||
|
|
subdomain_host = uri.host
|
||
|
|
|
||
|
|
# skip if already processed
|
||
|
|
if @processed_domains.include?(subdomain_host)
|
||
|
|
puts "Skipping already processed subdomain: #{subdomain_host}"
|
||
|
|
return
|
||
|
|
end
|
||
|
|
|
||
|
|
@processed_domains.add(subdomain_host)
|
||
|
|
puts "Downloading subdomain: #{subdomain_url}"
|
||
|
|
|
||
|
|
# create the directory for this subdomain
|
||
|
|
subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
|
||
|
|
FileUtils.mkdir_p(subdomain_dir)
|
||
|
|
|
||
|
|
# create subdomain downloader with appropriate options
|
||
|
|
subdomain_options = {
|
||
|
|
base_url: subdomain_url,
|
||
|
|
directory: subdomain_dir,
|
||
|
|
from_timestamp: @from_timestamp,
|
||
|
|
to_timestamp: @to_timestamp,
|
||
|
|
all: @all,
|
||
|
|
threads_count: @threads_count,
|
||
|
|
maximum_pages: [@maximum_pages / 2, 10].max,
|
||
|
|
rewrite: @rewrite,
|
||
|
|
# don't recursively process subdomains from here
|
||
|
|
recursive_subdomains: false
|
||
|
|
}
|
||
|
|
|
||
|
|
# download the subdomain content
|
||
|
|
subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
|
||
|
|
subdomain_downloader.download_files
|
||
|
|
|
||
|
|
puts "Completed download of subdomain: #{subdomain_host}"
|
||
|
|
rescue => e
|
||
|
|
puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
def rewrite_subdomain_links(base_domain)
|
||
|
|
puts "Rewriting all files to use local subdomain references..."
|
||
|
|
|
||
|
|
all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
||
|
|
subdomains = @processed_domains.reject { |domain| domain == base_domain }
|
||
|
|
|
||
|
|
puts "Found #{all_files.size} files to check for rewriting"
|
||
|
|
puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
|
||
|
|
|
||
|
|
rewritten_count = 0
|
||
|
|
|
||
|
|
all_files.each do |file_path|
|
||
|
|
next unless File.exist?(file_path)
|
||
|
|
|
||
|
|
begin
|
||
|
|
content = File.read(file_path)
|
||
|
|
original_content = content.dup
|
||
|
|
|
||
|
|
# replace subdomain URLs with local paths
|
||
|
|
subdomains.each do |subdomain_host|
|
||
|
|
# for HTML attributes (href, src, etc.)
|
||
|
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
||
|
|
prefix, path, suffix = $1, $2, $3
|
||
|
|
path = "/index.html" if path.empty? || path == "/"
|
||
|
|
"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
|
||
|
|
end
|
||
|
|
|
||
|
|
# for CSS url()
|
||
|
|
content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
|
||
|
|
path = $1
|
||
|
|
path = "/index.html" if path.empty? || path == "/"
|
||
|
|
"url(\"../subdomains/#{subdomain_host}#{path}\")"
|
||
|
|
end
|
||
|
|
|
||
|
|
# for JavaScript strings
|
||
|
|
content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
||
|
|
quote_start, path, quote_end = $1, $2, $3
|
||
|
|
path = "/index.html" if path.empty? || path == "/"
|
||
|
|
"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
# save if modified
|
||
|
|
if content != original_content
|
||
|
|
File.write(file_path, content)
|
||
|
|
rewritten_count += 1
|
||
|
|
end
|
||
|
|
rescue => e
|
||
|
|
puts "Error rewriting file #{file_path}: #{e.message}"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
puts "Rewrote links in #{rewritten_count} files"
|
||
|
|
end
|
||
|
|
end
|