mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
another page_requisites fix
This commit is contained in:
parent
8632050c45
commit
b2fc748c2c
@ -692,74 +692,76 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def process_page_requisites(file_path, parent_remote_info)
|
def process_page_requisites(file_path, parent_remote_info)
|
||||||
return unless File.exist?(file_path)
|
return unless File.exist?(file_path)
|
||||||
|
|
||||||
content = File.read(file_path)
|
content = File.read(file_path)
|
||||||
content = content.force_encoding('UTF-8').scrub
|
content = content.force_encoding('UTF-8').scrub
|
||||||
|
|
||||||
assets = PageRequisites.extract(content)
|
assets = PageRequisites.extract(content)
|
||||||
|
|
||||||
# FIX 1: Construct a valid URI object including the scheme (http://)
|
# prepare base URI for resolving relative paths
|
||||||
# parent_remote_info[:file_url] is usually "www.iana.org/path",
|
|
||||||
# we need "http://www.iana.org/path" to resolve relative paths correctly.
|
|
||||||
parent_raw = parent_remote_info[:file_url]
|
parent_raw = parent_remote_info[:file_url]
|
||||||
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
||||||
|
|
||||||
begin
|
begin
|
||||||
base_uri = URI(parent_raw)
|
base_uri = URI(parent_raw)
|
||||||
|
# calculate the "root" host of the site we are downloading to compare later
|
||||||
|
current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
|
||||||
rescue URI::InvalidURIError
|
rescue URI::InvalidURIError
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
parent_timestamp = parent_remote_info[:timestamp]
|
parent_timestamp = parent_remote_info[:timestamp]
|
||||||
|
|
||||||
assets.each do |asset_rel_url|
|
assets.each do |asset_rel_url|
|
||||||
begin
|
begin
|
||||||
# resolve the relative asset URL against the parent page URL
|
# resolve full URL (handles relative paths like "../img/logo.png")
|
||||||
# e.g. parent: http://www.iana.org/help/ex
|
|
||||||
# asset: /static/style.css
|
|
||||||
# result: http://www.iana.org/static/style.css
|
|
||||||
resolved_uri = base_uri + asset_rel_url
|
resolved_uri = base_uri + asset_rel_url
|
||||||
|
|
||||||
# filter out navigation links
|
# filter out navigation links (pages) vs assets
|
||||||
# If the path has no extension (like /domains) or is .html, it's likely a link and not a requisite
|
# skip if extension is empty or looks like an HTML page
|
||||||
# this prevents spidering the whole site
|
|
||||||
path = resolved_uri.path
|
path = resolved_uri.path
|
||||||
ext = File.extname(path).downcase
|
ext = File.extname(path).downcase
|
||||||
|
|
||||||
# skip empty extensions and standard page extensions
|
|
||||||
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
# reconstruct the ID expected by Wayback Machine
|
# construct the URL for the Wayback API
|
||||||
asset_final_url = resolved_uri.host + resolved_uri.path
|
asset_wbm_url = resolved_uri.host + resolved_uri.path
|
||||||
asset_final_url += "?#{resolved_uri.query}" if resolved_uri.query
|
asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query
|
||||||
|
|
||||||
|
# construct the local file ID
|
||||||
|
# if the asset is on the SAME domain, strip the domain from the folder path
|
||||||
|
# if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
|
||||||
|
if resolved_uri.host == current_project_host
|
||||||
|
# e.g. /static/css/style.css
|
||||||
|
asset_file_id = resolved_uri.path
|
||||||
|
asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
|
||||||
|
else
|
||||||
|
# e.g. cdn.google.com/jquery.js
|
||||||
|
asset_file_id = asset_wbm_url
|
||||||
|
end
|
||||||
|
|
||||||
rescue URI::InvalidURIError, StandardError
|
rescue URI::InvalidURIError, StandardError
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
# sanitize ID
|
# sanitize and queue
|
||||||
asset_id = sanitize_and_prepare_id(asset_final_url, asset_final_url)
|
asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)
|
||||||
|
|
||||||
# queue if not already queued
|
|
||||||
# @note: we use the PARENT timestamp here. WBM usually redirects
|
|
||||||
# to the closest available timestamp if the exact one doesn't exist
|
|
||||||
unless @session_downloaded_ids.include?(asset_id)
|
unless @session_downloaded_ids.include?(asset_id)
|
||||||
@session_downloaded_ids.add(asset_id)
|
@session_downloaded_ids.add(asset_id)
|
||||||
|
|
||||||
# construct info hash
|
|
||||||
new_file_info = {
|
new_file_info = {
|
||||||
file_url: asset_final_url,
|
file_url: asset_wbm_url,
|
||||||
timestamp: parent_timestamp,
|
timestamp: parent_timestamp,
|
||||||
file_id: asset_id
|
file_id: asset_id
|
||||||
}
|
}
|
||||||
|
|
||||||
@download_mutex.synchronize do
|
@download_mutex.synchronize do
|
||||||
@total_to_download += 1
|
@total_to_download += 1
|
||||||
puts "Queued requisite: #{asset_final_url}"
|
puts "Queued requisite: #{asset_file_id}"
|
||||||
end
|
end
|
||||||
|
|
||||||
submit_download_job(new_file_info)
|
submit_download_job(new_file_info)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user