mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
page requisites fix
This commit is contained in:
parent
2aa694eed0
commit
8632050c45
@ -694,30 +694,48 @@ class WaybackMachineDownloader
|
|||||||
return unless File.exist?(file_path)
|
return unless File.exist?(file_path)
|
||||||
|
|
||||||
content = File.read(file_path)
|
content = File.read(file_path)
|
||||||
# handle encoding slightly roughly for extraction
|
|
||||||
content = content.force_encoding('UTF-8').scrub
|
content = content.force_encoding('UTF-8').scrub
|
||||||
|
|
||||||
assets = PageRequisites.extract(content)
|
assets = PageRequisites.extract(content)
|
||||||
|
|
||||||
parent_url = parent_remote_info[:file_url]
|
# FIX 1: Construct a valid URI object including the scheme (http://)
|
||||||
|
# parent_remote_info[:file_url] is usually "www.iana.org/path",
|
||||||
|
# we need "http://www.iana.org/path" to resolve relative paths correctly.
|
||||||
|
parent_raw = parent_remote_info[:file_url]
|
||||||
|
parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
|
||||||
|
|
||||||
|
begin
|
||||||
|
base_uri = URI(parent_raw)
|
||||||
|
rescue URI::InvalidURIError
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
parent_timestamp = parent_remote_info[:timestamp]
|
parent_timestamp = parent_remote_info[:timestamp]
|
||||||
|
|
||||||
assets.each do |asset_rel_url|
|
assets.each do |asset_rel_url|
|
||||||
# resolve absolute URL
|
|
||||||
begin
|
begin
|
||||||
# assume relative to the parent file URL
|
# resolve the relative asset URL against the parent page URL
|
||||||
# We need a fake base URI to resolve /paths and ../paths
|
# e.g. parent: http://www.iana.org/help/ex
|
||||||
base_uri = URI("http://base.example.com/" + parent_url)
|
# asset: /static/style.css
|
||||||
|
# result: http://www.iana.org/static/style.css
|
||||||
resolved_uri = base_uri + asset_rel_url
|
resolved_uri = base_uri + asset_rel_url
|
||||||
|
|
||||||
# we only want the path part + query, not the host
|
# filter out navigation links
|
||||||
asset_final_url = resolved_uri.path
|
# If the path has no extension (like /domains) or is .html, it's likely a link and not a requisite
|
||||||
asset_final_url = asset_final_url[1..-1] if asset_final_url.start_with?('/') # strip leading slash
|
# this prevents spidering the whole site
|
||||||
|
path = resolved_uri.path
|
||||||
|
ext = File.extname(path).downcase
|
||||||
|
|
||||||
# re-attach query string if present (as some assets use ?v=123)
|
# skip empty extensions and standard page extensions
|
||||||
|
if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
# reconstruct the ID expected by Wayback Machine
|
||||||
|
asset_final_url = resolved_uri.host + resolved_uri.path
|
||||||
asset_final_url += "?#{resolved_uri.query}" if resolved_uri.query
|
asset_final_url += "?#{resolved_uri.query}" if resolved_uri.query
|
||||||
|
|
||||||
rescue URI::InvalidURIError
|
rescue URI::InvalidURIError, StandardError
|
||||||
next
|
next
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user