mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c731e0c7bd | ||
|
|
9fd2a7f8d1 | ||
|
|
6ad312f31f |
@@ -116,7 +116,7 @@ class WaybackMachineDownloader
|
|||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
include SubdomainProcessor
|
include SubdomainProcessor
|
||||||
|
|
||||||
VERSION = "2.4.0"
|
VERSION = "2.4.1"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -340,15 +340,15 @@ class WaybackMachineDownloader
|
|||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
next unless file_url.include?('/')
|
next unless file_url.include?('/')
|
||||||
next if file_timestamp.to_i > target_timestamp
|
next if file_timestamp.to_i > target_timestamp
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
|
||||||
file_id = CGI::unescape file_id
|
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||||
file_id = file_id.tidy_bytes unless file_id == ""
|
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||||
next if file_id.nil?
|
next if file_id.nil?
|
||||||
next if match_exclude_filter(file_url)
|
next if match_exclude_filter(file_url)
|
||||||
next unless match_only_filter(file_url)
|
next unless match_only_filter(file_url)
|
||||||
# Select the most recent version <= target_timestamp
|
|
||||||
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
||||||
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
file_versions.values
|
file_versions.values
|
||||||
@@ -368,22 +368,27 @@ class WaybackMachineDownloader
|
|||||||
file_list_curated = Hash.new
|
file_list_curated = Hash.new
|
||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
next unless file_url.include?('/')
|
next unless file_url.include?('/')
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
|
||||||
file_id = CGI::unescape file_id
|
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||||
file_id = file_id.tidy_bytes unless file_id == ""
|
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||||
if file_id.nil?
|
if file_id.nil?
|
||||||
puts "Malformed file url, ignoring: #{file_url}"
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
if file_id.include?('<') || file_id.include?('>')
|
||||||
|
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||||||
else
|
else
|
||||||
if match_exclude_filter(file_url)
|
if match_exclude_filter(file_url)
|
||||||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
elsif not match_only_filter(file_url)
|
elsif !match_only_filter(file_url)
|
||||||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||||
elsif file_list_curated[file_id]
|
elsif file_list_curated[file_id]
|
||||||
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
||||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -394,21 +399,32 @@ class WaybackMachineDownloader
|
|||||||
file_list_curated = Hash.new
|
file_list_curated = Hash.new
|
||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
next unless file_url.include?('/')
|
next unless file_url.include?('/')
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
|
||||||
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||||
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||||
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
|
||||||
if file_id.nil?
|
if file_id.nil?
|
||||||
puts "Malformed file url, ignoring: #{file_url}"
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
||||||
|
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
||||||
|
if file_id_and_timestamp.nil?
|
||||||
|
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
||||||
|
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||||||
else
|
else
|
||||||
if match_exclude_filter(file_url)
|
if match_exclude_filter(file_url)
|
||||||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
elsif not match_only_filter(file_url)
|
elsif !match_only_filter(file_url)
|
||||||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||||
elsif file_list_curated[file_id_and_timestamp]
|
elsif file_list_curated[file_id_and_timestamp]
|
||||||
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
# duplicate combo, ignore silently (verbose flag not shown here)
|
||||||
else
|
else
|
||||||
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -749,6 +765,20 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
logger
|
logger
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# safely sanitize a file id (or id+timestamp)
|
||||||
|
def sanitize_and_prepare_id(raw, file_url)
|
||||||
|
return nil if raw.nil?
|
||||||
|
begin
|
||||||
|
raw = CGI.unescape(raw) rescue raw
|
||||||
|
raw.gsub!(/<[^>]*>/, '')
|
||||||
|
raw = raw.tidy_bytes unless raw.empty?
|
||||||
|
raw
|
||||||
|
rescue => e
|
||||||
|
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
||||||
retries = 0
|
retries = 0
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = "2.4.0"
|
s.version = "2.4.1"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
|
|||||||
Reference in New Issue
Block a user