mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4160ff5e4a | ||
|
|
2490109cfe | ||
|
|
c3c5b8446a | ||
|
|
18357a77ed |
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
|
|
||||||
VERSION = "2.3.8"
|
VERSION = "2.3.9"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -131,7 +131,11 @@ class WaybackMachineDownloader
|
|||||||
validate_params(params)
|
validate_params(params)
|
||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]
|
||||||
@exact_url = params[:exact_url]
|
@exact_url = params[:exact_url]
|
||||||
@directory = params[:directory]
|
if params[:directory]
|
||||||
|
@directory = File.expand_path(params[:directory])
|
||||||
|
else
|
||||||
|
@directory = nil
|
||||||
|
end
|
||||||
@all_timestamps = params[:all_timestamps]
|
@all_timestamps = params[:all_timestamps]
|
||||||
@from_timestamp = params[:from_timestamp].to_i
|
@from_timestamp = params[:from_timestamp].to_i
|
||||||
@to_timestamp = params[:to_timestamp].to_i
|
@to_timestamp = params[:to_timestamp].to_i
|
||||||
@@ -165,13 +169,11 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def backup_path
|
def backup_path
|
||||||
if @directory
|
if @directory
|
||||||
if @directory[-1] == '/'
|
# because @directory is already an absolute path, we just ensure it exists
|
||||||
@directory
|
@directory
|
||||||
else
|
|
||||||
@directory + '/'
|
|
||||||
end
|
|
||||||
else
|
else
|
||||||
'websites/' + backup_name + '/'
|
# ensure the default path is absolute and normalized
|
||||||
|
File.expand_path(File.join('websites', backup_name))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -638,21 +640,35 @@ class WaybackMachineDownloader
|
|||||||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||||||
file_id = file_remote_info[:file_id]
|
file_id = file_remote_info[:file_id]
|
||||||
file_timestamp = file_remote_info[:timestamp]
|
file_timestamp = file_remote_info[:timestamp]
|
||||||
file_path_elements = file_id.split('/')
|
|
||||||
|
# sanitize file_id to ensure it is a valid path component
|
||||||
|
raw_path_elements = file_id.split('/')
|
||||||
|
|
||||||
|
sanitized_path_elements = raw_path_elements.map do |element|
|
||||||
|
if Gem.win_platform?
|
||||||
|
# for Windows, we need to sanitize path components to avoid invalid characters
|
||||||
|
# this prevents issues with file names that contain characters not allowed in
|
||||||
|
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
||||||
|
element.gsub(/[:\*?"<>\|\&\=\/\\]/, ->(match) { '%' + match.ord.to_s(16).upcase })
|
||||||
|
else
|
||||||
|
element
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
current_backup_path = backup_path
|
||||||
|
|
||||||
if file_id == ""
|
if file_id == ""
|
||||||
dir_path = backup_path
|
dir_path = current_backup_path
|
||||||
file_path = backup_path + 'index.html'
|
file_path = File.join(dir_path, 'index.html')
|
||||||
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
||||||
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
# if file_id is a directory, we treat it as such
|
||||||
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||||||
|
file_path = File.join(dir_path, 'index.html')
|
||||||
else
|
else
|
||||||
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
# if file_id is a file, we treat it as such
|
||||||
file_path = backup_path + file_path_elements[0..-1].join('/')
|
filename = sanitized_path_elements.pop
|
||||||
end
|
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||||||
if Gem.win_platform?
|
file_path = File.join(dir_path, filename)
|
||||||
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
|
||||||
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# check existence *before* download attempt
|
# check existence *before* download attempt
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ module ArchiveAPI
|
|||||||
# Automatically append /* if the URL doesn't contain a path after the domain
|
# Automatically append /* if the URL doesn't contain a path after the domain
|
||||||
# This is a workaround for an issue with the API and *some* domains.
|
# This is a workaround for an issue with the API and *some* domains.
|
||||||
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
||||||
if url && !url.match(/^https?:\/\/.*\//i)
|
# But don't do this when exact_url flag is set
|
||||||
|
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
||||||
url = "#{url}/*"
|
url = "#{url}/*"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = "2.3.8"
|
s.version = "2.3.9"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
|
|||||||
Reference in New Issue
Block a user