mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d3466b3387 | ||
|
|
0250579f0e | ||
|
|
0663c1c122 | ||
|
|
93115f70ec | ||
|
|
3d37ae10fd | ||
|
|
bff10e7260 | ||
|
|
ffdce7e4ec | ||
|
|
e4487baafc |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -32,3 +32,7 @@ tmp
|
|||||||
*.rbc
|
*.rbc
|
||||||
|
|
||||||
test.rb
|
test.rb
|
||||||
|
|
||||||
|
# Dev environment
|
||||||
|
.vscode
|
||||||
|
*.code-workspace
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
FROM ruby:3.4.4-alpine
|
FROM ruby:3.4.5-alpine
|
||||||
USER root
|
USER root
|
||||||
WORKDIR /build
|
WORKDIR /build
|
||||||
|
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ class WaybackMachineDownloader
|
|||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
include SubdomainProcessor
|
include SubdomainProcessor
|
||||||
|
|
||||||
VERSION = "2.3.11"
|
VERSION = "2.3.12"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -128,7 +128,8 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
||||||
|
:snapshot_at
|
||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
validate_params(params)
|
validate_params(params)
|
||||||
@@ -158,6 +159,7 @@ class WaybackMachineDownloader
|
|||||||
@rewrite = params[:rewrite] || false
|
@rewrite = params[:rewrite] || false
|
||||||
@recursive_subdomains = params[:recursive_subdomains] || false
|
@recursive_subdomains = params[:recursive_subdomains] || false
|
||||||
@subdomain_depth = params[:subdomain_depth] || 1
|
@subdomain_depth = params[:subdomain_depth] || 1
|
||||||
|
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
||||||
|
|
||||||
# URL for rejecting invalid/unencoded wayback urls
|
# URL for rejecting invalid/unencoded wayback urls
|
||||||
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
||||||
@@ -330,6 +332,36 @@ class WaybackMachineDownloader
|
|||||||
snapshot_list_to_consider
|
snapshot_list_to_consider
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Get a composite snapshot file list for a specific timestamp
|
||||||
|
def get_composite_snapshot_file_list(target_timestamp)
|
||||||
|
file_versions = {}
|
||||||
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
|
next unless file_url.include?('/')
|
||||||
|
next if file_timestamp.to_i > target_timestamp
|
||||||
|
file_id = file_url.split('/')[3..-1].join('/')
|
||||||
|
file_id = CGI::unescape file_id
|
||||||
|
file_id = file_id.tidy_bytes unless file_id == ""
|
||||||
|
next if file_id.nil?
|
||||||
|
next if match_exclude_filter(file_url)
|
||||||
|
next unless match_only_filter(file_url)
|
||||||
|
# Select the most recent version <= target_timestamp
|
||||||
|
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
||||||
|
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
file_versions.values
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns a list of files for the composite snapshot
|
||||||
|
def get_file_list_composite_snapshot(target_timestamp)
|
||||||
|
file_list = get_composite_snapshot_file_list(target_timestamp)
|
||||||
|
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||||||
|
file_list.map do |file_remote_info|
|
||||||
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
|
file_remote_info[1]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def get_file_list_curated
|
def get_file_list_curated
|
||||||
file_list_curated = Hash.new
|
file_list_curated = Hash.new
|
||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
@@ -384,7 +416,9 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
|
|
||||||
def get_file_list_by_timestamp
|
def get_file_list_by_timestamp
|
||||||
if @all_timestamps
|
if @snapshot_at
|
||||||
|
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
||||||
|
elsif @all_timestamps
|
||||||
file_list_curated = get_file_list_all_timestamps
|
file_list_curated = get_file_list_all_timestamps
|
||||||
file_list_curated.map do |file_remote_info|
|
file_list_curated.map do |file_remote_info|
|
||||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
@@ -727,7 +761,22 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
|
|
||||||
def file_list_by_timestamp
|
def file_list_by_timestamp
|
||||||
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
if @snapshot_at
|
||||||
|
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
||||||
|
elsif @all_timestamps
|
||||||
|
file_list_curated = get_file_list_all_timestamps
|
||||||
|
file_list_curated.map do |file_remote_info|
|
||||||
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
|
file_remote_info[1]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
file_list_curated = get_file_list_curated
|
||||||
|
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||||||
|
file_list_curated.map do |file_remote_info|
|
||||||
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
|
file_remote_info[1]
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ module TidyBytes
|
|||||||
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
||||||
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
||||||
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
||||||
|
else nil # ANYTHING ELSE...
|
||||||
end
|
end
|
||||||
end.freeze
|
end.freeze
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = "2.3.11"
|
s.version = "2.3.12"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
s.authors = ["strawberrymaster"]
|
s.authors = ["strawberrymaster"]
|
||||||
s.email = "strawberrymaster@vivaldi.net"
|
s.email = "strawberrymaster@vivaldi.net"
|
||||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb"]
|
||||||
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
||||||
s.license = "MIT"
|
s.license = "MIT"
|
||||||
s.required_ruby_version = ">= 3.4.3"
|
s.required_ruby_version = ">= 3.4.3"
|
||||||
|
|||||||
Reference in New Issue
Block a user