mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
42 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40e9c9bb51 | ||
|
|
6bc08947b7 | ||
|
|
c731e0c7bd | ||
|
|
9fd2a7f8d1 | ||
|
|
6ad312f31f | ||
|
|
62ea35daa6 | ||
|
|
1f4202908f | ||
|
|
bed3f6101c | ||
|
|
754df6b8d6 | ||
|
|
801fb77f79 | ||
|
|
e9849e6c9c | ||
|
|
bc868e6b39 | ||
|
|
2bf04aff48 | ||
|
|
51becde916 | ||
|
|
c30ee73977 | ||
|
|
d3466b3387 | ||
|
|
0250579f0e | ||
|
|
0663c1c122 | ||
|
|
93115f70ec | ||
|
|
3d37ae10fd | ||
|
|
bff10e7260 | ||
|
|
3d181ce84c | ||
|
|
999aa211ae | ||
|
|
ffdce7e4ec | ||
|
|
e4487baafc | ||
|
|
82ff2de3dc | ||
|
|
fd329afdd2 | ||
|
|
038785557d | ||
|
|
2eead8cc27 | ||
|
|
7e5cdd54fb | ||
|
|
4160ff5e4a | ||
|
|
f03d92a3c4 | ||
|
|
2490109cfe | ||
|
|
c3c5b8446a | ||
|
|
18357a77ed | ||
|
|
3fdfd70fc1 | ||
|
|
2bf74b4173 | ||
|
|
79cbb639e7 | ||
|
|
071d208b31 | ||
|
|
1681a12579 | ||
|
|
f38756dd76 | ||
|
|
9452411e32 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -32,3 +32,7 @@ tmp
|
||||
*.rbc
|
||||
|
||||
test.rb
|
||||
|
||||
# Dev environment
|
||||
.vscode
|
||||
*.code-workspace
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM ruby:3.4.4-alpine
|
||||
FROM ruby:3.4.5-alpine
|
||||
USER root
|
||||
WORKDIR /build
|
||||
|
||||
@@ -6,10 +6,9 @@ COPY Gemfile /build/
|
||||
COPY *.gemspec /build/
|
||||
|
||||
RUN bundle config set jobs "$(nproc)" \
|
||||
&& bundle config set without 'development test' \
|
||||
&& bundle install
|
||||
|
||||
COPY . /build
|
||||
|
||||
WORKDIR /
|
||||
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
||||
WORKDIR /build
|
||||
ENTRYPOINT [ "/build/bin/wayback_machine_downloader", "--directory", "/build/websites" ]
|
||||
|
||||
22
README.md
22
README.md
@@ -27,6 +27,7 @@ To run most commands, just like in the original WMD, you can use:
|
||||
```bash
|
||||
wayback_machine_downloader https://example.com
|
||||
```
|
||||
**Note**: this gem may conflict with hartator's wayback_machine_downloader gem, and so you may have to uninstall it for this WMD fork to work. A good way to know is if a command fails; it will list the gem version as 2.3.1 or earlier, while this WMD fork uses 2.3.2 or above.
|
||||
|
||||
### Step-by-step setup
|
||||
1. **Install Ruby**:
|
||||
@@ -80,36 +81,19 @@ services:
|
||||
tty: true
|
||||
image: wayback_machine_downloader:latest
|
||||
container_name: wayback_machine_downloader
|
||||
environment:
|
||||
- ENVIRONMENT=${ENVIRONMENT:-development}
|
||||
- OPTIONS=${OPTIONS:-""}
|
||||
- TARGET_URL=${TARGET_URL}
|
||||
volumes:
|
||||
- .:/build:rw
|
||||
- ./websites:/build/websites:rw
|
||||
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
|
||||
```
|
||||
#### Usage:
|
||||
Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
|
||||
Now you can create a Docker image as named "wayback_machine_downloader" with the following command:
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
After that you must set TARGET_URL environment variable:
|
||||
```bash
|
||||
export TARGET_URL="https://example.com/"
|
||||
```
|
||||
|
||||
The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
|
||||
|
||||
Example:
|
||||
```bash
|
||||
export OPTIONS="--list -f 20060121"
|
||||
```
|
||||
|
||||
After that you can run the exists container with the following command:
|
||||
```bash
|
||||
docker compose run --rm wayback_machine_downloader https://example.com
|
||||
docker compose run --rm wayback_machine_downloader https://example.com [options]
|
||||
```
|
||||
|
||||
## ⚙️ Configuration
|
||||
|
||||
@@ -74,6 +74,14 @@ option_parser = OptionParser.new do |opts|
|
||||
options[:keep] = true
|
||||
end
|
||||
|
||||
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
||||
options[:recursive_subdomains] = true
|
||||
end
|
||||
|
||||
opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
|
||||
options[:subdomain_depth] = t
|
||||
end
|
||||
|
||||
opts.on("-v", "--version", "Display version") do |t|
|
||||
options[:version] = t
|
||||
end
|
||||
|
||||
@@ -5,11 +5,6 @@ services:
|
||||
tty: true
|
||||
image: wayback_machine_downloader:latest
|
||||
container_name: wayback_machine_downloader
|
||||
environment:
|
||||
- ENVIRONMENT=${DEVELOPMENT:-production}
|
||||
- OPTIONS=${OPTIONS:-""}
|
||||
- TARGET_URL=${TARGET_URL}
|
||||
volumes:
|
||||
- .:/build:rw
|
||||
- ./websites:/websites:rw
|
||||
command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}
|
||||
- ./websites:/build/websites:rw
|
||||
@@ -11,9 +11,12 @@ require 'concurrent-ruby'
|
||||
require 'logger'
|
||||
require 'zlib'
|
||||
require 'stringio'
|
||||
require 'digest'
|
||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||
require_relative 'wayback_machine_downloader/to_regex'
|
||||
require_relative 'wayback_machine_downloader/archive_api'
|
||||
require_relative 'wayback_machine_downloader/subdom_processor'
|
||||
require_relative 'wayback_machine_downloader/url_rewrite'
|
||||
|
||||
class ConnectionPool
|
||||
MAX_AGE = 300
|
||||
@@ -112,8 +115,9 @@ end
|
||||
class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
include SubdomainProcessor
|
||||
|
||||
VERSION = "2.3.7"
|
||||
VERSION = "2.4.2"
|
||||
DEFAULT_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 2
|
||||
@@ -123,15 +127,22 @@ class WaybackMachineDownloader
|
||||
STATE_CDX_FILENAME = ".cdx.json"
|
||||
STATE_DB_FILENAME = ".downloaded.txt"
|
||||
|
||||
|
||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite,
|
||||
:snapshot_at
|
||||
|
||||
def initialize params
|
||||
validate_params(params)
|
||||
@base_url = params[:base_url]
|
||||
@base_url = params[:base_url]&.tidy_bytes
|
||||
@exact_url = params[:exact_url]
|
||||
@directory = params[:directory]
|
||||
if params[:directory]
|
||||
sanitized_dir = params[:directory].tidy_bytes
|
||||
@directory = File.expand_path(sanitized_dir)
|
||||
else
|
||||
@directory = nil
|
||||
end
|
||||
@all_timestamps = params[:all_timestamps]
|
||||
@from_timestamp = params[:from_timestamp].to_i
|
||||
@to_timestamp = params[:to_timestamp].to_i
|
||||
@@ -149,27 +160,40 @@ class WaybackMachineDownloader
|
||||
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
||||
@db_mutex = Mutex.new
|
||||
@rewrite = params[:rewrite] || false
|
||||
@recursive_subdomains = params[:recursive_subdomains] || false
|
||||
@subdomain_depth = params[:subdomain_depth] || 1
|
||||
@snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
|
||||
|
||||
# URL for rejecting invalid/unencoded wayback urls
|
||||
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
||||
|
||||
handle_reset
|
||||
end
|
||||
|
||||
def backup_name
|
||||
if @base_url.include? '//'
|
||||
@base_url.split('/')[2]
|
||||
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
||||
raw = if url_to_process.include?('//')
|
||||
url_to_process.split('/')[2]
|
||||
else
|
||||
@base_url
|
||||
url_to_process
|
||||
end
|
||||
|
||||
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
||||
if Gem.win_platform?
|
||||
raw = raw.gsub(/[:*?"<>|]/, '_')
|
||||
raw = raw.gsub(/[ .]+\z/, '')
|
||||
end
|
||||
raw = 'site' if raw.nil? || raw.empty?
|
||||
raw
|
||||
end
|
||||
|
||||
def backup_path
|
||||
if @directory
|
||||
if @directory[-1] == '/'
|
||||
@directory
|
||||
else
|
||||
@directory + '/'
|
||||
end
|
||||
# because @directory is already an absolute path, we just ensure it exists
|
||||
@directory
|
||||
else
|
||||
'websites/' + backup_name + '/'
|
||||
# ensure the default path is absolute and normalized
|
||||
File.expand_path(File.join('websites', backup_name))
|
||||
end
|
||||
end
|
||||
|
||||
@@ -192,7 +216,7 @@ class WaybackMachineDownloader
|
||||
|
||||
def match_only_filter file_url
|
||||
if @only_filter
|
||||
only_filter_regex = @only_filter.to_regex
|
||||
only_filter_regex = @only_filter.to_regex(detect: true)
|
||||
if only_filter_regex
|
||||
only_filter_regex =~ file_url
|
||||
else
|
||||
@@ -205,7 +229,7 @@ class WaybackMachineDownloader
|
||||
|
||||
def match_exclude_filter file_url
|
||||
if @exclude_filter
|
||||
exclude_filter_regex = @exclude_filter.to_regex
|
||||
exclude_filter_regex = @exclude_filter.to_regex(detect: true)
|
||||
if exclude_filter_regex
|
||||
exclude_filter_regex =~ file_url
|
||||
else
|
||||
@@ -241,6 +265,7 @@ class WaybackMachineDownloader
|
||||
# Fetch the initial set of snapshots, sequentially
|
||||
@connection_pool.with_connection do |connection|
|
||||
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
||||
initial_list ||= []
|
||||
mutex.synchronize do
|
||||
snapshot_list_to_consider.concat(initial_list)
|
||||
print "."
|
||||
@@ -265,6 +290,7 @@ class WaybackMachineDownloader
|
||||
@connection_pool.with_connection do |connection|
|
||||
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
||||
end
|
||||
result ||= []
|
||||
[page, result]
|
||||
end
|
||||
end
|
||||
@@ -284,7 +310,7 @@ class WaybackMachineDownloader
|
||||
|
||||
# Process results and check for empty pages
|
||||
results.each do |page, result|
|
||||
if result.empty?
|
||||
if result.nil? || result.empty?
|
||||
continue_fetching = false
|
||||
break
|
||||
else
|
||||
@@ -316,26 +342,61 @@ class WaybackMachineDownloader
|
||||
snapshot_list_to_consider
|
||||
end
|
||||
|
||||
# Get a composite snapshot file list for a specific timestamp
|
||||
def get_composite_snapshot_file_list(target_timestamp)
|
||||
file_versions = {}
|
||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||
next unless file_url.include?('/')
|
||||
next if file_timestamp.to_i > target_timestamp
|
||||
|
||||
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||
next if file_id.nil?
|
||||
next if match_exclude_filter(file_url)
|
||||
next unless match_only_filter(file_url)
|
||||
|
||||
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
||||
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
||||
end
|
||||
end
|
||||
file_versions.values
|
||||
end
|
||||
|
||||
# Returns a list of files for the composite snapshot
|
||||
def get_file_list_composite_snapshot(target_timestamp)
|
||||
file_list = get_composite_snapshot_file_list(target_timestamp)
|
||||
file_list = file_list.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||||
file_list.map do |file_remote_info|
|
||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||
file_remote_info[1]
|
||||
end
|
||||
end
|
||||
|
||||
def get_file_list_curated
|
||||
file_list_curated = Hash.new
|
||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||
next unless file_url.include?('/')
|
||||
file_id = file_url.split('/')[3..-1].join('/')
|
||||
file_id = CGI::unescape file_id
|
||||
file_id = file_id.tidy_bytes unless file_id == ""
|
||||
|
||||
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||
if file_id.nil?
|
||||
puts "Malformed file url, ignoring: #{file_url}"
|
||||
next
|
||||
end
|
||||
|
||||
if file_id.include?('<') || file_id.include?('>')
|
||||
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||||
else
|
||||
if match_exclude_filter(file_url)
|
||||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||
elsif not match_only_filter(file_url)
|
||||
elsif !match_only_filter(file_url)
|
||||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||
elsif file_list_curated[file_id]
|
||||
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
||||
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||||
end
|
||||
else
|
||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
||||
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -346,21 +407,32 @@ class WaybackMachineDownloader
|
||||
file_list_curated = Hash.new
|
||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||
next unless file_url.include?('/')
|
||||
file_id = file_url.split('/')[3..-1].join('/')
|
||||
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
||||
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
||||
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
||||
|
||||
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||
if file_id.nil?
|
||||
puts "Malformed file url, ignoring: #{file_url}"
|
||||
next
|
||||
end
|
||||
|
||||
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
||||
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
||||
if file_id_and_timestamp.nil?
|
||||
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
||||
next
|
||||
end
|
||||
|
||||
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
||||
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||||
else
|
||||
if match_exclude_filter(file_url)
|
||||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||
elsif not match_only_filter(file_url)
|
||||
elsif !match_only_filter(file_url)
|
||||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||
elsif file_list_curated[file_id_and_timestamp]
|
||||
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
||||
# duplicate combo, ignore silently (verbose flag not shown here)
|
||||
else
|
||||
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
||||
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -370,7 +442,9 @@ class WaybackMachineDownloader
|
||||
|
||||
|
||||
def get_file_list_by_timestamp
|
||||
if @all_timestamps
|
||||
if @snapshot_at
|
||||
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
||||
elsif @all_timestamps
|
||||
file_list_curated = get_file_list_all_timestamps
|
||||
file_list_curated.map do |file_remote_info|
|
||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||
@@ -378,7 +452,7 @@ class WaybackMachineDownloader
|
||||
end
|
||||
else
|
||||
file_list_curated = get_file_list_curated
|
||||
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
||||
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||||
file_list_curated.map do |file_remote_info|
|
||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||
file_remote_info[1]
|
||||
@@ -425,6 +499,39 @@ class WaybackMachineDownloader
|
||||
end
|
||||
end
|
||||
|
||||
def processing_files(pool, files_to_process)
|
||||
files_to_process.each do |file_remote_info|
|
||||
pool.post do
|
||||
download_success = false
|
||||
begin
|
||||
@connection_pool.with_connection do |connection|
|
||||
result_message = download_file(file_remote_info, connection)
|
||||
# assume download success if the result message contains ' -> '
|
||||
if result_message && result_message.include?(' -> ')
|
||||
download_success = true
|
||||
end
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
# adjust progress message to reflect remaining files
|
||||
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
||||
puts progress_message if progress_message
|
||||
end
|
||||
end
|
||||
# sppend to DB only after successful download outside the connection block
|
||||
if download_success
|
||||
append_to_db(file_remote_info[:file_id])
|
||||
end
|
||||
rescue => e
|
||||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
end
|
||||
end
|
||||
sleep(RATE_LIMIT)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def download_files
|
||||
start_time = Time.now
|
||||
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
||||
@@ -471,42 +578,23 @@ class WaybackMachineDownloader
|
||||
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
||||
pool = Concurrent::FixedThreadPool.new(thread_count)
|
||||
|
||||
files_to_process.each do |file_remote_info|
|
||||
pool.post do
|
||||
download_success = false
|
||||
begin
|
||||
@connection_pool.with_connection do |connection|
|
||||
result_message = download_file(file_remote_info, connection)
|
||||
# assume download success if the result message contains ' -> '
|
||||
if result_message && result_message.include?(' -> ')
|
||||
download_success = true
|
||||
end
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
# adjust progress message to reflect remaining files
|
||||
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
||||
puts progress_message if progress_message
|
||||
end
|
||||
end
|
||||
# sppend to DB only after successful download outside the connection block
|
||||
if download_success
|
||||
append_to_db(file_remote_info[:file_id])
|
||||
end
|
||||
rescue => e
|
||||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||||
@download_mutex.synchronize do
|
||||
@processed_file_count += 1
|
||||
end
|
||||
end
|
||||
sleep(RATE_LIMIT)
|
||||
end
|
||||
end
|
||||
processing_files(pool, files_to_process)
|
||||
|
||||
pool.shutdown
|
||||
pool.wait_for_termination
|
||||
|
||||
end_time = Time.now
|
||||
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
||||
|
||||
# process subdomains if enabled
|
||||
if @recursive_subdomains
|
||||
subdomain_start_time = Time.now
|
||||
process_subdomains
|
||||
subdomain_end_time = Time.now
|
||||
subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
|
||||
puts "Subdomain processing finished in #{subdomain_time}s."
|
||||
end
|
||||
|
||||
puts "Results saved in #{backup_path}"
|
||||
cleanup
|
||||
end
|
||||
@@ -550,64 +638,13 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
# URLs in HTML attributes
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||
prefix, url, suffix = $1, $2, $3
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
"#{prefix}#{path}#{suffix}"
|
||||
rescue
|
||||
"#{prefix}#{url}#{suffix}"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"#{prefix}./#{url[1..-1]}#{suffix}"
|
||||
else
|
||||
"#{prefix}#{url}#{suffix}"
|
||||
end
|
||||
end
|
||||
rewrite_html_attr_urls(content)
|
||||
|
||||
# URLs in CSS
|
||||
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
||||
url = $1
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
"url(\"#{path}\")"
|
||||
rescue
|
||||
"url(\"#{url}\")"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"url(\"./#{url[1..-1]}\")"
|
||||
else
|
||||
"url(\"#{url}\")"
|
||||
end
|
||||
end
|
||||
rewrite_css_urls(content)
|
||||
|
||||
# URLs in JavaScript
|
||||
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||
quote_start, url, quote_end = $1, $2, $3
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
"#{quote_start}#{path}#{quote_end}"
|
||||
rescue
|
||||
"#{quote_start}#{url}#{quote_end}"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
||||
else
|
||||
"#{quote_start}#{url}#{quote_end}"
|
||||
end
|
||||
end
|
||||
rewrite_js_urls(content)
|
||||
|
||||
# for URLs in HTML attributes that start with a single slash
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||||
@@ -634,21 +671,35 @@ class WaybackMachineDownloader
|
||||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||||
file_id = file_remote_info[:file_id]
|
||||
file_timestamp = file_remote_info[:timestamp]
|
||||
file_path_elements = file_id.split('/')
|
||||
|
||||
# sanitize file_id to ensure it is a valid path component
|
||||
raw_path_elements = file_id.split('/')
|
||||
|
||||
sanitized_path_elements = raw_path_elements.map do |element|
|
||||
if Gem.win_platform?
|
||||
# for Windows, we need to sanitize path components to avoid invalid characters
|
||||
# this prevents issues with file names that contain characters not allowed in
|
||||
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
||||
element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
|
||||
else
|
||||
element
|
||||
end
|
||||
end
|
||||
|
||||
current_backup_path = backup_path
|
||||
|
||||
if file_id == ""
|
||||
dir_path = backup_path
|
||||
file_path = backup_path + 'index.html'
|
||||
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
||||
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
||||
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
||||
dir_path = current_backup_path
|
||||
file_path = File.join(dir_path, 'index.html')
|
||||
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
||||
# if file_id is a directory, we treat it as such
|
||||
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||||
file_path = File.join(dir_path, 'index.html')
|
||||
else
|
||||
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
||||
file_path = backup_path + file_path_elements[0..-1].join('/')
|
||||
end
|
||||
if Gem.win_platform?
|
||||
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
# if file_id is a file, we treat it as such
|
||||
filename = sanitized_path_elements.pop
|
||||
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||||
file_path = File.join(dir_path, filename)
|
||||
end
|
||||
|
||||
# check existence *before* download attempt
|
||||
@@ -689,7 +740,22 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def file_list_by_timestamp
|
||||
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
||||
if @snapshot_at
|
||||
@file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
|
||||
elsif @all_timestamps
|
||||
file_list_curated = get_file_list_all_timestamps
|
||||
file_list_curated.map do |file_remote_info|
|
||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||
file_remote_info[1]
|
||||
end
|
||||
else
|
||||
file_list_curated = get_file_list_curated
|
||||
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||||
file_list_curated.map do |file_remote_info|
|
||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||
file_remote_info[1]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
@@ -707,6 +773,85 @@ class WaybackMachineDownloader
|
||||
end
|
||||
logger
|
||||
end
|
||||
|
||||
# safely sanitize a file id (or id+timestamp)
|
||||
def sanitize_and_prepare_id(raw, file_url)
|
||||
return nil if raw.nil? || raw.empty?
|
||||
original = raw.dup
|
||||
begin
|
||||
# work on a binary copy to avoid premature encoding errors
|
||||
raw = raw.dup.force_encoding(Encoding::BINARY)
|
||||
|
||||
# percent-decode (repeat until stable in case of double-encoding)
|
||||
loop do
|
||||
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
||||
break if decoded == raw
|
||||
raw = decoded
|
||||
end
|
||||
|
||||
# try tidy_bytes
|
||||
begin
|
||||
raw = raw.tidy_bytes
|
||||
rescue StandardError
|
||||
# fallback: scrub to UTF-8
|
||||
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||
end
|
||||
|
||||
# ensure UTF-8 and scrub again
|
||||
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
||||
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||
end
|
||||
|
||||
# strip HTML/comment artifacts & control chars
|
||||
raw.gsub!(/<!--+/, '')
|
||||
raw.gsub!(/[\x00-\x1F]/, '')
|
||||
|
||||
# split query; hash it for stable short name
|
||||
path_part, query_part = raw.split('?', 2)
|
||||
if query_part && !query_part.empty?
|
||||
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
||||
if path_part.include?('.')
|
||||
pre, _sep, post = path_part.rpartition('.')
|
||||
path_part = "#{pre}__q#{q_digest}.#{post}"
|
||||
else
|
||||
path_part = "#{path_part}__q#{q_digest}"
|
||||
end
|
||||
end
|
||||
raw = path_part
|
||||
|
||||
# collapse slashes & trim leading slash
|
||||
raw.gsub!(%r{/+}, '/')
|
||||
raw.sub!(%r{\A/}, '')
|
||||
|
||||
# segment-wise sanitation
|
||||
raw = raw.split('/').map do |segment|
|
||||
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
||||
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
||||
seg.empty? ? '_' : seg
|
||||
end.join('/')
|
||||
|
||||
# remove any remaining angle brackets
|
||||
raw.tr!('<>', '')
|
||||
|
||||
# final fallback if empty
|
||||
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
||||
|
||||
raw
|
||||
rescue => e
|
||||
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
||||
# deterministic fallback – never return nil so caller won’t mark malformed
|
||||
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
||||
end
|
||||
end
|
||||
|
||||
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
||||
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
||||
# when invoking manually.
|
||||
def safe_display_url(url)
|
||||
return url unless url && url.match?(/[&]/)
|
||||
"(#{url})"
|
||||
end
|
||||
|
||||
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
||||
retries = 0
|
||||
@@ -717,6 +862,15 @@ class WaybackMachineDownloader
|
||||
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
||||
end
|
||||
|
||||
# Escape square brackets because they are not valid in URI()
|
||||
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
||||
|
||||
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
|
||||
if not @url_regexp.match?(wayback_url)
|
||||
@logger.warn("Skipped #{file_url}: invalid URL")
|
||||
return :skipped_not_found
|
||||
end
|
||||
|
||||
request = Net::HTTP::Get.new(URI(wayback_url))
|
||||
request["Connection"] = "keep-alive"
|
||||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||||
|
||||
@@ -4,6 +4,14 @@ require 'uri'
|
||||
module ArchiveAPI
|
||||
|
||||
def get_raw_list_from_api(url, page_index, http)
|
||||
# Automatically append /* if the URL doesn't contain a path after the domain
|
||||
# This is a workaround for an issue with the API and *some* domains.
|
||||
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
||||
# But don't do this when exact_url flag is set
|
||||
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
||||
url = "#{url}/*"
|
||||
end
|
||||
|
||||
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||
request_url.query = URI.encode_www_form(params)
|
||||
@@ -17,7 +25,7 @@ module ArchiveAPI
|
||||
# Check if the response contains the header ["timestamp", "original"]
|
||||
json.shift if json.first == ["timestamp", "original"]
|
||||
json
|
||||
rescue JSON::ParserError, StandardError => e
|
||||
rescue JSON::ParserError => e
|
||||
warn "Failed to fetch data from API: #{e.message}"
|
||||
[]
|
||||
end
|
||||
|
||||
238
lib/wayback_machine_downloader/subdom_processor.rb
Normal file
238
lib/wayback_machine_downloader/subdom_processor.rb
Normal file
@@ -0,0 +1,238 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
module SubdomainProcessor
|
||||
def process_subdomains
|
||||
return unless @recursive_subdomains
|
||||
|
||||
puts "Starting subdomain processing..."
|
||||
|
||||
# extract base domain from the URL for comparison
|
||||
base_domain = extract_base_domain(@base_url)
|
||||
@processed_domains = Set.new([base_domain])
|
||||
@subdomain_queue = Queue.new
|
||||
|
||||
# scan downloaded files for subdomain links
|
||||
initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
||||
puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
|
||||
|
||||
subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
|
||||
|
||||
if subdomains_found.empty?
|
||||
puts "No subdomains found in downloaded content."
|
||||
return
|
||||
end
|
||||
|
||||
puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
|
||||
|
||||
# add found subdomains to the queue
|
||||
subdomains_found.each do |subdomain|
|
||||
full_domain = "#{subdomain}.#{base_domain}"
|
||||
@subdomain_queue << "https://#{full_domain}/"
|
||||
end
|
||||
|
||||
# process the subdomain queue
|
||||
download_subdomains(base_domain)
|
||||
|
||||
# after all downloads, rewrite all URLs to make local references
|
||||
rewrite_subdomain_links(base_domain) if @rewrite
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def extract_base_domain(url)
|
||||
uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
|
||||
return nil unless uri
|
||||
|
||||
host = uri.host || uri.path.split('/').first
|
||||
host = host.downcase
|
||||
|
||||
# extract the base domain (e.g., "example.com" from "sub.example.com")
|
||||
parts = host.split('.')
|
||||
return host if parts.size <= 2
|
||||
|
||||
# for domains like co.uk, we want to keep the last 3 parts
|
||||
if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
|
||||
parts.last(3).join('.')
|
||||
else
|
||||
parts.last(2).join('.')
|
||||
end
|
||||
end
|
||||
|
||||
def scan_files_for_subdomains(files, base_domain)
|
||||
return [] unless base_domain
|
||||
|
||||
subdomains = Set.new
|
||||
|
||||
files.each do |file_path|
|
||||
next unless File.exist?(file_path)
|
||||
|
||||
begin
|
||||
content = File.read(file_path)
|
||||
|
||||
# extract URLs from HTML href/src attributes
|
||||
content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||||
subdomain = match[0].downcase
|
||||
next if subdomain == 'www' # skip www subdomain
|
||||
subdomains.add(subdomain)
|
||||
end
|
||||
|
||||
# extract URLs from CSS
|
||||
content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||||
subdomain = match[0].downcase
|
||||
next if subdomain == 'www' # skip www subdomain
|
||||
subdomains.add(subdomain)
|
||||
end
|
||||
|
||||
# extract URLs from JavaScript strings
|
||||
content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||||
subdomain = match[0].downcase
|
||||
next if subdomain == 'www' # skip www subdomain
|
||||
subdomains.add(subdomain)
|
||||
end
|
||||
rescue => e
|
||||
puts "Error scanning file #{file_path}: #{e.message}"
|
||||
end
|
||||
end
|
||||
|
||||
subdomains.to_a
|
||||
end
|
||||
|
||||
def download_subdomains(base_domain)
|
||||
puts "Starting subdomain downloads..."
|
||||
depth = 0
|
||||
max_depth = @subdomain_depth || 1
|
||||
|
||||
while depth < max_depth && !@subdomain_queue.empty?
|
||||
current_batch = []
|
||||
|
||||
# get all subdomains at current depth
|
||||
while !@subdomain_queue.empty?
|
||||
current_batch << @subdomain_queue.pop
|
||||
end
|
||||
|
||||
puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
|
||||
|
||||
# download each subdomain
|
||||
current_batch.each do |subdomain_url|
|
||||
download_subdomain(subdomain_url, base_domain)
|
||||
end
|
||||
|
||||
# if we need to go deeper, scan the newly downloaded files
|
||||
if depth + 1 < max_depth
|
||||
# get all files in the subdomains directory
|
||||
new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
|
||||
new_subdomains = scan_files_for_subdomains(new_files, base_domain)
|
||||
|
||||
# filter out already processed subdomains
|
||||
new_subdomains.each do |subdomain|
|
||||
full_domain = "#{subdomain}.#{base_domain}"
|
||||
unless @processed_domains.include?(full_domain)
|
||||
@processed_domains.add(full_domain)
|
||||
@subdomain_queue << "https://#{full_domain}/"
|
||||
end
|
||||
end
|
||||
|
||||
puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
|
||||
end
|
||||
|
||||
depth += 1
|
||||
end
|
||||
end
|
||||
|
||||
def download_subdomain(subdomain_url, base_domain)
|
||||
begin
|
||||
uri = URI.parse(subdomain_url)
|
||||
subdomain_host = uri.host
|
||||
|
||||
# skip if already processed
|
||||
if @processed_domains.include?(subdomain_host)
|
||||
puts "Skipping already processed subdomain: #{subdomain_host}"
|
||||
return
|
||||
end
|
||||
|
||||
@processed_domains.add(subdomain_host)
|
||||
puts "Downloading subdomain: #{subdomain_url}"
|
||||
|
||||
# create the directory for this subdomain
|
||||
subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
|
||||
FileUtils.mkdir_p(subdomain_dir)
|
||||
|
||||
# create subdomain downloader with appropriate options
|
||||
subdomain_options = {
|
||||
base_url: subdomain_url,
|
||||
directory: subdomain_dir,
|
||||
from_timestamp: @from_timestamp,
|
||||
to_timestamp: @to_timestamp,
|
||||
all: @all,
|
||||
threads_count: @threads_count,
|
||||
maximum_pages: [@maximum_pages / 2, 10].max,
|
||||
rewrite: @rewrite,
|
||||
# don't recursively process subdomains from here
|
||||
recursive_subdomains: false
|
||||
}
|
||||
|
||||
# download the subdomain content
|
||||
subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
|
||||
subdomain_downloader.download_files
|
||||
|
||||
puts "Completed download of subdomain: #{subdomain_host}"
|
||||
rescue => e
|
||||
puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
|
||||
end
|
||||
end
|
||||
|
||||
def rewrite_subdomain_links(base_domain)
|
||||
puts "Rewriting all files to use local subdomain references..."
|
||||
|
||||
all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
||||
subdomains = @processed_domains.reject { |domain| domain == base_domain }
|
||||
|
||||
puts "Found #{all_files.size} files to check for rewriting"
|
||||
puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
|
||||
|
||||
rewritten_count = 0
|
||||
|
||||
all_files.each do |file_path|
|
||||
next unless File.exist?(file_path)
|
||||
|
||||
begin
|
||||
content = File.read(file_path)
|
||||
original_content = content.dup
|
||||
|
||||
# replace subdomain URLs with local paths
|
||||
subdomains.each do |subdomain_host|
|
||||
# for HTML attributes (href, src, etc.)
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
||||
prefix, path, suffix = $1, $2, $3
|
||||
path = "/index.html" if path.empty? || path == "/"
|
||||
"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
|
||||
end
|
||||
|
||||
# for CSS url()
|
||||
content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
|
||||
path = $1
|
||||
path = "/index.html" if path.empty? || path == "/"
|
||||
"url(\"../subdomains/#{subdomain_host}#{path}\")"
|
||||
end
|
||||
|
||||
# for JavaScript strings
|
||||
content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
||||
quote_start, path, quote_end = $1, $2, $3
|
||||
path = "/index.html" if path.empty? || path == "/"
|
||||
"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
|
||||
end
|
||||
end
|
||||
|
||||
# save if modified
|
||||
if content != original_content
|
||||
File.write(file_path, content)
|
||||
rewritten_count += 1
|
||||
end
|
||||
rescue => e
|
||||
puts "Error rewriting file #{file_path}: #{e.message}"
|
||||
end
|
||||
end
|
||||
|
||||
puts "Rewrote links in #{rewritten_count} files"
|
||||
end
|
||||
end
|
||||
@@ -1,73 +1,74 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# essentially, this is for converting a string with a potentially
|
||||
# broken or unknown encoding into a valid UTF-8 string
|
||||
# @todo: consider using charlock_holmes for this in the future
|
||||
module TidyBytes
|
||||
# precomputing CP1252 to UTF-8 mappings for bytes 128-159
|
||||
CP1252_MAP = (128..159).map do |byte|
|
||||
case byte
|
||||
when 128 then [226, 130, 172] # EURO SIGN
|
||||
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
|
||||
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
|
||||
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
|
||||
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
|
||||
when 134 then [226, 128, 160] # DAGGER
|
||||
when 135 then [226, 128, 161] # DOUBLE DAGGER
|
||||
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
when 137 then [226, 128, 176] # PER MILLE SIGN
|
||||
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
|
||||
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
|
||||
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
|
||||
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
|
||||
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
|
||||
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
|
||||
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
|
||||
when 149 then [226, 128, 162] # BULLET
|
||||
when 150 then [226, 128, 147] # EN DASH
|
||||
when 151 then [226, 128, 148] # EM DASH
|
||||
when 152 then [203, 156] # SMALL TILDE
|
||||
when 153 then [226, 132, 162] # TRADE MARK SIGN
|
||||
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
|
||||
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
||||
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
||||
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
||||
end
|
||||
end.freeze
|
||||
UNICODE_REPLACEMENT_CHARACTER = "<EFBFBD>"
|
||||
|
||||
# precomputing all possible byte conversions
|
||||
CP1252_TO_UTF8 = Array.new(256) do |b|
|
||||
if (128..159).cover?(b)
|
||||
CP1252_MAP[b - 128]&.pack('C*')
|
||||
elsif b < 128
|
||||
b.chr
|
||||
else
|
||||
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
|
||||
# common encodings to try for best multilingual compatibility
|
||||
COMMON_ENCODINGS = [
|
||||
Encoding::UTF_8,
|
||||
Encoding::Windows_1251, # Cyrillic/Russian legacy
|
||||
Encoding::GB18030, # Simplified Chinese
|
||||
Encoding::Shift_JIS, # Japanese
|
||||
Encoding::EUC_KR, # Korean
|
||||
Encoding::ISO_8859_1, # Western European
|
||||
Encoding::Windows_1252 # Western European/Latin1 superset
|
||||
].select { |enc| Encoding.name_list.include?(enc.name) }
|
||||
|
||||
# returns true if the string appears to be binary (has null bytes)
|
||||
def binary_data?
|
||||
self.include?("\x00".b)
|
||||
end
|
||||
|
||||
# attempts to return a valid UTF-8 version of the string
|
||||
def tidy_bytes
|
||||
return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
|
||||
return self.dup.force_encoding("BINARY") if binary_data?
|
||||
|
||||
str = self.dup
|
||||
COMMON_ENCODINGS.each do |enc|
|
||||
str.force_encoding(enc)
|
||||
begin
|
||||
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
||||
return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
|
||||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||
# try next encoding
|
||||
end
|
||||
end
|
||||
end.freeze
|
||||
|
||||
# if no clean conversion found, try again but accept replacement characters
|
||||
str = self.dup
|
||||
COMMON_ENCODINGS.each do |enc|
|
||||
str.force_encoding(enc)
|
||||
begin
|
||||
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
||||
return utf8 if utf8.valid_encoding?
|
||||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||
# try next encoding
|
||||
end
|
||||
end
|
||||
|
||||
# fallback: replace all invalid/undefined bytes
|
||||
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
||||
end
|
||||
|
||||
def tidy_bytes!
|
||||
replace(self.tidy_bytes)
|
||||
end
|
||||
|
||||
def self.included(base)
|
||||
base.class_eval do
|
||||
def tidy_bytes(force = false)
|
||||
return nil if empty?
|
||||
|
||||
if force
|
||||
buffer = String.new(capacity: bytesize)
|
||||
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
||||
return buffer.force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
base.send(:include, InstanceMethods)
|
||||
end
|
||||
|
||||
begin
|
||||
encode('UTF-8')
|
||||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||
buffer = String.new(capacity: bytesize)
|
||||
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
||||
end
|
||||
end
|
||||
module InstanceMethods
|
||||
def tidy_bytes
|
||||
TidyBytes.instance_method(:tidy_bytes).bind(self).call
|
||||
end
|
||||
|
||||
def tidy_bytes!(force = false)
|
||||
result = tidy_bytes(force)
|
||||
result ? replace(result) : self
|
||||
end
|
||||
def tidy_bytes!
|
||||
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
74
lib/wayback_machine_downloader/url_rewrite.rb
Normal file
74
lib/wayback_machine_downloader/url_rewrite.rb
Normal file
@@ -0,0 +1,74 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
# URLs in HTML attributes
|
||||
def rewrite_html_attr_urls(content)
|
||||
|
||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||
prefix, url, suffix = $1, $2, $3
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
"#{prefix}#{path}#{suffix}"
|
||||
rescue
|
||||
"#{prefix}#{url}#{suffix}"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"#{prefix}./#{url[1..-1]}#{suffix}"
|
||||
else
|
||||
"#{prefix}#{url}#{suffix}"
|
||||
end
|
||||
end
|
||||
content
|
||||
end
|
||||
|
||||
# URLs in CSS
|
||||
def rewrite_css_urls(content)
|
||||
|
||||
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
||||
url = $1
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
"url(\"#{path}\")"
|
||||
rescue
|
||||
"url(\"#{url}\")"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"url(\"./#{url[1..-1]}\")"
|
||||
else
|
||||
"url(\"#{url}\")"
|
||||
end
|
||||
end
|
||||
content
|
||||
end
|
||||
|
||||
# URLs in JavaScript
|
||||
def rewrite_js_urls(content)
|
||||
|
||||
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||
quote_start, url, quote_end = $1, $2, $3
|
||||
|
||||
if url.start_with?('http')
|
||||
begin
|
||||
uri = URI.parse(url)
|
||||
path = uri.path
|
||||
path = path[1..-1] if path.start_with?('/')
|
||||
"#{quote_start}#{path}#{quote_end}"
|
||||
rescue
|
||||
"#{quote_start}#{url}#{quote_end}"
|
||||
end
|
||||
elsif url.start_with?('/')
|
||||
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
||||
else
|
||||
"#{quote_start}#{url}#{quote_end}"
|
||||
end
|
||||
end
|
||||
|
||||
content
|
||||
end
|
||||
@@ -1,12 +1,12 @@
|
||||
Gem::Specification.new do |s|
|
||||
s.name = "wayback_machine_downloader_straw"
|
||||
s.version = "2.3.7"
|
||||
s.version = "2.4.2"
|
||||
s.executables << "wayback_machine_downloader"
|
||||
s.summary = "Download an entire website from the Wayback Machine."
|
||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||
s.authors = ["strawberrymaster"]
|
||||
s.email = "strawberrymaster@vivaldi.net"
|
||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
|
||||
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
||||
s.license = "MIT"
|
||||
s.required_ruby_version = ">= 3.4.3"
|
||||
|
||||
Reference in New Issue
Block a user