mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40e9c9bb51 | ||
|
|
6bc08947b7 | ||
|
|
c731e0c7bd | ||
|
|
9fd2a7f8d1 | ||
|
|
6ad312f31f | ||
|
|
62ea35daa6 | ||
|
|
1f4202908f | ||
|
|
bed3f6101c | ||
|
|
754df6b8d6 | ||
|
|
801fb77f79 | ||
|
|
e9849e6c9c | ||
|
|
bc868e6b39 | ||
|
|
2bf04aff48 | ||
|
|
51becde916 | ||
|
|
c30ee73977 |
@@ -6,10 +6,9 @@ COPY Gemfile /build/
|
|||||||
COPY *.gemspec /build/
|
COPY *.gemspec /build/
|
||||||
|
|
||||||
RUN bundle config set jobs "$(nproc)" \
|
RUN bundle config set jobs "$(nproc)" \
|
||||||
&& bundle config set without 'development test' \
|
|
||||||
&& bundle install
|
&& bundle install
|
||||||
|
|
||||||
COPY . /build
|
COPY . /build
|
||||||
|
|
||||||
WORKDIR /
|
WORKDIR /build
|
||||||
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
ENTRYPOINT [ "/build/bin/wayback_machine_downloader", "--directory", "/build/websites" ]
|
||||||
|
|||||||
21
README.md
21
README.md
@@ -81,36 +81,19 @@ services:
|
|||||||
tty: true
|
tty: true
|
||||||
image: wayback_machine_downloader:latest
|
image: wayback_machine_downloader:latest
|
||||||
container_name: wayback_machine_downloader
|
container_name: wayback_machine_downloader
|
||||||
environment:
|
|
||||||
- ENVIRONMENT=${ENVIRONMENT:-development}
|
|
||||||
- OPTIONS=${OPTIONS:-""}
|
|
||||||
- TARGET_URL=${TARGET_URL}
|
|
||||||
volumes:
|
volumes:
|
||||||
- .:/build:rw
|
- .:/build:rw
|
||||||
- ./websites:/build/websites:rw
|
- ./websites:/build/websites:rw
|
||||||
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
|
|
||||||
```
|
```
|
||||||
#### Usage:
|
#### Usage:
|
||||||
Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
|
Now you can create a Docker image as named "wayback_machine_downloader" with the following command:
|
||||||
```bash
|
```bash
|
||||||
docker compose up -d --build
|
docker compose up -d --build
|
||||||
```
|
```
|
||||||
|
|
||||||
After that you must set TARGET_URL environment variable:
|
|
||||||
```bash
|
|
||||||
export TARGET_URL="https://example.com/"
|
|
||||||
```
|
|
||||||
|
|
||||||
The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
```bash
|
|
||||||
export OPTIONS="--list -f 20060121"
|
|
||||||
```
|
|
||||||
|
|
||||||
After that you can run the exists container with the following command:
|
After that you can run the exists container with the following command:
|
||||||
```bash
|
```bash
|
||||||
docker compose run --rm wayback_machine_downloader https://example.com
|
docker compose run --rm wayback_machine_downloader https://example.com [options]
|
||||||
```
|
```
|
||||||
|
|
||||||
## ⚙️ Configuration
|
## ⚙️ Configuration
|
||||||
|
|||||||
@@ -5,11 +5,6 @@ services:
|
|||||||
tty: true
|
tty: true
|
||||||
image: wayback_machine_downloader:latest
|
image: wayback_machine_downloader:latest
|
||||||
container_name: wayback_machine_downloader
|
container_name: wayback_machine_downloader
|
||||||
environment:
|
|
||||||
- ENVIRONMENT=${DEVELOPMENT:-production}
|
|
||||||
- OPTIONS=${OPTIONS:-""}
|
|
||||||
- TARGET_URL=${TARGET_URL}
|
|
||||||
volumes:
|
volumes:
|
||||||
- .:/build:rw
|
- .:/build:rw
|
||||||
- ./websites:/websites:rw
|
- ./websites:/build/websites:rw
|
||||||
command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}
|
|
||||||
@@ -11,10 +11,12 @@ require 'concurrent-ruby'
|
|||||||
require 'logger'
|
require 'logger'
|
||||||
require 'zlib'
|
require 'zlib'
|
||||||
require 'stringio'
|
require 'stringio'
|
||||||
|
require 'digest'
|
||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
require_relative 'wayback_machine_downloader/to_regex'
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
require_relative 'wayback_machine_downloader/archive_api'
|
require_relative 'wayback_machine_downloader/archive_api'
|
||||||
require_relative 'wayback_machine_downloader/subdom_processor'
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
||||||
|
require_relative 'wayback_machine_downloader/url_rewrite'
|
||||||
|
|
||||||
class ConnectionPool
|
class ConnectionPool
|
||||||
MAX_AGE = 300
|
MAX_AGE = 300
|
||||||
@@ -115,7 +117,7 @@ class WaybackMachineDownloader
|
|||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
include SubdomainProcessor
|
include SubdomainProcessor
|
||||||
|
|
||||||
VERSION = "2.3.12"
|
VERSION = "2.4.2"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -133,10 +135,11 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
validate_params(params)
|
validate_params(params)
|
||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]&.tidy_bytes
|
||||||
@exact_url = params[:exact_url]
|
@exact_url = params[:exact_url]
|
||||||
if params[:directory]
|
if params[:directory]
|
||||||
@directory = File.expand_path(params[:directory])
|
sanitized_dir = params[:directory].tidy_bytes
|
||||||
|
@directory = File.expand_path(sanitized_dir)
|
||||||
else
|
else
|
||||||
@directory = nil
|
@directory = nil
|
||||||
end
|
end
|
||||||
@@ -169,12 +172,19 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
||||||
|
raw = if url_to_process.include?('//')
|
||||||
if url_to_process.include? '//'
|
|
||||||
url_to_process.split('/')[2]
|
url_to_process.split('/')[2]
|
||||||
else
|
else
|
||||||
url_to_process
|
url_to_process
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
|
||||||
|
if Gem.win_platform?
|
||||||
|
raw = raw.gsub(/[:*?"<>|]/, '_')
|
||||||
|
raw = raw.gsub(/[ .]+\z/, '')
|
||||||
|
end
|
||||||
|
raw = 'site' if raw.nil? || raw.empty?
|
||||||
|
raw
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_path
|
def backup_path
|
||||||
@@ -338,15 +348,15 @@ class WaybackMachineDownloader
|
|||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
next unless file_url.include?('/')
|
next unless file_url.include?('/')
|
||||||
next if file_timestamp.to_i > target_timestamp
|
next if file_timestamp.to_i > target_timestamp
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
|
||||||
file_id = CGI::unescape file_id
|
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||||
file_id = file_id.tidy_bytes unless file_id == ""
|
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||||
next if file_id.nil?
|
next if file_id.nil?
|
||||||
next if match_exclude_filter(file_url)
|
next if match_exclude_filter(file_url)
|
||||||
next unless match_only_filter(file_url)
|
next unless match_only_filter(file_url)
|
||||||
# Select the most recent version <= target_timestamp
|
|
||||||
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
|
||||||
file_versions[file_id] = {file_url: file_url, timestamp: file_timestamp, file_id: file_id}
|
file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
file_versions.values
|
file_versions.values
|
||||||
@@ -366,22 +376,27 @@ class WaybackMachineDownloader
|
|||||||
file_list_curated = Hash.new
|
file_list_curated = Hash.new
|
||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
next unless file_url.include?('/')
|
next unless file_url.include?('/')
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
|
||||||
file_id = CGI::unescape file_id
|
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||||
file_id = file_id.tidy_bytes unless file_id == ""
|
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||||
if file_id.nil?
|
if file_id.nil?
|
||||||
puts "Malformed file url, ignoring: #{file_url}"
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
if file_id.include?('<') || file_id.include?('>')
|
||||||
|
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||||||
else
|
else
|
||||||
if match_exclude_filter(file_url)
|
if match_exclude_filter(file_url)
|
||||||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
elsif not match_only_filter(file_url)
|
elsif !match_only_filter(file_url)
|
||||||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||||
elsif file_list_curated[file_id]
|
elsif file_list_curated[file_id]
|
||||||
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
||||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -392,21 +407,32 @@ class WaybackMachineDownloader
|
|||||||
file_list_curated = Hash.new
|
file_list_curated = Hash.new
|
||||||
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
||||||
next unless file_url.include?('/')
|
next unless file_url.include?('/')
|
||||||
file_id = file_url.split('/')[3..-1].join('/')
|
|
||||||
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
raw_tail = file_url.split('/')[3..-1]&.join('/')
|
||||||
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
file_id = sanitize_and_prepare_id(raw_tail, file_url)
|
||||||
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
|
||||||
if file_id.nil?
|
if file_id.nil?
|
||||||
puts "Malformed file url, ignoring: #{file_url}"
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
|
||||||
|
file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
|
||||||
|
if file_id_and_timestamp.nil?
|
||||||
|
puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
|
||||||
|
next
|
||||||
|
end
|
||||||
|
|
||||||
|
if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
|
||||||
|
puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
|
||||||
else
|
else
|
||||||
if match_exclude_filter(file_url)
|
if match_exclude_filter(file_url)
|
||||||
puts "File url matches exclude filter, ignoring: #{file_url}"
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
elsif not match_only_filter(file_url)
|
elsif !match_only_filter(file_url)
|
||||||
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||||
elsif file_list_curated[file_id_and_timestamp]
|
elsif file_list_curated[file_id_and_timestamp]
|
||||||
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
# duplicate combo, ignore silently (verbose flag not shown here)
|
||||||
else
|
else
|
||||||
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -473,6 +499,39 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def processing_files(pool, files_to_process)
|
||||||
|
files_to_process.each do |file_remote_info|
|
||||||
|
pool.post do
|
||||||
|
download_success = false
|
||||||
|
begin
|
||||||
|
@connection_pool.with_connection do |connection|
|
||||||
|
result_message = download_file(file_remote_info, connection)
|
||||||
|
# assume download success if the result message contains ' -> '
|
||||||
|
if result_message && result_message.include?(' -> ')
|
||||||
|
download_success = true
|
||||||
|
end
|
||||||
|
@download_mutex.synchronize do
|
||||||
|
@processed_file_count += 1
|
||||||
|
# adjust progress message to reflect remaining files
|
||||||
|
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
||||||
|
puts progress_message if progress_message
|
||||||
|
end
|
||||||
|
end
|
||||||
|
# sppend to DB only after successful download outside the connection block
|
||||||
|
if download_success
|
||||||
|
append_to_db(file_remote_info[:file_id])
|
||||||
|
end
|
||||||
|
rescue => e
|
||||||
|
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
||||||
|
@download_mutex.synchronize do
|
||||||
|
@processed_file_count += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
sleep(RATE_LIMIT)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def download_files
|
def download_files
|
||||||
start_time = Time.now
|
start_time = Time.now
|
||||||
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
||||||
@@ -519,36 +578,7 @@ class WaybackMachineDownloader
|
|||||||
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
||||||
pool = Concurrent::FixedThreadPool.new(thread_count)
|
pool = Concurrent::FixedThreadPool.new(thread_count)
|
||||||
|
|
||||||
files_to_process.each do |file_remote_info|
|
processing_files(pool, files_to_process)
|
||||||
pool.post do
|
|
||||||
download_success = false
|
|
||||||
begin
|
|
||||||
@connection_pool.with_connection do |connection|
|
|
||||||
result_message = download_file(file_remote_info, connection)
|
|
||||||
# assume download success if the result message contains ' -> '
|
|
||||||
if result_message && result_message.include?(' -> ')
|
|
||||||
download_success = true
|
|
||||||
end
|
|
||||||
@download_mutex.synchronize do
|
|
||||||
@processed_file_count += 1
|
|
||||||
# adjust progress message to reflect remaining files
|
|
||||||
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
|
|
||||||
puts progress_message if progress_message
|
|
||||||
end
|
|
||||||
end
|
|
||||||
# sppend to DB only after successful download outside the connection block
|
|
||||||
if download_success
|
|
||||||
append_to_db(file_remote_info[:file_id])
|
|
||||||
end
|
|
||||||
rescue => e
|
|
||||||
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
|
|
||||||
@download_mutex.synchronize do
|
|
||||||
@processed_file_count += 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
sleep(RATE_LIMIT)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
pool.shutdown
|
pool.shutdown
|
||||||
pool.wait_for_termination
|
pool.wait_for_termination
|
||||||
@@ -608,64 +638,13 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
|
|
||||||
# URLs in HTML attributes
|
# URLs in HTML attributes
|
||||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
rewrite_html_attr_urls(content)
|
||||||
prefix, url, suffix = $1, $2, $3
|
|
||||||
|
|
||||||
if url.start_with?('http')
|
|
||||||
begin
|
|
||||||
uri = URI.parse(url)
|
|
||||||
path = uri.path
|
|
||||||
path = path[1..-1] if path.start_with?('/')
|
|
||||||
"#{prefix}#{path}#{suffix}"
|
|
||||||
rescue
|
|
||||||
"#{prefix}#{url}#{suffix}"
|
|
||||||
end
|
|
||||||
elsif url.start_with?('/')
|
|
||||||
"#{prefix}./#{url[1..-1]}#{suffix}"
|
|
||||||
else
|
|
||||||
"#{prefix}#{url}#{suffix}"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# URLs in CSS
|
# URLs in CSS
|
||||||
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
rewrite_css_urls(content)
|
||||||
url = $1
|
|
||||||
|
|
||||||
if url.start_with?('http')
|
|
||||||
begin
|
|
||||||
uri = URI.parse(url)
|
|
||||||
path = uri.path
|
|
||||||
path = path[1..-1] if path.start_with?('/')
|
|
||||||
"url(\"#{path}\")"
|
|
||||||
rescue
|
|
||||||
"url(\"#{url}\")"
|
|
||||||
end
|
|
||||||
elsif url.start_with?('/')
|
|
||||||
"url(\"./#{url[1..-1]}\")"
|
|
||||||
else
|
|
||||||
"url(\"#{url}\")"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# URLs in JavaScript
|
# URLs in JavaScript
|
||||||
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
rewrite_js_urls(content)
|
||||||
quote_start, url, quote_end = $1, $2, $3
|
|
||||||
|
|
||||||
if url.start_with?('http')
|
|
||||||
begin
|
|
||||||
uri = URI.parse(url)
|
|
||||||
path = uri.path
|
|
||||||
path = path[1..-1] if path.start_with?('/')
|
|
||||||
"#{quote_start}#{path}#{quote_end}"
|
|
||||||
rescue
|
|
||||||
"#{quote_start}#{url}#{quote_end}"
|
|
||||||
end
|
|
||||||
elsif url.start_with?('/')
|
|
||||||
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
|
||||||
else
|
|
||||||
"#{quote_start}#{url}#{quote_end}"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# for URLs in HTML attributes that start with a single slash
|
# for URLs in HTML attributes that start with a single slash
|
||||||
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||||||
@@ -794,6 +773,85 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
logger
|
logger
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# safely sanitize a file id (or id+timestamp)
|
||||||
|
def sanitize_and_prepare_id(raw, file_url)
|
||||||
|
return nil if raw.nil? || raw.empty?
|
||||||
|
original = raw.dup
|
||||||
|
begin
|
||||||
|
# work on a binary copy to avoid premature encoding errors
|
||||||
|
raw = raw.dup.force_encoding(Encoding::BINARY)
|
||||||
|
|
||||||
|
# percent-decode (repeat until stable in case of double-encoding)
|
||||||
|
loop do
|
||||||
|
decoded = raw.gsub(/%([0-9A-Fa-f]{2})/) { [$1].pack('H2') }
|
||||||
|
break if decoded == raw
|
||||||
|
raw = decoded
|
||||||
|
end
|
||||||
|
|
||||||
|
# try tidy_bytes
|
||||||
|
begin
|
||||||
|
raw = raw.tidy_bytes
|
||||||
|
rescue StandardError
|
||||||
|
# fallback: scrub to UTF-8
|
||||||
|
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||||
|
end
|
||||||
|
|
||||||
|
# ensure UTF-8 and scrub again
|
||||||
|
unless raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
||||||
|
raw = raw.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||||
|
end
|
||||||
|
|
||||||
|
# strip HTML/comment artifacts & control chars
|
||||||
|
raw.gsub!(/<!--+/, '')
|
||||||
|
raw.gsub!(/[\x00-\x1F]/, '')
|
||||||
|
|
||||||
|
# split query; hash it for stable short name
|
||||||
|
path_part, query_part = raw.split('?', 2)
|
||||||
|
if query_part && !query_part.empty?
|
||||||
|
q_digest = Digest::SHA256.hexdigest(query_part)[0, 12]
|
||||||
|
if path_part.include?('.')
|
||||||
|
pre, _sep, post = path_part.rpartition('.')
|
||||||
|
path_part = "#{pre}__q#{q_digest}.#{post}"
|
||||||
|
else
|
||||||
|
path_part = "#{path_part}__q#{q_digest}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
raw = path_part
|
||||||
|
|
||||||
|
# collapse slashes & trim leading slash
|
||||||
|
raw.gsub!(%r{/+}, '/')
|
||||||
|
raw.sub!(%r{\A/}, '')
|
||||||
|
|
||||||
|
# segment-wise sanitation
|
||||||
|
raw = raw.split('/').map do |segment|
|
||||||
|
seg = segment.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
||||||
|
seg = seg.gsub(/[:*?"<>|\\]/) { |c| "%#{c.ord.to_s(16).upcase}" }
|
||||||
|
seg = seg.gsub(/[ .]+\z/, '') if Gem.win_platform?
|
||||||
|
seg.empty? ? '_' : seg
|
||||||
|
end.join('/')
|
||||||
|
|
||||||
|
# remove any remaining angle brackets
|
||||||
|
raw.tr!('<>', '')
|
||||||
|
|
||||||
|
# final fallback if empty
|
||||||
|
raw = "file__#{Digest::SHA1.hexdigest(original)[0,10]}" if raw.nil? || raw.empty?
|
||||||
|
|
||||||
|
raw
|
||||||
|
rescue => e
|
||||||
|
@logger&.warn("Failed to sanitize file id from #{file_url}: #{e.message}")
|
||||||
|
# deterministic fallback – never return nil so caller won’t mark malformed
|
||||||
|
"file__#{Digest::SHA1.hexdigest(original)[0,10]}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# wrap URL in parentheses if it contains characters that commonly break unquoted
|
||||||
|
# Windows CMD usage (e.g., &). This is only for display; user still must quote
|
||||||
|
# when invoking manually.
|
||||||
|
def safe_display_url(url)
|
||||||
|
return url unless url && url.match?(/[&]/)
|
||||||
|
"(#{url})"
|
||||||
|
end
|
||||||
|
|
||||||
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
||||||
retries = 0
|
retries = 0
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ module ArchiveAPI
|
|||||||
# Check if the response contains the header ["timestamp", "original"]
|
# Check if the response contains the header ["timestamp", "original"]
|
||||||
json.shift if json.first == ["timestamp", "original"]
|
json.shift if json.first == ["timestamp", "original"]
|
||||||
json
|
json
|
||||||
rescue JSON::ParserError, StandardError => e
|
rescue JSON::ParserError => e
|
||||||
warn "Failed to fetch data from API: #{e.message}"
|
warn "Failed to fetch data from API: #{e.message}"
|
||||||
[]
|
[]
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -1,74 +1,74 @@
|
|||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
# essentially, this is for converting a string with a potentially
|
||||||
|
# broken or unknown encoding into a valid UTF-8 string
|
||||||
|
# @todo: consider using charlock_holmes for this in the future
|
||||||
module TidyBytes
|
module TidyBytes
|
||||||
# precomputing CP1252 to UTF-8 mappings for bytes 128-159
|
UNICODE_REPLACEMENT_CHARACTER = "<EFBFBD>"
|
||||||
CP1252_MAP = (128..159).map do |byte|
|
|
||||||
case byte
|
|
||||||
when 128 then [226, 130, 172] # EURO SIGN
|
|
||||||
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
|
|
||||||
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
|
|
||||||
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
|
|
||||||
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
|
|
||||||
when 134 then [226, 128, 160] # DAGGER
|
|
||||||
when 135 then [226, 128, 161] # DOUBLE DAGGER
|
|
||||||
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
||||||
when 137 then [226, 128, 176] # PER MILLE SIGN
|
|
||||||
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
|
|
||||||
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
||||||
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
|
|
||||||
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
|
|
||||||
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
|
|
||||||
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
|
|
||||||
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
|
|
||||||
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
|
|
||||||
when 149 then [226, 128, 162] # BULLET
|
|
||||||
when 150 then [226, 128, 147] # EN DASH
|
|
||||||
when 151 then [226, 128, 148] # EM DASH
|
|
||||||
when 152 then [203, 156] # SMALL TILDE
|
|
||||||
when 153 then [226, 132, 162] # TRADE MARK SIGN
|
|
||||||
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
|
|
||||||
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
||||||
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
|
||||||
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
|
||||||
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
|
||||||
else nil # ANYTHING ELSE...
|
|
||||||
end
|
|
||||||
end.freeze
|
|
||||||
|
|
||||||
# precomputing all possible byte conversions
|
# common encodings to try for best multilingual compatibility
|
||||||
CP1252_TO_UTF8 = Array.new(256) do |b|
|
COMMON_ENCODINGS = [
|
||||||
if (128..159).cover?(b)
|
Encoding::UTF_8,
|
||||||
CP1252_MAP[b - 128]&.pack('C*')
|
Encoding::Windows_1251, # Cyrillic/Russian legacy
|
||||||
elsif b < 128
|
Encoding::GB18030, # Simplified Chinese
|
||||||
b.chr
|
Encoding::Shift_JIS, # Japanese
|
||||||
else
|
Encoding::EUC_KR, # Korean
|
||||||
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
|
Encoding::ISO_8859_1, # Western European
|
||||||
|
Encoding::Windows_1252 # Western European/Latin1 superset
|
||||||
|
].select { |enc| Encoding.name_list.include?(enc.name) }
|
||||||
|
|
||||||
|
# returns true if the string appears to be binary (has null bytes)
|
||||||
|
def binary_data?
|
||||||
|
self.include?("\x00".b)
|
||||||
|
end
|
||||||
|
|
||||||
|
# attempts to return a valid UTF-8 version of the string
|
||||||
|
def tidy_bytes
|
||||||
|
return self if self.encoding == Encoding::UTF_8 && self.valid_encoding?
|
||||||
|
return self.dup.force_encoding("BINARY") if binary_data?
|
||||||
|
|
||||||
|
str = self.dup
|
||||||
|
COMMON_ENCODINGS.each do |enc|
|
||||||
|
str.force_encoding(enc)
|
||||||
|
begin
|
||||||
|
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
||||||
|
return utf8 if utf8.valid_encoding? && !utf8.include?(UNICODE_REPLACEMENT_CHARACTER)
|
||||||
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||||
|
# try next encoding
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end.freeze
|
|
||||||
|
# if no clean conversion found, try again but accept replacement characters
|
||||||
|
str = self.dup
|
||||||
|
COMMON_ENCODINGS.each do |enc|
|
||||||
|
str.force_encoding(enc)
|
||||||
|
begin
|
||||||
|
utf8 = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
||||||
|
return utf8 if utf8.valid_encoding?
|
||||||
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||||
|
# try next encoding
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# fallback: replace all invalid/undefined bytes
|
||||||
|
str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: UNICODE_REPLACEMENT_CHARACTER)
|
||||||
|
end
|
||||||
|
|
||||||
|
def tidy_bytes!
|
||||||
|
replace(self.tidy_bytes)
|
||||||
|
end
|
||||||
|
|
||||||
def self.included(base)
|
def self.included(base)
|
||||||
base.class_eval do
|
base.send(:include, InstanceMethods)
|
||||||
def tidy_bytes(force = false)
|
end
|
||||||
return nil if empty?
|
|
||||||
|
|
||||||
if force
|
|
||||||
buffer = String.new(capacity: bytesize)
|
|
||||||
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
|
||||||
return buffer.force_encoding(Encoding::UTF_8)
|
|
||||||
end
|
|
||||||
|
|
||||||
begin
|
module InstanceMethods
|
||||||
encode('UTF-8')
|
def tidy_bytes
|
||||||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
TidyBytes.instance_method(:tidy_bytes).bind(self).call
|
||||||
buffer = String.new(capacity: bytesize)
|
end
|
||||||
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def tidy_bytes!(force = false)
|
def tidy_bytes!
|
||||||
result = tidy_bytes(force)
|
TidyBytes.instance_method(:tidy_bytes!).bind(self).call
|
||||||
result ? replace(result) : self
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
74
lib/wayback_machine_downloader/url_rewrite.rb
Normal file
74
lib/wayback_machine_downloader/url_rewrite.rb
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
# URLs in HTML attributes
|
||||||
|
def rewrite_html_attr_urls(content)
|
||||||
|
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||||
|
prefix, url, suffix = $1, $2, $3
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"#{prefix}#{path}#{suffix}"
|
||||||
|
rescue
|
||||||
|
"#{prefix}#{url}#{suffix}"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"#{prefix}./#{url[1..-1]}#{suffix}"
|
||||||
|
else
|
||||||
|
"#{prefix}#{url}#{suffix}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
content
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in CSS
|
||||||
|
def rewrite_css_urls(content)
|
||||||
|
|
||||||
|
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
||||||
|
url = $1
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"url(\"#{path}\")"
|
||||||
|
rescue
|
||||||
|
"url(\"#{url}\")"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"url(\"./#{url[1..-1]}\")"
|
||||||
|
else
|
||||||
|
"url(\"#{url}\")"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
content
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in JavaScript
|
||||||
|
def rewrite_js_urls(content)
|
||||||
|
|
||||||
|
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||||
|
quote_start, url, quote_end = $1, $2, $3
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"#{quote_start}#{path}#{quote_end}"
|
||||||
|
rescue
|
||||||
|
"#{quote_start}#{url}#{quote_end}"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
||||||
|
else
|
||||||
|
"#{quote_start}#{url}#{quote_end}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
content
|
||||||
|
end
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = "2.3.12"
|
s.version = "2.4.2"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
s.authors = ["strawberrymaster"]
|
s.authors = ["strawberrymaster"]
|
||||||
s.email = "strawberrymaster@vivaldi.net"
|
s.email = "strawberrymaster@vivaldi.net"
|
||||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb"]
|
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb", "lib/wayback_machine_downloader/subdom_processor.rb", "lib/wayback_machine_downloader/url_rewrite.rb"]
|
||||||
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
||||||
s.license = "MIT"
|
s.license = "MIT"
|
||||||
s.required_ruby_version = ">= 3.4.3"
|
s.required_ruby_version = ">= 3.4.3"
|
||||||
|
|||||||
Reference in New Issue
Block a user