mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
46 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d181ce84c | ||
|
|
999aa211ae | ||
|
|
82ff2de3dc | ||
|
|
fd329afdd2 | ||
|
|
038785557d | ||
|
|
2eead8cc27 | ||
|
|
7e5cdd54fb | ||
|
|
4160ff5e4a | ||
|
|
f03d92a3c4 | ||
|
|
2490109cfe | ||
|
|
c3c5b8446a | ||
|
|
18357a77ed | ||
|
|
3fdfd70fc1 | ||
|
|
2bf74b4173 | ||
|
|
79cbb639e7 | ||
|
|
071d208b31 | ||
|
|
1681a12579 | ||
|
|
f38756dd76 | ||
|
|
9452411e32 | ||
|
|
61e22cfe25 | ||
|
|
183ed61104 | ||
|
|
e6ecf32a43 | ||
|
|
375c6314ad | ||
|
|
6e2739f5a8 | ||
|
|
caba6a665f | ||
|
|
ab4324c0eb | ||
|
|
e28d7d578b | ||
|
|
a7a25574cf | ||
|
|
23cc3d69b1 | ||
|
|
01fa1f8c9f | ||
|
|
d2f98d9428 | ||
|
|
c7a5381eaf | ||
|
|
9709834e20 | ||
|
|
77998372cb | ||
|
|
2c789b7df6 | ||
|
|
1ef8c14c48 | ||
|
|
780e45343f | ||
|
|
42e6d62284 | ||
|
|
543161d7fb | ||
|
|
99a6de981e | ||
|
|
d85c880d23 | ||
|
|
917f4f8798 | ||
|
|
787bc2e535 | ||
|
|
4db13a7792 | ||
|
|
13e88ce04a | ||
|
|
c7fc7c7b58 |
12
Dockerfile
12
Dockerfile
@@ -1,11 +1,15 @@
|
|||||||
FROM ruby:3.1.6-alpine
|
FROM ruby:3.4.4-alpine
|
||||||
USER root
|
USER root
|
||||||
WORKDIR /build
|
WORKDIR /build
|
||||||
COPY . /build
|
|
||||||
|
|
||||||
RUN gem update \
|
COPY Gemfile /build/
|
||||||
&& gem install concurrent-ruby \
|
COPY *.gemspec /build/
|
||||||
|
|
||||||
|
RUN bundle config set jobs "$(nproc)" \
|
||||||
|
&& bundle config set without 'development test' \
|
||||||
&& bundle install
|
&& bundle install
|
||||||
|
|
||||||
|
COPY . /build
|
||||||
|
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
||||||
47
README.md
47
README.md
@@ -27,6 +27,7 @@ To run most commands, just like in the original WMD, you can use:
|
|||||||
```bash
|
```bash
|
||||||
wayback_machine_downloader https://example.com
|
wayback_machine_downloader https://example.com
|
||||||
```
|
```
|
||||||
|
**Note**: this gem may conflict with hartator's wayback_machine_downloader gem, and so you may have to uninstall it for this WMD fork to work. A good way to know is if a command fails; it will list the gem version as 2.3.1 or earlier, while this WMD fork uses 2.3.2 or above.
|
||||||
|
|
||||||
### Step-by-step setup
|
### Step-by-step setup
|
||||||
1. **Install Ruby**:
|
1. **Install Ruby**:
|
||||||
@@ -62,6 +63,12 @@ docker build -t wayback_machine_downloader .
|
|||||||
docker run -it --rm wayback_machine_downloader [options] URL
|
docker run -it --rm wayback_machine_downloader [options] URL
|
||||||
```
|
```
|
||||||
|
|
||||||
|
or the example without cloning the repo - fetching smallrockets.com until the year 2013:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
|
||||||
|
```
|
||||||
|
|
||||||
### 🐳 Using Docker Compose
|
### 🐳 Using Docker Compose
|
||||||
|
|
||||||
We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database):
|
We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database):
|
||||||
@@ -74,21 +81,49 @@ services:
|
|||||||
tty: true
|
tty: true
|
||||||
image: wayback_machine_downloader:latest
|
image: wayback_machine_downloader:latest
|
||||||
container_name: wayback_machine_downloader
|
container_name: wayback_machine_downloader
|
||||||
|
environment:
|
||||||
|
- ENVIRONMENT=${ENVIRONMENT:-development}
|
||||||
|
- OPTIONS=${OPTIONS:-""}
|
||||||
|
- TARGET_URL=${TARGET_URL}
|
||||||
volumes:
|
volumes:
|
||||||
- .:/build:rw
|
- .:/build:rw
|
||||||
- ./websites:/build/websites:rw
|
- ./websites:/build/websites:rw
|
||||||
|
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
|
||||||
|
```
|
||||||
|
#### Usage:
|
||||||
|
Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
|
||||||
|
```bash
|
||||||
|
docker compose up -d --build
|
||||||
|
```
|
||||||
|
|
||||||
|
After that you must set TARGET_URL environment variable:
|
||||||
|
```bash
|
||||||
|
export TARGET_URL="https://example.com/"
|
||||||
|
```
|
||||||
|
|
||||||
|
The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```bash
|
||||||
|
export OPTIONS="--list -f 20060121"
|
||||||
|
```
|
||||||
|
|
||||||
|
After that you can run the exists container with the following command:
|
||||||
|
```bash
|
||||||
|
docker compose run --rm wayback_machine_downloader https://example.com
|
||||||
```
|
```
|
||||||
|
|
||||||
## ⚙️ Configuration
|
## ⚙️ Configuration
|
||||||
There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
|
There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
|
||||||
|
|
||||||
```ruby
|
```ruby
|
||||||
DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds)
|
DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds)
|
||||||
MAX_RETRIES = 3 # Failed request retries
|
MAX_RETRIES = 3 # Number of times to retry failed requests
|
||||||
RETRY_DELAY = 2 # Wait between retries
|
RETRY_DELAY = 2 # Wait time between retries (seconds)
|
||||||
RATE_LIMIT = 0.25 # Throttle between requests
|
RATE_LIMIT = 0.25 # Throttle between requests (seconds)
|
||||||
CONNECTION_POOL_SIZE = 10 # No. of simultaneous connections
|
CONNECTION_POOL_SIZE = 10 # Maximum simultaneous connections
|
||||||
MEMORY_BUFFER_SIZE = 16384 # Size of download buffer
|
MEMORY_BUFFER_SIZE = 16384 # Download buffer size (bytes)
|
||||||
|
STATE_CDX_FILENAME = '.cdx.json' # Stores snapshot listing
|
||||||
|
STATE_DB_FILENAME = '.downloaded.txt' # Tracks completed downloads
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🛠️ Advanced usage
|
## 🛠️ Advanced usage
|
||||||
|
|||||||
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
|
|||||||
options[:rewritten] = true
|
options[:rewritten] = true
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
|
||||||
|
options[:rewrite] = true
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
||||||
options[:reset] = true
|
options[:reset] = true
|
||||||
end
|
end
|
||||||
@@ -70,6 +74,14 @@ option_parser = OptionParser.new do |opts|
|
|||||||
options[:keep] = true
|
options[:keep] = true
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
||||||
|
options[:recursive_subdomains] = true
|
||||||
|
end
|
||||||
|
|
||||||
|
opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
|
||||||
|
options[:subdomain_depth] = t
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("-v", "--version", "Display version") do |t|
|
opts.on("-v", "--version", "Display version") do |t|
|
||||||
options[:version] = t
|
options[:version] = t
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -6,8 +6,10 @@ services:
|
|||||||
image: wayback_machine_downloader:latest
|
image: wayback_machine_downloader:latest
|
||||||
container_name: wayback_machine_downloader
|
container_name: wayback_machine_downloader
|
||||||
environment:
|
environment:
|
||||||
- ENVIRONMENT=${ENVIRONMENT}
|
- ENVIRONMENT=${DEVELOPMENT:-production}
|
||||||
|
- OPTIONS=${OPTIONS:-""}
|
||||||
|
- TARGET_URL=${TARGET_URL}
|
||||||
volumes:
|
volumes:
|
||||||
- .:/build:rw
|
- .:/build:rw
|
||||||
- ./websites:/build/websites:rw
|
- ./websites:/websites:rw
|
||||||
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
|
command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}
|
||||||
@@ -14,6 +14,7 @@ require 'stringio'
|
|||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
require_relative 'wayback_machine_downloader/to_regex'
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
require_relative 'wayback_machine_downloader/archive_api'
|
require_relative 'wayback_machine_downloader/archive_api'
|
||||||
|
require_relative 'wayback_machine_downloader/subdom_processor'
|
||||||
|
|
||||||
class ConnectionPool
|
class ConnectionPool
|
||||||
MAX_AGE = 300
|
MAX_AGE = 300
|
||||||
@@ -112,8 +113,9 @@ end
|
|||||||
class WaybackMachineDownloader
|
class WaybackMachineDownloader
|
||||||
|
|
||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
|
include SubdomainProcessor
|
||||||
|
|
||||||
VERSION = "2.3.4"
|
VERSION = "2.3.11"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -123,15 +125,20 @@ class WaybackMachineDownloader
|
|||||||
STATE_CDX_FILENAME = ".cdx.json"
|
STATE_CDX_FILENAME = ".cdx.json"
|
||||||
STATE_DB_FILENAME = ".downloaded.txt"
|
STATE_DB_FILENAME = ".downloaded.txt"
|
||||||
|
|
||||||
|
|
||||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
validate_params(params)
|
validate_params(params)
|
||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]
|
||||||
@exact_url = params[:exact_url]
|
@exact_url = params[:exact_url]
|
||||||
@directory = params[:directory]
|
if params[:directory]
|
||||||
|
@directory = File.expand_path(params[:directory])
|
||||||
|
else
|
||||||
|
@directory = nil
|
||||||
|
end
|
||||||
@all_timestamps = params[:all_timestamps]
|
@all_timestamps = params[:all_timestamps]
|
||||||
@from_timestamp = params[:from_timestamp].to_i
|
@from_timestamp = params[:from_timestamp].to_i
|
||||||
@to_timestamp = params[:to_timestamp].to_i
|
@to_timestamp = params[:to_timestamp].to_i
|
||||||
@@ -148,27 +155,33 @@ class WaybackMachineDownloader
|
|||||||
@failed_downloads = Concurrent::Array.new
|
@failed_downloads = Concurrent::Array.new
|
||||||
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
||||||
@db_mutex = Mutex.new
|
@db_mutex = Mutex.new
|
||||||
|
@rewrite = params[:rewrite] || false
|
||||||
|
@recursive_subdomains = params[:recursive_subdomains] || false
|
||||||
|
@subdomain_depth = params[:subdomain_depth] || 1
|
||||||
|
|
||||||
|
# URL for rejecting invalid/unencoded wayback urls
|
||||||
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
||||||
|
|
||||||
handle_reset
|
handle_reset
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
if @base_url.include? '//'
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
||||||
@base_url.split('/')[2]
|
|
||||||
|
if url_to_process.include? '//'
|
||||||
|
url_to_process.split('/')[2]
|
||||||
else
|
else
|
||||||
@base_url
|
url_to_process
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_path
|
def backup_path
|
||||||
if @directory
|
if @directory
|
||||||
if @directory[-1] == '/'
|
# because @directory is already an absolute path, we just ensure it exists
|
||||||
@directory
|
@directory
|
||||||
else
|
|
||||||
@directory + '/'
|
|
||||||
end
|
|
||||||
else
|
else
|
||||||
'websites/' + backup_name + '/'
|
# ensure the default path is absolute and normalized
|
||||||
|
File.expand_path(File.join('websites', backup_name))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -191,7 +204,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def match_only_filter file_url
|
def match_only_filter file_url
|
||||||
if @only_filter
|
if @only_filter
|
||||||
only_filter_regex = @only_filter.to_regex
|
only_filter_regex = @only_filter.to_regex(detect: true)
|
||||||
if only_filter_regex
|
if only_filter_regex
|
||||||
only_filter_regex =~ file_url
|
only_filter_regex =~ file_url
|
||||||
else
|
else
|
||||||
@@ -204,7 +217,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
def match_exclude_filter file_url
|
def match_exclude_filter file_url
|
||||||
if @exclude_filter
|
if @exclude_filter
|
||||||
exclude_filter_regex = @exclude_filter.to_regex
|
exclude_filter_regex = @exclude_filter.to_regex(detect: true)
|
||||||
if exclude_filter_regex
|
if exclude_filter_regex
|
||||||
exclude_filter_regex =~ file_url
|
exclude_filter_regex =~ file_url
|
||||||
else
|
else
|
||||||
@@ -240,6 +253,7 @@ class WaybackMachineDownloader
|
|||||||
# Fetch the initial set of snapshots, sequentially
|
# Fetch the initial set of snapshots, sequentially
|
||||||
@connection_pool.with_connection do |connection|
|
@connection_pool.with_connection do |connection|
|
||||||
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
||||||
|
initial_list ||= []
|
||||||
mutex.synchronize do
|
mutex.synchronize do
|
||||||
snapshot_list_to_consider.concat(initial_list)
|
snapshot_list_to_consider.concat(initial_list)
|
||||||
print "."
|
print "."
|
||||||
@@ -264,6 +278,7 @@ class WaybackMachineDownloader
|
|||||||
@connection_pool.with_connection do |connection|
|
@connection_pool.with_connection do |connection|
|
||||||
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
||||||
end
|
end
|
||||||
|
result ||= []
|
||||||
[page, result]
|
[page, result]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -283,7 +298,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
# Process results and check for empty pages
|
# Process results and check for empty pages
|
||||||
results.each do |page, result|
|
results.each do |page, result|
|
||||||
if result.empty?
|
if result.nil? || result.empty?
|
||||||
continue_fetching = false
|
continue_fetching = false
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
@@ -377,7 +392,7 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
else
|
else
|
||||||
file_list_curated = get_file_list_curated
|
file_list_curated = get_file_list_curated
|
||||||
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
|
||||||
file_list_curated.map do |file_remote_info|
|
file_list_curated.map do |file_remote_info|
|
||||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
file_remote_info[1]
|
file_remote_info[1]
|
||||||
@@ -476,8 +491,8 @@ class WaybackMachineDownloader
|
|||||||
begin
|
begin
|
||||||
@connection_pool.with_connection do |connection|
|
@connection_pool.with_connection do |connection|
|
||||||
result_message = download_file(file_remote_info, connection)
|
result_message = download_file(file_remote_info, connection)
|
||||||
# for now, assume success if no exception and message doesn't indicate error/skip
|
# assume download success if the result message contains ' -> '
|
||||||
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
if result_message && result_message.include?(' -> ')
|
||||||
download_success = true
|
download_success = true
|
||||||
end
|
end
|
||||||
@download_mutex.synchronize do
|
@download_mutex.synchronize do
|
||||||
@@ -506,6 +521,16 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
end_time = Time.now
|
end_time = Time.now
|
||||||
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
|
||||||
|
|
||||||
|
# process subdomains if enabled
|
||||||
|
if @recursive_subdomains
|
||||||
|
subdomain_start_time = Time.now
|
||||||
|
process_subdomains
|
||||||
|
subdomain_end_time = Time.now
|
||||||
|
subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
|
||||||
|
puts "Subdomain processing finished in #{subdomain_time}s."
|
||||||
|
end
|
||||||
|
|
||||||
puts "Results saved in #{backup_path}"
|
puts "Results saved in #{backup_path}"
|
||||||
cleanup
|
cleanup
|
||||||
end
|
end
|
||||||
@@ -533,27 +558,135 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def rewrite_urls_to_relative(file_path)
|
||||||
|
return unless File.exist?(file_path)
|
||||||
|
|
||||||
|
file_ext = File.extname(file_path).downcase
|
||||||
|
|
||||||
|
begin
|
||||||
|
content = File.binread(file_path)
|
||||||
|
|
||||||
|
if file_ext == '.html' || file_ext == '.htm'
|
||||||
|
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
||||||
|
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
||||||
|
else
|
||||||
|
content.force_encoding('UTF-8')
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in HTML attributes
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||||
|
prefix, url, suffix = $1, $2, $3
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"#{prefix}#{path}#{suffix}"
|
||||||
|
rescue
|
||||||
|
"#{prefix}#{url}#{suffix}"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"#{prefix}./#{url[1..-1]}#{suffix}"
|
||||||
|
else
|
||||||
|
"#{prefix}#{url}#{suffix}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in CSS
|
||||||
|
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
||||||
|
url = $1
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"url(\"#{path}\")"
|
||||||
|
rescue
|
||||||
|
"url(\"#{url}\")"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"url(\"./#{url[1..-1]}\")"
|
||||||
|
else
|
||||||
|
"url(\"#{url}\")"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in JavaScript
|
||||||
|
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||||
|
quote_start, url, quote_end = $1, $2, $3
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"#{quote_start}#{path}#{quote_end}"
|
||||||
|
rescue
|
||||||
|
"#{quote_start}#{url}#{quote_end}"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
||||||
|
else
|
||||||
|
"#{quote_start}#{url}#{quote_end}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# for URLs in HTML attributes that start with a single slash
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||||||
|
prefix, path, suffix = $1, $2, $3
|
||||||
|
"#{prefix}./#{path}#{suffix}"
|
||||||
|
end
|
||||||
|
|
||||||
|
# for URLs in CSS that start with a single slash
|
||||||
|
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
||||||
|
path = $1
|
||||||
|
"url(\"./#{path}\")"
|
||||||
|
end
|
||||||
|
|
||||||
|
# save the modified content back to the file
|
||||||
|
File.binwrite(file_path, content)
|
||||||
|
puts "Rewrote URLs in #{file_path} to be relative."
|
||||||
|
rescue Errno::ENOENT => e
|
||||||
|
@logger.warn("Error reading file #{file_path}: #{e.message}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def download_file (file_remote_info, http)
|
def download_file (file_remote_info, http)
|
||||||
current_encoding = "".encoding
|
current_encoding = "".encoding
|
||||||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||||||
file_id = file_remote_info[:file_id]
|
file_id = file_remote_info[:file_id]
|
||||||
file_timestamp = file_remote_info[:timestamp]
|
file_timestamp = file_remote_info[:timestamp]
|
||||||
original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
|
|
||||||
file_path_elements = original_file_id.split('/')
|
|
||||||
|
|
||||||
if original_file_id == ""
|
# sanitize file_id to ensure it is a valid path component
|
||||||
dir_path = backup_path
|
raw_path_elements = file_id.split('/')
|
||||||
file_path = backup_path + 'index.html'
|
|
||||||
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
sanitized_path_elements = raw_path_elements.map do |element|
|
||||||
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
if Gem.win_platform?
|
||||||
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
# for Windows, we need to sanitize path components to avoid invalid characters
|
||||||
else
|
# this prevents issues with file names that contain characters not allowed in
|
||||||
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
# Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
|
||||||
file_path = backup_path + file_path_elements[0..-1].join('/')
|
element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
|
||||||
|
else
|
||||||
|
element
|
||||||
|
end
|
||||||
end
|
end
|
||||||
if Gem.win_platform?
|
|
||||||
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
current_backup_path = backup_path
|
||||||
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
|
||||||
|
if file_id == ""
|
||||||
|
dir_path = current_backup_path
|
||||||
|
file_path = File.join(dir_path, 'index.html')
|
||||||
|
elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
|
||||||
|
# if file_id is a directory, we treat it as such
|
||||||
|
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||||||
|
file_path = File.join(dir_path, 'index.html')
|
||||||
|
else
|
||||||
|
# if file_id is a file, we treat it as such
|
||||||
|
filename = sanitized_path_elements.pop
|
||||||
|
dir_path = File.join(current_backup_path, *sanitized_path_elements)
|
||||||
|
file_path = File.join(dir_path, filename)
|
||||||
end
|
end
|
||||||
|
|
||||||
# check existence *before* download attempt
|
# check existence *before* download attempt
|
||||||
@@ -564,11 +697,24 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
begin
|
begin
|
||||||
structure_dir_path dir_path
|
structure_dir_path dir_path
|
||||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
||||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
||||||
|
case status
|
||||||
|
when :saved
|
||||||
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
||||||
|
rewrite_urls_to_relative(file_path)
|
||||||
|
end
|
||||||
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
|
when :skipped_not_found
|
||||||
|
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
|
else
|
||||||
|
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
||||||
|
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
||||||
|
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
|
end
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
if File.exist?(file_path) and File.size(file_path) == 0
|
||||||
File.delete(file_path)
|
File.delete(file_path)
|
||||||
msg += "\n#{file_path} was empty and was removed."
|
msg += "\n#{file_path} was empty and was removed."
|
||||||
end
|
end
|
||||||
@@ -609,6 +755,15 @@ class WaybackMachineDownloader
|
|||||||
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Escape square brackets because they are not valid in URI()
|
||||||
|
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
||||||
|
|
||||||
|
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
|
||||||
|
if not @url_regexp.match?(wayback_url)
|
||||||
|
@logger.warn("Skipped #{file_url}: invalid URL")
|
||||||
|
return :skipped_not_found
|
||||||
|
end
|
||||||
|
|
||||||
request = Net::HTTP::Get.new(URI(wayback_url))
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
||||||
request["Connection"] = "keep-alive"
|
request["Connection"] = "keep-alive"
|
||||||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||||||
@@ -616,8 +771,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
response = connection.request(request)
|
response = connection.request(request)
|
||||||
|
|
||||||
case response
|
save_response_body = lambda do
|
||||||
when Net::HTTPSuccess
|
|
||||||
File.open(file_path, "wb") do |file|
|
File.open(file_path, "wb") do |file|
|
||||||
body = response.body
|
body = response.body
|
||||||
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
||||||
@@ -627,26 +781,48 @@ class WaybackMachineDownloader
|
|||||||
gz.close
|
gz.close
|
||||||
file.write(decompressed_body)
|
file.write(decompressed_body)
|
||||||
rescue Zlib::GzipFile::Error => e
|
rescue Zlib::GzipFile::Error => e
|
||||||
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
||||||
file.write(body)
|
file.write(body)
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
file.write(body) if body
|
file.write(body) if body
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
when Net::HTTPRedirection
|
end
|
||||||
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
|
||||||
location = response['location']
|
if @all
|
||||||
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
case response
|
||||||
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
||||||
when Net::HTTPTooManyRequests
|
save_response_body.call
|
||||||
sleep(RATE_LIMIT * 2)
|
if response.is_a?(Net::HTTPRedirection)
|
||||||
raise "Rate limited, retrying..."
|
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
||||||
when Net::HTTPNotFound
|
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
||||||
@logger.warn("File not found, skipping: #{file_url}")
|
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
||||||
return
|
end
|
||||||
else
|
return :saved
|
||||||
raise "HTTP Error: #{response.code} #{response.message}"
|
else
|
||||||
|
# for any other response type when --all is true, treat as an error to be retried or failed
|
||||||
|
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
||||||
|
end
|
||||||
|
else # not @all (our default behavior)
|
||||||
|
case response
|
||||||
|
when Net::HTTPSuccess
|
||||||
|
save_response_body.call
|
||||||
|
return :saved
|
||||||
|
when Net::HTTPRedirection
|
||||||
|
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
||||||
|
location = response['location']
|
||||||
|
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
||||||
|
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
||||||
|
when Net::HTTPTooManyRequests
|
||||||
|
sleep(RATE_LIMIT * 2)
|
||||||
|
raise "Rate limited, retrying..."
|
||||||
|
when Net::HTTPNotFound
|
||||||
|
@logger.warn("File not found, skipping: #{file_url}")
|
||||||
|
return :skipped_not_found
|
||||||
|
else
|
||||||
|
raise "HTTP Error: #{response.code} #{response.message}"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
|
|||||||
@@ -4,6 +4,14 @@ require 'uri'
|
|||||||
module ArchiveAPI
|
module ArchiveAPI
|
||||||
|
|
||||||
def get_raw_list_from_api(url, page_index, http)
|
def get_raw_list_from_api(url, page_index, http)
|
||||||
|
# Automatically append /* if the URL doesn't contain a path after the domain
|
||||||
|
# This is a workaround for an issue with the API and *some* domains.
|
||||||
|
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
||||||
|
# But don't do this when exact_url flag is set
|
||||||
|
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
||||||
|
url = "#{url}/*"
|
||||||
|
end
|
||||||
|
|
||||||
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
||||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||||
request_url.query = URI.encode_www_form(params)
|
request_url.query = URI.encode_www_form(params)
|
||||||
|
|||||||
238
lib/wayback_machine_downloader/subdom_processor.rb
Normal file
238
lib/wayback_machine_downloader/subdom_processor.rb
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module SubdomainProcessor
|
||||||
|
def process_subdomains
|
||||||
|
return unless @recursive_subdomains
|
||||||
|
|
||||||
|
puts "Starting subdomain processing..."
|
||||||
|
|
||||||
|
# extract base domain from the URL for comparison
|
||||||
|
base_domain = extract_base_domain(@base_url)
|
||||||
|
@processed_domains = Set.new([base_domain])
|
||||||
|
@subdomain_queue = Queue.new
|
||||||
|
|
||||||
|
# scan downloaded files for subdomain links
|
||||||
|
initial_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
||||||
|
puts "Scanning #{initial_files.size} downloaded files for subdomain links..."
|
||||||
|
|
||||||
|
subdomains_found = scan_files_for_subdomains(initial_files, base_domain)
|
||||||
|
|
||||||
|
if subdomains_found.empty?
|
||||||
|
puts "No subdomains found in downloaded content."
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
puts "Found #{subdomains_found.size} subdomains to process: #{subdomains_found.join(', ')}"
|
||||||
|
|
||||||
|
# add found subdomains to the queue
|
||||||
|
subdomains_found.each do |subdomain|
|
||||||
|
full_domain = "#{subdomain}.#{base_domain}"
|
||||||
|
@subdomain_queue << "https://#{full_domain}/"
|
||||||
|
end
|
||||||
|
|
||||||
|
# process the subdomain queue
|
||||||
|
download_subdomains(base_domain)
|
||||||
|
|
||||||
|
# after all downloads, rewrite all URLs to make local references
|
||||||
|
rewrite_subdomain_links(base_domain) if @rewrite
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def extract_base_domain(url)
|
||||||
|
uri = URI.parse(url.gsub(/^https?:\/\//, '').split('/').first) rescue nil
|
||||||
|
return nil unless uri
|
||||||
|
|
||||||
|
host = uri.host || uri.path.split('/').first
|
||||||
|
host = host.downcase
|
||||||
|
|
||||||
|
# extract the base domain (e.g., "example.com" from "sub.example.com")
|
||||||
|
parts = host.split('.')
|
||||||
|
return host if parts.size <= 2
|
||||||
|
|
||||||
|
# for domains like co.uk, we want to keep the last 3 parts
|
||||||
|
if parts[-2].length <= 3 && parts[-1].length <= 3 && parts.size > 2
|
||||||
|
parts.last(3).join('.')
|
||||||
|
else
|
||||||
|
parts.last(2).join('.')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def scan_files_for_subdomains(files, base_domain)
|
||||||
|
return [] unless base_domain
|
||||||
|
|
||||||
|
subdomains = Set.new
|
||||||
|
|
||||||
|
files.each do |file_path|
|
||||||
|
next unless File.exist?(file_path)
|
||||||
|
|
||||||
|
begin
|
||||||
|
content = File.read(file_path)
|
||||||
|
|
||||||
|
# extract URLs from HTML href/src attributes
|
||||||
|
content.scan(/(?:href|src|action|data-src)=["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||||||
|
subdomain = match[0].downcase
|
||||||
|
next if subdomain == 'www' # skip www subdomain
|
||||||
|
subdomains.add(subdomain)
|
||||||
|
end
|
||||||
|
|
||||||
|
# extract URLs from CSS
|
||||||
|
content.scan(/url\(["']?https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||||||
|
subdomain = match[0].downcase
|
||||||
|
next if subdomain == 'www' # skip www subdomain
|
||||||
|
subdomains.add(subdomain)
|
||||||
|
end
|
||||||
|
|
||||||
|
# extract URLs from JavaScript strings
|
||||||
|
content.scan(/["']https?:\/\/([^\/."']+)\.#{Regexp.escape(base_domain)}[\/"]/) do |match|
|
||||||
|
subdomain = match[0].downcase
|
||||||
|
next if subdomain == 'www' # skip www subdomain
|
||||||
|
subdomains.add(subdomain)
|
||||||
|
end
|
||||||
|
rescue => e
|
||||||
|
puts "Error scanning file #{file_path}: #{e.message}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
subdomains.to_a
|
||||||
|
end
|
||||||
|
|
||||||
|
def download_subdomains(base_domain)
|
||||||
|
puts "Starting subdomain downloads..."
|
||||||
|
depth = 0
|
||||||
|
max_depth = @subdomain_depth || 1
|
||||||
|
|
||||||
|
while depth < max_depth && !@subdomain_queue.empty?
|
||||||
|
current_batch = []
|
||||||
|
|
||||||
|
# get all subdomains at current depth
|
||||||
|
while !@subdomain_queue.empty?
|
||||||
|
current_batch << @subdomain_queue.pop
|
||||||
|
end
|
||||||
|
|
||||||
|
puts "Processing #{current_batch.size} subdomains at depth #{depth + 1}..."
|
||||||
|
|
||||||
|
# download each subdomain
|
||||||
|
current_batch.each do |subdomain_url|
|
||||||
|
download_subdomain(subdomain_url, base_domain)
|
||||||
|
end
|
||||||
|
|
||||||
|
# if we need to go deeper, scan the newly downloaded files
|
||||||
|
if depth + 1 < max_depth
|
||||||
|
# get all files in the subdomains directory
|
||||||
|
new_files = Dir.glob(File.join(backup_path, "subdomains", "**/*.{html,htm,css,js}"))
|
||||||
|
new_subdomains = scan_files_for_subdomains(new_files, base_domain)
|
||||||
|
|
||||||
|
# filter out already processed subdomains
|
||||||
|
new_subdomains.each do |subdomain|
|
||||||
|
full_domain = "#{subdomain}.#{base_domain}"
|
||||||
|
unless @processed_domains.include?(full_domain)
|
||||||
|
@processed_domains.add(full_domain)
|
||||||
|
@subdomain_queue << "https://#{full_domain}/"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
puts "Found #{@subdomain_queue.size} new subdomains at depth #{depth + 1}" if !@subdomain_queue.empty?
|
||||||
|
end
|
||||||
|
|
||||||
|
depth += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def download_subdomain(subdomain_url, base_domain)
|
||||||
|
begin
|
||||||
|
uri = URI.parse(subdomain_url)
|
||||||
|
subdomain_host = uri.host
|
||||||
|
|
||||||
|
# skip if already processed
|
||||||
|
if @processed_domains.include?(subdomain_host)
|
||||||
|
puts "Skipping already processed subdomain: #{subdomain_host}"
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
@processed_domains.add(subdomain_host)
|
||||||
|
puts "Downloading subdomain: #{subdomain_url}"
|
||||||
|
|
||||||
|
# create the directory for this subdomain
|
||||||
|
subdomain_dir = File.join(backup_path, "subdomains", subdomain_host)
|
||||||
|
FileUtils.mkdir_p(subdomain_dir)
|
||||||
|
|
||||||
|
# create subdomain downloader with appropriate options
|
||||||
|
subdomain_options = {
|
||||||
|
base_url: subdomain_url,
|
||||||
|
directory: subdomain_dir,
|
||||||
|
from_timestamp: @from_timestamp,
|
||||||
|
to_timestamp: @to_timestamp,
|
||||||
|
all: @all,
|
||||||
|
threads_count: @threads_count,
|
||||||
|
maximum_pages: [@maximum_pages / 2, 10].max,
|
||||||
|
rewrite: @rewrite,
|
||||||
|
# don't recursively process subdomains from here
|
||||||
|
recursive_subdomains: false
|
||||||
|
}
|
||||||
|
|
||||||
|
# download the subdomain content
|
||||||
|
subdomain_downloader = WaybackMachineDownloader.new(subdomain_options)
|
||||||
|
subdomain_downloader.download_files
|
||||||
|
|
||||||
|
puts "Completed download of subdomain: #{subdomain_host}"
|
||||||
|
rescue => e
|
||||||
|
puts "Error downloading subdomain #{subdomain_url}: #{e.message}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def rewrite_subdomain_links(base_domain)
|
||||||
|
puts "Rewriting all files to use local subdomain references..."
|
||||||
|
|
||||||
|
all_files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js}"))
|
||||||
|
subdomains = @processed_domains.reject { |domain| domain == base_domain }
|
||||||
|
|
||||||
|
puts "Found #{all_files.size} files to check for rewriting"
|
||||||
|
puts "Will rewrite links for subdomains: #{subdomains.join(', ')}"
|
||||||
|
|
||||||
|
rewritten_count = 0
|
||||||
|
|
||||||
|
all_files.each do |file_path|
|
||||||
|
next unless File.exist?(file_path)
|
||||||
|
|
||||||
|
begin
|
||||||
|
content = File.read(file_path)
|
||||||
|
original_content = content.dup
|
||||||
|
|
||||||
|
# replace subdomain URLs with local paths
|
||||||
|
subdomains.each do |subdomain_host|
|
||||||
|
# for HTML attributes (href, src, etc.)
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
||||||
|
prefix, path, suffix = $1, $2, $3
|
||||||
|
path = "/index.html" if path.empty? || path == "/"
|
||||||
|
"#{prefix}../subdomains/#{subdomain_host}#{path}#{suffix}"
|
||||||
|
end
|
||||||
|
|
||||||
|
# for CSS url()
|
||||||
|
content.gsub!(/url\(\s*["']?https?:\/\/#{Regexp.escape(subdomain_host)}([^"'\)]*?)["']?\s*\)/i) do
|
||||||
|
path = $1
|
||||||
|
path = "/index.html" if path.empty? || path == "/"
|
||||||
|
"url(\"../subdomains/#{subdomain_host}#{path}\")"
|
||||||
|
end
|
||||||
|
|
||||||
|
# for JavaScript strings
|
||||||
|
content.gsub!(/(["'])https?:\/\/#{Regexp.escape(subdomain_host)}([^"']*)(["'])/i) do
|
||||||
|
quote_start, path, quote_end = $1, $2, $3
|
||||||
|
path = "/index.html" if path.empty? || path == "/"
|
||||||
|
"#{quote_start}../subdomains/#{subdomain_host}#{path}#{quote_end}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# save if modified
|
||||||
|
if content != original_content
|
||||||
|
File.write(file_path, content)
|
||||||
|
rewritten_count += 1
|
||||||
|
end
|
||||||
|
rescue => e
|
||||||
|
puts "Error rewriting file #{file_path}: #{e.message}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
puts "Rewrote links in #{rewritten_count} files"
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -1,8 +1,6 @@
|
|||||||
require './lib/wayback_machine_downloader'
|
|
||||||
|
|
||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = WaybackMachineDownloader::VERSION
|
s.version = "2.3.11"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
@@ -11,7 +9,7 @@ Gem::Specification.new do |s|
|
|||||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
||||||
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
||||||
s.license = "MIT"
|
s.license = "MIT"
|
||||||
s.required_ruby_version = ">= 1.9.2"
|
s.required_ruby_version = ">= 3.4.3"
|
||||||
s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4"
|
s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4"
|
||||||
s.add_development_dependency "rake", "~> 12.2"
|
s.add_development_dependency "rake", "~> 12.2"
|
||||||
s.add_development_dependency "minitest", "~> 5.2"
|
s.add_development_dependency "minitest", "~> 5.2"
|
||||||
|
|||||||
Reference in New Issue
Block a user