mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ab4324c0eb | ||
|
|
e28d7d578b | ||
|
|
a7a25574cf | ||
|
|
23cc3d69b1 | ||
|
|
01fa1f8c9f | ||
|
|
d2f98d9428 | ||
|
|
c7a5381eaf | ||
|
|
9709834e20 | ||
|
|
77998372cb | ||
|
|
2c789b7df6 | ||
|
|
1ef8c14c48 | ||
|
|
780e45343f | ||
|
|
42e6d62284 | ||
|
|
543161d7fb | ||
|
|
99a6de981e | ||
|
|
d85c880d23 | ||
|
|
917f4f8798 | ||
|
|
787bc2e535 | ||
|
|
4db13a7792 | ||
|
|
13e88ce04a | ||
|
|
c7fc7c7b58 |
10
Dockerfile
10
Dockerfile
@@ -1,11 +1,15 @@
|
|||||||
FROM ruby:3.1.6-alpine
|
FROM ruby:3.4.3-alpine
|
||||||
USER root
|
USER root
|
||||||
WORKDIR /build
|
WORKDIR /build
|
||||||
COPY . /build
|
|
||||||
|
COPY Gemfile /build/
|
||||||
|
COPY *.gemspec /build/
|
||||||
|
|
||||||
RUN gem update \
|
RUN gem update \
|
||||||
&& gem install concurrent-ruby \
|
&& bundle config set jobs $(nproc) \
|
||||||
&& bundle install
|
&& bundle install
|
||||||
|
|
||||||
|
COPY . /build
|
||||||
|
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
||||||
46
README.md
46
README.md
@@ -62,6 +62,12 @@ docker build -t wayback_machine_downloader .
|
|||||||
docker run -it --rm wayback_machine_downloader [options] URL
|
docker run -it --rm wayback_machine_downloader [options] URL
|
||||||
```
|
```
|
||||||
|
|
||||||
|
or the example without cloning the repo - fetching smallrockets.com until the year 2013:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
|
||||||
|
```
|
||||||
|
|
||||||
### 🐳 Using Docker Compose
|
### 🐳 Using Docker Compose
|
||||||
|
|
||||||
We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database):
|
We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database):
|
||||||
@@ -74,21 +80,49 @@ services:
|
|||||||
tty: true
|
tty: true
|
||||||
image: wayback_machine_downloader:latest
|
image: wayback_machine_downloader:latest
|
||||||
container_name: wayback_machine_downloader
|
container_name: wayback_machine_downloader
|
||||||
|
environment:
|
||||||
|
- ENVIRONMENT=${ENVIRONMENT:-development}
|
||||||
|
- OPTIONS=${OPTIONS:-""}
|
||||||
|
- TARGET_URL=${TARGET_URL}
|
||||||
volumes:
|
volumes:
|
||||||
- .:/build:rw
|
- .:/build:rw
|
||||||
- ./websites:/build/websites:rw
|
- ./websites:/build/websites:rw
|
||||||
|
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
|
||||||
|
```
|
||||||
|
#### Usage:
|
||||||
|
Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
|
||||||
|
```bash
|
||||||
|
docker compose up -d --build
|
||||||
|
```
|
||||||
|
|
||||||
|
After that you must set TARGET_URL environment variable:
|
||||||
|
```bash
|
||||||
|
export TARGET_URL="https://example.com/"
|
||||||
|
```
|
||||||
|
|
||||||
|
The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```bash
|
||||||
|
export OPTIONS="--list -f 20060121"
|
||||||
|
```
|
||||||
|
|
||||||
|
After that you can run the exists container with the following command:
|
||||||
|
```bash
|
||||||
|
docker compose run --rm wayback_machine_downloader https://example.com
|
||||||
```
|
```
|
||||||
|
|
||||||
## ⚙️ Configuration
|
## ⚙️ Configuration
|
||||||
There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
|
There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
|
||||||
|
|
||||||
```ruby
|
```ruby
|
||||||
DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds)
|
DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds)
|
||||||
MAX_RETRIES = 3 # Failed request retries
|
MAX_RETRIES = 3 # Number of times to retry failed requests
|
||||||
RETRY_DELAY = 2 # Wait between retries
|
RETRY_DELAY = 2 # Wait time between retries (seconds)
|
||||||
RATE_LIMIT = 0.25 # Throttle between requests
|
RATE_LIMIT = 0.25 # Throttle between requests (seconds)
|
||||||
CONNECTION_POOL_SIZE = 10 # No. of simultaneous connections
|
CONNECTION_POOL_SIZE = 10 # Maximum simultaneous connections
|
||||||
MEMORY_BUFFER_SIZE = 16384 # Size of download buffer
|
MEMORY_BUFFER_SIZE = 16384 # Download buffer size (bytes)
|
||||||
|
STATE_CDX_FILENAME = '.cdx.json' # Stores snapshot listing
|
||||||
|
STATE_DB_FILENAME = '.downloaded.txt' # Tracks completed downloads
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🛠️ Advanced usage
|
## 🛠️ Advanced usage
|
||||||
|
|||||||
@@ -62,6 +62,10 @@ option_parser = OptionParser.new do |opts|
|
|||||||
options[:rewritten] = true
|
options[:rewritten] = true
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
|
||||||
|
options[:rewrite] = true
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
||||||
options[:reset] = true
|
options[:reset] = true
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -6,8 +6,10 @@ services:
|
|||||||
image: wayback_machine_downloader:latest
|
image: wayback_machine_downloader:latest
|
||||||
container_name: wayback_machine_downloader
|
container_name: wayback_machine_downloader
|
||||||
environment:
|
environment:
|
||||||
- ENVIRONMENT=${ENVIRONMENT}
|
- ENVIRONMENT=${DEVELOPMENT:-production}
|
||||||
|
- OPTIONS=${OPTIONS:-""}
|
||||||
|
- TARGET_URL=${TARGET_URL}
|
||||||
volumes:
|
volumes:
|
||||||
- .:/build:rw
|
- .:/build:rw
|
||||||
- ./websites:/build/websites:rw
|
- ./websites:/websites:rw
|
||||||
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
|
command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}
|
||||||
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
|
|
||||||
VERSION = "2.3.4"
|
VERSION = "2.3.6"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -125,7 +125,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
validate_params(params)
|
validate_params(params)
|
||||||
@@ -148,6 +148,7 @@ class WaybackMachineDownloader
|
|||||||
@failed_downloads = Concurrent::Array.new
|
@failed_downloads = Concurrent::Array.new
|
||||||
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
||||||
@db_mutex = Mutex.new
|
@db_mutex = Mutex.new
|
||||||
|
@rewrite = params[:rewrite] || false
|
||||||
|
|
||||||
handle_reset
|
handle_reset
|
||||||
end
|
end
|
||||||
@@ -533,15 +534,109 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def rewrite_urls_to_relative(file_path)
|
||||||
|
return unless File.exist?(file_path)
|
||||||
|
|
||||||
|
file_ext = File.extname(file_path).downcase
|
||||||
|
|
||||||
|
begin
|
||||||
|
content = File.binread(file_path)
|
||||||
|
|
||||||
|
if file_ext == '.html' || file_ext == '.htm'
|
||||||
|
encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
|
||||||
|
content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
|
||||||
|
else
|
||||||
|
content.force_encoding('UTF-8')
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in HTML attributes
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||||
|
prefix, url, suffix = $1, $2, $3
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"#{prefix}#{path}#{suffix}"
|
||||||
|
rescue
|
||||||
|
"#{prefix}#{url}#{suffix}"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"#{prefix}./#{url[1..-1]}#{suffix}"
|
||||||
|
else
|
||||||
|
"#{prefix}#{url}#{suffix}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in CSS
|
||||||
|
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
|
||||||
|
url = $1
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"url(\"#{path}\")"
|
||||||
|
rescue
|
||||||
|
"url(\"#{url}\")"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"url(\"./#{url[1..-1]}\")"
|
||||||
|
else
|
||||||
|
"url(\"#{url}\")"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs in JavaScript
|
||||||
|
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
|
||||||
|
quote_start, url, quote_end = $1, $2, $3
|
||||||
|
|
||||||
|
if url.start_with?('http')
|
||||||
|
begin
|
||||||
|
uri = URI.parse(url)
|
||||||
|
path = uri.path
|
||||||
|
path = path[1..-1] if path.start_with?('/')
|
||||||
|
"#{quote_start}#{path}#{quote_end}"
|
||||||
|
rescue
|
||||||
|
"#{quote_start}#{url}#{quote_end}"
|
||||||
|
end
|
||||||
|
elsif url.start_with?('/')
|
||||||
|
"#{quote_start}./#{url[1..-1]}#{quote_end}"
|
||||||
|
else
|
||||||
|
"#{quote_start}#{url}#{quote_end}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# for URLs in HTML attributes that start with a single slash
|
||||||
|
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
|
||||||
|
prefix, path, suffix = $1, $2, $3
|
||||||
|
"#{prefix}./#{path}#{suffix}"
|
||||||
|
end
|
||||||
|
|
||||||
|
# for URLs in CSS that start with a single slash
|
||||||
|
content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
|
||||||
|
path = $1
|
||||||
|
"url(\"./#{path}\")"
|
||||||
|
end
|
||||||
|
|
||||||
|
# save the modified content back to the file
|
||||||
|
File.binwrite(file_path, content)
|
||||||
|
puts "Rewrote URLs in #{file_path} to be relative."
|
||||||
|
rescue Errno::ENOENT => e
|
||||||
|
@logger.warn("Error reading file #{file_path}: #{e.message}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def download_file (file_remote_info, http)
|
def download_file (file_remote_info, http)
|
||||||
current_encoding = "".encoding
|
current_encoding = "".encoding
|
||||||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||||||
file_id = file_remote_info[:file_id]
|
file_id = file_remote_info[:file_id]
|
||||||
file_timestamp = file_remote_info[:timestamp]
|
file_timestamp = file_remote_info[:timestamp]
|
||||||
original_file_id = @all_timestamps ? file_id.split('/', 2)[1] : file_id
|
file_path_elements = file_id.split('/')
|
||||||
file_path_elements = original_file_id.split('/')
|
|
||||||
|
|
||||||
if original_file_id == ""
|
if file_id == ""
|
||||||
dir_path = backup_path
|
dir_path = backup_path
|
||||||
file_path = backup_path + 'index.html'
|
file_path = backup_path + 'index.html'
|
||||||
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
||||||
@@ -565,10 +660,13 @@ class WaybackMachineDownloader
|
|||||||
begin
|
begin
|
||||||
structure_dir_path dir_path
|
structure_dir_path dir_path
|
||||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
download_with_retry(file_path, file_url, file_timestamp, http)
|
||||||
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
||||||
|
rewrite_urls_to_relative(file_path)
|
||||||
|
end
|
||||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
if File.exist?(file_path) and File.size(file_path) == 0
|
||||||
File.delete(file_path)
|
File.delete(file_path)
|
||||||
msg += "\n#{file_path} was empty and was removed."
|
msg += "\n#{file_path} was empty and was removed."
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
require './lib/wayback_machine_downloader'
|
|
||||||
|
|
||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = WaybackMachineDownloader::VERSION
|
s.version = "2.3.6"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
@@ -11,7 +9,7 @@ Gem::Specification.new do |s|
|
|||||||
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
|
||||||
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader"
|
||||||
s.license = "MIT"
|
s.license = "MIT"
|
||||||
s.required_ruby_version = ">= 1.9.2"
|
s.required_ruby_version = ">= 3.4.3"
|
||||||
s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4"
|
s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4"
|
||||||
s.add_development_dependency "rake", "~> 12.2"
|
s.add_development_dependency "rake", "~> 12.2"
|
||||||
s.add_development_dependency "minitest", "~> 5.2"
|
s.add_development_dependency "minitest", "~> 5.2"
|
||||||
|
|||||||
Reference in New Issue
Block a user