Merge pull request #27 from adampweb/master

Refactored huge functions & cleanup
This commit is contained in:
Felipe 2025-07-29 18:09:51 -03:00 committed by GitHub
commit 754df6b8d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 117 additions and 112 deletions

View File

@ -6,10 +6,9 @@ COPY Gemfile /build/
COPY *.gemspec /build/ COPY *.gemspec /build/
RUN bundle config set jobs "$(nproc)" \ RUN bundle config set jobs "$(nproc)" \
&& bundle config set without 'development test' \
&& bundle install && bundle install
COPY . /build COPY . /build
WORKDIR / WORKDIR /build
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ] ENTRYPOINT [ "/build/bin/wayback_machine_downloader", "--directory", "/build/websites" ]

View File

@ -81,36 +81,19 @@ services:
tty: true tty: true
image: wayback_machine_downloader:latest image: wayback_machine_downloader:latest
container_name: wayback_machine_downloader container_name: wayback_machine_downloader
environment:
- ENVIRONMENT=${ENVIRONMENT:-development}
- OPTIONS=${OPTIONS:-""}
- TARGET_URL=${TARGET_URL}
volumes: volumes:
- .:/build:rw - .:/build:rw
- ./websites:/build/websites:rw - ./websites:/build/websites:rw
command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
``` ```
#### Usage: #### Usage:
Now You can create a Docker image as named "wayback_machine_downloader" with the following command: Now you can create a Docker image as named "wayback_machine_downloader" with the following command:
```bash ```bash
docker compose up -d --build docker compose up -d --build
``` ```
After that you must set TARGET_URL environment variable:
```bash
export TARGET_URL="https://example.com/"
```
The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
Example:
```bash
export OPTIONS="--list -f 20060121"
```
After that you can run the exists container with the following command: After that you can run the exists container with the following command:
```bash ```bash
docker compose run --rm wayback_machine_downloader https://example.com docker compose run --rm wayback_machine_downloader https://example.com [options]
``` ```
## ⚙️ Configuration ## ⚙️ Configuration

View File

@ -5,11 +5,6 @@ services:
tty: true tty: true
image: wayback_machine_downloader:latest image: wayback_machine_downloader:latest
container_name: wayback_machine_downloader container_name: wayback_machine_downloader
environment:
- ENVIRONMENT=${DEVELOPMENT:-production}
- OPTIONS=${OPTIONS:-""}
- TARGET_URL=${TARGET_URL}
volumes: volumes:
- .:/build:rw - .:/build:rw
- ./websites:/websites:rw - ./websites:/build/websites:rw
command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}

View File

@ -15,6 +15,7 @@ require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api' require_relative 'wayback_machine_downloader/archive_api'
require_relative 'wayback_machine_downloader/subdom_processor' require_relative 'wayback_machine_downloader/subdom_processor'
require_relative 'wayback_machine_downloader/url_rewrite'
class ConnectionPool class ConnectionPool
MAX_AGE = 300 MAX_AGE = 300
@ -474,6 +475,39 @@ class WaybackMachineDownloader
end end
end end
def processing_files(pool, files_to_process)
files_to_process.each do |file_remote_info|
pool.post do
download_success = false
begin
@connection_pool.with_connection do |connection|
result_message = download_file(file_remote_info, connection)
# assume download success if the result message contains ' -> '
if result_message && result_message.include?(' -> ')
download_success = true
end
@download_mutex.synchronize do
@processed_file_count += 1
# adjust progress message to reflect remaining files
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
puts progress_message if progress_message
end
end
# sppend to DB only after successful download outside the connection block
if download_success
append_to_db(file_remote_info[:file_id])
end
rescue => e
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
@download_mutex.synchronize do
@processed_file_count += 1
end
end
sleep(RATE_LIMIT)
end
end
end
def download_files def download_files
start_time = Time.now start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
@ -520,36 +554,7 @@ class WaybackMachineDownloader
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
pool = Concurrent::FixedThreadPool.new(thread_count) pool = Concurrent::FixedThreadPool.new(thread_count)
files_to_process.each do |file_remote_info| processing_files(pool, files_to_process)
pool.post do
download_success = false
begin
@connection_pool.with_connection do |connection|
result_message = download_file(file_remote_info, connection)
# assume download success if the result message contains ' -> '
if result_message && result_message.include?(' -> ')
download_success = true
end
@download_mutex.synchronize do
@processed_file_count += 1
# adjust progress message to reflect remaining files
progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
puts progress_message if progress_message
end
end
# sppend to DB only after successful download outside the connection block
if download_success
append_to_db(file_remote_info[:file_id])
end
rescue => e
@logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
@download_mutex.synchronize do
@processed_file_count += 1
end
end
sleep(RATE_LIMIT)
end
end
pool.shutdown pool.shutdown
pool.wait_for_termination pool.wait_for_termination
@ -609,64 +614,13 @@ class WaybackMachineDownloader
end end
# URLs in HTML attributes # URLs in HTML attributes
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do rewrite_html_attr_urls(content)
prefix, url, suffix = $1, $2, $3
if url.start_with?('http')
begin
uri = URI.parse(url)
path = uri.path
path = path[1..-1] if path.start_with?('/')
"#{prefix}#{path}#{suffix}"
rescue
"#{prefix}#{url}#{suffix}"
end
elsif url.start_with?('/')
"#{prefix}./#{url[1..-1]}#{suffix}"
else
"#{prefix}#{url}#{suffix}"
end
end
# URLs in CSS # URLs in CSS
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do rewrite_css_urls(content)
url = $1
if url.start_with?('http')
begin
uri = URI.parse(url)
path = uri.path
path = path[1..-1] if path.start_with?('/')
"url(\"#{path}\")"
rescue
"url(\"#{url}\")"
end
elsif url.start_with?('/')
"url(\"./#{url[1..-1]}\")"
else
"url(\"#{url}\")"
end
end
# URLs in JavaScript # URLs in JavaScript
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do rewrite_js_urls(content)
quote_start, url, quote_end = $1, $2, $3
if url.start_with?('http')
begin
uri = URI.parse(url)
path = uri.path
path = path[1..-1] if path.start_with?('/')
"#{quote_start}#{path}#{quote_end}"
rescue
"#{quote_start}#{url}#{quote_end}"
end
elsif url.start_with?('/')
"#{quote_start}./#{url[1..-1]}#{quote_end}"
else
"#{quote_start}#{url}#{quote_end}"
end
end
# for URLs in HTML attributes that start with a single slash # for URLs in HTML attributes that start with a single slash
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do

View File

@ -0,0 +1,74 @@
# frozen_string_literal: true
# URLs in HTML attributes
def rewrite_html_attr_urls(content)
content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
prefix, url, suffix = $1, $2, $3
if url.start_with?('http')
begin
uri = URI.parse(url)
path = uri.path
path = path[1..-1] if path.start_with?('/')
"#{prefix}#{path}#{suffix}"
rescue
"#{prefix}#{url}#{suffix}"
end
elsif url.start_with?('/')
"#{prefix}./#{url[1..-1]}#{suffix}"
else
"#{prefix}#{url}#{suffix}"
end
end
content
end
# URLs in CSS
def rewrite_css_urls(content)
content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
url = $1
if url.start_with?('http')
begin
uri = URI.parse(url)
path = uri.path
path = path[1..-1] if path.start_with?('/')
"url(\"#{path}\")"
rescue
"url(\"#{url}\")"
end
elsif url.start_with?('/')
"url(\"./#{url[1..-1]}\")"
else
"url(\"#{url}\")"
end
end
content
end
# URLs in JavaScript
def rewrite_js_urls(content)
content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
quote_start, url, quote_end = $1, $2, $3
if url.start_with?('http')
begin
uri = URI.parse(url)
path = uri.path
path = path[1..-1] if path.start_with?('/')
"#{quote_start}#{path}#{quote_end}"
rescue
"#{quote_start}#{url}#{quote_end}"
end
elsif url.start_with?('/')
"#{quote_start}./#{url[1..-1]}#{quote_end}"
else
"#{quote_start}#{url}#{quote_end}"
end
end
content
end