6 Commits

Author SHA1 Message Date
Felipe
61e22cfe25 Bump versions 2025-05-27 18:10:09 +00:00
Felipe
183ed61104 Attempt at fixing --all
I honestly don't recall if this was implemented in the original code, and I'm guessing this worked at *some point* during this fork. It seems to work correctly now, however. See #6 and #11
2025-05-27 17:17:34 +00:00
Felipe
e6ecf32a43 Dockerfile test 2
I really should not be using deprecated parameters.
2025-05-21 21:34:36 -03:00
Felipe
375c6314ad Dockerfile test
...again
2025-05-21 21:26:37 -03:00
Felipe
6e2739f5a8 Testing 2025-05-18 18:00:10 +00:00
Felipe
caba6a665f Rough attempt to make this more efficient 2025-05-18 17:52:28 +00:00
3 changed files with 59 additions and 28 deletions

View File

@@ -1,15 +1,15 @@
FROM ruby:3.4.3-alpine FROM ruby:3.4.4-alpine
USER root USER root
WORKDIR /build WORKDIR /build
COPY Gemfile /build/ COPY Gemfile /build/
COPY *.gemspec /build/ COPY *.gemspec /build/
RUN gem update \ RUN bundle config set jobs "$(nproc)" \
&& bundle config set jobs $(nproc) \ && bundle config set without 'development test' \
&& bundle install && bundle install
COPY . /build COPY . /build
WORKDIR / WORKDIR /
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ] ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]

View File

@@ -113,7 +113,7 @@ class WaybackMachineDownloader
include ArchiveAPI include ArchiveAPI
VERSION = "2.3.6" VERSION = "2.3.7"
DEFAULT_TIMEOUT = 30 DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_DELAY = 2 RETRY_DELAY = 2
@@ -477,8 +477,8 @@ class WaybackMachineDownloader
begin begin
@connection_pool.with_connection do |connection| @connection_pool.with_connection do |connection|
result_message = download_file(file_remote_info, connection) result_message = download_file(file_remote_info, connection)
# for now, assume success if no exception and message doesn't indicate error/skip # assume download success if the result message contains ' -> '
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists') if result_message && result_message.include?(' -> ')
download_success = true download_success = true
end end
@download_mutex.synchronize do @download_mutex.synchronize do
@@ -659,11 +659,21 @@ class WaybackMachineDownloader
begin begin
structure_dir_path dir_path structure_dir_path dir_path
download_with_retry(file_path, file_url, file_timestamp, http) status = download_with_retry(file_path, file_url, file_timestamp, http)
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
rewrite_urls_to_relative(file_path) case status
when :saved
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
rewrite_urls_to_relative(file_path)
end
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
when :skipped_not_found
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
else
# ideally, this case should not be reached if download_with_retry behaves as expected.
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
end end
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
rescue StandardError => e rescue StandardError => e
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})" msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
if File.exist?(file_path) and File.size(file_path) == 0 if File.exist?(file_path) and File.size(file_path) == 0
@@ -714,8 +724,7 @@ class WaybackMachineDownloader
response = connection.request(request) response = connection.request(request)
case response save_response_body = lambda do
when Net::HTTPSuccess
File.open(file_path, "wb") do |file| File.open(file_path, "wb") do |file|
body = response.body body = response.body
if response['content-encoding'] == 'gzip' && body && !body.empty? if response['content-encoding'] == 'gzip' && body && !body.empty?
@@ -725,26 +734,48 @@ class WaybackMachineDownloader
gz.close gz.close
file.write(decompressed_body) file.write(decompressed_body)
rescue Zlib::GzipFile::Error => e rescue Zlib::GzipFile::Error => e
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}") @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
file.write(body) file.write(body)
end end
else else
file.write(body) if body file.write(body) if body
end end
end end
when Net::HTTPRedirection end
raise "Too many redirects for #{file_url}" if redirect_count >= 2
location = response['location'] if @all
@logger.warn("Redirect found for #{file_url} -> #{location}") case response
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1) when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
when Net::HTTPTooManyRequests save_response_body.call
sleep(RATE_LIMIT * 2) if response.is_a?(Net::HTTPRedirection)
raise "Rate limited, retrying..." @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
when Net::HTTPNotFound elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
@logger.warn("File not found, skipping: #{file_url}") @logger.info("Saved error page for #{file_url} (status #{response.code}).")
return end
else return :saved
raise "HTTP Error: #{response.code} #{response.message}" else
# for any other response type when --all is true, treat as an error to be retried or failed
raise "Unhandled HTTP response: #{response.code} #{response.message}"
end
else # not @all (our default behavior)
case response
when Net::HTTPSuccess
save_response_body.call
return :saved
when Net::HTTPRedirection
raise "Too many redirects for #{file_url}" if redirect_count >= 2
location = response['location']
@logger.warn("Redirect found for #{file_url} -> #{location}")
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
when Net::HTTPTooManyRequests
sleep(RATE_LIMIT * 2)
raise "Rate limited, retrying..."
when Net::HTTPNotFound
@logger.warn("File not found, skipping: #{file_url}")
return :skipped_not_found
else
raise "HTTP Error: #{response.code} #{response.message}"
end
end end
rescue StandardError => e rescue StandardError => e

View File

@@ -1,6 +1,6 @@
Gem::Specification.new do |s| Gem::Specification.new do |s|
s.name = "wayback_machine_downloader_straw" s.name = "wayback_machine_downloader_straw"
s.version = "2.3.6" s.version = "2.3.7"
s.executables << "wayback_machine_downloader" s.executables << "wayback_machine_downloader"
s.summary = "Download an entire website from the Wayback Machine." s.summary = "Download an entire website from the Wayback Machine."
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)" s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"