mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61e22cfe25 | ||
|
|
183ed61104 | ||
|
|
e6ecf32a43 | ||
|
|
375c6314ad | ||
|
|
6e2739f5a8 | ||
|
|
caba6a665f |
@@ -1,12 +1,12 @@
|
||||
FROM ruby:3.4.3-alpine
|
||||
FROM ruby:3.4.4-alpine
|
||||
USER root
|
||||
WORKDIR /build
|
||||
|
||||
COPY Gemfile /build/
|
||||
COPY *.gemspec /build/
|
||||
|
||||
RUN gem update \
|
||||
&& bundle config set jobs $(nproc) \
|
||||
RUN bundle config set jobs "$(nproc)" \
|
||||
&& bundle config set without 'development test' \
|
||||
&& bundle install
|
||||
|
||||
COPY . /build
|
||||
|
||||
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
|
||||
VERSION = "2.3.6"
|
||||
VERSION = "2.3.7"
|
||||
DEFAULT_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 2
|
||||
@@ -477,8 +477,8 @@ class WaybackMachineDownloader
|
||||
begin
|
||||
@connection_pool.with_connection do |connection|
|
||||
result_message = download_file(file_remote_info, connection)
|
||||
# for now, assume success if no exception and message doesn't indicate error/skip
|
||||
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
||||
# assume download success if the result message contains ' -> '
|
||||
if result_message && result_message.include?(' -> ')
|
||||
download_success = true
|
||||
end
|
||||
@download_mutex.synchronize do
|
||||
@@ -659,11 +659,21 @@ class WaybackMachineDownloader
|
||||
|
||||
begin
|
||||
structure_dir_path dir_path
|
||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
||||
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
||||
|
||||
case status
|
||||
when :saved
|
||||
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
||||
rewrite_urls_to_relative(file_path)
|
||||
end
|
||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
when :skipped_not_found
|
||||
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
else
|
||||
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
||||
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
||||
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
end
|
||||
rescue StandardError => e
|
||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||
if File.exist?(file_path) and File.size(file_path) == 0
|
||||
@@ -714,8 +724,7 @@ class WaybackMachineDownloader
|
||||
|
||||
response = connection.request(request)
|
||||
|
||||
case response
|
||||
when Net::HTTPSuccess
|
||||
save_response_body = lambda do
|
||||
File.open(file_path, "wb") do |file|
|
||||
body = response.body
|
||||
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
||||
@@ -725,13 +734,34 @@ class WaybackMachineDownloader
|
||||
gz.close
|
||||
file.write(decompressed_body)
|
||||
rescue Zlib::GzipFile::Error => e
|
||||
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
||||
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
||||
file.write(body)
|
||||
end
|
||||
else
|
||||
file.write(body) if body
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if @all
|
||||
case response
|
||||
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
||||
save_response_body.call
|
||||
if response.is_a?(Net::HTTPRedirection)
|
||||
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
||||
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
||||
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
||||
end
|
||||
return :saved
|
||||
else
|
||||
# for any other response type when --all is true, treat as an error to be retried or failed
|
||||
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
||||
end
|
||||
else # not @all (our default behavior)
|
||||
case response
|
||||
when Net::HTTPSuccess
|
||||
save_response_body.call
|
||||
return :saved
|
||||
when Net::HTTPRedirection
|
||||
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
||||
location = response['location']
|
||||
@@ -742,10 +772,11 @@ class WaybackMachineDownloader
|
||||
raise "Rate limited, retrying..."
|
||||
when Net::HTTPNotFound
|
||||
@logger.warn("File not found, skipping: #{file_url}")
|
||||
return
|
||||
return :skipped_not_found
|
||||
else
|
||||
raise "HTTP Error: #{response.code} #{response.message}"
|
||||
end
|
||||
end
|
||||
|
||||
rescue StandardError => e
|
||||
if retries < MAX_RETRIES
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
Gem::Specification.new do |s|
|
||||
s.name = "wayback_machine_downloader_straw"
|
||||
s.version = "2.3.6"
|
||||
s.version = "2.3.7"
|
||||
s.executables << "wayback_machine_downloader"
|
||||
s.summary = "Download an entire website from the Wayback Machine."
|
||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||
|
||||
Reference in New Issue
Block a user