mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3fdfd70fc1 | ||
|
|
2bf74b4173 | ||
|
|
79cbb639e7 | ||
|
|
071d208b31 | ||
|
|
1681a12579 | ||
|
|
f38756dd76 | ||
|
|
9452411e32 | ||
|
|
61e22cfe25 | ||
|
|
183ed61104 | ||
|
|
e6ecf32a43 | ||
|
|
375c6314ad | ||
|
|
6e2739f5a8 | ||
|
|
caba6a665f |
@@ -1,15 +1,15 @@
|
|||||||
FROM ruby:3.4.3-alpine
|
FROM ruby:3.4.4-alpine
|
||||||
USER root
|
USER root
|
||||||
WORKDIR /build
|
WORKDIR /build
|
||||||
|
|
||||||
COPY Gemfile /build/
|
COPY Gemfile /build/
|
||||||
COPY *.gemspec /build/
|
COPY *.gemspec /build/
|
||||||
|
|
||||||
RUN gem update \
|
RUN bundle config set jobs "$(nproc)" \
|
||||||
&& bundle config set jobs $(nproc) \
|
&& bundle config set without 'development test' \
|
||||||
&& bundle install
|
&& bundle install
|
||||||
|
|
||||||
COPY . /build
|
COPY . /build
|
||||||
|
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
|
|
||||||
VERSION = "2.3.6"
|
VERSION = "2.3.8"
|
||||||
DEFAULT_TIMEOUT = 30
|
DEFAULT_TIMEOUT = 30
|
||||||
MAX_RETRIES = 3
|
MAX_RETRIES = 3
|
||||||
RETRY_DELAY = 2
|
RETRY_DELAY = 2
|
||||||
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
if @base_url.include? '//'
|
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
||||||
@base_url.split('/')[2]
|
|
||||||
|
if url_to_process.include? '//'
|
||||||
|
url_to_process.split('/')[2]
|
||||||
else
|
else
|
||||||
@base_url
|
url_to_process
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
|
|||||||
# Fetch the initial set of snapshots, sequentially
|
# Fetch the initial set of snapshots, sequentially
|
||||||
@connection_pool.with_connection do |connection|
|
@connection_pool.with_connection do |connection|
|
||||||
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
||||||
|
initial_list ||= []
|
||||||
mutex.synchronize do
|
mutex.synchronize do
|
||||||
snapshot_list_to_consider.concat(initial_list)
|
snapshot_list_to_consider.concat(initial_list)
|
||||||
print "."
|
print "."
|
||||||
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
|
|||||||
@connection_pool.with_connection do |connection|
|
@connection_pool.with_connection do |connection|
|
||||||
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
||||||
end
|
end
|
||||||
|
result ||= []
|
||||||
[page, result]
|
[page, result]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
# Process results and check for empty pages
|
# Process results and check for empty pages
|
||||||
results.each do |page, result|
|
results.each do |page, result|
|
||||||
if result.empty?
|
if result.nil? || result.empty?
|
||||||
continue_fetching = false
|
continue_fetching = false
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
@@ -477,8 +481,8 @@ class WaybackMachineDownloader
|
|||||||
begin
|
begin
|
||||||
@connection_pool.with_connection do |connection|
|
@connection_pool.with_connection do |connection|
|
||||||
result_message = download_file(file_remote_info, connection)
|
result_message = download_file(file_remote_info, connection)
|
||||||
# for now, assume success if no exception and message doesn't indicate error/skip
|
# assume download success if the result message contains ' -> '
|
||||||
if result_message && !result_message.downcase.include?('error') && !result_message.downcase.include?('failed') && !result_message.downcase.include?('skipping') && !result_message.downcase.include?('already exists')
|
if result_message && result_message.include?(' -> ')
|
||||||
download_success = true
|
download_success = true
|
||||||
end
|
end
|
||||||
@download_mutex.synchronize do
|
@download_mutex.synchronize do
|
||||||
@@ -659,11 +663,21 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
begin
|
begin
|
||||||
structure_dir_path dir_path
|
structure_dir_path dir_path
|
||||||
download_with_retry(file_path, file_url, file_timestamp, http)
|
status = download_with_retry(file_path, file_url, file_timestamp, http)
|
||||||
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
|
||||||
rewrite_urls_to_relative(file_path)
|
case status
|
||||||
|
when :saved
|
||||||
|
if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
|
||||||
|
rewrite_urls_to_relative(file_path)
|
||||||
|
end
|
||||||
|
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
|
when :skipped_not_found
|
||||||
|
"Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
|
else
|
||||||
|
# ideally, this case should not be reached if download_with_retry behaves as expected.
|
||||||
|
@logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
|
||||||
|
"Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
end
|
end
|
||||||
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
|
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
|
||||||
if File.exist?(file_path) and File.size(file_path) == 0
|
if File.exist?(file_path) and File.size(file_path) == 0
|
||||||
@@ -707,6 +721,9 @@ class WaybackMachineDownloader
|
|||||||
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Escape square brackets because they are not valid in URI()
|
||||||
|
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
||||||
|
|
||||||
request = Net::HTTP::Get.new(URI(wayback_url))
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
||||||
request["Connection"] = "keep-alive"
|
request["Connection"] = "keep-alive"
|
||||||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||||||
@@ -714,8 +731,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
response = connection.request(request)
|
response = connection.request(request)
|
||||||
|
|
||||||
case response
|
save_response_body = lambda do
|
||||||
when Net::HTTPSuccess
|
|
||||||
File.open(file_path, "wb") do |file|
|
File.open(file_path, "wb") do |file|
|
||||||
body = response.body
|
body = response.body
|
||||||
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
if response['content-encoding'] == 'gzip' && body && !body.empty?
|
||||||
@@ -725,26 +741,48 @@ class WaybackMachineDownloader
|
|||||||
gz.close
|
gz.close
|
||||||
file.write(decompressed_body)
|
file.write(decompressed_body)
|
||||||
rescue Zlib::GzipFile::Error => e
|
rescue Zlib::GzipFile::Error => e
|
||||||
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}")
|
@logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
|
||||||
file.write(body)
|
file.write(body)
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
file.write(body) if body
|
file.write(body) if body
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
when Net::HTTPRedirection
|
end
|
||||||
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
|
||||||
location = response['location']
|
if @all
|
||||||
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
case response
|
||||||
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
|
||||||
when Net::HTTPTooManyRequests
|
save_response_body.call
|
||||||
sleep(RATE_LIMIT * 2)
|
if response.is_a?(Net::HTTPRedirection)
|
||||||
raise "Rate limited, retrying..."
|
@logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
|
||||||
when Net::HTTPNotFound
|
elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
|
||||||
@logger.warn("File not found, skipping: #{file_url}")
|
@logger.info("Saved error page for #{file_url} (status #{response.code}).")
|
||||||
return
|
end
|
||||||
else
|
return :saved
|
||||||
raise "HTTP Error: #{response.code} #{response.message}"
|
else
|
||||||
|
# for any other response type when --all is true, treat as an error to be retried or failed
|
||||||
|
raise "Unhandled HTTP response: #{response.code} #{response.message}"
|
||||||
|
end
|
||||||
|
else # not @all (our default behavior)
|
||||||
|
case response
|
||||||
|
when Net::HTTPSuccess
|
||||||
|
save_response_body.call
|
||||||
|
return :saved
|
||||||
|
when Net::HTTPRedirection
|
||||||
|
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
||||||
|
location = response['location']
|
||||||
|
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
||||||
|
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
||||||
|
when Net::HTTPTooManyRequests
|
||||||
|
sleep(RATE_LIMIT * 2)
|
||||||
|
raise "Rate limited, retrying..."
|
||||||
|
when Net::HTTPNotFound
|
||||||
|
@logger.warn("File not found, skipping: #{file_url}")
|
||||||
|
return :skipped_not_found
|
||||||
|
else
|
||||||
|
raise "HTTP Error: #{response.code} #{response.message}"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
|
|||||||
@@ -4,6 +4,13 @@ require 'uri'
|
|||||||
module ArchiveAPI
|
module ArchiveAPI
|
||||||
|
|
||||||
def get_raw_list_from_api(url, page_index, http)
|
def get_raw_list_from_api(url, page_index, http)
|
||||||
|
# Automatically append /* if the URL doesn't contain a path after the domain
|
||||||
|
# This is a workaround for an issue with the API and *some* domains.
|
||||||
|
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
||||||
|
if url && !url.match(/^https?:\/\/.*\//i)
|
||||||
|
url = "#{url}/*"
|
||||||
|
end
|
||||||
|
|
||||||
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
||||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||||
request_url.query = URI.encode_www_form(params)
|
request_url.query = URI.encode_www_form(params)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "wayback_machine_downloader_straw"
|
s.name = "wayback_machine_downloader_straw"
|
||||||
s.version = "2.3.6"
|
s.version = "2.3.8"
|
||||||
s.executables << "wayback_machine_downloader"
|
s.executables << "wayback_machine_downloader"
|
||||||
s.summary = "Download an entire website from the Wayback Machine."
|
s.summary = "Download an entire website from the Wayback Machine."
|
||||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||||
|
|||||||
Reference in New Issue
Block a user