mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3fdfd70fc1 | ||
|
|
2bf74b4173 | ||
|
|
79cbb639e7 | ||
|
|
071d208b31 | ||
|
|
1681a12579 | ||
|
|
f38756dd76 | ||
|
|
9452411e32 |
@@ -113,7 +113,7 @@ class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
|
||||
VERSION = "2.3.7"
|
||||
VERSION = "2.3.8"
|
||||
DEFAULT_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 2
|
||||
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def backup_name
|
||||
if @base_url.include? '//'
|
||||
@base_url.split('/')[2]
|
||||
url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
|
||||
|
||||
if url_to_process.include? '//'
|
||||
url_to_process.split('/')[2]
|
||||
else
|
||||
@base_url
|
||||
url_to_process
|
||||
end
|
||||
end
|
||||
|
||||
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
|
||||
# Fetch the initial set of snapshots, sequentially
|
||||
@connection_pool.with_connection do |connection|
|
||||
initial_list = get_raw_list_from_api(@base_url, nil, connection)
|
||||
initial_list ||= []
|
||||
mutex.synchronize do
|
||||
snapshot_list_to_consider.concat(initial_list)
|
||||
print "."
|
||||
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
|
||||
@connection_pool.with_connection do |connection|
|
||||
result = get_raw_list_from_api("#{@base_url}/*", page, connection)
|
||||
end
|
||||
result ||= []
|
||||
[page, result]
|
||||
end
|
||||
end
|
||||
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
|
||||
|
||||
# Process results and check for empty pages
|
||||
results.each do |page, result|
|
||||
if result.empty?
|
||||
if result.nil? || result.empty?
|
||||
continue_fetching = false
|
||||
break
|
||||
else
|
||||
@@ -717,6 +721,9 @@ class WaybackMachineDownloader
|
||||
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
||||
end
|
||||
|
||||
# Escape square brackets because they are not valid in URI()
|
||||
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
||||
|
||||
request = Net::HTTP::Get.new(URI(wayback_url))
|
||||
request["Connection"] = "keep-alive"
|
||||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||||
|
||||
@@ -4,6 +4,13 @@ require 'uri'
|
||||
module ArchiveAPI
|
||||
|
||||
def get_raw_list_from_api(url, page_index, http)
|
||||
# Automatically append /* if the URL doesn't contain a path after the domain
|
||||
# This is a workaround for an issue with the API and *some* domains.
|
||||
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
||||
if url && !url.match(/^https?:\/\/.*\//i)
|
||||
url = "#{url}/*"
|
||||
end
|
||||
|
||||
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
||||
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
||||
request_url.query = URI.encode_www_form(params)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
Gem::Specification.new do |s|
|
||||
s.name = "wayback_machine_downloader_straw"
|
||||
s.version = "2.3.7"
|
||||
s.version = "2.3.8"
|
||||
s.executables << "wayback_machine_downloader"
|
||||
s.summary = "Download an entire website from the Wayback Machine."
|
||||
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
|
||||
|
||||
Reference in New Issue
Block a user