7 Commits

Author SHA1 Message Date
Felipe
3fdfd70fc1 Bump version 2025-06-05 22:34:40 +00:00
Felipe
2bf74b4173 Merge pull request #14 from elidickinson/fix-bracket-urls
Fix bug with archive urls containing square brackets
2025-06-03 23:12:07 -03:00
Eli Dickinson
79cbb639e7 Fix bug with archive urls containing square brackets 2025-06-03 16:36:03 -04:00
Felipe
071d208b31 Merge pull request #13 from elidickinson/master
workaround for API only showing html files for some domains (fixes #6)
2025-05-30 14:34:32 -03:00
Eli Dickinson
1681a12579 workaround for API only showing html files for some domains
See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
2025-05-30 12:50:48 -04:00
Felipe
f38756dd76 Correction for downloaded data folder
if you downloaded content from example.org/*, it would be listed in a folder titled * instead of the sitename. See #6 (and thanks to elidickinson for pointing it out!)
2025-05-30 14:00:32 +00:00
Felipe
9452411e32 Added nil checks 2025-05-30 13:52:25 +00:00
3 changed files with 20 additions and 6 deletions

View File

@@ -113,7 +113,7 @@ class WaybackMachineDownloader
include ArchiveAPI include ArchiveAPI
VERSION = "2.3.7" VERSION = "2.3.8"
DEFAULT_TIMEOUT = 30 DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3 MAX_RETRIES = 3
RETRY_DELAY = 2 RETRY_DELAY = 2
@@ -154,10 +154,12 @@ class WaybackMachineDownloader
end end
def backup_name def backup_name
if @base_url.include? '//' url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
@base_url.split('/')[2]
if url_to_process.include? '//'
url_to_process.split('/')[2]
else else
@base_url url_to_process
end end
end end
@@ -241,6 +243,7 @@ class WaybackMachineDownloader
# Fetch the initial set of snapshots, sequentially # Fetch the initial set of snapshots, sequentially
@connection_pool.with_connection do |connection| @connection_pool.with_connection do |connection|
initial_list = get_raw_list_from_api(@base_url, nil, connection) initial_list = get_raw_list_from_api(@base_url, nil, connection)
initial_list ||= []
mutex.synchronize do mutex.synchronize do
snapshot_list_to_consider.concat(initial_list) snapshot_list_to_consider.concat(initial_list)
print "." print "."
@@ -265,6 +268,7 @@ class WaybackMachineDownloader
@connection_pool.with_connection do |connection| @connection_pool.with_connection do |connection|
result = get_raw_list_from_api("#{@base_url}/*", page, connection) result = get_raw_list_from_api("#{@base_url}/*", page, connection)
end end
result ||= []
[page, result] [page, result]
end end
end end
@@ -284,7 +288,7 @@ class WaybackMachineDownloader
# Process results and check for empty pages # Process results and check for empty pages
results.each do |page, result| results.each do |page, result|
if result.empty? if result.nil? || result.empty?
continue_fetching = false continue_fetching = false
break break
else else
@@ -717,6 +721,9 @@ class WaybackMachineDownloader
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}" "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
end end
# Escape square brackets because they are not valid in URI()
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
request = Net::HTTP::Get.new(URI(wayback_url)) request = Net::HTTP::Get.new(URI(wayback_url))
request["Connection"] = "keep-alive" request["Connection"] = "keep-alive"
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}" request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"

View File

@@ -4,6 +4,13 @@ require 'uri'
module ArchiveAPI module ArchiveAPI
def get_raw_list_from_api(url, page_index, http) def get_raw_list_from_api(url, page_index, http)
# Automatically append /* if the URL doesn't contain a path after the domain
# This is a workaround for an issue with the API and *some* domains.
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
if url && !url.match(/^https?:\/\/.*\//i)
url = "#{url}/*"
end
request_url = URI("https://web.archive.org/cdx/search/cdx") request_url = URI("https://web.archive.org/cdx/search/cdx")
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index) params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
request_url.query = URI.encode_www_form(params) request_url.query = URI.encode_www_form(params)

View File

@@ -1,6 +1,6 @@
Gem::Specification.new do |s| Gem::Specification.new do |s|
s.name = "wayback_machine_downloader_straw" s.name = "wayback_machine_downloader_straw"
s.version = "2.3.7" s.version = "2.3.8"
s.executables << "wayback_machine_downloader" s.executables << "wayback_machine_downloader"
s.summary = "Download an entire website from the Wayback Machine." s.summary = "Download an entire website from the Wayback Machine."
s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)" s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"