mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
Using net:HTTP and decompressing gzip content
see https://github.com/ShiftaDeband/wayback-machine-downloader and bf6e33c2fe
This commit is contained in:
parent
466228fee4
commit
cff30f529e
@ -6,6 +6,8 @@ require 'open-uri'
|
|||||||
require 'fileutils'
|
require 'fileutils'
|
||||||
require 'cgi'
|
require 'cgi'
|
||||||
require 'json'
|
require 'json'
|
||||||
|
require 'zlib'
|
||||||
|
require 'stringio'
|
||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
require_relative 'wayback_machine_downloader/to_regex'
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
require_relative 'wayback_machine_downloader/archive_api'
|
require_relative 'wayback_machine_downloader/archive_api'
|
||||||
@ -14,7 +16,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
include ArchiveAPI
|
include ArchiveAPI
|
||||||
|
|
||||||
VERSION = "2.3.1"
|
VERSION = "2.3.2"
|
||||||
|
|
||||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
@ -81,22 +83,33 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
|
|
||||||
def get_all_snapshots_to_consider
|
def get_all_snapshots_to_consider
|
||||||
# Note: Passing a page index parameter allow us to get more snapshots,
|
http = Net::HTTP.new("web.archive.org", 443)
|
||||||
# but from a less fresh index
|
http.use_ssl = true
|
||||||
print "Getting snapshot pages"
|
|
||||||
snapshot_list_to_consider = []
|
snapshot_list_to_consider = []
|
||||||
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
|
||||||
|
http.start do
|
||||||
|
puts "Getting snapshot pages"
|
||||||
|
|
||||||
|
# Fetch the initial set of snapshots
|
||||||
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
|
||||||
print "."
|
print "."
|
||||||
|
|
||||||
|
# Fetch additional pages if the exact URL flag is not set
|
||||||
unless @exact_url
|
unless @exact_url
|
||||||
@maximum_pages.times do |page_index|
|
@maximum_pages.times do |page_index|
|
||||||
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, http)
|
||||||
break if snapshot_list.empty?
|
break if snapshot_list.empty?
|
||||||
|
|
||||||
snapshot_list_to_consider += snapshot_list
|
snapshot_list_to_consider += snapshot_list
|
||||||
print "."
|
print "."
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
end
|
||||||
|
|
||||||
|
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
||||||
puts
|
puts
|
||||||
|
|
||||||
snapshot_list_to_consider
|
snapshot_list_to_consider
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -206,11 +219,15 @@ class WaybackMachineDownloader
|
|||||||
@processed_file_count = 0
|
@processed_file_count = 0
|
||||||
@threads_count = 1 unless @threads_count != 0
|
@threads_count = 1 unless @threads_count != 0
|
||||||
@threads_count.times do
|
@threads_count.times do
|
||||||
|
http = Net::HTTP.new("web.archive.org", 443)
|
||||||
|
http.use_ssl = true
|
||||||
|
http.start()
|
||||||
threads << Thread.new do
|
threads << Thread.new do
|
||||||
until file_queue.empty?
|
until file_queue.empty?
|
||||||
file_remote_info = file_queue.pop(true) rescue nil
|
file_remote_info = file_queue.pop(true) rescue nil
|
||||||
download_file(file_remote_info) if file_remote_info
|
download_file(file_remote_info, http) if file_remote_info
|
||||||
end
|
end
|
||||||
|
http.finish()
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -243,7 +260,7 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def download_file file_remote_info
|
def download_file (file_remote_info, http)
|
||||||
current_encoding = "".encoding
|
current_encoding = "".encoding
|
||||||
file_url = file_remote_info[:file_url].encode(current_encoding)
|
file_url = file_remote_info[:file_url].encode(current_encoding)
|
||||||
file_id = file_remote_info[:file_id]
|
file_id = file_remote_info[:file_id]
|
||||||
@ -268,8 +285,19 @@ class WaybackMachineDownloader
|
|||||||
structure_dir_path dir_path
|
structure_dir_path dir_path
|
||||||
open(file_path, "wb") do |file|
|
open(file_path, "wb") do |file|
|
||||||
begin
|
begin
|
||||||
URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
|
http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
|
||||||
file.write(uri.read)
|
file.write(body)
|
||||||
|
|
||||||
|
if file_path.include? '.gz'
|
||||||
|
file_path_temp = file_path + '.temp'
|
||||||
|
File.rename(file_path, file_path_temp)
|
||||||
|
Zlib::GzipReader.open(file_path_temp) do |gz|
|
||||||
|
File.open(file_path, 'wb') do |f|
|
||||||
|
f.write gz.read
|
||||||
|
end
|
||||||
|
end
|
||||||
|
File.delete(file_path_temp)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
rescue OpenURI::HTTPError => e
|
rescue OpenURI::HTTPError => e
|
||||||
puts "#{file_url} # #{e}"
|
puts "#{file_url} # #{e}"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user