mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-18 02:06:35 +00:00
Merge branch 'pr/89'
# Conflicts: # lib/wayback_machine_downloader.rb
This commit is contained in:
commit
a7c3d9b6c1
@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
|
|||||||
options[:directory] = t
|
options[:directory] = t
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
||||||
|
options[:all_timestamps] = true
|
||||||
|
end
|
||||||
|
|
||||||
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
||||||
options[:from_timestamp] = t
|
options[:from_timestamp] = t
|
||||||
end
|
end
|
||||||
|
|||||||
@ -16,7 +16,7 @@ class WaybackMachineDownloader
|
|||||||
|
|
||||||
VERSION = "2.1.1"
|
VERSION = "2.1.1"
|
||||||
|
|
||||||
attr_accessor :base_url, :exact_url, :directory,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
:all, :maximum_pages, :threads_count
|
:all, :maximum_pages, :threads_count
|
||||||
|
|
||||||
@ -24,6 +24,7 @@ class WaybackMachineDownloader
|
|||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]
|
||||||
@exact_url = params[:exact_url]
|
@exact_url = params[:exact_url]
|
||||||
@directory = params[:directory]
|
@directory = params[:directory]
|
||||||
|
@all_timestamps = params[:all_timestamps]
|
||||||
@from_timestamp = params[:from_timestamp].to_i
|
@from_timestamp = params[:from_timestamp].to_i
|
||||||
@to_timestamp = params[:to_timestamp].to_i
|
@to_timestamp = params[:to_timestamp].to_i
|
||||||
@only_filter = params[:only_filter]
|
@only_filter = params[:only_filter]
|
||||||
@ -127,12 +128,49 @@ class WaybackMachineDownloader
|
|||||||
file_list_curated
|
file_list_curated
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def get_file_list_all_timestamps
|
||||||
|
file_list_curated = Hash.new
|
||||||
|
get_all_snapshots_to_consider.each_line do |line|
|
||||||
|
next unless line.include?('/')
|
||||||
|
file_timestamp = line[0..13].to_i
|
||||||
|
file_url = line[15..-2]
|
||||||
|
file_id = file_url.split('/')[3..-1].join('/')
|
||||||
|
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
||||||
|
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
||||||
|
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
||||||
|
if file_id.nil?
|
||||||
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
|
else
|
||||||
|
if match_exclude_filter(file_url)
|
||||||
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
|
elsif not match_only_filter(file_url)
|
||||||
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||||
|
elsif file_list_curated[file_id_and_timestamp]
|
||||||
|
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
||||||
|
else
|
||||||
|
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
puts "file_list_curated: " + file_list_curated.count.to_s
|
||||||
|
file_list_curated
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
def get_file_list_by_timestamp
|
def get_file_list_by_timestamp
|
||||||
file_list_curated = get_file_list_curated
|
if @all_timestamps
|
||||||
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
file_list_curated = get_file_list_all_timestamps
|
||||||
file_list_curated.map do |file_remote_info|
|
file_list_curated.map do |file_remote_info|
|
||||||
file_remote_info[1][:file_id] = file_remote_info[0]
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
file_remote_info[1]
|
file_remote_info[1]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
file_list_curated = get_file_list_curated
|
||||||
|
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
||||||
|
file_list_curated.map do |file_remote_info|
|
||||||
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
|
file_remote_info[1]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@ -85,6 +85,11 @@ class WaybackMachineDownloaderTest < Minitest::Test
|
|||||||
assert_includes linux_page.read, "Linux Games"
|
assert_includes linux_page.read, "Linux Games"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_all_timestamps_being_respected
|
||||||
|
@wayback_machine_downloader.all_timestamps = true
|
||||||
|
assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
|
||||||
|
end
|
||||||
|
|
||||||
def test_from_timestamp_being_respected
|
def test_from_timestamp_being_respected
|
||||||
@wayback_machine_downloader.from_timestamp = 20050716231334
|
@wayback_machine_downloader.from_timestamp = 20050716231334
|
||||||
file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url]
|
file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user