mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-18 18:26:43 +00:00
Update wayback_machine_downloader.rb
This commit is contained in:
parent
330c1888fb
commit
5bd9fbffdd
@ -26,6 +26,7 @@ class WaybackMachineDownloader
|
|||||||
@only_filter = params[:only_filter]
|
@only_filter = params[:only_filter]
|
||||||
@exclude_filter = params[:exclude_filter]
|
@exclude_filter = params[:exclude_filter]
|
||||||
@all = params[:all]
|
@all = params[:all]
|
||||||
|
@all_timestamps = params[:all_timestamps]
|
||||||
@list = params[:list]
|
@list = params[:list]
|
||||||
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
||||||
@threads_count = params[:threads_count].to_i
|
@threads_count = params[:threads_count].to_i
|
||||||
@ -123,8 +124,47 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
file_list_curated
|
file_list_curated
|
||||||
end
|
end
|
||||||
|
def get_file_list_ALL
|
||||||
|
file_list_curated = Hash.new
|
||||||
|
puts "snapshots_to_consider: " + get_all_snapshots_to_consider.lines.count.to_s
|
||||||
|
get_all_snapshots_to_consider.each_line do |line|
|
||||||
|
unless line.include?('/')
|
||||||
|
print "**" + line
|
||||||
|
next
|
||||||
|
end
|
||||||
|
file_timestamp = line[0..13].to_i
|
||||||
|
file_url = line[15..-2]
|
||||||
|
file_id = file_url.split('/')[3..-1].join('/')
|
||||||
|
file_id = [file_timestamp, file_id].join('/')
|
||||||
|
file_id = CGI::unescape file_id
|
||||||
|
file_id = file_id.tidy_bytes unless file_id == ""
|
||||||
|
if file_id.nil?
|
||||||
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
|
else
|
||||||
|
if match_exclude_filter(file_url)
|
||||||
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
||||||
|
elsif not match_only_filter(file_url)
|
||||||
|
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
||||||
|
elsif file_list_curated[file_id]
|
||||||
|
#puts "__" + file_id
|
||||||
|
else
|
||||||
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
puts "file_list_curated: " + file_list_curated.count.to_s
|
||||||
|
file_list_curated
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
def get_file_list_by_timestamp
|
def get_file_list_by_timestamp
|
||||||
|
if @all_timestamps
|
||||||
|
file_list_curated = get_file_list_ALL
|
||||||
|
file_list_curated.map do |file_remote_info|
|
||||||
|
file_remote_info[1][:file_id] = file_remote_info[0]
|
||||||
|
file_remote_info[1]
|
||||||
|
end
|
||||||
|
else
|
||||||
file_list_curated = get_file_list_curated
|
file_list_curated = get_file_list_curated
|
||||||
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
||||||
file_list_curated.map do |file_remote_info|
|
file_list_curated.map do |file_remote_info|
|
||||||
@ -132,6 +172,7 @@ class WaybackMachineDownloader
|
|||||||
file_remote_info[1]
|
file_remote_info[1]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def list_files
|
def list_files
|
||||||
puts "["
|
puts "["
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user