Ensure latest version in case of name conflict

This commit is contained in:
hartator 2015-08-15 15:37:37 -05:00
parent e3ef984fb6
commit 12be377640

View File

@ -21,9 +21,11 @@ class WaybackMachineDownloader
end
def get_file_list_curated
file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
file_list_curated = Hash.new
file_list_raw.each_line do |line|
[index_file_list_raw, all_file_list_raw].each do |file|
file.each_line do |line|
line = line.split(' ')
file_timestamp = line[1].to_i
file_url = line[2]
@ -39,17 +41,28 @@ class WaybackMachineDownloader
end
end
end
end
file_list_curated
end
def file_list_by_timestamp
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
def download_files
puts "Downlading #{@base_url} from Wayback Machine..."
puts
file_list_curated = get_file_list_curated
count = 0
file_list_curated.each do |file_id, file_remote_info|
file_list_by_timestamp.each do |file_remote_info|
count += 1
file_url = file_remote_info[:file_url]
file_id = file_remote_info[:file_id]
file_path_elements = file_id.split('/')
if file_id == ""
dir_path = backup_path