Ensure latest version in case of name conflict

This commit is contained in:
hartator 2015-08-15 15:37:37 -05:00
parent e3ef984fb6
commit 12be377640

View File

@ -21,35 +21,48 @@ class WaybackMachineDownloader
end end
def get_file_list_curated def get_file_list_curated
file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
file_list_curated = Hash.new file_list_curated = Hash.new
file_list_raw.each_line do |line| [index_file_list_raw, all_file_list_raw].each do |file|
line = line.split(' ') file.each_line do |line|
file_timestamp = line[1].to_i line = line.split(' ')
file_url = line[2] file_timestamp = line[1].to_i
file_id = file_url.split('/')[3..-1].join('/') file_url = line[2]
file_id = URI.unescape file_id file_id = file_url.split('/')[3..-1].join('/')
if @timestamp == 0 or file_timestamp <= @timestamp file_id = URI.unescape file_id
if file_list_curated[file_id] if @timestamp == 0 or file_timestamp <= @timestamp
unless file_list_curated[file_id][:timestamp] > file_timestamp if file_list_curated[file_id]
unless file_list_curated[file_id][:timestamp] > file_timestamp
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end end
else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end end
end end
end end
file_list_curated file_list_curated
end end
def file_list_by_timestamp
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
def download_files def download_files
puts "Downlading #{@base_url} from Wayback Machine..." puts "Downlading #{@base_url} from Wayback Machine..."
puts puts
file_list_curated = get_file_list_curated file_list_curated = get_file_list_curated
count = 0 count = 0
file_list_curated.each do |file_id, file_remote_info| file_list_by_timestamp.each do |file_remote_info|
count += 1 count += 1
file_url = file_remote_info[:file_url] file_url = file_remote_info[:file_url]
file_id = file_remote_info[:file_id]
file_path_elements = file_id.split('/') file_path_elements = file_id.split('/')
if file_id == "" if file_id == ""
dir_path = backup_path dir_path = backup_path