Add more precise error handling of UTF-8 malformed urls

This commit is contained in:
hartator 2015-08-17 19:42:37 -05:00
parent be6fe01cf0
commit c0024cbcb5

View File

@ -1,5 +1,6 @@
require 'open-uri' require 'open-uri'
require 'fileutils' require 'fileutils'
require_relative 'tidy_bytes'
class WaybackMachineDownloader class WaybackMachineDownloader
@ -31,7 +32,10 @@ class WaybackMachineDownloader
file_url = line[2] file_url = line[2]
file_id = file_url.split('/')[3..-1].join('/') file_id = file_url.split('/')[3..-1].join('/')
file_id = URI.unescape file_id file_id = URI.unescape file_id
if @timestamp == 0 or file_timestamp <= @timestamp file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
elsif @timestamp == 0 or file_timestamp <= @timestamp
if file_list_curated[file_id] if file_list_curated[file_id]
unless file_list_curated[file_id][:timestamp] > file_timestamp unless file_list_curated[file_id][:timestamp] > file_timestamp
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
@ -55,7 +59,7 @@ class WaybackMachineDownloader
end end
def download_files def download_files
puts "Downlading #{@base_url} from Wayback Machine..." puts "Downlading #{@base_url} to #{backup_path} from Wayback Machine..."
puts puts
file_list_curated = get_file_list_curated file_list_curated = get_file_list_curated
count = 0 count = 0
@ -84,7 +88,7 @@ class WaybackMachineDownloader
rescue OpenURI::HTTPError => e rescue OpenURI::HTTPError => e
puts "#{file_url} # #{e}" puts "#{file_url} # #{e}"
file.write(e.io.read) file.write(e.io.read)
rescue Exception => e rescue StandardError => e
puts "#{file_url} # #{e}" puts "#{file_url} # #{e}"
end end
end end