From c0024cbcb5ea6a0947d90bc5030c6c2d30988932 Mon Sep 17 00:00:00 2001 From: hartator Date: Mon, 17 Aug 2015 19:42:37 -0500 Subject: [PATCH] Add more precise error handling of UTF-8 malformed urls --- lib/wayback_machine_downloader.rb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 550e5bf..414ece3 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -1,5 +1,6 @@ require 'open-uri' require 'fileutils' +require_relative 'tidy_bytes' class WaybackMachineDownloader @@ -31,7 +32,10 @@ class WaybackMachineDownloader file_url = line[2] file_id = file_url.split('/')[3..-1].join('/') file_id = URI.unescape file_id - if @timestamp == 0 or file_timestamp <= @timestamp + file_id = file_id.tidy_bytes unless file_id == "" + if file_id.nil? + puts "Malformed file url, ignoring: #{file_url}" + elsif @timestamp == 0 or file_timestamp <= @timestamp if file_list_curated[file_id] unless file_list_curated[file_id][:timestamp] > file_timestamp file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} @@ -55,7 +59,7 @@ class WaybackMachineDownloader end def download_files - puts "Downlading #{@base_url} from Wayback Machine..." + puts "Downlading #{@base_url} to #{backup_path} from Wayback Machine..." puts file_list_curated = get_file_list_curated count = 0 @@ -84,7 +88,7 @@ class WaybackMachineDownloader rescue OpenURI::HTTPError => e puts "#{file_url} # #{e}" file.write(e.io.read) - rescue Exception => e + rescue StandardError => e puts "#{file_url} # #{e}" end end