From 7b69f3c2360aa899d3770836767342d3de82dea9 Mon Sep 17 00:00:00 2001 From: hartator Date: Fri, 16 Sep 2016 12:09:58 -0500 Subject: [PATCH] Clean up data earlier from UTF-8 bad bytes --- lib/wayback_machine_downloader.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index ef82bc8..bdc721f 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -78,12 +78,11 @@ class WaybackMachineDownloader file_list_curated = Hash.new [index_file_list_raw, all_file_list_raw].each do |file| file.each_line do |line| - line = line.split(' ') + line = line.tidy_bytes.split(' ') file_timestamp = line[0].to_i file_url = line[1] file_id = file_url.split('/')[3..-1].join('/') file_id = CGI::unescape file_id - file_id = file_id.tidy_bytes unless file_id == "" if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" else