From bdf611bce9ed70d9057a67a8ae9e79d5ea0a55e5 Mon Sep 17 00:00:00 2001 From: hartator Date: Fri, 16 Sep 2016 15:52:18 -0500 Subject: [PATCH] Set file id to nil when encounter UTF-8 issues --- .gitignore | 2 ++ lib/wayback_machine_downloader.rb | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2d88f6b..adbec99 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ tmp ## RUBINIUS *.rbc + +test.rb diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 57a2ded..776f67e 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -78,10 +78,9 @@ class WaybackMachineDownloader file_list_curated = Hash.new [index_file_list_raw, all_file_list_raw].each do |file| file.each_line do |line| - next if line.size < 20 file_timestamp = line[0..13].to_i file_url = line[15..-2] - file_id = file_url.split('/')[3..-1].join('/') + file_id = file_url.split('/')[3..-1].join('/') rescue nil file_id = CGI::unescape file_id file_id = file_id.tidy_bytes unless file_id == "" if file_id.nil?