Clean up data earlier from UTF-8 bad bytes

This commit is contained in:
hartator 2016-09-16 12:09:58 -05:00
parent cfa236a4d1
commit 7b69f3c236

View File

@ -78,12 +78,11 @@ class WaybackMachineDownloader
file_list_curated = Hash.new file_list_curated = Hash.new
[index_file_list_raw, all_file_list_raw].each do |file| [index_file_list_raw, all_file_list_raw].each do |file|
file.each_line do |line| file.each_line do |line|
line = line.split(' ') line = line.tidy_bytes.split(' ')
file_timestamp = line[0].to_i file_timestamp = line[0].to_i
file_url = line[1] file_url = line[1]
file_id = file_url.split('/')[3..-1].join('/') file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil? if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}" puts "Malformed file url, ignoring: #{file_url}"
else else