From 2fd50480eae3f6c9528237b008345890c3521beb Mon Sep 17 00:00:00 2001 From: Ian Kirker Date: Wed, 15 Feb 2017 10:50:12 +0000 Subject: [PATCH 1/4] Added prospective test for encoding conflict errors --- test/test_wayback_machine_downloader.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index 0e4ceb4..6125042 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -89,5 +89,14 @@ class WaybackMachineDownloaderTest < Minitest::Test @wayback_machine_downloader.all = true assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size end + + # Testing encoding conflicts needs a different base_url + def test_nonascii_suburls_download + @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + # Once for the downloading... + @wayback_machine_downloader.download_files + # ... and once for the "is already present" + @wayback_machine_downloader.download_files + end end From 0a324016b85755ce27a054074ea656eef8730d60 Mon Sep 17 00:00:00 2001 From: Ian Kirker Date: Wed, 15 Feb 2017 10:58:40 +0000 Subject: [PATCH 2/4] Split encoding test into two tests (One for downloading, one for when files are already present) --- test/test_wayback_machine_downloader.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_wayback_machine_downloader.rb b/test/test_wayback_machine_downloader.rb index 6125042..cd5b822 100644 --- a/test/test_wayback_machine_downloader.rb +++ b/test/test_wayback_machine_downloader.rb @@ -93,9 +93,14 @@ class WaybackMachineDownloaderTest < Minitest::Test # Testing encoding conflicts needs a different base_url def test_nonascii_suburls_download @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' - # Once for the downloading... + # Once just for the downloading... + @wayback_machine_downloader.download_files + end + + def test_nonascii_suburls_already_present + @wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84' + # ... twice to test the "is already present" case @wayback_machine_downloader.download_files - # ... and once for the "is already present" @wayback_machine_downloader.download_files end From 132e3fa5f89ef28798811bec2e6be5a4951bb18c Mon Sep 17 00:00:00 2001 From: Ian Kirker Date: Wed, 15 Feb 2017 11:01:26 +0000 Subject: [PATCH 3/4] Alters encoding of file_url to fix encoding incompatibilities --- lib/wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 924d7ac..7d2f37e 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -201,7 +201,7 @@ class WaybackMachineDownloader end def download_file file_remote_info - file_url = file_remote_info[:file_url] + file_url = file_remote_info[:file_url].encode(''.encoding) file_id = file_remote_info[:file_id] file_timestamp = file_remote_info[:timestamp] file_path_elements = file_id.split('/') From 4830913ed3a9da3a66aae039fa5ead4f65b2ed9c Mon Sep 17 00:00:00 2001 From: hartator Date: Fri, 17 Feb 2017 12:54:12 -0600 Subject: [PATCH 4/4] Add explicit variable current encoding --- lib/wayback_machine_downloader.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 7d2f37e..401ba7a 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -201,7 +201,8 @@ class WaybackMachineDownloader end def download_file file_remote_info - file_url = file_remote_info[:file_url].encode(''.encoding) + current_encoding = "".encoding + file_url = file_remote_info[:file_url].encode(current_encoding) file_id = file_remote_info[:file_id] file_timestamp = file_remote_info[:timestamp] file_path_elements = file_id.split('/')