From f03d92a3c473eed248ec9fb3f6c719dd9b3684cc Mon Sep 17 00:00:00 2001 From: underarchiver <216701687+underarchiver@users.noreply.github.com> Date: Tue, 17 Jun 2025 13:27:10 +0200 Subject: [PATCH] Prevent fetching off non RFC3968-compliant URLs --- lib/wayback_machine_downloader.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 4a16c67..9212360 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -123,6 +123,7 @@ class WaybackMachineDownloader STATE_CDX_FILENAME = ".cdx.json" STATE_DB_FILENAME = ".downloaded.txt" + attr_accessor :base_url, :exact_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite @@ -150,6 +151,9 @@ class WaybackMachineDownloader @db_mutex = Mutex.new @rewrite = params[:rewrite] || false + # URL for rejecting invalid/unencoded wayback urls + @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/ + handle_reset end @@ -724,6 +728,12 @@ class WaybackMachineDownloader # Escape square brackets because they are not valid in URI() wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D') + # reject invalid/unencoded wayback_url, behaving as if the resource weren't found + if not @url_regexp.match?(wayback_url) + @logger.warn("Skipped #{file_url}: invalid URL") + return :skipped_not_found + end + request = Net::HTTP::Get.new(URI(wayback_url)) request["Connection"] = "keep-alive" request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"