mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 17:56:44 +00:00
Prevent fetching off non RFC3968-compliant URLs
This commit is contained in:
parent
3fdfd70fc1
commit
f03d92a3c4
@ -123,6 +123,7 @@ class WaybackMachineDownloader
|
|||||||
STATE_CDX_FILENAME = ".cdx.json"
|
STATE_CDX_FILENAME = ".cdx.json"
|
||||||
STATE_DB_FILENAME = ".downloaded.txt"
|
STATE_DB_FILENAME = ".downloaded.txt"
|
||||||
|
|
||||||
|
|
||||||
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
||||||
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
||||||
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
:all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
|
||||||
@ -150,6 +151,9 @@ class WaybackMachineDownloader
|
|||||||
@db_mutex = Mutex.new
|
@db_mutex = Mutex.new
|
||||||
@rewrite = params[:rewrite] || false
|
@rewrite = params[:rewrite] || false
|
||||||
|
|
||||||
|
# URL for rejecting invalid/unencoded wayback urls
|
||||||
|
@url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/
|
||||||
|
|
||||||
handle_reset
|
handle_reset
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -724,6 +728,12 @@ class WaybackMachineDownloader
|
|||||||
# Escape square brackets because they are not valid in URI()
|
# Escape square brackets because they are not valid in URI()
|
||||||
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
|
||||||
|
|
||||||
|
# reject invalid/unencoded wayback_url, behaving as if the resource weren't found
|
||||||
|
if not @url_regexp.match?(wayback_url)
|
||||||
|
@logger.warn("Skipped #{file_url}: invalid URL")
|
||||||
|
return :skipped_not_found
|
||||||
|
end
|
||||||
|
|
||||||
request = Net::HTTP::Get.new(URI(wayback_url))
|
request = Net::HTTP::Get.new(URI(wayback_url))
|
||||||
request["Connection"] = "keep-alive"
|
request["Connection"] = "keep-alive"
|
||||||
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user