mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Improve only_filter to accept both strings and regexes
This commit is contained in:
@@ -3,17 +3,18 @@
|
|||||||
require 'open-uri'
|
require 'open-uri'
|
||||||
require 'fileutils'
|
require 'fileutils'
|
||||||
require_relative 'wayback_machine_downloader/tidy_bytes'
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
||||||
|
require_relative 'wayback_machine_downloader/to_regex'
|
||||||
|
|
||||||
class WaybackMachineDownloader
|
class WaybackMachineDownloader
|
||||||
|
|
||||||
VERSION = "0.1.18"
|
VERSION = "0.1.18"
|
||||||
|
|
||||||
attr_accessor :base_url, :timestamp
|
attr_accessor :base_url, :timestamp, :only_filter
|
||||||
|
|
||||||
def initialize params
|
def initialize params
|
||||||
@base_url = params[:base_url]
|
@base_url = params[:base_url]
|
||||||
@timestamp = params[:timestamp].to_i
|
@timestamp = params[:timestamp].to_i
|
||||||
@only_filter = Regexp.new params[:only_filter]
|
@only_filter = params[:only_filter]
|
||||||
end
|
end
|
||||||
|
|
||||||
def backup_name
|
def backup_name
|
||||||
@@ -24,6 +25,19 @@ class WaybackMachineDownloader
|
|||||||
'websites/' + backup_name + '/'
|
'websites/' + backup_name + '/'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def match_only_filter file_url
|
||||||
|
if @only_filter
|
||||||
|
only_filter_regex = @only_filter.to_regex
|
||||||
|
if only_filter_regex
|
||||||
|
only_filter_regex =~ file_url
|
||||||
|
else
|
||||||
|
file_url.downcase.include? @only_filter.downcase
|
||||||
|
end
|
||||||
|
else
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def get_file_list_curated
|
def get_file_list_curated
|
||||||
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
||||||
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
||||||
@@ -39,7 +53,9 @@ class WaybackMachineDownloader
|
|||||||
if file_id.nil?
|
if file_id.nil?
|
||||||
puts "Malformed file url, ignoring: #{file_url}"
|
puts "Malformed file url, ignoring: #{file_url}"
|
||||||
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
||||||
if file_list_curated[file_id]
|
if not match_only_filter(file_url)
|
||||||
|
puts "File url not in supplied only filter, ignoring: #{file_url}"
|
||||||
|
elsif file_list_curated[file_id]
|
||||||
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
||||||
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
||||||
end
|
end
|
||||||
@@ -49,21 +65,7 @@ class WaybackMachineDownloader
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
file_list_curated
|
||||||
# if accept_regex not defined, just return the file_list_curated
|
|
||||||
if @accept_regex.nil?
|
|
||||||
return file_list_curated
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
# accept_regex defined. now we need to create a filtered list.
|
|
||||||
filtered_file_list_curated = Hash.new
|
|
||||||
file_list_curated.each do |file_id, fileinfo|
|
|
||||||
if fileinfo[:file_url].match @accept_regex
|
|
||||||
filtered_file_list_curated[file_id] = fileinfo
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return filtered_file_list_curated
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_file_list_by_timestamp
|
def get_file_list_by_timestamp
|
||||||
|
|||||||
Reference in New Issue
Block a user