diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 924d7ac..9394082 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -26,6 +26,7 @@ class WaybackMachineDownloader @only_filter = params[:only_filter] @exclude_filter = params[:exclude_filter] @all = params[:all] + @all_timestamps = params[:all_timestamps] @list = params[:list] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i @@ -123,13 +124,53 @@ class WaybackMachineDownloader end file_list_curated end + def get_file_list_ALL + file_list_curated = Hash.new + puts "snapshots_to_consider: " + get_all_snapshots_to_consider.lines.count.to_s + get_all_snapshots_to_consider.each_line do |line| + unless line.include?('/') + print "**" + line + next + end + file_timestamp = line[0..13].to_i + file_url = line[15..-2] + file_id = file_url.split('/')[3..-1].join('/') + file_id = [file_timestamp, file_id].join('/') + file_id = CGI::unescape file_id + file_id = file_id.tidy_bytes unless file_id == "" + if file_id.nil? + puts "Malformed file url, ignoring: #{file_url}" + else + if match_exclude_filter(file_url) + puts "File url matches exclude filter, ignoring: #{file_url}" + elsif not match_only_filter(file_url) + puts "File url doesn't match only filter, ignoring: #{file_url}" + elsif file_list_curated[file_id] + #puts "__" + file_id + else + file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} + end + end + end + puts "file_list_curated: " + file_list_curated.count.to_s + file_list_curated + end + def get_file_list_by_timestamp - file_list_curated = get_file_list_curated - file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse - file_list_curated.map do |file_remote_info| - file_remote_info[1][:file_id] = file_remote_info[0] - file_remote_info[1] + if @all_timestamps + file_list_curated = get_file_list_ALL + file_list_curated.map do |file_remote_info| + file_remote_info[1][:file_id] = file_remote_info[0] + file_remote_info[1] + end + else + file_list_curated = get_file_list_curated + file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse + file_list_curated.map do |file_remote_info| + file_remote_info[1][:file_id] = file_remote_info[0] + file_remote_info[1] + end end end