2015-11-05 16:44:41 -06:00
# encoding: UTF-8
2015-08-08 16:15:14 -05:00
require 'open-uri'
require 'fileutils'
2015-08-19 12:02:08 -05:00
require_relative 'wayback_machine_downloader/tidy_bytes'
2015-11-19 15:28:02 -06:00
require_relative 'wayback_machine_downloader/to_regex'
2015-08-08 16:15:14 -05:00
2015-07-25 18:44:37 -05:00
class WaybackMachineDownloader
2015-08-08 16:15:14 -05:00
2016-03-29 18:39:31 -05:00
VERSION = " 0.2.4 "
2015-08-10 01:13:59 -05:00
2015-11-19 15:28:02 -06:00
attr_accessor :base_url , :timestamp , :only_filter
2015-08-08 16:15:14 -05:00
def initialize params
@base_url = params [ :base_url ]
2015-08-09 22:33:32 -05:00
@timestamp = params [ :timestamp ] . to_i
2015-11-19 15:28:02 -06:00
@only_filter = params [ :only_filter ]
2015-07-25 18:44:37 -05:00
end
2015-08-08 16:15:14 -05:00
def backup_name
@base_url . split ( '/' ) [ 2 ]
end
def backup_path
'websites/' + backup_name + '/'
end
2015-11-19 15:28:02 -06:00
def match_only_filter file_url
if @only_filter
only_filter_regex = @only_filter . to_regex
if only_filter_regex
only_filter_regex =~ file_url
else
file_url . downcase . include? @only_filter . downcase
end
else
true
end
end
2015-08-09 21:26:43 -05:00
def get_file_list_curated
2015-11-17 14:52:55 -06:00
index_file_list_raw = open " http://web.archive.org/cdx/search/xd?url= #{ @base_url } "
2015-08-15 15:37:37 -05:00
all_file_list_raw = open " http://web.archive.org/cdx/search/xd?url= #{ @base_url } /* "
2015-08-08 16:15:14 -05:00
file_list_curated = Hash . new
2015-08-15 15:37:37 -05:00
[ index_file_list_raw , all_file_list_raw ] . each do | file |
file . each_line do | line |
line = line . split ( ' ' )
file_timestamp = line [ 1 ] . to_i
file_url = line [ 2 ]
file_id = file_url . split ( '/' ) [ 3 .. - 1 ] . join ( '/' )
file_id = URI . unescape file_id
2015-08-17 19:42:37 -05:00
file_id = file_id . tidy_bytes unless file_id == " "
if file_id . nil?
puts " Malformed file url, ignoring: #{ file_url } "
elsif @timestamp == 0 or file_timestamp < = @timestamp
2015-11-19 15:28:02 -06:00
if not match_only_filter ( file_url )
puts " File url not in supplied only filter, ignoring: #{ file_url } "
elsif file_list_curated [ file_id ]
2015-08-15 15:37:37 -05:00
unless file_list_curated [ file_id ] [ :timestamp ] > file_timestamp
file_list_curated [ file_id ] = { file_url : file_url , timestamp : file_timestamp }
end
else
2015-08-09 22:33:32 -05:00
file_list_curated [ file_id ] = { file_url : file_url , timestamp : file_timestamp }
end
2015-08-08 16:15:14 -05:00
end
end
end
2015-11-19 15:28:02 -06:00
file_list_curated
2015-08-08 16:15:14 -05:00
end
2015-11-05 16:19:03 -06:00
def get_file_list_by_timestamp
2015-08-15 15:37:37 -05:00
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated . sort_by { | k , v | v [ :timestamp ] } . reverse
file_list_curated . map do | file_remote_info |
file_remote_info [ 1 ] [ :file_id ] = file_remote_info [ 0 ]
file_remote_info [ 1 ]
end
end
2015-08-08 16:15:14 -05:00
def download_files
2015-10-20 13:00:52 -05:00
puts " Downloading #{ @base_url } to #{ backup_path } from Wayback Machine... "
2015-08-09 21:26:43 -05:00
puts
2015-11-05 16:19:03 -06:00
file_list_by_timestamp = get_file_list_by_timestamp
2015-11-06 13:11:26 -05:00
if file_list_by_timestamp . count == 0
puts " No files to download. Possible reasons: \n \t * Accept regex didn't let any files through (Accept Regex: \" #{ @accept_regex . to_s } \" ) \n \t * Site is not in wayback machine. "
return
end
2015-08-10 01:13:59 -05:00
count = 0
2015-08-15 15:37:37 -05:00
file_list_by_timestamp . each do | file_remote_info |
2015-08-10 01:13:59 -05:00
count += 1
2015-08-08 16:15:14 -05:00
file_url = file_remote_info [ :file_url ]
2015-08-15 15:37:37 -05:00
file_id = file_remote_info [ :file_id ]
2015-11-05 16:18:36 -06:00
file_timestamp = file_remote_info [ :timestamp ]
2015-08-08 16:15:14 -05:00
file_path_elements = file_id . split ( '/' )
if file_id == " "
dir_path = backup_path
file_path = backup_path + 'index.html'
2015-08-09 21:26:43 -05:00
elsif file_url [ - 1 ] == '/' or not file_path_elements [ - 1 ] . include? '.'
2015-08-08 16:15:14 -05:00
dir_path = backup_path + file_path_elements [ 0 .. - 1 ] . join ( '/' )
2015-08-10 01:13:59 -05:00
file_path = backup_path + file_path_elements [ 0 .. - 1 ] . join ( '/' ) + '/index.html'
2015-08-08 16:15:14 -05:00
else
dir_path = backup_path + file_path_elements [ 0 .. - 2 ] . join ( '/' )
file_path = backup_path + file_path_elements [ 0 .. - 1 ] . join ( '/' )
end
2016-02-09 10:29:27 -06:00
if Gem . win_platform?
file_path = file_path . gsub ( / [:*?<> \\ |] / ) { | s | '%' + s . ord . to_s ( 16 ) }
end
2015-08-08 16:15:14 -05:00
unless File . exists? file_path
2015-08-21 20:43:31 -05:00
begin
2015-08-24 18:00:36 -05:00
structure_dir_path dir_path
2015-08-21 20:43:31 -05:00
open ( file_path , " wb " ) do | file |
begin
2016-03-17 00:48:59 -06:00
open ( " http://web.archive.org/web/ #{ file_timestamp } id_/ #{ file_url } " , " Accept-Encoding " = > " plain " ) do | uri |
2015-08-21 20:43:31 -05:00
file . write ( uri . read )
end
rescue OpenURI :: HTTPError = > e
puts " #{ file_url } # #{ e } "
file . write ( e . io . read )
rescue StandardError = > e
puts " #{ file_url } # #{ e } "
2015-08-08 16:15:14 -05:00
end
end
2015-09-04 23:34:43 -05:00
rescue StandardError = > e
2015-08-21 20:43:31 -05:00
puts " #{ file_url } # #{ e } "
2015-08-08 16:15:14 -05:00
end
2015-11-05 16:19:03 -06:00
puts " #{ file_url } -> #{ file_path } ( #{ count } / #{ file_list_by_timestamp . size } ) "
2015-08-08 16:15:14 -05:00
else
2015-11-05 16:19:03 -06:00
puts " #{ file_url } # #{ file_path } already exists. ( #{ count } / #{ file_list_by_timestamp . size } ) "
2015-08-08 16:15:14 -05:00
end
end
2015-08-09 21:26:43 -05:00
puts
2015-11-05 16:19:03 -06:00
puts " Download complete, saved in #{ backup_path } ( #{ file_list_by_timestamp . size } files) "
2015-08-10 01:13:59 -05:00
end
def structure_dir_path dir_path
begin
FileUtils :: mkdir_p dir_path unless File . exists? dir_path
rescue Errno :: EEXIST = > e
2015-09-10 00:35:48 -05:00
error_to_string = e . to_s
puts " # #{ error_to_string } "
if error_to_string . include? " File exists @ dir_s_mkdir - "
2015-09-10 00:43:21 -05:00
file_already_existing = error_to_string . split ( " File exists @ dir_s_mkdir - " ) [ - 1 ]
2015-09-10 00:35:48 -05:00
elsif error_to_string . include? " File exists - "
2015-09-10 00:43:21 -05:00
file_already_existing = error_to_string . split ( " File exists - " ) [ - 1 ]
2015-09-10 00:35:48 -05:00
else
2015-09-10 00:43:21 -05:00
raise " Unhandled directory restructure error # #{ error_to_string } "
2015-09-10 00:35:48 -05:00
end
2015-08-10 01:13:59 -05:00
file_already_existing_temporary = file_already_existing + '.temp'
file_already_existing_permanent = file_already_existing + '/index.html'
FileUtils :: mv file_already_existing , file_already_existing_temporary
FileUtils :: mkdir_p file_already_existing
FileUtils :: mv file_already_existing_temporary , file_already_existing_permanent
2015-08-24 18:00:36 -05:00
puts " #{ file_already_existing } -> #{ file_already_existing_permanent } "
2015-08-10 01:13:59 -05:00
structure_dir_path dir_path
end
2015-08-08 16:15:14 -05:00
end
2015-07-25 18:44:37 -05:00
end