Merge pull request #188 from pabs3/fixes

Fix various issues
This commit is contained in:
hartator
2021-06-06 19:53:43 -05:00
committed by GitHub
6 changed files with 42 additions and 31 deletions

1
.gitignore vendored
View File

@@ -6,6 +6,7 @@ rdoc
log log
websites websites
.DS_Store .DS_Store
.rake_tasks~
## BUNDLER ## BUNDLER
*.gem *.gem

View File

@@ -42,7 +42,7 @@ It will download the last version of every file present on Wayback Machine to `.
-x, --exclude EXCLUDE_FILTER Skip downloading of urls that match this filter -x, --exclude EXCLUDE_FILTER Skip downloading of urls that match this filter
(use // notation for the filter to be treated as a regex) (use // notation for the filter to be treated as a regex)
-a, --all Expand downloading to error files (40x and 50x) and redirections (30x) -a, --all Expand downloading to error files (40x and 50x) and redirections (30x)
-c, --concurrency NUMBER Number of multiple files to dowload at a time -c, --concurrency NUMBER Number of multiple files to download at a time
Default is one file at a time (ie. 20) Default is one file at a time (ie. 20)
-p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100) -p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100)
Count an average of 150,000 snapshots per page Count an average of 150,000 snapshots per page
@@ -62,7 +62,7 @@ Example:
-s, --all-timestamps -s, --all-timestamps
Optional. This option will download all timestamps/snapshots for a given website. It will uses the timepstamp of each snapshot as directory. Optional. This option will download all timestamps/snapshots for a given website. It will uses the timestamp of each snapshot as directory.
Example: Example:
@@ -78,7 +78,7 @@ Example:
-f, --from TIMESTAMP -f, --from TIMESTAMP
Optional. You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp. Optional. You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp.
Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified. Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified.
Example: Example:
@@ -89,7 +89,7 @@ Example:
-t, --to TIMESTAMP -t, --to TIMESTAMP
Optional. You may want to supply a to timestamp to lock your backup to a specifc version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp. Optional. You may want to supply a to timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp.
Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified. Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified.
Example: Example:
@@ -169,7 +169,7 @@ Example:
-c, --concurrency NUMBER -c, --concurrency NUMBER
Optional. Specify the number of multiple files you want to download at the same time. Allows to speed up the download of a website significantly. Default is to download one file at a time. Optional. Specify the number of multiple files you want to download at the same time. Allows one to speed up the download of a website significantly. Default is to download one file at a time.
Example: Example:

View File

@@ -46,7 +46,7 @@ option_parser = OptionParser.new do |opts|
options[:all] = true options[:all] = true
end end
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t| opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
options[:threads_count] = t options[:threads_count] = t
end end

View File

@@ -14,7 +14,7 @@ class WaybackMachineDownloader
include ArchiveAPI include ArchiveAPI
VERSION = "2.2.1" VERSION = "2.3.0"
attr_accessor :base_url, :exact_url, :directory, :all_timestamps, attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
@@ -84,7 +84,7 @@ class WaybackMachineDownloader
# Note: Passing a page index parameter allow us to get more snapshots, # Note: Passing a page index parameter allow us to get more snapshots,
# but from a less fresh index # but from a less fresh index
print "Getting snapshot pages" print "Getting snapshot pages"
snapshot_list_to_consider = "" snapshot_list_to_consider = []
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
print "." print "."
unless @exact_url unless @exact_url
@@ -95,17 +95,15 @@ class WaybackMachineDownloader
print "." print "."
end end
end end
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider." puts " found #{snapshot_list_to_consider.length} snaphots to consider."
puts puts
snapshot_list_to_consider snapshot_list_to_consider
end end
def get_file_list_curated def get_file_list_curated
file_list_curated = Hash.new file_list_curated = Hash.new
get_all_snapshots_to_consider.each_line do |line| get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless line.include?('/') next unless file_url.include?('/')
file_timestamp = line[0..13].to_i
file_url = line[15..-2]
file_id = file_url.split('/')[3..-1].join('/') file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == "" file_id = file_id.tidy_bytes unless file_id == ""
@@ -130,10 +128,8 @@ class WaybackMachineDownloader
def get_file_list_all_timestamps def get_file_list_all_timestamps
file_list_curated = Hash.new file_list_curated = Hash.new
get_all_snapshots_to_consider.each_line do |line| get_all_snapshots_to_consider.each do |file_timestamp, file_url|
next unless line.include?('/') next unless file_url.include?('/')
file_timestamp = line[0..13].to_i
file_url = line[15..-2]
file_id = file_url.split('/')[3..-1].join('/') file_id = file_url.split('/')[3..-1].join('/')
file_id_and_timestamp = [file_timestamp, file_id].join('/') file_id_and_timestamp = [file_timestamp, file_id].join('/')
file_id_and_timestamp = CGI::unescape file_id_and_timestamp file_id_and_timestamp = CGI::unescape file_id_and_timestamp
@@ -176,11 +172,15 @@ class WaybackMachineDownloader
def list_files def list_files
# retrieval produces its own output # retrieval produces its own output
@orig_stdout = $stdout
$stdout = $stderr
files = get_file_list_by_timestamp files = get_file_list_by_timestamp
$stdout = @orig_stdout
puts "[" puts "["
files.each do |file| files[0...-1].each do |file|
puts file.to_json + "," puts file.to_json + ","
end end
puts files[-1].to_json
puts "]" puts "]"
end end

View File

@@ -1,28 +1,38 @@
require 'json'
require 'uri'
module ArchiveAPI module ArchiveAPI
def get_raw_list_from_api url, page_index def get_raw_list_from_api url, page_index
request_url = "https://web.archive.org/cdx/search/xd?url=" request_url = URI("https://web.archive.org/cdx/search/xd")
request_url += url params = [["output", "json"], ["url", url]]
request_url += parameters_for_api page_index params += parameters_for_api page_index
request_url.query = URI.encode_www_form(params)
URI.open(request_url).read begin
json = JSON.parse(URI(request_url).open.read)
if (json[0] <=> ["timestamp","original"]) == 0
json.shift
end
json
rescue JSON::ParserError
[]
end
end end
def parameters_for_api page_index def parameters_for_api page_index
parameters = "&fl=timestamp,original&collapse=digest&gzip=false" parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
if @all if !@all
parameters += "" parameters.push(["filter", "statuscode:200"])
else
parameters += "&filter=statuscode:200"
end end
if @from_timestamp and @from_timestamp != 0 if @from_timestamp and @from_timestamp != 0
parameters += "&from=" + @from_timestamp.to_s parameters.push(["from", @from_timestamp.to_s])
end end
if @to_timestamp and @to_timestamp != 0 if @to_timestamp and @to_timestamp != 0
parameters += "&to=" + @to_timestamp.to_s parameters.push(["to", @to_timestamp.to_s])
end end
if page_index if page_index
parameters += "&page=#{page_index}" parameters.push(["page", page_index])
end end
parameters parameters
end end

View File

@@ -70,7 +70,7 @@ module TibyBytes
if is_unused || is_restricted if is_unused || is_restricted
bytes[i] = tidy_byte(byte) bytes[i] = tidy_byte(byte)
elsif is_cont elsif is_cont
# Not expecting contination byte? Clean up. Otherwise, now expect one less. # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
else else
if conts_expected > 0 if conts_expected > 0