mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-17 09:46:05 +00:00
Merge branch 'master' of https://github.com/p/wayback-machine-downloader into p-master
This commit is contained in:
commit
62f424b6d1
@ -58,7 +58,7 @@ end.parse!
|
||||
if (base_url = ARGV[-1])
|
||||
options[:base_url] = base_url
|
||||
wayback_machine_downloader = WaybackMachineDownloader.new options
|
||||
if wayback_machine_downloader.list
|
||||
if options[:list]
|
||||
wayback_machine_downloader.list_files
|
||||
else
|
||||
wayback_machine_downloader.download_files
|
||||
|
||||
@ -16,17 +16,19 @@ class WaybackMachineDownloader
|
||||
|
||||
VERSION = "2.0.0"
|
||||
|
||||
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
|
||||
attr_accessor :base_url, :exact_match, :directory,
|
||||
:from_timestamp, :to_timestamp,
|
||||
:only_filter, :exclude_filter, :all, :maximum_pages, :threads_count
|
||||
|
||||
def initialize params
|
||||
@base_url = params[:base_url]
|
||||
@exact_match = params[:exact_match]
|
||||
@directory = params[:directory]
|
||||
@from_timestamp = params[:from_timestamp].to_i
|
||||
@to_timestamp = params[:to_timestamp].to_i
|
||||
@only_filter = params[:only_filter]
|
||||
@exclude_filter = params[:exclude_filter]
|
||||
@all = params[:all]
|
||||
@list = params[:list]
|
||||
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
||||
@threads_count = params[:threads_count].to_i
|
||||
end
|
||||
@ -78,19 +80,20 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def get_all_snapshots_to_consider
|
||||
# Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
|
||||
# Note: Passing a page index parameter allow us to get more snapshots,
|
||||
# but from a less fresh index
|
||||
print "Getting snapshot pages"
|
||||
snapshot_list_to_consider = ""
|
||||
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
||||
print "."
|
||||
snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
|
||||
print "."
|
||||
unless @exact_match
|
||||
@maximum_pages.times do |page_index|
|
||||
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
||||
break if snapshot_list.empty?
|
||||
snapshot_list_to_consider += snapshot_list
|
||||
print "."
|
||||
end
|
||||
end
|
||||
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
|
||||
puts
|
||||
snapshot_list_to_consider
|
||||
@ -134,8 +137,11 @@ class WaybackMachineDownloader
|
||||
end
|
||||
|
||||
def list_files
|
||||
# retrieval produces its own output
|
||||
files = get_file_list_by_timestamp
|
||||
# ... hence delay printing the opening bracket
|
||||
puts "["
|
||||
get_file_list_by_timestamp.each do |file|
|
||||
files.each do |file|
|
||||
puts file.to_json + ","
|
||||
end
|
||||
puts "]"
|
||||
@ -179,7 +185,7 @@ class WaybackMachineDownloader
|
||||
|
||||
def structure_dir_path dir_path
|
||||
begin
|
||||
FileUtils::mkdir_p dir_path unless File.exists? dir_path
|
||||
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
||||
rescue Errno::EEXIST => e
|
||||
error_to_string = e.to_s
|
||||
puts "# #{error_to_string}"
|
||||
@ -219,7 +225,7 @@ class WaybackMachineDownloader
|
||||
if Gem.win_platform?
|
||||
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
||||
end
|
||||
unless File.exists? file_path
|
||||
unless File.exist? file_path
|
||||
begin
|
||||
structure_dir_path dir_path
|
||||
open(file_path, "wb") do |file|
|
||||
@ -240,7 +246,7 @@ class WaybackMachineDownloader
|
||||
rescue StandardError => e
|
||||
puts "#{file_url} # #{e}"
|
||||
ensure
|
||||
if not @all and File.exists?(file_path) and File.size(file_path) == 0
|
||||
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
||||
File.delete(file_path)
|
||||
puts "#{file_path} was empty and was removed."
|
||||
end
|
||||
|
||||
@ -60,7 +60,7 @@ module TibyBytes
|
||||
bytes.each_index do |i|
|
||||
|
||||
byte = bytes[i]
|
||||
is_ascii = byte < 128
|
||||
_is_ascii = byte < 128
|
||||
is_cont = byte > 127 && byte < 192
|
||||
is_lead = byte > 191 && byte < 245
|
||||
is_unused = byte > 240
|
||||
@ -78,7 +78,7 @@ module TibyBytes
|
||||
# the leading byte.
|
||||
begin
|
||||
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
||||
rescue NoMethodError => e
|
||||
rescue NoMethodError
|
||||
next
|
||||
end
|
||||
conts_expected = 0
|
||||
@ -98,7 +98,7 @@ module TibyBytes
|
||||
end
|
||||
begin
|
||||
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
||||
rescue ArgumentError => e
|
||||
rescue ArgumentError
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
@ -25,7 +25,7 @@ module ToRegex
|
||||
# @option options [true,false] :lang /foo/[nesu]
|
||||
def to_regex(options = {})
|
||||
if args = as_regexp(options)
|
||||
::Regexp.new *args
|
||||
::Regexp.new(*args)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@ -4,7 +4,8 @@ require 'wayback_machine_downloader'
|
||||
class WaybackMachineDownloaderTest < Minitest::Test
|
||||
|
||||
def setup
|
||||
@wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'http://www.onlyfreegames.net'
|
||||
@wayback_machine_downloader = WaybackMachineDownloader.new(
|
||||
base_url: 'http://www.onlyfreegames.net')
|
||||
$stdout = StringIO.new
|
||||
end
|
||||
|
||||
@ -38,6 +39,16 @@ class WaybackMachineDownloaderTest < Minitest::Test
|
||||
assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2]
|
||||
end
|
||||
|
||||
def test_without_exact_match
|
||||
@wayback_machine_downloader.exact_match = false
|
||||
assert @wayback_machine_downloader.get_file_list_curated.size > 1
|
||||
end
|
||||
|
||||
def test_exact_match
|
||||
@wayback_machine_downloader.exact_match = true
|
||||
assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
|
||||
end
|
||||
|
||||
def test_file_list_only_filter_without_matches
|
||||
@wayback_machine_downloader.only_filter = 'abc123'
|
||||
assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size
|
||||
@ -85,20 +96,22 @@ class WaybackMachineDownloaderTest < Minitest::Test
|
||||
assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"]
|
||||
end
|
||||
|
||||
def test_file_list_exclude_filter_with_a_regex
|
||||
def test_all_get_file_list_curated_size
|
||||
@wayback_machine_downloader.all = true
|
||||
assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
|
||||
end
|
||||
|
||||
# Testing encoding conflicts needs a different base_url
|
||||
def test_nonascii_suburls_download
|
||||
@wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84'
|
||||
@wayback_machine_downloader = WaybackMachineDownloader.new(
|
||||
base_url: 'https://en.wikipedia.org/wiki/%C3%84')
|
||||
# Once just for the downloading...
|
||||
@wayback_machine_downloader.download_files
|
||||
end
|
||||
|
||||
def test_nonascii_suburls_already_present
|
||||
@wayback_machine_downloader = WaybackMachineDownloader.new base_url: 'https://en.wikipedia.org/wiki/%C3%84'
|
||||
@wayback_machine_downloader = WaybackMachineDownloader.new(
|
||||
base_url: 'https://en.wikipedia.org/wiki/%C3%84')
|
||||
# ... twice to test the "is already present" case
|
||||
@wayback_machine_downloader.download_files
|
||||
@wayback_machine_downloader.download_files
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user