mirror of
https://github.com/StrawberryMaster/wayback-machine-downloader.git
synced 2025-12-29 16:16:06 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2243958643 | ||
|
|
e25732e19c | ||
|
|
46450d7c20 | ||
|
|
019534794c | ||
|
|
7142be5c16 |
@@ -1,6 +1,6 @@
|
||||
# Wayback Machine Downloader
|
||||
|
||||
[](https://rubygems.org/gems/wayback_machine_downloader/)
|
||||

|
||||
|
||||
This is a fork of the [Wayback Machine Downloader](https://github.com/hartator/wayback-machine-downloader). With this, you can download a website from the Internet Archive Wayback Machine.
|
||||
|
||||
|
||||
@@ -111,7 +111,7 @@ class WaybackMachineDownloader
|
||||
|
||||
include ArchiveAPI
|
||||
|
||||
VERSION = "2.3.2"
|
||||
VERSION = "2.3.3"
|
||||
DEFAULT_TIMEOUT = 30
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 2
|
||||
@@ -426,7 +426,7 @@ class WaybackMachineDownloader
|
||||
logger
|
||||
end
|
||||
|
||||
def download_with_retry(file_path, file_url, file_timestamp, connection)
|
||||
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
||||
retries = 0
|
||||
begin
|
||||
wayback_url = if @rewritten
|
||||
@@ -450,9 +450,17 @@ class WaybackMachineDownloader
|
||||
file.write(response.body)
|
||||
end
|
||||
end
|
||||
when Net::HTTPRedirection
|
||||
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
||||
location = response['location']
|
||||
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
||||
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
||||
when Net::HTTPTooManyRequests
|
||||
sleep(RATE_LIMIT * 2)
|
||||
raise "Rate limited, retrying..."
|
||||
when Net::HTTPNotFound
|
||||
@logger.warn("File not found, skipping: #{file_url}")
|
||||
return
|
||||
else
|
||||
raise "HTTP Error: #{response.code} #{response.message}"
|
||||
end
|
||||
|
||||
@@ -10,7 +10,9 @@ module ArchiveAPI
|
||||
|
||||
begin
|
||||
response = http.get(request_url)
|
||||
json = JSON.parse(response.body)
|
||||
body = response.body.to_s.strip
|
||||
return [] if body.empty?
|
||||
json = JSON.parse(body)
|
||||
|
||||
# Check if the response contains the header ["timestamp", "original"]
|
||||
json.shift if json.first == ["timestamp", "original"]
|
||||
|
||||
@@ -1,136 +1,69 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
module TidyBytes
|
||||
# Using a frozen array so we have a O(1) lookup time
|
||||
CP1252_MAP = Array.new(160) do |i|
|
||||
case i
|
||||
when 128 then [226, 130, 172]
|
||||
when 130 then [226, 128, 154]
|
||||
when 131 then [198, 146]
|
||||
when 132 then [226, 128, 158]
|
||||
when 133 then [226, 128, 166]
|
||||
when 134 then [226, 128, 160]
|
||||
when 135 then [226, 128, 161]
|
||||
when 136 then [203, 134]
|
||||
when 137 then [226, 128, 176]
|
||||
when 138 then [197, 160]
|
||||
when 139 then [226, 128, 185]
|
||||
when 140 then [197, 146]
|
||||
when 142 then [197, 189]
|
||||
when 145 then [226, 128, 152]
|
||||
when 146 then [226, 128, 153]
|
||||
when 147 then [226, 128, 156]
|
||||
when 148 then [226, 128, 157]
|
||||
when 149 then [226, 128, 162]
|
||||
when 150 then [226, 128, 147]
|
||||
when 151 then [226, 128, 148]
|
||||
when 152 then [203, 156]
|
||||
when 153 then [226, 132, 162]
|
||||
when 154 then [197, 161]
|
||||
when 155 then [226, 128, 186]
|
||||
when 156 then [197, 147]
|
||||
when 158 then [197, 190]
|
||||
when 159 then [197, 184]
|
||||
# precomputing CP1252 to UTF-8 mappings for bytes 128-159
|
||||
CP1252_MAP = (128..159).map do |byte|
|
||||
case byte
|
||||
when 128 then [226, 130, 172] # EURO SIGN
|
||||
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
|
||||
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
|
||||
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
|
||||
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
|
||||
when 134 then [226, 128, 160] # DAGGER
|
||||
when 135 then [226, 128, 161] # DOUBLE DAGGER
|
||||
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
when 137 then [226, 128, 176] # PER MILLE SIGN
|
||||
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
|
||||
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
|
||||
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
|
||||
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
|
||||
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
|
||||
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
|
||||
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
|
||||
when 149 then [226, 128, 162] # BULLET
|
||||
when 150 then [226, 128, 147] # EN DASH
|
||||
when 151 then [226, 128, 148] # EM DASH
|
||||
when 152 then [203, 156] # SMALL TILDE
|
||||
when 153 then [226, 132, 162] # TRADE MARK SIGN
|
||||
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
|
||||
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
||||
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
||||
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
||||
end
|
||||
end.freeze
|
||||
|
||||
# precomputing all possible byte conversions
|
||||
CP1252_TO_UTF8 = Array.new(256) do |b|
|
||||
if (128..159).cover?(b)
|
||||
CP1252_MAP[b - 128]&.pack('C*')
|
||||
elsif b < 128
|
||||
b.chr
|
||||
else
|
||||
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
|
||||
end
|
||||
end.freeze
|
||||
|
||||
def self.included(base)
|
||||
base.class_eval do
|
||||
private
|
||||
|
||||
def tidy_byte(byte)
|
||||
if byte < 160
|
||||
CP1252_MAP[byte]
|
||||
else
|
||||
byte < 192 ? [194, byte] : [195, byte - 64]
|
||||
end
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
||||
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
||||
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
||||
# always work.
|
||||
#
|
||||
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
||||
# encoding is CP-1252 or ISO-8859-1.
|
||||
|
||||
def tidy_bytes(force = false)
|
||||
return nil if empty?
|
||||
|
||||
if force
|
||||
buffer = String.new(capacity: bytesize)
|
||||
each_byte do |b|
|
||||
cleaned = tidy_byte(b)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
end
|
||||
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
||||
return buffer.force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
|
||||
buffer = String.new(capacity: bytesize)
|
||||
bytes = each_byte.to_a
|
||||
conts_expected = 0
|
||||
last_lead = 0
|
||||
|
||||
bytes.each_with_index do |byte, i|
|
||||
if byte < 128 # ASCII
|
||||
buffer << byte
|
||||
next
|
||||
end
|
||||
|
||||
if byte > 244 || byte > 240 # invalid bytes
|
||||
cleaned = tidy_byte(byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
next
|
||||
end
|
||||
|
||||
is_cont = byte > 127 && byte < 192
|
||||
is_lead = byte > 191 && byte < 245
|
||||
|
||||
if is_cont
|
||||
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
||||
if conts_expected == 0
|
||||
cleaned = tidy_byte(byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
else
|
||||
buffer << byte
|
||||
conts_expected -= 1
|
||||
end
|
||||
else
|
||||
if conts_expected > 0
|
||||
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
||||
# the leading byte.
|
||||
(1..(i - last_lead)).each do |j|
|
||||
back_byte = bytes[i - j]
|
||||
cleaned = tidy_byte(back_byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
end
|
||||
conts_expected = 0
|
||||
end
|
||||
|
||||
if is_lead
|
||||
# Final byte is leading? Clean it.
|
||||
if i == bytes.length - 1
|
||||
cleaned = tidy_byte(byte)
|
||||
buffer << cleaned.pack("C*") if cleaned
|
||||
else
|
||||
# Valid leading byte? Expect continuations determined by position of
|
||||
# first zero bit, with max of 3.
|
||||
buffer << byte
|
||||
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
||||
last_lead = i
|
||||
end
|
||||
end
|
||||
end
|
||||
begin
|
||||
encode('UTF-8')
|
||||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||
buffer = String.new(capacity: bytesize)
|
||||
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
||||
end
|
||||
|
||||
buffer.force_encoding(Encoding::UTF_8)
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
|
||||
# Tidy bytes in place.
|
||||
def tidy_bytes!(force = false)
|
||||
result = tidy_bytes(force)
|
||||
result ? replace(result) : self
|
||||
|
||||
Reference in New Issue
Block a user