From d572207122def28b48ec550a466b2c2a8072d704 Mon Sep 17 00:00:00 2001 From: Felipe <41008398+StrawberryMaster@users.noreply.github.com> Date: Thu, 18 Dec 2025 14:41:35 +0000 Subject: [PATCH] Improve URL detection for timestamps + fix retry download links --- lib/wayback_machine_downloader.rb | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 3c8bf30..4700f2d 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -717,6 +717,19 @@ class WaybackMachineDownloader # resolve full URL (handles relative paths like "../img/logo.png") resolved_uri = base_uri + asset_rel_url + # detect if the asset URL is already a Wayback "web//.../https://..." embed + asset_timestamp = parent_timestamp + if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z} + embedded_ts = $1 + begin + orig_uri = URI($2) + resolved_uri = orig_uri + asset_timestamp = embedded_ts.to_i + rescue URI::InvalidURIError + # fall back to original resolved_uri and parent timestamp + end + end + # filter out navigation links (pages) vs assets # skip if extension is empty or looks like an HTML page path = resolved_uri.path @@ -753,7 +766,7 @@ class WaybackMachineDownloader new_file_info = { file_url: asset_wbm_url, - timestamp: parent_timestamp, + timestamp: asset_timestamp, file_id: asset_id }