feat: self-observe — record bots that visit the API directly

Add selfObserve middleware that detects bot/scanner User-Agents (or
requests with no UA) hitting any endpoint except /health and /submit,
and logs them to the bots table as site_id='self', action='observed'.

Dashboard shows these with a cyan [LOCAL] badge and colours 'observed'
action in cyan to distinguish them from WordPress-reported blocks.
Geo-enrichment runs async on self-observed entries too.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 08:37:52 +02:00
parent a3920cacd5
commit a4464214af
2 changed files with 40 additions and 2 deletions

View File

@@ -251,6 +251,10 @@ main {
.feed-action { font-weight: bold; } .feed-action { font-weight: bold; }
.feed-action.blocked { color: var(--red); } .feed-action.blocked { color: var(--red); }
.feed-action.rate_limited { color: var(--amber); } .feed-action.rate_limited { color: var(--amber); }
.feed-action.observed { color: var(--cyan2); }
.feed-local { font-size: 9px; font-weight: 700; letter-spacing: .5px;
color: var(--bg); background: var(--cyan2); border-radius: 3px;
padding: 1px 5px; margin-left: 4px; vertical-align: middle; }
.feed-reason { color: var(--dim); font-size: 10px; } .feed-reason { color: var(--dim); font-size: 10px; }
.feed-geo { color: var(--dim); font-size: 10px; } .feed-geo { color: var(--dim); font-size: 10px; }
@@ -661,6 +665,7 @@ function addRow(row) {
el.className = 'feed-row'; el.className = 'feed-row';
const f = flag(row.country||''); const f = flag(row.country||'');
const action = row.action||'blocked'; const action = row.action||'blocked';
const isLocal = row.site_id === 'self';
el.innerHTML = ` el.innerHTML = `
<span class="feed-ts">${fmtTime(row.received_at)}</span> <span class="feed-ts">${fmtTime(row.received_at)}</span>
<span class="feed-ip">${esc(row.ip_masked||row.ip||'?')}</span> <span class="feed-ip">${esc(row.ip_masked||row.ip||'?')}</span>
@@ -668,6 +673,7 @@ function addRow(row) {
${f?`<span class="feed-geo">${f} ${esc(row.country||'')}</span><br>`:''} ${f?`<span class="feed-geo">${f} ${esc(row.country||'')}</span><br>`:''}
<span class="feed-bot">${esc(row.bot_type||'?')}</span> <span class="feed-bot">${esc(row.bot_type||'?')}</span>
<span class="feed-action ${action}"> [${esc(action)}]</span> <span class="feed-action ${action}"> [${esc(action)}]</span>
${isLocal?'<span class="feed-local">LOCAL</span>':''}
<br><span class="feed-reason">${esc(row.reason||row.ua_family||'')}</span> <br><span class="feed-reason">${esc(row.reason||row.ua_family||'')}</span>
</span>`; </span>`;
feedEl.prepend(el); feedEl.prepend(el);

View File

@@ -191,7 +191,7 @@ function getStats() {
GROUP BY ua_family ORDER BY hits DESC LIMIT 8 GROUP BY ua_family ORDER BY hits DESC LIMIT 8
`).all(now - 2592000), `).all(now - 2592000),
recent: DB.prepare(` recent: DB.prepare(`
SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family, site_id
FROM bots ORDER BY id DESC LIMIT 40 FROM bots ORDER BY id DESC LIMIT 40
`).all(), `).all(),
hourly: DB.prepare(` hourly: DB.prepare(`
@@ -211,7 +211,7 @@ let lastId = DB.prepare('SELECT MAX(id) id FROM bots').get().id || 0;
setInterval(() => { setInterval(() => {
if (!sseClients.size) return; if (!sseClients.size) return;
const rows = DB.prepare('SELECT * FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId); const rows = DB.prepare('SELECT id, received_at, ip_masked, country, bot_type, action, reason, ua_family, site_id FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId);
if (!rows.length) return; if (!rows.length) return;
lastId = rows.at(-1).id; lastId = rows.at(-1).id;
const msg = `data: ${JSON.stringify(rows)}\n\n`; const msg = `data: ${JSON.stringify(rows)}\n\n`;
@@ -252,9 +252,41 @@ const insertBatch = DB.transaction((siteId, bots) => {
return ids; return ids;
}); });
// ── Self-observation (log bots that visit the API directly) ───────────────────
//
// Matches any request whose UA looks like a bot/scanner/tool, or has no UA.
// Skips /health (Docker probe) and /submit (WP plugin).
// Logged as site_id='self', action='observed' so they're visually distinct.
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
function selfObserve(req, res, next) {
if (SKIP_SELF.has(req.path)) return next();
const ua = req.headers['user-agent'] || '';
if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|| req.socket.remoteAddress || '?';
const now = Math.floor(Date.now() / 1000);
const fam = parseUA(ua);
try {
const r = stmtIns.run(
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', ''
);
_cache = null;
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
} catch {}
next();
}
// ── Routes ──────────────────────────────────────────────────────────────────── // ── Routes ────────────────────────────────────────────────────────────────────
app.use(express.json({ limit: '128kb' })); app.use(express.json({ limit: '128kb' }));
app.use(selfObserve);
app.use(express.static(path.join(__dirname, 'public'))); app.use(express.static(path.join(__dirname, 'public')));
app.post('/api/v1/submit', requireToken, (req, res) => { app.post('/api/v1/submit', requireToken, (req, res) => {