feat: self-observe — record bots that visit the API directly

Add selfObserve middleware that detects bot/scanner User-Agents (or
requests with no UA) hitting any endpoint except /health and /submit,
and logs them to the bots table as site_id='self', action='observed'.

Dashboard shows these with a cyan [LOCAL] badge and colours 'observed'
action in cyan to distinguish them from WordPress-reported blocks.
Geo-enrichment runs async on self-observed entries too.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 08:37:52 +02:00
parent a3920cacd5
commit a4464214af
2 changed files with 40 additions and 2 deletions

View File

@@ -191,7 +191,7 @@ function getStats() {
GROUP BY ua_family ORDER BY hits DESC LIMIT 8
`).all(now - 2592000),
recent: DB.prepare(`
SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family
SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family, site_id
FROM bots ORDER BY id DESC LIMIT 40
`).all(),
hourly: DB.prepare(`
@@ -211,7 +211,7 @@ let lastId = DB.prepare('SELECT MAX(id) id FROM bots').get().id || 0;
setInterval(() => {
if (!sseClients.size) return;
const rows = DB.prepare('SELECT * FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId);
const rows = DB.prepare('SELECT id, received_at, ip_masked, country, bot_type, action, reason, ua_family, site_id FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId);
if (!rows.length) return;
lastId = rows.at(-1).id;
const msg = `data: ${JSON.stringify(rows)}\n\n`;
@@ -252,9 +252,41 @@ const insertBatch = DB.transaction((siteId, bots) => {
return ids;
});
// ── Self-observation (log bots that visit the API directly) ───────────────────
//
// Matches any request whose UA looks like a bot/scanner/tool, or has no UA.
// Skips /health (Docker probe) and /submit (WP plugin).
// Logged as site_id='self', action='observed' so they're visually distinct.
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
function selfObserve(req, res, next) {
if (SKIP_SELF.has(req.path)) return next();
const ua = req.headers['user-agent'] || '';
if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|| req.socket.remoteAddress || '?';
const now = Math.floor(Date.now() / 1000);
const fam = parseUA(ua);
try {
const r = stmtIns.run(
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', ''
);
_cache = null;
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
} catch {}
next();
}
// ── Routes ────────────────────────────────────────────────────────────────────
app.use(express.json({ limit: '128kb' }));
app.use(selfObserve);
app.use(express.static(path.join(__dirname, 'public')));
app.post('/api/v1/submit', requireToken, (req, res) => {