fix: disable self-observe by default, fix crash causes

- ENABLE_SELF_OBSERVE env var (default false) gates self-observe
- Fix UA condition: empty UA no longer triggers logging
- Add per-IP dedup (5min) to prevent 1M+ row storms
- Remove _cache=null from selfObserve (was busting cache on every hit)
- Add 90-day row pruning on startup + every 6h
- Add enrichCache TTL cleanup every 5min to prevent unbounded memory growth

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 10:00:06 +02:00
parent 379e993384
commit 5a823501f2
3 changed files with 56 additions and 5 deletions

View File

@@ -4,3 +4,6 @@ NODE_ENV=production
# Set a strong random token — all WP sites must send this as: Authorization: Bearer <token>
# Leave empty to run in open mode (dev only)
API_TOKEN=change-me-to-a-long-random-string
# Set to 'true' to log bots that hit the API dashboard directly.
# Disabled by default — enabling it caused DB bloat from legitimate crawlers.
ENABLE_SELF_OBSERVE=false

View File

@@ -12,6 +12,7 @@ services:
- DB_PATH=/data/bots.db
- NODE_ENV=production
- API_TOKEN=${API_TOKEN:-change-me-to-a-long-random-string}
- ENABLE_SELF_OBSERVE=false
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3091/api/v1/health"]
interval: 30s

View File

@@ -48,6 +48,22 @@ DB.exec(`
try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {}
});
// ── Stats cache (declared early so pruneOldRows can reference it) ─────────────
let _cache = null, _cacheTs = 0;
// ── Row pruning (90 days) ─────────────────────────────────────────────────────
const PRUNE_AGE = 90 * 86400; // 90 days in seconds
function pruneOldRows() {
const cutoff = Math.floor(Date.now() / 1000) - PRUNE_AGE;
DB.prepare('DELETE FROM bots WHERE received_at < ?').run(cutoff);
_cache = null;
}
pruneOldRows(); // on startup
setInterval(pruneOldRows, 6 * 3600 * 1000); // every 6 hours
// ── Auth ──────────────────────────────────────────────────────────────────────
const API_TOKEN = (process.env.API_TOKEN || '').trim();
@@ -98,6 +114,14 @@ function parseUA(ua = '') {
const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?');
const enrichCache = new Map();
// Clean enrichCache entries whose TTL has expired (runs every 5 minutes)
setInterval(() => {
const now = Date.now();
for (const [ip, expiry] of enrichCache) {
if (now > expiry) enrichCache.delete(ip);
}
}, 5 * 60 * 1000);
function isPrivateIP(ip) {
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip);
}
@@ -151,8 +175,7 @@ function allowed(ip, max = 30, win = 60_000) {
}
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
let _cache = null, _cacheTs = 0;
// (_cache and _cacheTs are declared earlier, before pruneOldRows)
function getStats() {
if (_cache && Date.now() - _cacheTs < 30_000) return _cache;
@@ -260,21 +283,46 @@ const insertBatch = DB.transaction((siteId, bots) => {
// ── Self-observation (log bots that visit the API directly) ───────────────────
//
// Matches any request whose UA looks like a bot/scanner/tool, or has no UA.
// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true').
// When enabled: only logs requests where UA is present AND matches bot patterns.
// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat.
// Skips /health (Docker probe) and /submit (WP plugin).
// Logged as site_id='self', action='observed' so they're visually distinct.
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
// Per-IP dedup map for selfObserve: IP -> timestamp of last log
const selfSeen = new Map();
// Clean selfSeen entries older than 10 minutes every 5 minutes
setInterval(() => {
const cutoff = Date.now() - 10 * 60 * 1000;
for (const [ip, t] of selfSeen) {
if (t < cutoff) selfSeen.delete(ip);
}
}, 5 * 60 * 1000);
function selfObserve(req, res, next) {
// Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true'
if (!process.env.ENABLE_SELF_OBSERVE || process.env.ENABLE_SELF_OBSERVE === 'false') {
return next();
}
if (SKIP_SELF.has(req.path)) return next();
const ua = req.headers['user-agent'] || '';
if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip
// Only log when UA is present AND matches known bot patterns
if (!ua || !BOT_UA_RE.test(ua)) return next();
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|| req.socket.remoteAddress || '?';
// Per-IP dedup: skip if this IP was already logged in the last 5 minutes
const lastSeen = selfSeen.get(ip);
if (lastSeen && Date.now() - lastSeen < 300_000) return next();
selfSeen.set(ip, Date.now());
const now = Math.floor(Date.now() / 1000);
const fam = parseUA(ua);
@@ -282,7 +330,6 @@ function selfObserve(req, res, next) {
const r = stmtIns.run(
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300)
);
_cache = null;
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
} catch {}