diff --git a/.env.example b/.env.example index a2bc966..362037a 100644 --- a/.env.example +++ b/.env.example @@ -4,3 +4,6 @@ NODE_ENV=production # Set a strong random token — all WP sites must send this as: Authorization: Bearer # Leave empty to run in open mode (dev only) API_TOKEN=change-me-to-a-long-random-string +# Set to 'true' to log bots that hit the API dashboard directly. +# Disabled by default — enabling it caused DB bloat from legitimate crawlers. +ENABLE_SELF_OBSERVE=false diff --git a/docker-compose.yml b/docker-compose.yml index a43ce66..c43bd66 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,7 @@ services: - DB_PATH=/data/bots.db - NODE_ENV=production - API_TOKEN=${API_TOKEN:-change-me-to-a-long-random-string} + - ENABLE_SELF_OBSERVE=false healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:3091/api/v1/health"] interval: 30s diff --git a/server.js b/server.js index 46944fd..52628b7 100644 --- a/server.js +++ b/server.js @@ -48,6 +48,22 @@ DB.exec(` try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {} }); +// ── Stats cache (declared early so pruneOldRows can reference it) ───────────── + +let _cache = null, _cacheTs = 0; + +// ── Row pruning (90 days) ───────────────────────────────────────────────────── + +const PRUNE_AGE = 90 * 86400; // 90 days in seconds + +function pruneOldRows() { + const cutoff = Math.floor(Date.now() / 1000) - PRUNE_AGE; + DB.prepare('DELETE FROM bots WHERE received_at < ?').run(cutoff); + _cache = null; +} +pruneOldRows(); // on startup +setInterval(pruneOldRows, 6 * 3600 * 1000); // every 6 hours + // ── Auth ────────────────────────────────────────────────────────────────────── const API_TOKEN = (process.env.API_TOKEN || '').trim(); @@ -98,6 +114,14 @@ function parseUA(ua = '') { const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?'); const enrichCache = new Map(); +// Clean enrichCache entries whose TTL has expired (runs every 5 minutes) +setInterval(() => { + const now = Date.now(); + for (const [ip, expiry] of enrichCache) { + if (now > expiry) enrichCache.delete(ip); + } +}, 5 * 60 * 1000); + function isPrivateIP(ip) { return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip); } @@ -151,8 +175,7 @@ function allowed(ip, max = 30, win = 60_000) { } // ── Stats cache (30s TTL) ───────────────────────────────────────────────────── - -let _cache = null, _cacheTs = 0; +// (_cache and _cacheTs are declared earlier, before pruneOldRows) function getStats() { if (_cache && Date.now() - _cacheTs < 30_000) return _cache; @@ -260,21 +283,46 @@ const insertBatch = DB.transaction((siteId, bots) => { // ── Self-observation (log bots that visit the API directly) ─────────────────── // -// Matches any request whose UA looks like a bot/scanner/tool, or has no UA. +// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true'). +// When enabled: only logs requests where UA is present AND matches bot patterns. +// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat. // Skips /health (Docker probe) and /submit (WP plugin). // Logged as site_id='self', action='observed' so they're visually distinct. const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i; const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']); +// Per-IP dedup map for selfObserve: IP -> timestamp of last log +const selfSeen = new Map(); + +// Clean selfSeen entries older than 10 minutes every 5 minutes +setInterval(() => { + const cutoff = Date.now() - 10 * 60 * 1000; + for (const [ip, t] of selfSeen) { + if (t < cutoff) selfSeen.delete(ip); + } +}, 5 * 60 * 1000); + function selfObserve(req, res, next) { + // Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true' + if (!process.env.ENABLE_SELF_OBSERVE || process.env.ENABLE_SELF_OBSERVE === 'false') { + return next(); + } + if (SKIP_SELF.has(req.path)) return next(); const ua = req.headers['user-agent'] || ''; - if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip + // Only log when UA is present AND matches known bot patterns + if (!ua || !BOT_UA_RE.test(ua)) return next(); const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim() || req.socket.remoteAddress || '?'; + + // Per-IP dedup: skip if this IP was already logged in the last 5 minutes + const lastSeen = selfSeen.get(ip); + if (lastSeen && Date.now() - lastSeen < 300_000) return next(); + selfSeen.set(ip, Date.now()); + const now = Math.floor(Date.now() / 1000); const fam = parseUA(ua); @@ -282,7 +330,6 @@ function selfObserve(req, res, next) { const r = stmtIns.run( now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300) ); - _cache = null; setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip)); } catch {}