fix: disable self-observe by default, fix crash causes
- ENABLE_SELF_OBSERVE env var (default false) gates self-observe - Fix UA condition: empty UA no longer triggers logging - Add per-IP dedup (5min) to prevent 1M+ row storms - Remove _cache=null from selfObserve (was busting cache on every hit) - Add 90-day row pruning on startup + every 6h - Add enrichCache TTL cleanup every 5min to prevent unbounded memory growth Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,3 +4,6 @@ NODE_ENV=production
|
|||||||
# Set a strong random token — all WP sites must send this as: Authorization: Bearer <token>
|
# Set a strong random token — all WP sites must send this as: Authorization: Bearer <token>
|
||||||
# Leave empty to run in open mode (dev only)
|
# Leave empty to run in open mode (dev only)
|
||||||
API_TOKEN=change-me-to-a-long-random-string
|
API_TOKEN=change-me-to-a-long-random-string
|
||||||
|
# Set to 'true' to log bots that hit the API dashboard directly.
|
||||||
|
# Disabled by default — enabling it caused DB bloat from legitimate crawlers.
|
||||||
|
ENABLE_SELF_OBSERVE=false
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ services:
|
|||||||
- DB_PATH=/data/bots.db
|
- DB_PATH=/data/bots.db
|
||||||
- NODE_ENV=production
|
- NODE_ENV=production
|
||||||
- API_TOKEN=${API_TOKEN:-change-me-to-a-long-random-string}
|
- API_TOKEN=${API_TOKEN:-change-me-to-a-long-random-string}
|
||||||
|
- ENABLE_SELF_OBSERVE=false
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "wget", "-qO-", "http://localhost:3091/api/v1/health"]
|
test: ["CMD", "wget", "-qO-", "http://localhost:3091/api/v1/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
|
|||||||
57
server.js
57
server.js
@@ -48,6 +48,22 @@ DB.exec(`
|
|||||||
try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {}
|
try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ── Stats cache (declared early so pruneOldRows can reference it) ─────────────
|
||||||
|
|
||||||
|
let _cache = null, _cacheTs = 0;
|
||||||
|
|
||||||
|
// ── Row pruning (90 days) ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const PRUNE_AGE = 90 * 86400; // 90 days in seconds
|
||||||
|
|
||||||
|
function pruneOldRows() {
|
||||||
|
const cutoff = Math.floor(Date.now() / 1000) - PRUNE_AGE;
|
||||||
|
DB.prepare('DELETE FROM bots WHERE received_at < ?').run(cutoff);
|
||||||
|
_cache = null;
|
||||||
|
}
|
||||||
|
pruneOldRows(); // on startup
|
||||||
|
setInterval(pruneOldRows, 6 * 3600 * 1000); // every 6 hours
|
||||||
|
|
||||||
// ── Auth ──────────────────────────────────────────────────────────────────────
|
// ── Auth ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const API_TOKEN = (process.env.API_TOKEN || '').trim();
|
const API_TOKEN = (process.env.API_TOKEN || '').trim();
|
||||||
@@ -98,6 +114,14 @@ function parseUA(ua = '') {
|
|||||||
const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?');
|
const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?');
|
||||||
const enrichCache = new Map();
|
const enrichCache = new Map();
|
||||||
|
|
||||||
|
// Clean enrichCache entries whose TTL has expired (runs every 5 minutes)
|
||||||
|
setInterval(() => {
|
||||||
|
const now = Date.now();
|
||||||
|
for (const [ip, expiry] of enrichCache) {
|
||||||
|
if (now > expiry) enrichCache.delete(ip);
|
||||||
|
}
|
||||||
|
}, 5 * 60 * 1000);
|
||||||
|
|
||||||
function isPrivateIP(ip) {
|
function isPrivateIP(ip) {
|
||||||
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip);
|
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip);
|
||||||
}
|
}
|
||||||
@@ -151,8 +175,7 @@ function allowed(ip, max = 30, win = 60_000) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
|
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
|
||||||
|
// (_cache and _cacheTs are declared earlier, before pruneOldRows)
|
||||||
let _cache = null, _cacheTs = 0;
|
|
||||||
|
|
||||||
function getStats() {
|
function getStats() {
|
||||||
if (_cache && Date.now() - _cacheTs < 30_000) return _cache;
|
if (_cache && Date.now() - _cacheTs < 30_000) return _cache;
|
||||||
@@ -260,21 +283,46 @@ const insertBatch = DB.transaction((siteId, bots) => {
|
|||||||
|
|
||||||
// ── Self-observation (log bots that visit the API directly) ───────────────────
|
// ── Self-observation (log bots that visit the API directly) ───────────────────
|
||||||
//
|
//
|
||||||
// Matches any request whose UA looks like a bot/scanner/tool, or has no UA.
|
// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true').
|
||||||
|
// When enabled: only logs requests where UA is present AND matches bot patterns.
|
||||||
|
// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat.
|
||||||
// Skips /health (Docker probe) and /submit (WP plugin).
|
// Skips /health (Docker probe) and /submit (WP plugin).
|
||||||
// Logged as site_id='self', action='observed' so they're visually distinct.
|
// Logged as site_id='self', action='observed' so they're visually distinct.
|
||||||
|
|
||||||
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
|
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
|
||||||
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
|
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
|
||||||
|
|
||||||
|
// Per-IP dedup map for selfObserve: IP -> timestamp of last log
|
||||||
|
const selfSeen = new Map();
|
||||||
|
|
||||||
|
// Clean selfSeen entries older than 10 minutes every 5 minutes
|
||||||
|
setInterval(() => {
|
||||||
|
const cutoff = Date.now() - 10 * 60 * 1000;
|
||||||
|
for (const [ip, t] of selfSeen) {
|
||||||
|
if (t < cutoff) selfSeen.delete(ip);
|
||||||
|
}
|
||||||
|
}, 5 * 60 * 1000);
|
||||||
|
|
||||||
function selfObserve(req, res, next) {
|
function selfObserve(req, res, next) {
|
||||||
|
// Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true'
|
||||||
|
if (!process.env.ENABLE_SELF_OBSERVE || process.env.ENABLE_SELF_OBSERVE === 'false') {
|
||||||
|
return next();
|
||||||
|
}
|
||||||
|
|
||||||
if (SKIP_SELF.has(req.path)) return next();
|
if (SKIP_SELF.has(req.path)) return next();
|
||||||
|
|
||||||
const ua = req.headers['user-agent'] || '';
|
const ua = req.headers['user-agent'] || '';
|
||||||
if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip
|
// Only log when UA is present AND matches known bot patterns
|
||||||
|
if (!ua || !BOT_UA_RE.test(ua)) return next();
|
||||||
|
|
||||||
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|
||||||
|| req.socket.remoteAddress || '?';
|
|| req.socket.remoteAddress || '?';
|
||||||
|
|
||||||
|
// Per-IP dedup: skip if this IP was already logged in the last 5 minutes
|
||||||
|
const lastSeen = selfSeen.get(ip);
|
||||||
|
if (lastSeen && Date.now() - lastSeen < 300_000) return next();
|
||||||
|
selfSeen.set(ip, Date.now());
|
||||||
|
|
||||||
const now = Math.floor(Date.now() / 1000);
|
const now = Math.floor(Date.now() / 1000);
|
||||||
const fam = parseUA(ua);
|
const fam = parseUA(ua);
|
||||||
|
|
||||||
@@ -282,7 +330,6 @@ function selfObserve(req, res, next) {
|
|||||||
const r = stmtIns.run(
|
const r = stmtIns.run(
|
||||||
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300)
|
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300)
|
||||||
);
|
);
|
||||||
_cache = null;
|
|
||||||
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
|
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
|
||||||
} catch {}
|
} catch {}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user