fix: disable self-observe by default, fix crash causes
- ENABLE_SELF_OBSERVE env var (default false) gates self-observe - Fix UA condition: empty UA no longer triggers logging - Add per-IP dedup (5min) to prevent 1M+ row storms - Remove _cache=null from selfObserve (was busting cache on every hit) - Add 90-day row pruning on startup + every 6h - Add enrichCache TTL cleanup every 5min to prevent unbounded memory growth Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,3 +4,6 @@ NODE_ENV=production
|
||||
# Set a strong random token — all WP sites must send this as: Authorization: Bearer <token>
|
||||
# Leave empty to run in open mode (dev only)
|
||||
API_TOKEN=change-me-to-a-long-random-string
|
||||
# Set to 'true' to log bots that hit the API dashboard directly.
|
||||
# Disabled by default — enabling it caused DB bloat from legitimate crawlers.
|
||||
ENABLE_SELF_OBSERVE=false
|
||||
|
||||
@@ -12,6 +12,7 @@ services:
|
||||
- DB_PATH=/data/bots.db
|
||||
- NODE_ENV=production
|
||||
- API_TOKEN=${API_TOKEN:-change-me-to-a-long-random-string}
|
||||
- ENABLE_SELF_OBSERVE=false
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3091/api/v1/health"]
|
||||
interval: 30s
|
||||
|
||||
57
server.js
57
server.js
@@ -48,6 +48,22 @@ DB.exec(`
|
||||
try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {}
|
||||
});
|
||||
|
||||
// ── Stats cache (declared early so pruneOldRows can reference it) ─────────────
|
||||
|
||||
let _cache = null, _cacheTs = 0;
|
||||
|
||||
// ── Row pruning (90 days) ─────────────────────────────────────────────────────
|
||||
|
||||
const PRUNE_AGE = 90 * 86400; // 90 days in seconds
|
||||
|
||||
function pruneOldRows() {
|
||||
const cutoff = Math.floor(Date.now() / 1000) - PRUNE_AGE;
|
||||
DB.prepare('DELETE FROM bots WHERE received_at < ?').run(cutoff);
|
||||
_cache = null;
|
||||
}
|
||||
pruneOldRows(); // on startup
|
||||
setInterval(pruneOldRows, 6 * 3600 * 1000); // every 6 hours
|
||||
|
||||
// ── Auth ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
const API_TOKEN = (process.env.API_TOKEN || '').trim();
|
||||
@@ -98,6 +114,14 @@ function parseUA(ua = '') {
|
||||
const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?');
|
||||
const enrichCache = new Map();
|
||||
|
||||
// Clean enrichCache entries whose TTL has expired (runs every 5 minutes)
|
||||
setInterval(() => {
|
||||
const now = Date.now();
|
||||
for (const [ip, expiry] of enrichCache) {
|
||||
if (now > expiry) enrichCache.delete(ip);
|
||||
}
|
||||
}, 5 * 60 * 1000);
|
||||
|
||||
function isPrivateIP(ip) {
|
||||
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip);
|
||||
}
|
||||
@@ -151,8 +175,7 @@ function allowed(ip, max = 30, win = 60_000) {
|
||||
}
|
||||
|
||||
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
|
||||
|
||||
let _cache = null, _cacheTs = 0;
|
||||
// (_cache and _cacheTs are declared earlier, before pruneOldRows)
|
||||
|
||||
function getStats() {
|
||||
if (_cache && Date.now() - _cacheTs < 30_000) return _cache;
|
||||
@@ -260,21 +283,46 @@ const insertBatch = DB.transaction((siteId, bots) => {
|
||||
|
||||
// ── Self-observation (log bots that visit the API directly) ───────────────────
|
||||
//
|
||||
// Matches any request whose UA looks like a bot/scanner/tool, or has no UA.
|
||||
// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true').
|
||||
// When enabled: only logs requests where UA is present AND matches bot patterns.
|
||||
// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat.
|
||||
// Skips /health (Docker probe) and /submit (WP plugin).
|
||||
// Logged as site_id='self', action='observed' so they're visually distinct.
|
||||
|
||||
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
|
||||
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
|
||||
|
||||
// Per-IP dedup map for selfObserve: IP -> timestamp of last log
|
||||
const selfSeen = new Map();
|
||||
|
||||
// Clean selfSeen entries older than 10 minutes every 5 minutes
|
||||
setInterval(() => {
|
||||
const cutoff = Date.now() - 10 * 60 * 1000;
|
||||
for (const [ip, t] of selfSeen) {
|
||||
if (t < cutoff) selfSeen.delete(ip);
|
||||
}
|
||||
}, 5 * 60 * 1000);
|
||||
|
||||
function selfObserve(req, res, next) {
|
||||
// Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true'
|
||||
if (!process.env.ENABLE_SELF_OBSERVE || process.env.ENABLE_SELF_OBSERVE === 'false') {
|
||||
return next();
|
||||
}
|
||||
|
||||
if (SKIP_SELF.has(req.path)) return next();
|
||||
|
||||
const ua = req.headers['user-agent'] || '';
|
||||
if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip
|
||||
// Only log when UA is present AND matches known bot patterns
|
||||
if (!ua || !BOT_UA_RE.test(ua)) return next();
|
||||
|
||||
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|
||||
|| req.socket.remoteAddress || '?';
|
||||
|
||||
// Per-IP dedup: skip if this IP was already logged in the last 5 minutes
|
||||
const lastSeen = selfSeen.get(ip);
|
||||
if (lastSeen && Date.now() - lastSeen < 300_000) return next();
|
||||
selfSeen.set(ip, Date.now());
|
||||
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const fam = parseUA(ua);
|
||||
|
||||
@@ -282,7 +330,6 @@ function selfObserve(req, res, next) {
|
||||
const r = stmtIns.run(
|
||||
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300)
|
||||
);
|
||||
_cache = null;
|
||||
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
|
||||
} catch {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user