Files
bot-api/server.js
Malin 5a823501f2 fix: disable self-observe by default, fix crash causes
- ENABLE_SELF_OBSERVE env var (default false) gates self-observe
- Fix UA condition: empty UA no longer triggers logging
- Add per-IP dedup (5min) to prevent 1M+ row storms
- Remove _cache=null from selfObserve (was busting cache on every hit)
- Add 90-day row pruning on startup + every 6h
- Add enrichCache TTL cleanup every 5min to prevent unbounded memory growth

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 10:00:06 +02:00

393 lines
16 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
'use strict';
const express = require('express');
const Database = require('better-sqlite3');
const path = require('path');
const http = require('http');
const { timingSafeEqual } = require('crypto');
const app = express();
const PORT = Number(process.env.PORT) || 3001;
const DB = new Database(process.env.DB_PATH || '/data/bots.db');
// ── Database ──────────────────────────────────────────────────────────────────
DB.pragma('journal_mode = WAL');
DB.pragma('synchronous = NORMAL');
DB.pragma('cache_size = -8000');
DB.exec(`
CREATE TABLE IF NOT EXISTS bots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
received_at INTEGER NOT NULL DEFAULT (unixepoch()),
site_id TEXT NOT NULL DEFAULT '',
ip_masked TEXT NOT NULL DEFAULT '',
bot_type TEXT NOT NULL DEFAULT '',
action TEXT NOT NULL DEFAULT 'blocked',
reason TEXT NOT NULL DEFAULT '',
ua_family TEXT NOT NULL DEFAULT '',
request_uri TEXT NOT NULL DEFAULT '',
country TEXT NOT NULL DEFAULT '',
asn TEXT NOT NULL DEFAULT ''
);
CREATE TABLE IF NOT EXISTS sites (
site_id TEXT PRIMARY KEY,
first_seen INTEGER NOT NULL DEFAULT (unixepoch()),
last_seen INTEGER NOT NULL DEFAULT (unixepoch()),
block_count INTEGER NOT NULL DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_recv ON bots(received_at DESC);
CREATE INDEX IF NOT EXISTS idx_ip ON bots(ip_masked);
CREATE INDEX IF NOT EXISTS idx_site ON bots(site_id);
CREATE INDEX IF NOT EXISTS idx_bot_type ON bots(bot_type);
CREATE INDEX IF NOT EXISTS idx_action ON bots(action);
`);
// Migrations silently ignored if columns already exist
['country', 'asn', 'request_uri', 'user_agent'].forEach(col => {
try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {}
});
// ── Stats cache (declared early so pruneOldRows can reference it) ─────────────
let _cache = null, _cacheTs = 0;
// ── Row pruning (90 days) ─────────────────────────────────────────────────────
const PRUNE_AGE = 90 * 86400; // 90 days in seconds
function pruneOldRows() {
const cutoff = Math.floor(Date.now() / 1000) - PRUNE_AGE;
DB.prepare('DELETE FROM bots WHERE received_at < ?').run(cutoff);
_cache = null;
}
pruneOldRows(); // on startup
setInterval(pruneOldRows, 6 * 3600 * 1000); // every 6 hours
// ── Auth ──────────────────────────────────────────────────────────────────────
const API_TOKEN = (process.env.API_TOKEN || '').trim();
function requireToken(req, res, next) {
if (!API_TOKEN) return next();
const token = (req.headers['authorization'] || '').replace(/^Bearer\s+/, '');
const a = Buffer.alloc(128); Buffer.from(token, 'utf8').copy(a, 0, 0, 128);
const b = Buffer.alloc(128); Buffer.from(API_TOKEN, 'utf8').copy(b, 0, 0, 128);
if (!timingSafeEqual(a, b) || token !== API_TOKEN) {
return res.status(403).json({ error: 'Forbidden' });
}
next();
}
// ── UA families ───────────────────────────────────────────────────────────────
const UA_MAP = [
[/curl\//i, 'curl'],
[/python-requests|python\//i, 'Python'],
[/go-http-client/i, 'Go'],
[/wget\//i, 'Wget'],
[/java\//i, 'Java'],
[/scrapy/i, 'Scrapy'],
[/axios/i, 'Axios'],
[/headlesschrome|phantomjs/i, 'Headless Browser'],
[/(bot|crawler|spider|slurp)/i, 'Bot/Crawler'],
[/GPTBot|ChatGPT/i, 'OpenAI Bot'],
[/Googlebot/i, 'Googlebot'],
[/bingbot/i, 'Bingbot'],
[/YandexBot/i, 'YandexBot'],
[/Baiduspider/i, 'Baiduspider'],
[/DuckDuckBot/i, 'DuckDuckBot'],
[/AhrefsBot/i, 'AhrefsBot'],
[/SemrushBot/i, 'SemrushBot'],
[/chrome/i, 'Chrome'],
[/firefox/i, 'Firefox'],
[/safari/i, 'Safari'],
];
function parseUA(ua = '') {
for (const [re, label] of UA_MAP) if (re.test(ua)) return label;
return ua.length ? 'Other' : 'No UA';
}
// ── IP geo-enrichment ─────────────────────────────────────────────────────────
const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?');
const enrichCache = new Map();
// Clean enrichCache entries whose TTL has expired (runs every 5 minutes)
setInterval(() => {
const now = Date.now();
for (const [ip, expiry] of enrichCache) {
if (now > expiry) enrichCache.delete(ip);
}
}, 5 * 60 * 1000);
function isPrivateIP(ip) {
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip);
}
function enrichIP(rowId, ip) {
if (!ip || ip === '?' || isPrivateIP(ip)) return;
const now = Date.now();
if ((enrichCache.get(ip) || 0) > now) return;
enrichCache.set(ip, now + 3_600_000);
http.get(
`http://ip-api.com/json/${encodeURIComponent(ip)}?fields=status,countryCode,as`,
{ timeout: 5000 },
res => {
let data = '';
res.on('data', d => data += d);
res.on('end', () => {
try {
const j = JSON.parse(data);
if (j.status === 'success') {
stmtEnrich.run(
(j.countryCode || '').slice(0, 2),
(j.as || '').slice(0, 50),
rowId
);
}
} catch {}
});
}
).on('error', () => enrichCache.delete(ip));
}
// Background enrichment of unenriched rows
const stmtUnenriched = DB.prepare(
"SELECT id, ip_masked FROM bots WHERE country='' AND ip_masked != '' AND ip_masked != '?' LIMIT 5"
);
setInterval(() => {
for (const row of stmtUnenriched.all()) enrichIP(row.id, row.ip_masked);
}, 20_000);
// ── Rate limiter ──────────────────────────────────────────────────────────────
const rl = new Map();
setInterval(() => { const n = Date.now(); for (const [k, v] of rl) if (n > v.r) rl.delete(k); }, 30_000);
function allowed(ip, max = 30, win = 60_000) {
const n = Date.now();
let e = rl.get(ip);
if (!e || n > e.r) { e = { c: 0, r: n + win }; rl.set(ip, e); }
return ++e.c <= max;
}
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
// (_cache and _cacheTs are declared earlier, before pruneOldRows)
function getStats() {
if (_cache && Date.now() - _cacheTs < 30_000) return _cache;
const now = Math.floor(Date.now() / 1000);
_cache = {
total: DB.prepare('SELECT COUNT(*) n FROM bots').get().n,
today: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 86400).n,
last_7d: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 604800).n,
last_30d: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 2592000).n,
rate_limited: DB.prepare("SELECT COUNT(*) n FROM bots WHERE action='rate_limited' AND received_at > ?").get(now - 2592000).n,
total_sites: DB.prepare('SELECT COUNT(*) n FROM sites').get().n,
top_ips: DB.prepare(`
SELECT ip_masked ip, country, asn, COUNT(*) hits
FROM bots WHERE received_at > ?
GROUP BY ip_masked ORDER BY hits DESC LIMIT 10
`).all(now - 2592000),
top_bot_types: DB.prepare(`
SELECT bot_type, COUNT(*) hits
FROM bots WHERE received_at > ?
GROUP BY bot_type ORDER BY hits DESC LIMIT 8
`).all(now - 2592000),
top_actions: DB.prepare(`
SELECT action, COUNT(*) hits
FROM bots WHERE received_at > ?
GROUP BY action ORDER BY hits DESC LIMIT 8
`).all(now - 2592000),
top_reasons: DB.prepare(`
SELECT reason, COUNT(*) hits
FROM bots WHERE received_at > ?
GROUP BY reason ORDER BY hits DESC LIMIT 8
`).all(now - 2592000),
top_ua: DB.prepare(`
SELECT ua_family, COUNT(*) hits
FROM bots WHERE received_at > ?
GROUP BY ua_family ORDER BY hits DESC LIMIT 8
`).all(now - 2592000),
top_user_agents: DB.prepare(`
SELECT user_agent ua, COUNT(*) hits
FROM bots WHERE received_at > ? AND user_agent != ''
GROUP BY user_agent ORDER BY hits DESC LIMIT 15
`).all(now - 2592000),
recent: DB.prepare(`
SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family, site_id
FROM bots ORDER BY id DESC LIMIT 40
`).all(),
hourly: DB.prepare(`
SELECT (received_at / 3600) * 3600 h, COUNT(*) n
FROM bots WHERE received_at > ?
GROUP BY h ORDER BY h ASC
`).all(now - 86400),
};
_cacheTs = Date.now();
return _cache;
}
// ── SSE live stream ───────────────────────────────────────────────────────────
const sseClients = new Set();
let lastId = DB.prepare('SELECT MAX(id) id FROM bots').get().id || 0;
setInterval(() => {
if (!sseClients.size) return;
const rows = DB.prepare('SELECT id, received_at, ip_masked, country, bot_type, action, reason, ua_family, site_id FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId);
if (!rows.length) return;
lastId = rows.at(-1).id;
const msg = `data: ${JSON.stringify(rows)}\n\n`;
for (const r of sseClients) { try { r.write(msg); } catch { sseClients.delete(r); } }
}, 2000);
// ── Prepared statements ───────────────────────────────────────────────────────
const stmtIns = DB.prepare(`
INSERT INTO bots (received_at, site_id, ip_masked, bot_type, action, reason, ua_family, request_uri, country, asn, user_agent)
VALUES (?,?,?,?,?,?,?,?,?,?,?)
`);
const stmtSite = DB.prepare(`
INSERT INTO sites (site_id, first_seen, last_seen, block_count) VALUES (?,?,?,?)
ON CONFLICT(site_id) DO UPDATE SET
last_seen = excluded.last_seen,
block_count = block_count + excluded.block_count
`);
const insertBatch = DB.transaction((siteId, bots) => {
const now = Math.floor(Date.now() / 1000);
const ids = [];
for (const b of bots) {
const ts = b.logged_at ? Math.floor(new Date(b.logged_at) / 1000) : now;
const ip = String(b.ip || '').trim().slice(0, 45) || '?';
const r = stmtIns.run(
ts, siteId, ip,
String(b.bot_type || '').slice(0, 100),
String(b.action || 'blocked').slice(0, 20),
String(b.reason || '').slice(0, 255),
parseUA(b.user_agent || ''),
String(b.request_uri || '').slice(0, 500),
'', '', // country/asn filled async
String(b.user_agent || '').slice(0, 300)
);
ids.push({ id: Number(r.lastInsertRowid), ip });
}
stmtSite.run(siteId, now, now, bots.length);
return ids;
});
// ── Self-observation (log bots that visit the API directly) ───────────────────
//
// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true').
// When enabled: only logs requests where UA is present AND matches bot patterns.
// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat.
// Skips /health (Docker probe) and /submit (WP plugin).
// Logged as site_id='self', action='observed' so they're visually distinct.
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
// Per-IP dedup map for selfObserve: IP -> timestamp of last log
const selfSeen = new Map();
// Clean selfSeen entries older than 10 minutes every 5 minutes
setInterval(() => {
const cutoff = Date.now() - 10 * 60 * 1000;
for (const [ip, t] of selfSeen) {
if (t < cutoff) selfSeen.delete(ip);
}
}, 5 * 60 * 1000);
function selfObserve(req, res, next) {
// Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true'
if (!process.env.ENABLE_SELF_OBSERVE || process.env.ENABLE_SELF_OBSERVE === 'false') {
return next();
}
if (SKIP_SELF.has(req.path)) return next();
const ua = req.headers['user-agent'] || '';
// Only log when UA is present AND matches known bot patterns
if (!ua || !BOT_UA_RE.test(ua)) return next();
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|| req.socket.remoteAddress || '?';
// Per-IP dedup: skip if this IP was already logged in the last 5 minutes
const lastSeen = selfSeen.get(ip);
if (lastSeen && Date.now() - lastSeen < 300_000) return next();
selfSeen.set(ip, Date.now());
const now = Math.floor(Date.now() / 1000);
const fam = parseUA(ua);
try {
const r = stmtIns.run(
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300)
);
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
} catch {}
next();
}
// ── Routes ────────────────────────────────────────────────────────────────────
app.use(express.json({ limit: '128kb' }));
app.use(selfObserve);
app.use(express.static(path.join(__dirname, 'public')));
app.post('/api/v1/submit', requireToken, (req, res) => {
const clientIP = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|| req.socket.remoteAddress || '';
if (!allowed(clientIP)) return res.status(429).json({ error: 'Rate limit exceeded' });
const { site_hash, bots } = req.body || {};
if (!site_hash || typeof site_hash !== 'string' || site_hash.length < 8) {
return res.status(400).json({ error: 'Invalid site_hash' });
}
if (!Array.isArray(bots) || !bots.length || bots.length > 50) {
return res.status(400).json({ error: 'bots must be array of 150 items' });
}
try {
const ids = insertBatch(site_hash.slice(0, 20), bots);
_cache = null;
setImmediate(() => ids.forEach(({ id, ip }) => enrichIP(id, ip)));
res.json({ ok: true, received: bots.length });
} catch (e) {
console.error('[submit]', e.message);
res.status(500).json({ error: 'Internal error' });
}
});
app.get('/api/v1/stats', (_, res) => res.json(getStats()));
app.get('/api/v1/stream', (req, res) => {
res.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'X-Accel-Buffering': 'no',
});
res.write(':\n\n');
sseClients.add(res);
req.on('close', () => sseClients.delete(res));
});
app.get('/api/v1/health', (_, res) =>
res.json({ ok: true, uptime: process.uptime(), sse_clients: sseClients.size })
);
app.listen(PORT, '0.0.0.0', () => {
console.log(`[bot-api] listening on :${PORT}`);
console.log(`[bot-api] db: ${process.env.DB_PATH || '/data/bots.db'}`);
});