- ENABLE_SELF_OBSERVE env var (default false) gates self-observe - Fix UA condition: empty UA no longer triggers logging - Add per-IP dedup (5min) to prevent 1M+ row storms - Remove _cache=null from selfObserve (was busting cache on every hit) - Add 90-day row pruning on startup + every 6h - Add enrichCache TTL cleanup every 5min to prevent unbounded memory growth Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
393 lines
16 KiB
JavaScript
393 lines
16 KiB
JavaScript
'use strict';
|
||
|
||
const express = require('express');
|
||
const Database = require('better-sqlite3');
|
||
const path = require('path');
|
||
const http = require('http');
|
||
const { timingSafeEqual } = require('crypto');
|
||
|
||
const app = express();
|
||
const PORT = Number(process.env.PORT) || 3001;
|
||
const DB = new Database(process.env.DB_PATH || '/data/bots.db');
|
||
|
||
// ── Database ──────────────────────────────────────────────────────────────────
|
||
|
||
DB.pragma('journal_mode = WAL');
|
||
DB.pragma('synchronous = NORMAL');
|
||
DB.pragma('cache_size = -8000');
|
||
|
||
DB.exec(`
|
||
CREATE TABLE IF NOT EXISTS bots (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
received_at INTEGER NOT NULL DEFAULT (unixepoch()),
|
||
site_id TEXT NOT NULL DEFAULT '',
|
||
ip_masked TEXT NOT NULL DEFAULT '',
|
||
bot_type TEXT NOT NULL DEFAULT '',
|
||
action TEXT NOT NULL DEFAULT 'blocked',
|
||
reason TEXT NOT NULL DEFAULT '',
|
||
ua_family TEXT NOT NULL DEFAULT '',
|
||
request_uri TEXT NOT NULL DEFAULT '',
|
||
country TEXT NOT NULL DEFAULT '',
|
||
asn TEXT NOT NULL DEFAULT ''
|
||
);
|
||
CREATE TABLE IF NOT EXISTS sites (
|
||
site_id TEXT PRIMARY KEY,
|
||
first_seen INTEGER NOT NULL DEFAULT (unixepoch()),
|
||
last_seen INTEGER NOT NULL DEFAULT (unixepoch()),
|
||
block_count INTEGER NOT NULL DEFAULT 0
|
||
);
|
||
CREATE INDEX IF NOT EXISTS idx_recv ON bots(received_at DESC);
|
||
CREATE INDEX IF NOT EXISTS idx_ip ON bots(ip_masked);
|
||
CREATE INDEX IF NOT EXISTS idx_site ON bots(site_id);
|
||
CREATE INDEX IF NOT EXISTS idx_bot_type ON bots(bot_type);
|
||
CREATE INDEX IF NOT EXISTS idx_action ON bots(action);
|
||
`);
|
||
|
||
// Migrations – silently ignored if columns already exist
|
||
['country', 'asn', 'request_uri', 'user_agent'].forEach(col => {
|
||
try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {}
|
||
});
|
||
|
||
// ── Stats cache (declared early so pruneOldRows can reference it) ─────────────
|
||
|
||
let _cache = null, _cacheTs = 0;
|
||
|
||
// ── Row pruning (90 days) ─────────────────────────────────────────────────────
|
||
|
||
const PRUNE_AGE = 90 * 86400; // 90 days in seconds
|
||
|
||
function pruneOldRows() {
|
||
const cutoff = Math.floor(Date.now() / 1000) - PRUNE_AGE;
|
||
DB.prepare('DELETE FROM bots WHERE received_at < ?').run(cutoff);
|
||
_cache = null;
|
||
}
|
||
pruneOldRows(); // on startup
|
||
setInterval(pruneOldRows, 6 * 3600 * 1000); // every 6 hours
|
||
|
||
// ── Auth ──────────────────────────────────────────────────────────────────────
|
||
|
||
const API_TOKEN = (process.env.API_TOKEN || '').trim();
|
||
|
||
function requireToken(req, res, next) {
|
||
if (!API_TOKEN) return next();
|
||
const token = (req.headers['authorization'] || '').replace(/^Bearer\s+/, '');
|
||
const a = Buffer.alloc(128); Buffer.from(token, 'utf8').copy(a, 0, 0, 128);
|
||
const b = Buffer.alloc(128); Buffer.from(API_TOKEN, 'utf8').copy(b, 0, 0, 128);
|
||
if (!timingSafeEqual(a, b) || token !== API_TOKEN) {
|
||
return res.status(403).json({ error: 'Forbidden' });
|
||
}
|
||
next();
|
||
}
|
||
|
||
// ── UA families ───────────────────────────────────────────────────────────────
|
||
|
||
const UA_MAP = [
|
||
[/curl\//i, 'curl'],
|
||
[/python-requests|python\//i, 'Python'],
|
||
[/go-http-client/i, 'Go'],
|
||
[/wget\//i, 'Wget'],
|
||
[/java\//i, 'Java'],
|
||
[/scrapy/i, 'Scrapy'],
|
||
[/axios/i, 'Axios'],
|
||
[/headlesschrome|phantomjs/i, 'Headless Browser'],
|
||
[/(bot|crawler|spider|slurp)/i, 'Bot/Crawler'],
|
||
[/GPTBot|ChatGPT/i, 'OpenAI Bot'],
|
||
[/Googlebot/i, 'Googlebot'],
|
||
[/bingbot/i, 'Bingbot'],
|
||
[/YandexBot/i, 'YandexBot'],
|
||
[/Baiduspider/i, 'Baiduspider'],
|
||
[/DuckDuckBot/i, 'DuckDuckBot'],
|
||
[/AhrefsBot/i, 'AhrefsBot'],
|
||
[/SemrushBot/i, 'SemrushBot'],
|
||
[/chrome/i, 'Chrome'],
|
||
[/firefox/i, 'Firefox'],
|
||
[/safari/i, 'Safari'],
|
||
];
|
||
|
||
function parseUA(ua = '') {
|
||
for (const [re, label] of UA_MAP) if (re.test(ua)) return label;
|
||
return ua.length ? 'Other' : 'No UA';
|
||
}
|
||
|
||
// ── IP geo-enrichment ─────────────────────────────────────────────────────────
|
||
|
||
const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?');
|
||
const enrichCache = new Map();
|
||
|
||
// Clean enrichCache entries whose TTL has expired (runs every 5 minutes)
|
||
setInterval(() => {
|
||
const now = Date.now();
|
||
for (const [ip, expiry] of enrichCache) {
|
||
if (now > expiry) enrichCache.delete(ip);
|
||
}
|
||
}, 5 * 60 * 1000);
|
||
|
||
function isPrivateIP(ip) {
|
||
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip);
|
||
}
|
||
|
||
function enrichIP(rowId, ip) {
|
||
if (!ip || ip === '?' || isPrivateIP(ip)) return;
|
||
const now = Date.now();
|
||
if ((enrichCache.get(ip) || 0) > now) return;
|
||
enrichCache.set(ip, now + 3_600_000);
|
||
|
||
http.get(
|
||
`http://ip-api.com/json/${encodeURIComponent(ip)}?fields=status,countryCode,as`,
|
||
{ timeout: 5000 },
|
||
res => {
|
||
let data = '';
|
||
res.on('data', d => data += d);
|
||
res.on('end', () => {
|
||
try {
|
||
const j = JSON.parse(data);
|
||
if (j.status === 'success') {
|
||
stmtEnrich.run(
|
||
(j.countryCode || '').slice(0, 2),
|
||
(j.as || '').slice(0, 50),
|
||
rowId
|
||
);
|
||
}
|
||
} catch {}
|
||
});
|
||
}
|
||
).on('error', () => enrichCache.delete(ip));
|
||
}
|
||
|
||
// Background enrichment of unenriched rows
|
||
const stmtUnenriched = DB.prepare(
|
||
"SELECT id, ip_masked FROM bots WHERE country='' AND ip_masked != '' AND ip_masked != '?' LIMIT 5"
|
||
);
|
||
setInterval(() => {
|
||
for (const row of stmtUnenriched.all()) enrichIP(row.id, row.ip_masked);
|
||
}, 20_000);
|
||
|
||
// ── Rate limiter ──────────────────────────────────────────────────────────────
|
||
|
||
const rl = new Map();
|
||
setInterval(() => { const n = Date.now(); for (const [k, v] of rl) if (n > v.r) rl.delete(k); }, 30_000);
|
||
|
||
function allowed(ip, max = 30, win = 60_000) {
|
||
const n = Date.now();
|
||
let e = rl.get(ip);
|
||
if (!e || n > e.r) { e = { c: 0, r: n + win }; rl.set(ip, e); }
|
||
return ++e.c <= max;
|
||
}
|
||
|
||
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
|
||
// (_cache and _cacheTs are declared earlier, before pruneOldRows)
|
||
|
||
function getStats() {
|
||
if (_cache && Date.now() - _cacheTs < 30_000) return _cache;
|
||
const now = Math.floor(Date.now() / 1000);
|
||
|
||
_cache = {
|
||
total: DB.prepare('SELECT COUNT(*) n FROM bots').get().n,
|
||
today: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 86400).n,
|
||
last_7d: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 604800).n,
|
||
last_30d: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 2592000).n,
|
||
rate_limited: DB.prepare("SELECT COUNT(*) n FROM bots WHERE action='rate_limited' AND received_at > ?").get(now - 2592000).n,
|
||
total_sites: DB.prepare('SELECT COUNT(*) n FROM sites').get().n,
|
||
top_ips: DB.prepare(`
|
||
SELECT ip_masked ip, country, asn, COUNT(*) hits
|
||
FROM bots WHERE received_at > ?
|
||
GROUP BY ip_masked ORDER BY hits DESC LIMIT 10
|
||
`).all(now - 2592000),
|
||
top_bot_types: DB.prepare(`
|
||
SELECT bot_type, COUNT(*) hits
|
||
FROM bots WHERE received_at > ?
|
||
GROUP BY bot_type ORDER BY hits DESC LIMIT 8
|
||
`).all(now - 2592000),
|
||
top_actions: DB.prepare(`
|
||
SELECT action, COUNT(*) hits
|
||
FROM bots WHERE received_at > ?
|
||
GROUP BY action ORDER BY hits DESC LIMIT 8
|
||
`).all(now - 2592000),
|
||
top_reasons: DB.prepare(`
|
||
SELECT reason, COUNT(*) hits
|
||
FROM bots WHERE received_at > ?
|
||
GROUP BY reason ORDER BY hits DESC LIMIT 8
|
||
`).all(now - 2592000),
|
||
top_ua: DB.prepare(`
|
||
SELECT ua_family, COUNT(*) hits
|
||
FROM bots WHERE received_at > ?
|
||
GROUP BY ua_family ORDER BY hits DESC LIMIT 8
|
||
`).all(now - 2592000),
|
||
top_user_agents: DB.prepare(`
|
||
SELECT user_agent ua, COUNT(*) hits
|
||
FROM bots WHERE received_at > ? AND user_agent != ''
|
||
GROUP BY user_agent ORDER BY hits DESC LIMIT 15
|
||
`).all(now - 2592000),
|
||
recent: DB.prepare(`
|
||
SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family, site_id
|
||
FROM bots ORDER BY id DESC LIMIT 40
|
||
`).all(),
|
||
hourly: DB.prepare(`
|
||
SELECT (received_at / 3600) * 3600 h, COUNT(*) n
|
||
FROM bots WHERE received_at > ?
|
||
GROUP BY h ORDER BY h ASC
|
||
`).all(now - 86400),
|
||
};
|
||
_cacheTs = Date.now();
|
||
return _cache;
|
||
}
|
||
|
||
// ── SSE live stream ───────────────────────────────────────────────────────────
|
||
|
||
const sseClients = new Set();
|
||
let lastId = DB.prepare('SELECT MAX(id) id FROM bots').get().id || 0;
|
||
|
||
setInterval(() => {
|
||
if (!sseClients.size) return;
|
||
const rows = DB.prepare('SELECT id, received_at, ip_masked, country, bot_type, action, reason, ua_family, site_id FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId);
|
||
if (!rows.length) return;
|
||
lastId = rows.at(-1).id;
|
||
const msg = `data: ${JSON.stringify(rows)}\n\n`;
|
||
for (const r of sseClients) { try { r.write(msg); } catch { sseClients.delete(r); } }
|
||
}, 2000);
|
||
|
||
// ── Prepared statements ───────────────────────────────────────────────────────
|
||
|
||
const stmtIns = DB.prepare(`
|
||
INSERT INTO bots (received_at, site_id, ip_masked, bot_type, action, reason, ua_family, request_uri, country, asn, user_agent)
|
||
VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
||
`);
|
||
const stmtSite = DB.prepare(`
|
||
INSERT INTO sites (site_id, first_seen, last_seen, block_count) VALUES (?,?,?,?)
|
||
ON CONFLICT(site_id) DO UPDATE SET
|
||
last_seen = excluded.last_seen,
|
||
block_count = block_count + excluded.block_count
|
||
`);
|
||
|
||
const insertBatch = DB.transaction((siteId, bots) => {
|
||
const now = Math.floor(Date.now() / 1000);
|
||
const ids = [];
|
||
for (const b of bots) {
|
||
const ts = b.logged_at ? Math.floor(new Date(b.logged_at) / 1000) : now;
|
||
const ip = String(b.ip || '').trim().slice(0, 45) || '?';
|
||
const r = stmtIns.run(
|
||
ts, siteId, ip,
|
||
String(b.bot_type || '').slice(0, 100),
|
||
String(b.action || 'blocked').slice(0, 20),
|
||
String(b.reason || '').slice(0, 255),
|
||
parseUA(b.user_agent || ''),
|
||
String(b.request_uri || '').slice(0, 500),
|
||
'', '', // country/asn filled async
|
||
String(b.user_agent || '').slice(0, 300)
|
||
);
|
||
ids.push({ id: Number(r.lastInsertRowid), ip });
|
||
}
|
||
stmtSite.run(siteId, now, now, bots.length);
|
||
return ids;
|
||
});
|
||
|
||
// ── Self-observation (log bots that visit the API directly) ───────────────────
|
||
//
|
||
// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true').
|
||
// When enabled: only logs requests where UA is present AND matches bot patterns.
|
||
// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat.
|
||
// Skips /health (Docker probe) and /submit (WP plugin).
|
||
// Logged as site_id='self', action='observed' so they're visually distinct.
|
||
|
||
const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i;
|
||
const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']);
|
||
|
||
// Per-IP dedup map for selfObserve: IP -> timestamp of last log
|
||
const selfSeen = new Map();
|
||
|
||
// Clean selfSeen entries older than 10 minutes every 5 minutes
|
||
setInterval(() => {
|
||
const cutoff = Date.now() - 10 * 60 * 1000;
|
||
for (const [ip, t] of selfSeen) {
|
||
if (t < cutoff) selfSeen.delete(ip);
|
||
}
|
||
}, 5 * 60 * 1000);
|
||
|
||
function selfObserve(req, res, next) {
|
||
// Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true'
|
||
if (!process.env.ENABLE_SELF_OBSERVE || process.env.ENABLE_SELF_OBSERVE === 'false') {
|
||
return next();
|
||
}
|
||
|
||
if (SKIP_SELF.has(req.path)) return next();
|
||
|
||
const ua = req.headers['user-agent'] || '';
|
||
// Only log when UA is present AND matches known bot patterns
|
||
if (!ua || !BOT_UA_RE.test(ua)) return next();
|
||
|
||
const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|
||
|| req.socket.remoteAddress || '?';
|
||
|
||
// Per-IP dedup: skip if this IP was already logged in the last 5 minutes
|
||
const lastSeen = selfSeen.get(ip);
|
||
if (lastSeen && Date.now() - lastSeen < 300_000) return next();
|
||
selfSeen.set(ip, Date.now());
|
||
|
||
const now = Math.floor(Date.now() / 1000);
|
||
const fam = parseUA(ua);
|
||
|
||
try {
|
||
const r = stmtIns.run(
|
||
now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '', ua.slice(0, 300)
|
||
);
|
||
setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip));
|
||
} catch {}
|
||
|
||
next();
|
||
}
|
||
|
||
// ── Routes ────────────────────────────────────────────────────────────────────
|
||
|
||
app.use(express.json({ limit: '128kb' }));
|
||
app.use(selfObserve);
|
||
app.use(express.static(path.join(__dirname, 'public')));
|
||
|
||
app.post('/api/v1/submit', requireToken, (req, res) => {
|
||
const clientIP = (req.headers['x-forwarded-for'] || '').split(',')[0].trim()
|
||
|| req.socket.remoteAddress || '';
|
||
|
||
if (!allowed(clientIP)) return res.status(429).json({ error: 'Rate limit exceeded' });
|
||
|
||
const { site_hash, bots } = req.body || {};
|
||
|
||
if (!site_hash || typeof site_hash !== 'string' || site_hash.length < 8) {
|
||
return res.status(400).json({ error: 'Invalid site_hash' });
|
||
}
|
||
if (!Array.isArray(bots) || !bots.length || bots.length > 50) {
|
||
return res.status(400).json({ error: 'bots must be array of 1–50 items' });
|
||
}
|
||
|
||
try {
|
||
const ids = insertBatch(site_hash.slice(0, 20), bots);
|
||
_cache = null;
|
||
setImmediate(() => ids.forEach(({ id, ip }) => enrichIP(id, ip)));
|
||
res.json({ ok: true, received: bots.length });
|
||
} catch (e) {
|
||
console.error('[submit]', e.message);
|
||
res.status(500).json({ error: 'Internal error' });
|
||
}
|
||
});
|
||
|
||
app.get('/api/v1/stats', (_, res) => res.json(getStats()));
|
||
|
||
app.get('/api/v1/stream', (req, res) => {
|
||
res.writeHead(200, {
|
||
'Content-Type': 'text/event-stream',
|
||
'Cache-Control': 'no-cache',
|
||
'Connection': 'keep-alive',
|
||
'X-Accel-Buffering': 'no',
|
||
});
|
||
res.write(':\n\n');
|
||
sseClients.add(res);
|
||
req.on('close', () => sseClients.delete(res));
|
||
});
|
||
|
||
app.get('/api/v1/health', (_, res) =>
|
||
res.json({ ok: true, uptime: process.uptime(), sse_clients: sseClients.size })
|
||
);
|
||
|
||
app.listen(PORT, '0.0.0.0', () => {
|
||
console.log(`[bot-api] listening on :${PORT}`);
|
||
console.log(`[bot-api] db: ${process.env.DB_PATH || '/data/bots.db'}`);
|
||
});
|