'use strict'; const express = require('express'); const Database = require('better-sqlite3'); const path = require('path'); const http = require('http'); const { timingSafeEqual } = require('crypto'); const app = express(); const PORT = Number(process.env.PORT) || 3001; const DB = new Database(process.env.DB_PATH || '/data/bots.db'); // ── Database ────────────────────────────────────────────────────────────────── DB.pragma('journal_mode = WAL'); DB.pragma('synchronous = NORMAL'); DB.pragma('cache_size = -8000'); DB.exec(` CREATE TABLE IF NOT EXISTS bots ( id INTEGER PRIMARY KEY AUTOINCREMENT, received_at INTEGER NOT NULL DEFAULT (unixepoch()), site_id TEXT NOT NULL DEFAULT '', ip_masked TEXT NOT NULL DEFAULT '', bot_type TEXT NOT NULL DEFAULT '', action TEXT NOT NULL DEFAULT 'blocked', reason TEXT NOT NULL DEFAULT '', ua_family TEXT NOT NULL DEFAULT '', request_uri TEXT NOT NULL DEFAULT '', country TEXT NOT NULL DEFAULT '', asn TEXT NOT NULL DEFAULT '' ); CREATE TABLE IF NOT EXISTS sites ( site_id TEXT PRIMARY KEY, first_seen INTEGER NOT NULL DEFAULT (unixepoch()), last_seen INTEGER NOT NULL DEFAULT (unixepoch()), block_count INTEGER NOT NULL DEFAULT 0 ); CREATE INDEX IF NOT EXISTS idx_recv ON bots(received_at DESC); CREATE INDEX IF NOT EXISTS idx_ip ON bots(ip_masked); CREATE INDEX IF NOT EXISTS idx_site ON bots(site_id); CREATE INDEX IF NOT EXISTS idx_bot_type ON bots(bot_type); CREATE INDEX IF NOT EXISTS idx_action ON bots(action); `); // Migrations – silently ignored if columns already exist ['country', 'asn', 'request_uri'].forEach(col => { try { DB.exec(`ALTER TABLE bots ADD COLUMN ${col} TEXT NOT NULL DEFAULT ''`); } catch {} }); // ── Auth ────────────────────────────────────────────────────────────────────── const API_TOKEN = (process.env.API_TOKEN || '').trim(); function requireToken(req, res, next) { if (!API_TOKEN) return next(); const token = (req.headers['authorization'] || '').replace(/^Bearer\s+/, ''); const a = Buffer.alloc(128); Buffer.from(token, 'utf8').copy(a, 0, 0, 128); const b = Buffer.alloc(128); Buffer.from(API_TOKEN, 'utf8').copy(b, 0, 0, 128); if (!timingSafeEqual(a, b) || token !== API_TOKEN) { return res.status(403).json({ error: 'Forbidden' }); } next(); } // ── UA families ─────────────────────────────────────────────────────────────── const UA_MAP = [ [/curl\//i, 'curl'], [/python-requests|python\//i, 'Python'], [/go-http-client/i, 'Go'], [/wget\//i, 'Wget'], [/java\//i, 'Java'], [/scrapy/i, 'Scrapy'], [/axios/i, 'Axios'], [/headlesschrome|phantomjs/i, 'Headless Browser'], [/(bot|crawler|spider|slurp)/i, 'Bot/Crawler'], [/GPTBot|ChatGPT/i, 'OpenAI Bot'], [/Googlebot/i, 'Googlebot'], [/bingbot/i, 'Bingbot'], [/YandexBot/i, 'YandexBot'], [/Baiduspider/i, 'Baiduspider'], [/DuckDuckBot/i, 'DuckDuckBot'], [/AhrefsBot/i, 'AhrefsBot'], [/SemrushBot/i, 'SemrushBot'], [/chrome/i, 'Chrome'], [/firefox/i, 'Firefox'], [/safari/i, 'Safari'], ]; function parseUA(ua = '') { for (const [re, label] of UA_MAP) if (re.test(ua)) return label; return ua.length ? 'Other' : 'No UA'; } // ── IP geo-enrichment ───────────────────────────────────────────────────────── const stmtEnrich = DB.prepare('UPDATE bots SET country=?, asn=? WHERE id=?'); const enrichCache = new Map(); function isPrivateIP(ip) { return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/.test(ip); } function enrichIP(rowId, ip) { if (!ip || ip === '?' || isPrivateIP(ip)) return; const now = Date.now(); if ((enrichCache.get(ip) || 0) > now) return; enrichCache.set(ip, now + 3_600_000); http.get( `http://ip-api.com/json/${encodeURIComponent(ip)}?fields=status,countryCode,as`, { timeout: 5000 }, res => { let data = ''; res.on('data', d => data += d); res.on('end', () => { try { const j = JSON.parse(data); if (j.status === 'success') { stmtEnrich.run( (j.countryCode || '').slice(0, 2), (j.as || '').slice(0, 50), rowId ); } } catch {} }); } ).on('error', () => enrichCache.delete(ip)); } // Background enrichment of unenriched rows const stmtUnenriched = DB.prepare( "SELECT id, ip_masked FROM bots WHERE country='' AND ip_masked != '' AND ip_masked != '?' LIMIT 5" ); setInterval(() => { for (const row of stmtUnenriched.all()) enrichIP(row.id, row.ip_masked); }, 20_000); // ── Rate limiter ────────────────────────────────────────────────────────────── const rl = new Map(); setInterval(() => { const n = Date.now(); for (const [k, v] of rl) if (n > v.r) rl.delete(k); }, 30_000); function allowed(ip, max = 30, win = 60_000) { const n = Date.now(); let e = rl.get(ip); if (!e || n > e.r) { e = { c: 0, r: n + win }; rl.set(ip, e); } return ++e.c <= max; } // ── Stats cache (30s TTL) ───────────────────────────────────────────────────── let _cache = null, _cacheTs = 0; function getStats() { if (_cache && Date.now() - _cacheTs < 30_000) return _cache; const now = Math.floor(Date.now() / 1000); _cache = { total: DB.prepare('SELECT COUNT(*) n FROM bots').get().n, today: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 86400).n, last_7d: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 604800).n, last_30d: DB.prepare('SELECT COUNT(*) n FROM bots WHERE received_at > ?').get(now - 2592000).n, rate_limited: DB.prepare("SELECT COUNT(*) n FROM bots WHERE action='rate_limited' AND received_at > ?").get(now - 2592000).n, total_sites: DB.prepare('SELECT COUNT(*) n FROM sites').get().n, top_ips: DB.prepare(` SELECT ip_masked ip, country, asn, COUNT(*) hits FROM bots WHERE received_at > ? GROUP BY ip_masked ORDER BY hits DESC LIMIT 10 `).all(now - 2592000), top_bot_types: DB.prepare(` SELECT bot_type, COUNT(*) hits FROM bots WHERE received_at > ? GROUP BY bot_type ORDER BY hits DESC LIMIT 8 `).all(now - 2592000), top_actions: DB.prepare(` SELECT action, COUNT(*) hits FROM bots WHERE received_at > ? GROUP BY action ORDER BY hits DESC LIMIT 8 `).all(now - 2592000), top_reasons: DB.prepare(` SELECT reason, COUNT(*) hits FROM bots WHERE received_at > ? GROUP BY reason ORDER BY hits DESC LIMIT 8 `).all(now - 2592000), top_ua: DB.prepare(` SELECT ua_family, COUNT(*) hits FROM bots WHERE received_at > ? GROUP BY ua_family ORDER BY hits DESC LIMIT 8 `).all(now - 2592000), recent: DB.prepare(` SELECT received_at, ip_masked ip, country, bot_type, action, reason, ua_family, site_id FROM bots ORDER BY id DESC LIMIT 40 `).all(), hourly: DB.prepare(` SELECT (received_at / 3600) * 3600 h, COUNT(*) n FROM bots WHERE received_at > ? GROUP BY h ORDER BY h ASC `).all(now - 86400), }; _cacheTs = Date.now(); return _cache; } // ── SSE live stream ─────────────────────────────────────────────────────────── const sseClients = new Set(); let lastId = DB.prepare('SELECT MAX(id) id FROM bots').get().id || 0; setInterval(() => { if (!sseClients.size) return; const rows = DB.prepare('SELECT id, received_at, ip_masked, country, bot_type, action, reason, ua_family, site_id FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20').all(lastId); if (!rows.length) return; lastId = rows.at(-1).id; const msg = `data: ${JSON.stringify(rows)}\n\n`; for (const r of sseClients) { try { r.write(msg); } catch { sseClients.delete(r); } } }, 2000); // ── Prepared statements ─────────────────────────────────────────────────────── const stmtIns = DB.prepare(` INSERT INTO bots (received_at, site_id, ip_masked, bot_type, action, reason, ua_family, request_uri, country, asn) VALUES (?,?,?,?,?,?,?,?,?,?) `); const stmtSite = DB.prepare(` INSERT INTO sites (site_id, first_seen, last_seen, block_count) VALUES (?,?,?,?) ON CONFLICT(site_id) DO UPDATE SET last_seen = excluded.last_seen, block_count = block_count + excluded.block_count `); const insertBatch = DB.transaction((siteId, bots) => { const now = Math.floor(Date.now() / 1000); const ids = []; for (const b of bots) { const ts = b.logged_at ? Math.floor(new Date(b.logged_at) / 1000) : now; const ip = String(b.ip || '').trim().slice(0, 45) || '?'; const r = stmtIns.run( ts, siteId, ip, String(b.bot_type || '').slice(0, 100), String(b.action || 'blocked').slice(0, 20), String(b.reason || '').slice(0, 255), parseUA(b.user_agent || ''), String(b.request_uri || '').slice(0, 500), '', '' // country/asn filled async ); ids.push({ id: Number(r.lastInsertRowid), ip }); } stmtSite.run(siteId, now, now, bots.length); return ids; }); // ── Self-observation (log bots that visit the API directly) ─────────────────── // // Matches any request whose UA looks like a bot/scanner/tool, or has no UA. // Skips /health (Docker probe) and /submit (WP plugin). // Logged as site_id='self', action='observed' so they're visually distinct. const BOT_UA_RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i; const SKIP_SELF = new Set(['/api/v1/health', '/api/v1/submit']); function selfObserve(req, res, next) { if (SKIP_SELF.has(req.path)) return next(); const ua = req.headers['user-agent'] || ''; if (ua && !BOT_UA_RE.test(ua)) return next(); // normal browser — skip const ip = (req.headers['x-forwarded-for'] || '').split(',')[0].trim() || req.socket.remoteAddress || '?'; const now = Math.floor(Date.now() / 1000); const fam = parseUA(ua); try { const r = stmtIns.run( now, 'self', ip, fam, 'observed', 'Direct API visitor', fam, req.path, '', '' ); _cache = null; setImmediate(() => enrichIP(Number(r.lastInsertRowid), ip)); } catch {} next(); } // ── Routes ──────────────────────────────────────────────────────────────────── app.use(express.json({ limit: '128kb' })); app.use(selfObserve); app.use(express.static(path.join(__dirname, 'public'))); app.post('/api/v1/submit', requireToken, (req, res) => { const clientIP = (req.headers['x-forwarded-for'] || '').split(',')[0].trim() || req.socket.remoteAddress || ''; if (!allowed(clientIP)) return res.status(429).json({ error: 'Rate limit exceeded' }); const { site_hash, bots } = req.body || {}; if (!site_hash || typeof site_hash !== 'string' || site_hash.length < 8) { return res.status(400).json({ error: 'Invalid site_hash' }); } if (!Array.isArray(bots) || !bots.length || bots.length > 50) { return res.status(400).json({ error: 'bots must be array of 1–50 items' }); } try { const ids = insertBatch(site_hash.slice(0, 20), bots); _cache = null; setImmediate(() => ids.forEach(({ id, ip }) => enrichIP(id, ip))); res.json({ ok: true, received: bots.length }); } catch (e) { console.error('[submit]', e.message); res.status(500).json({ error: 'Internal error' }); } }); app.get('/api/v1/stats', (_, res) => res.json(getStats())); app.get('/api/v1/stream', (req, res) => { res.writeHead(200, { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Accel-Buffering': 'no', }); res.write(':\n\n'); sseClients.add(res); req.on('close', () => sseClients.delete(res)); }); app.get('/api/v1/health', (_, res) => res.json({ ok: true, uptime: process.uptime(), sse_clients: sseClients.size }) ); app.listen(PORT, '0.0.0.0', () => { console.log(`[bot-api] listening on :${PORT}`); console.log(`[bot-api] db: ${process.env.DB_PATH || '/data/bots.db'}`); });