2026-04-09 18:32:02 +02:00
'use strict' ;
const express = require ( 'express' ) ;
const Database = require ( 'better-sqlite3' ) ;
const path = require ( 'path' ) ;
const http = require ( 'http' ) ;
const { timingSafeEqual } = require ( 'crypto' ) ;
const app = express ( ) ;
const PORT = Number ( process . env . PORT ) || 3001 ;
const DB = new Database ( process . env . DB _PATH || '/data/bots.db' ) ;
// ── Database ──────────────────────────────────────────────────────────────────
DB . pragma ( 'journal_mode = WAL' ) ;
DB . pragma ( 'synchronous = NORMAL' ) ;
DB . pragma ( 'cache_size = -8000' ) ;
DB . exec ( `
CREATE TABLE IF NOT EXISTS bots (
id INTEGER PRIMARY KEY AUTOINCREMENT ,
received _at INTEGER NOT NULL DEFAULT ( unixepoch ( ) ) ,
site _id TEXT NOT NULL DEFAULT '' ,
ip _masked TEXT NOT NULL DEFAULT '' ,
bot _type TEXT NOT NULL DEFAULT '' ,
action TEXT NOT NULL DEFAULT 'blocked' ,
reason TEXT NOT NULL DEFAULT '' ,
ua _family TEXT NOT NULL DEFAULT '' ,
request _uri TEXT NOT NULL DEFAULT '' ,
country TEXT NOT NULL DEFAULT '' ,
asn TEXT NOT NULL DEFAULT ''
) ;
CREATE TABLE IF NOT EXISTS sites (
site _id TEXT PRIMARY KEY ,
first _seen INTEGER NOT NULL DEFAULT ( unixepoch ( ) ) ,
last _seen INTEGER NOT NULL DEFAULT ( unixepoch ( ) ) ,
block _count INTEGER NOT NULL DEFAULT 0
) ;
CREATE INDEX IF NOT EXISTS idx _recv ON bots ( received _at DESC ) ;
CREATE INDEX IF NOT EXISTS idx _ip ON bots ( ip _masked ) ;
CREATE INDEX IF NOT EXISTS idx _site ON bots ( site _id ) ;
CREATE INDEX IF NOT EXISTS idx _bot _type ON bots ( bot _type ) ;
CREATE INDEX IF NOT EXISTS idx _action ON bots ( action ) ;
` );
// Migrations – silently ignored if columns already exist
2026-04-10 09:00:59 +02:00
[ 'country' , 'asn' , 'request_uri' , 'user_agent' ] . forEach ( col => {
2026-04-09 18:32:02 +02:00
try { DB . exec ( ` ALTER TABLE bots ADD COLUMN ${ col } TEXT NOT NULL DEFAULT '' ` ) ; } catch { }
} ) ;
2026-04-13 10:00:06 +02:00
// ── Stats cache (declared early so pruneOldRows can reference it) ─────────────
let _cache = null , _cacheTs = 0 ;
// ── Row pruning (90 days) ─────────────────────────────────────────────────────
const PRUNE _AGE = 90 * 86400 ; // 90 days in seconds
function pruneOldRows ( ) {
const cutoff = Math . floor ( Date . now ( ) / 1000 ) - PRUNE _AGE ;
DB . prepare ( 'DELETE FROM bots WHERE received_at < ?' ) . run ( cutoff ) ;
_cache = null ;
}
pruneOldRows ( ) ; // on startup
setInterval ( pruneOldRows , 6 * 3600 * 1000 ) ; // every 6 hours
2026-04-09 18:32:02 +02:00
// ── Auth ──────────────────────────────────────────────────────────────────────
const API _TOKEN = ( process . env . API _TOKEN || '' ) . trim ( ) ;
function requireToken ( req , res , next ) {
if ( ! API _TOKEN ) return next ( ) ;
const token = ( req . headers [ 'authorization' ] || '' ) . replace ( /^Bearer\s+/ , '' ) ;
const a = Buffer . alloc ( 128 ) ; Buffer . from ( token , 'utf8' ) . copy ( a , 0 , 0 , 128 ) ;
const b = Buffer . alloc ( 128 ) ; Buffer . from ( API _TOKEN , 'utf8' ) . copy ( b , 0 , 0 , 128 ) ;
if ( ! timingSafeEqual ( a , b ) || token !== API _TOKEN ) {
return res . status ( 403 ) . json ( { error : 'Forbidden' } ) ;
}
next ( ) ;
}
// ── UA families ───────────────────────────────────────────────────────────────
const UA _MAP = [
[ /curl\//i , 'curl' ] ,
[ /python-requests|python\//i , 'Python' ] ,
[ /go-http-client/i , 'Go' ] ,
[ /wget\//i , 'Wget' ] ,
[ /java\//i , 'Java' ] ,
[ /scrapy/i , 'Scrapy' ] ,
[ /axios/i , 'Axios' ] ,
[ /headlesschrome|phantomjs/i , 'Headless Browser' ] ,
[ /(bot|crawler|spider|slurp)/i , 'Bot/Crawler' ] ,
[ /GPTBot|ChatGPT/i , 'OpenAI Bot' ] ,
[ /Googlebot/i , 'Googlebot' ] ,
[ /bingbot/i , 'Bingbot' ] ,
[ /YandexBot/i , 'YandexBot' ] ,
[ /Baiduspider/i , 'Baiduspider' ] ,
[ /DuckDuckBot/i , 'DuckDuckBot' ] ,
[ /AhrefsBot/i , 'AhrefsBot' ] ,
[ /SemrushBot/i , 'SemrushBot' ] ,
[ /chrome/i , 'Chrome' ] ,
[ /firefox/i , 'Firefox' ] ,
[ /safari/i , 'Safari' ] ,
] ;
function parseUA ( ua = '' ) {
for ( const [ re , label ] of UA _MAP ) if ( re . test ( ua ) ) return label ;
return ua . length ? 'Other' : 'No UA' ;
}
// ── IP geo-enrichment ─────────────────────────────────────────────────────────
const stmtEnrich = DB . prepare ( 'UPDATE bots SET country=?, asn=? WHERE id=?' ) ;
const enrichCache = new Map ( ) ;
2026-04-13 10:00:06 +02:00
// Clean enrichCache entries whose TTL has expired (runs every 5 minutes)
setInterval ( ( ) => {
const now = Date . now ( ) ;
for ( const [ ip , expiry ] of enrichCache ) {
if ( now > expiry ) enrichCache . delete ( ip ) ;
}
} , 5 * 60 * 1000 ) ;
2026-04-09 18:32:02 +02:00
function isPrivateIP ( ip ) {
return /^(10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.|127\.|::1$|fc|fd)/ . test ( ip ) ;
}
function enrichIP ( rowId , ip ) {
if ( ! ip || ip === '?' || isPrivateIP ( ip ) ) return ;
const now = Date . now ( ) ;
if ( ( enrichCache . get ( ip ) || 0 ) > now ) return ;
enrichCache . set ( ip , now + 3_600_000 ) ;
http . get (
` http://ip-api.com/json/ ${ encodeURIComponent ( ip ) } ?fields=status,countryCode,as ` ,
{ timeout : 5000 } ,
res => {
let data = '' ;
res . on ( 'data' , d => data += d ) ;
res . on ( 'end' , ( ) => {
try {
const j = JSON . parse ( data ) ;
if ( j . status === 'success' ) {
stmtEnrich . run (
( j . countryCode || '' ) . slice ( 0 , 2 ) ,
( j . as || '' ) . slice ( 0 , 50 ) ,
rowId
) ;
}
} catch { }
} ) ;
}
) . on ( 'error' , ( ) => enrichCache . delete ( ip ) ) ;
}
// Background enrichment of unenriched rows
const stmtUnenriched = DB . prepare (
"SELECT id, ip_masked FROM bots WHERE country='' AND ip_masked != '' AND ip_masked != '?' LIMIT 5"
) ;
setInterval ( ( ) => {
for ( const row of stmtUnenriched . all ( ) ) enrichIP ( row . id , row . ip _masked ) ;
} , 20_000 ) ;
// ── Rate limiter ──────────────────────────────────────────────────────────────
const rl = new Map ( ) ;
setInterval ( ( ) => { const n = Date . now ( ) ; for ( const [ k , v ] of rl ) if ( n > v . r ) rl . delete ( k ) ; } , 30_000 ) ;
function allowed ( ip , max = 30 , win = 60_000 ) {
const n = Date . now ( ) ;
let e = rl . get ( ip ) ;
if ( ! e || n > e . r ) { e = { c : 0 , r : n + win } ; rl . set ( ip , e ) ; }
return ++ e . c <= max ;
}
// ── Stats cache (30s TTL) ─────────────────────────────────────────────────────
2026-04-13 10:00:06 +02:00
// (_cache and _cacheTs are declared earlier, before pruneOldRows)
2026-04-09 18:32:02 +02:00
function getStats ( ) {
if ( _cache && Date . now ( ) - _cacheTs < 30_000 ) return _cache ;
const now = Math . floor ( Date . now ( ) / 1000 ) ;
_cache = {
total : DB . prepare ( 'SELECT COUNT(*) n FROM bots' ) . get ( ) . n ,
today : DB . prepare ( 'SELECT COUNT(*) n FROM bots WHERE received_at > ?' ) . get ( now - 86400 ) . n ,
last _7d : DB . prepare ( 'SELECT COUNT(*) n FROM bots WHERE received_at > ?' ) . get ( now - 604800 ) . n ,
last _30d : DB . prepare ( 'SELECT COUNT(*) n FROM bots WHERE received_at > ?' ) . get ( now - 2592000 ) . n ,
rate _limited : DB . prepare ( "SELECT COUNT(*) n FROM bots WHERE action='rate_limited' AND received_at > ?" ) . get ( now - 2592000 ) . n ,
total _sites : DB . prepare ( 'SELECT COUNT(*) n FROM sites' ) . get ( ) . n ,
top _ips : DB . prepare ( `
SELECT ip _masked ip , country , asn , COUNT ( * ) hits
FROM bots WHERE received _at > ?
GROUP BY ip _masked ORDER BY hits DESC LIMIT 10
` ).all(now - 2592000),
top _bot _types : DB . prepare ( `
SELECT bot _type , COUNT ( * ) hits
FROM bots WHERE received _at > ?
GROUP BY bot _type ORDER BY hits DESC LIMIT 8
` ).all(now - 2592000),
top _actions : DB . prepare ( `
SELECT action , COUNT ( * ) hits
FROM bots WHERE received _at > ?
GROUP BY action ORDER BY hits DESC LIMIT 8
` ).all(now - 2592000),
top _reasons : DB . prepare ( `
SELECT reason , COUNT ( * ) hits
FROM bots WHERE received _at > ?
GROUP BY reason ORDER BY hits DESC LIMIT 8
` ).all(now - 2592000),
top _ua : DB . prepare ( `
SELECT ua _family , COUNT ( * ) hits
FROM bots WHERE received _at > ?
GROUP BY ua _family ORDER BY hits DESC LIMIT 8
` ).all(now - 2592000),
2026-04-10 09:00:59 +02:00
top _user _agents : DB . prepare ( `
SELECT user _agent ua , COUNT ( * ) hits
FROM bots WHERE received _at > ? AND user _agent != ''
GROUP BY user _agent ORDER BY hits DESC LIMIT 15
` ).all(now - 2592000),
2026-04-09 18:32:02 +02:00
recent : DB . prepare ( `
2026-04-10 08:37:52 +02:00
SELECT received _at , ip _masked ip , country , bot _type , action , reason , ua _family , site _id
2026-04-09 18:32:02 +02:00
FROM bots ORDER BY id DESC LIMIT 40
` ).all(),
hourly : DB . prepare ( `
SELECT ( received _at / 3600 ) * 3600 h , COUNT ( * ) n
FROM bots WHERE received _at > ?
GROUP BY h ORDER BY h ASC
` ).all(now - 86400),
} ;
_cacheTs = Date . now ( ) ;
return _cache ;
}
// ── SSE live stream ───────────────────────────────────────────────────────────
const sseClients = new Set ( ) ;
let lastId = DB . prepare ( 'SELECT MAX(id) id FROM bots' ) . get ( ) . id || 0 ;
setInterval ( ( ) => {
if ( ! sseClients . size ) return ;
2026-04-10 08:37:52 +02:00
const rows = DB . prepare ( 'SELECT id, received_at, ip_masked, country, bot_type, action, reason, ua_family, site_id FROM bots WHERE id > ? ORDER BY id ASC LIMIT 20' ) . all ( lastId ) ;
2026-04-09 18:32:02 +02:00
if ( ! rows . length ) return ;
lastId = rows . at ( - 1 ) . id ;
const msg = ` data: ${ JSON . stringify ( rows ) } \n \n ` ;
for ( const r of sseClients ) { try { r . write ( msg ) ; } catch { sseClients . delete ( r ) ; } }
} , 2000 ) ;
// ── Prepared statements ───────────────────────────────────────────────────────
const stmtIns = DB . prepare ( `
2026-04-10 09:00:59 +02:00
INSERT INTO bots ( received _at , site _id , ip _masked , bot _type , action , reason , ua _family , request _uri , country , asn , user _agent )
VALUES ( ? , ? , ? , ? , ? , ? , ? , ? , ? , ? , ? )
2026-04-09 18:32:02 +02:00
` );
const stmtSite = DB . prepare ( `
INSERT INTO sites ( site _id , first _seen , last _seen , block _count ) VALUES ( ? , ? , ? , ? )
ON CONFLICT ( site _id ) DO UPDATE SET
last _seen = excluded . last _seen ,
block _count = block _count + excluded . block _count
` );
const insertBatch = DB . transaction ( ( siteId , bots ) => {
const now = Math . floor ( Date . now ( ) / 1000 ) ;
const ids = [ ] ;
for ( const b of bots ) {
const ts = b . logged _at ? Math . floor ( new Date ( b . logged _at ) / 1000 ) : now ;
const ip = String ( b . ip || '' ) . trim ( ) . slice ( 0 , 45 ) || '?' ;
const r = stmtIns . run (
ts , siteId , ip ,
String ( b . bot _type || '' ) . slice ( 0 , 100 ) ,
String ( b . action || 'blocked' ) . slice ( 0 , 20 ) ,
String ( b . reason || '' ) . slice ( 0 , 255 ) ,
parseUA ( b . user _agent || '' ) ,
String ( b . request _uri || '' ) . slice ( 0 , 500 ) ,
2026-04-10 09:00:59 +02:00
'' , '' , // country/asn filled async
String ( b . user _agent || '' ) . slice ( 0 , 300 )
2026-04-09 18:32:02 +02:00
) ;
ids . push ( { id : Number ( r . lastInsertRowid ) , ip } ) ;
}
stmtSite . run ( siteId , now , now , bots . length ) ;
return ids ;
} ) ;
2026-04-10 08:37:52 +02:00
// ── Self-observation (log bots that visit the API directly) ───────────────────
//
2026-04-13 10:00:06 +02:00
// Disabled by default (ENABLE_SELF_OBSERVE must be explicitly set to 'true').
// When enabled: only logs requests where UA is present AND matches bot patterns.
// Includes per-IP dedup (one log entry per IP per 5 minutes) to prevent DB bloat.
2026-04-10 08:37:52 +02:00
// Skips /health (Docker probe) and /submit (WP plugin).
// Logged as site_id='self', action='observed' so they're visually distinct.
const BOT _UA _RE = /(bot|crawl|spider|scrap|scan|slurp|fetch|wget|curl|python|go-http|java\/|scrapy|axios|headless|phantom|gptbot|chatgpt|openai|yandex|baidu|semrush|ahrefs|mj12|dotbot|petalbot)/i ;
const SKIP _SELF = new Set ( [ '/api/v1/health' , '/api/v1/submit' ] ) ;
2026-04-13 10:00:06 +02:00
// Per-IP dedup map for selfObserve: IP -> timestamp of last log
const selfSeen = new Map ( ) ;
// Clean selfSeen entries older than 10 minutes every 5 minutes
setInterval ( ( ) => {
const cutoff = Date . now ( ) - 10 * 60 * 1000 ;
for ( const [ ip , t ] of selfSeen ) {
if ( t < cutoff ) selfSeen . delete ( ip ) ;
}
} , 5 * 60 * 1000 ) ;
2026-04-10 08:37:52 +02:00
function selfObserve ( req , res , next ) {
2026-04-13 10:00:06 +02:00
// Skip entirely if ENABLE_SELF_OBSERVE is not explicitly 'true'
if ( ! process . env . ENABLE _SELF _OBSERVE || process . env . ENABLE _SELF _OBSERVE === 'false' ) {
return next ( ) ;
}
2026-04-10 08:37:52 +02:00
if ( SKIP _SELF . has ( req . path ) ) return next ( ) ;
const ua = req . headers [ 'user-agent' ] || '' ;
2026-04-13 10:00:06 +02:00
// Only log when UA is present AND matches known bot patterns
if ( ! ua || ! BOT _UA _RE . test ( ua ) ) return next ( ) ;
2026-04-10 08:37:52 +02:00
const ip = ( req . headers [ 'x-forwarded-for' ] || '' ) . split ( ',' ) [ 0 ] . trim ( )
|| req . socket . remoteAddress || '?' ;
2026-04-13 10:00:06 +02:00
// Per-IP dedup: skip if this IP was already logged in the last 5 minutes
const lastSeen = selfSeen . get ( ip ) ;
if ( lastSeen && Date . now ( ) - lastSeen < 300_000 ) return next ( ) ;
selfSeen . set ( ip , Date . now ( ) ) ;
2026-04-10 08:37:52 +02:00
const now = Math . floor ( Date . now ( ) / 1000 ) ;
const fam = parseUA ( ua ) ;
try {
const r = stmtIns . run (
2026-04-10 09:00:59 +02:00
now , 'self' , ip , fam , 'observed' , 'Direct API visitor' , fam , req . path , '' , '' , ua . slice ( 0 , 300 )
2026-04-10 08:37:52 +02:00
) ;
setImmediate ( ( ) => enrichIP ( Number ( r . lastInsertRowid ) , ip ) ) ;
} catch { }
next ( ) ;
}
2026-04-09 18:32:02 +02:00
// ── Routes ────────────────────────────────────────────────────────────────────
app . use ( express . json ( { limit : '128kb' } ) ) ;
2026-04-10 08:37:52 +02:00
app . use ( selfObserve ) ;
2026-04-09 18:32:02 +02:00
app . use ( express . static ( path . join ( _ _dirname , 'public' ) ) ) ;
app . post ( '/api/v1/submit' , requireToken , ( req , res ) => {
const clientIP = ( req . headers [ 'x-forwarded-for' ] || '' ) . split ( ',' ) [ 0 ] . trim ( )
|| req . socket . remoteAddress || '' ;
if ( ! allowed ( clientIP ) ) return res . status ( 429 ) . json ( { error : 'Rate limit exceeded' } ) ;
const { site _hash , bots } = req . body || { } ;
if ( ! site _hash || typeof site _hash !== 'string' || site _hash . length < 8 ) {
return res . status ( 400 ) . json ( { error : 'Invalid site_hash' } ) ;
}
if ( ! Array . isArray ( bots ) || ! bots . length || bots . length > 50 ) {
return res . status ( 400 ) . json ( { error : 'bots must be array of 1– 50 items' } ) ;
}
try {
const ids = insertBatch ( site _hash . slice ( 0 , 20 ) , bots ) ;
_cache = null ;
setImmediate ( ( ) => ids . forEach ( ( { id , ip } ) => enrichIP ( id , ip ) ) ) ;
res . json ( { ok : true , received : bots . length } ) ;
} catch ( e ) {
console . error ( '[submit]' , e . message ) ;
res . status ( 500 ) . json ( { error : 'Internal error' } ) ;
}
} ) ;
app . get ( '/api/v1/stats' , ( _ , res ) => res . json ( getStats ( ) ) ) ;
app . get ( '/api/v1/stream' , ( req , res ) => {
res . writeHead ( 200 , {
'Content-Type' : 'text/event-stream' ,
'Cache-Control' : 'no-cache' ,
'Connection' : 'keep-alive' ,
'X-Accel-Buffering' : 'no' ,
} ) ;
res . write ( ':\n\n' ) ;
sseClients . add ( res ) ;
req . on ( 'close' , ( ) => sseClients . delete ( res ) ) ;
} ) ;
app . get ( '/api/v1/health' , ( _ , res ) =>
res . json ( { ok : true , uptime : process . uptime ( ) , sse _clients : sseClients . size } )
) ;
app . listen ( PORT , '0.0.0.0' , ( ) => {
console . log ( ` [bot-api] listening on : ${ PORT } ` ) ;
console . log ( ` [bot-api] db: ${ process . env . DB _PATH || '/data/bots.db' } ` ) ;
} ) ;