Files
InformatiQ-Toolkit/includes/class-itk-bot-blocker.php

313 lines
13 KiB
PHP
Raw Normal View History

<?php
if (!defined('ABSPATH')) exit;
/**
* ITK Bot Blocker
*
* Handles detection and blocking of malicious bots, bad referrers, and bad
* networks. Good/legitimate bots are rate-limited instead of blocked.
*
* Deactivation bug fix: every check method reads options at call time so
* toggling a setting via AJAX takes effect immediately without any hook
* re-registration.
*/
class ITK_Bot_Blocker {
private string $badbots_file;
private string $referrers_file;
private string $networks_file;
private string $goodbots_file;
public function __construct() {
$this->badbots_file = ITK_PATH . 'config/badbots.conf';
$this->referrers_file = ITK_PATH . 'config/referrers.conf';
$this->networks_file = ITK_PATH . 'config/networks.conf';
$this->goodbots_file = ITK_PATH . 'config/goodbots.conf';
// Always hook; each method guards itself with its own option check.
add_action('init', [$this, 'check_request'], 1);
add_filter('robots_txt', [$this, 'modify_robots_txt'], 10, 2);
}
/* ── Main entry point ─────────────────────────────────────── */
public function check_request(): void {
// Never block logged-in admins.
if (is_admin() || (function_exists('current_user_can') && current_user_can('manage_options'))) {
return;
}
// Skip all checks for whitelisted IPs/UAs.
if (ITK_Whitelist::allowed()) return;
$options = get_option('itk_security', []);
$ua = $_SERVER['HTTP_USER_AGENT'] ?? '';
$referrer = $_SERVER['HTTP_REFERER'] ?? '';
$ip = $this->get_client_ip();
$uri = $_SERVER['REQUEST_URI'] ?? '';
// ── 1. Rate-limit good/legitimate bots ─────────────────
if (!empty($options['rate_limit_good_bots'])) {
$good_bot = $this->identify_good_bot($ua);
if ($good_bot !== null) {
$this->handle_good_bot($good_bot, $ua, $ip, $uri);
return; // Handled don't fall through to block checks.
}
}
// ── 2. Block OpenAI bots ───────────────────────────────
if (!empty($options['block_openai_bots']) && $this->is_openai_bot($ua)) {
$this->block('OpenAI bot detected', 'openai', $ua, $referrer, $ip, $uri, $options);
}
// ── 3. Block malicious bots ────────────────────────────
if (!empty($options['block_malicious_bots']) && $this->is_malicious_bot($ua)) {
$this->block('Malicious bot detected', 'malicious_bot', $ua, $referrer, $ip, $uri, $options);
}
// ── 4. Block bad referrers ─────────────────────────────
if (!empty($options['block_bad_referrers']) && $this->is_bad_referrer($referrer)) {
$this->block('Bad referrer detected', 'bad_referrer', $ua, $referrer, $ip, $uri, $options);
}
// ── 5. Block bad networks ──────────────────────────────
if (!empty($options['block_bad_networks']) && $this->is_bad_network($ip)) {
$this->block('IP in blocked network', 'bad_network', $ua, $referrer, $ip, $uri, $options);
}
}
/* ── Good-bot rate limiting ───────────────────────────────── */
/**
* Returns ['name' => string, 'limit' => int] or null if not a known good bot.
* A limit of 0 means "never allow" (treat as blocked).
*/
private function identify_good_bot(string $ua): ?array {
if (empty($ua)) return null;
$cache_key = 'itk_goodbots_list';
$list = get_transient($cache_key);
if ($list === false) {
$list = [];
if (file_exists($this->goodbots_file)) {
foreach (file($this->goodbots_file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
$line = trim($line);
if ($line === '' || $line[0] === '#') continue;
$parts = explode('|', $line, 2);
$list[] = ['name' => trim($parts[0]), 'limit' => isset($parts[1]) ? (int)$parts[1] : 30];
}
}
set_transient($cache_key, $list, 300);
}
foreach ($list as $entry) {
if (stripos($ua, $entry['name']) !== false) {
return $entry;
}
}
return null;
}
private function handle_good_bot(array $bot, string $ua, string $ip, string $uri): void {
$options = get_option('itk_security', []);
$name = $bot['name'];
$limit = (int)$bot['limit'];
// Limit of 0 = always block this "good" bot (e.g. GPTBot still in goodbots.conf)
if ($limit === 0) {
$this->block("Good bot with limit 0: {$name}", $name, $ua, '', $ip, $uri, $options);
return;
}
// Sliding window: track hits per bot per minute using transients.
$window = (int)(time() / 60); // 1-minute window
$tk_key = 'itk_rl_' . md5($name) . '_' . $window;
$count = (int)get_transient($tk_key);
if ($count >= $limit) {
// Over the limit log and send 429.
if (!empty($options['log_blocked_attempts'])) {
$event = [
'ip' => $ip,
'ua' => $ua,
'referrer' => '',
'uri' => $uri,
'bot_type' => $name,
'reason' => "Rate limited: {$count}/{$limit} req/min",
'action' => 'rate_limited',
];
ITK_Database::log_bot($event);
ITK_Bot_API::queue($event);
}
status_header(429);
header('Retry-After: 60');
header('X-ITK-Rate-Limit: ' . $limit);
echo 'Too Many Requests. Crawl-delay: 60';
exit;
}
// Under the limit increment counter and allow through.
set_transient($tk_key, $count + 1, 120);
}
/* ── Blocking ─────────────────────────────────────────────── */
private function block(
string $reason,
string $bot_type,
string $ua,
string $referrer,
string $ip,
string $uri,
array $options
): void {
if (!empty($options['log_blocked_attempts'])) {
$event = [
'ip' => $ip,
'ua' => $ua,
'referrer' => $referrer,
'uri' => $uri,
'bot_type' => $bot_type,
'reason' => $reason,
'action' => 'blocked',
];
ITK_Database::log_bot($event);
ITK_Bot_API::queue($event);
}
$code = $options['response_code'] ?? '403';
$message = $options['custom_message'] ?? 'Access denied.';
$redir = $options['redirect_url'] ?? '';
if ($code === '301_custom' && !empty($redir)) {
header('Location: ' . esc_url_raw($redir), true, 301);
} else {
status_header((int)$code ?: 403);
echo esc_html($message);
}
exit;
}
/* ── Detection helpers ────────────────────────────────────── */
private function is_openai_bot(string $ua): bool {
if (empty($ua)) return false;
foreach (['GPTBot', 'ChatGPT-User', 'OAI-SearchBot', 'whisper'] as $b) {
if (stripos($ua, $b) !== false) return true;
}
return false;
}
private function is_malicious_bot(string $ua): bool {
if (empty($ua)) return false;
foreach ($this->load_conf_list($this->badbots_file, 'itk_bots_list') as $bot) {
if (stripos($ua, $bot) !== false) return true;
}
return false;
}
private function is_bad_referrer(string $referrer): bool {
if (empty($referrer)) return false;
foreach ($this->load_conf_list($this->referrers_file, 'itk_referrers_list') as $ref) {
if (stripos($referrer, $ref) !== false) return true;
}
return false;
}
private function is_bad_network(string $ip): bool {
if (empty($ip) || $ip === 'UNKNOWN') return false;
foreach ($this->load_conf_list($this->networks_file, 'itk_networks_list') as $network) {
if (filter_var($network, FILTER_VALIDATE_IP)) {
if ($ip === $network) return true;
} elseif (strpos($network, '/') !== false) {
if ($this->ip_in_cidr($ip, $network)) return true;
}
}
return false;
}
/* ── Robots.txt ───────────────────────────────────────────── */
public function modify_robots_txt(string $output, string $public): string {
if ($public === '0') return $output;
$options = get_option('itk_security', []);
if (empty($options['block_openai_bots'])) return $output;
$output .= "\n# InformatiQ Toolkit AI bot disallow\n";
foreach (['GPTBot', 'ChatGPT-User', 'OAI-SearchBot'] as $bot) {
$output .= "User-agent: {$bot}\nDisallow: /\n\n";
}
return $output;
}
/* ── Config file readers ──────────────────────────────────── */
private function load_conf_list(string $file, string $cache_key): array {
$cached = get_transient($cache_key);
if ($cached !== false) return $cached;
if (!file_exists($file) || filesize($file) > 1048576) return [];
$lines = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$list = [];
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || $line[0] === '#') continue;
if (strlen($line) <= 200 && !preg_match('/[<>"\']/', $line)) {
$list[] = $line;
}
}
set_transient($cache_key, $list, 300);
return $list;
}
public function invalidate_cache(): void {
delete_transient('itk_bots_list');
delete_transient('itk_referrers_list');
delete_transient('itk_networks_list');
delete_transient('itk_goodbots_list');
}
/* ── IP utilities ─────────────────────────────────────────── */
public function get_client_ip(): string {
$keys = [
'HTTP_CLIENT_IP', 'HTTP_X_FORWARDED_FOR', 'HTTP_X_FORWARDED',
'HTTP_X_CLUSTER_CLIENT_IP', 'HTTP_FORWARDED_FOR', 'HTTP_FORWARDED',
'REMOTE_ADDR',
];
foreach ($keys as $key) {
if (empty($_SERVER[$key])) continue;
$ip = trim(explode(',', $_SERVER[$key])[0]);
if ($key !== 'REMOTE_ADDR' && filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE)) {
return $ip;
}
if ($key === 'REMOTE_ADDR' && filter_var($ip, FILTER_VALIDATE_IP)) {
return $ip;
}
}
return 'UNKNOWN';
}
private function ip_in_cidr(string $ip, string $cidr): bool {
if (strpos($cidr, '/') === false) return false;
[$subnet, $mask] = explode('/', $cidr, 2);
if (!filter_var($subnet, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) return false;
if (!is_numeric($mask) || $mask < 0 || $mask > 32) return false;
if (!filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) return false;
$ip_long = ip2long($ip);
$sub_long = ip2long($subnet);
$mask_dec = ~((1 << (32 - (int)$mask)) - 1);
return ($ip_long & $mask_dec) === ($sub_long & $mask_dec);
}
/* ── Accessors for admin ──────────────────────────────────── */
public function get_badbots_file(): string { return $this->badbots_file; }
public function get_referrers_file(): string { return $this->referrers_file; }
public function get_networks_file(): string { return $this->networks_file; }
public function get_goodbots_file(): string { return $this->goodbots_file; }
}