2025-12-14 19:08:01 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
import os
|
2026-01-04 22:20:10 +01:00
|
|
|
import sys
|
2025-12-14 19:08:01 +01:00
|
|
|
from dataclasses import dataclass
|
2026-01-04 22:20:10 +01:00
|
|
|
from pathlib import Path
|
2025-12-14 19:08:01 +01:00
|
|
|
from typing import Optional, Tuple
|
2025-12-28 17:07:18 +01:00
|
|
|
from zoneinfo import ZoneInfo
|
|
|
|
|
import time
|
2025-12-14 19:08:01 +01:00
|
|
|
|
2026-01-04 22:20:10 +01:00
|
|
|
import yaml
|
|
|
|
|
|
2025-12-14 19:08:01 +01:00
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class Config:
|
|
|
|
|
"""Configuration class for the deception server"""
|
|
|
|
|
port: int = 5000
|
|
|
|
|
delay: int = 100 # milliseconds
|
2026-01-04 22:20:10 +01:00
|
|
|
server_header: str = ""
|
2025-12-14 19:08:01 +01:00
|
|
|
links_length_range: Tuple[int, int] = (5, 15)
|
|
|
|
|
links_per_page_range: Tuple[int, int] = (10, 15)
|
|
|
|
|
char_space: str = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
|
|
|
|
|
max_counter: int = 10
|
|
|
|
|
canary_token_url: Optional[str] = None
|
|
|
|
|
canary_token_tries: int = 10
|
|
|
|
|
dashboard_secret_path: str = None
|
|
|
|
|
api_server_url: Optional[str] = None
|
|
|
|
|
api_server_port: int = 8080
|
|
|
|
|
api_server_path: str = "/api/v2/users"
|
|
|
|
|
probability_error_codes: int = 0 # Percentage (0-100)
|
2026-01-04 22:20:10 +01:00
|
|
|
|
2026-01-23 21:33:32 +01:00
|
|
|
# Crawl limiting settings - for legitimate vs malicious crawlers
|
|
|
|
|
max_pages_limit: int = 100 # Max pages limit for good crawlers and regular users (and bad crawlers/attackers if infinite_pages_for_malicious is False)
|
|
|
|
|
infinite_pages_for_malicious: bool = True # Infinite pages for malicious crawlers
|
|
|
|
|
ban_duration_seconds: int = 600 # Ban duration in seconds for IPs exceeding limits
|
|
|
|
|
|
2025-12-28 10:43:32 -06:00
|
|
|
# Database settings
|
|
|
|
|
database_path: str = "data/krawl.db"
|
|
|
|
|
database_retention_days: int = 30
|
2026-01-04 22:20:10 +01:00
|
|
|
|
|
|
|
|
# Analyzer settings
|
|
|
|
|
http_risky_methods_threshold: float = None
|
|
|
|
|
violated_robots_threshold: float = None
|
|
|
|
|
uneven_request_timing_threshold: float = None
|
|
|
|
|
uneven_request_timing_time_window_seconds: float = None
|
|
|
|
|
user_agents_used_threshold: float = None
|
|
|
|
|
attack_urls_threshold: float = None
|
|
|
|
|
|
2025-12-14 19:08:01 +01:00
|
|
|
@classmethod
|
2026-01-04 22:20:10 +01:00
|
|
|
def from_yaml(cls) -> 'Config':
|
|
|
|
|
"""Create configuration from YAML file"""
|
|
|
|
|
config_location = os.getenv('CONFIG_LOCATION', 'config.yaml')
|
|
|
|
|
config_path = Path(__file__).parent.parent / config_location
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with open(config_path, 'r') as f:
|
|
|
|
|
data = yaml.safe_load(f)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f"Error: Configuration file '{config_path}' not found.", file=sys.stderr)
|
|
|
|
|
print(f"Please create a config.yaml file or set CONFIG_LOCATION environment variable.", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
except yaml.YAMLError as e:
|
|
|
|
|
print(f"Error: Invalid YAML in configuration file '{config_path}': {e}", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
if data is None:
|
|
|
|
|
data = {}
|
|
|
|
|
|
|
|
|
|
# Extract nested values with defaults
|
|
|
|
|
server = data.get('server', {})
|
|
|
|
|
links = data.get('links', {})
|
|
|
|
|
canary = data.get('canary', {})
|
|
|
|
|
dashboard = data.get('dashboard', {})
|
|
|
|
|
api = data.get('api', {})
|
|
|
|
|
database = data.get('database', {})
|
|
|
|
|
behavior = data.get('behavior', {})
|
2026-01-05 10:01:51 +01:00
|
|
|
analyzer = data.get('analyzer') or {}
|
2026-01-23 21:33:32 +01:00
|
|
|
crawl = data.get('crawl', {})
|
2026-01-04 22:20:10 +01:00
|
|
|
|
|
|
|
|
# Handle dashboard_secret_path - auto-generate if null/not set
|
|
|
|
|
dashboard_path = dashboard.get('secret_path')
|
|
|
|
|
if dashboard_path is None:
|
|
|
|
|
dashboard_path = f'/{os.urandom(16).hex()}'
|
|
|
|
|
else:
|
|
|
|
|
# ensure the dashboard path starts with a /
|
|
|
|
|
if dashboard_path[:1] != "/":
|
|
|
|
|
dashboard_path = f"/{dashboard_path}"
|
2026-01-17 18:06:09 +01:00
|
|
|
|
2025-12-14 19:08:01 +01:00
|
|
|
return cls(
|
2026-01-04 22:20:10 +01:00
|
|
|
port=server.get('port', 5000),
|
|
|
|
|
delay=server.get('delay', 100),
|
|
|
|
|
server_header=server.get('server_header',""),
|
2025-12-14 19:08:01 +01:00
|
|
|
links_length_range=(
|
2026-01-04 22:20:10 +01:00
|
|
|
links.get('min_length', 5),
|
|
|
|
|
links.get('max_length', 15)
|
2025-12-14 19:08:01 +01:00
|
|
|
),
|
|
|
|
|
links_per_page_range=(
|
2026-01-04 22:20:10 +01:00
|
|
|
links.get('min_per_page', 10),
|
|
|
|
|
links.get('max_per_page', 15)
|
2025-12-14 19:08:01 +01:00
|
|
|
),
|
2026-01-04 22:20:10 +01:00
|
|
|
char_space=links.get('char_space', 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'),
|
|
|
|
|
max_counter=links.get('max_counter', 10),
|
|
|
|
|
canary_token_url=canary.get('token_url'),
|
|
|
|
|
canary_token_tries=canary.get('token_tries', 10),
|
|
|
|
|
dashboard_secret_path=dashboard_path,
|
|
|
|
|
api_server_url=api.get('server_url'),
|
|
|
|
|
api_server_port=api.get('server_port', 8080),
|
|
|
|
|
api_server_path=api.get('server_path', '/api/v2/users'),
|
|
|
|
|
probability_error_codes=behavior.get('probability_error_codes', 0),
|
|
|
|
|
database_path=database.get('path', 'data/krawl.db'),
|
|
|
|
|
database_retention_days=database.get('retention_days', 30),
|
|
|
|
|
http_risky_methods_threshold=analyzer.get('http_risky_methods_threshold', 0.1),
|
|
|
|
|
violated_robots_threshold=analyzer.get('violated_robots_threshold', 0.1),
|
2026-01-17 18:06:09 +01:00
|
|
|
uneven_request_timing_threshold=analyzer.get('uneven_request_timing_threshold', 0.5), # coefficient of variation
|
2026-01-04 22:20:10 +01:00
|
|
|
uneven_request_timing_time_window_seconds=analyzer.get('uneven_request_timing_time_window_seconds', 300),
|
2026-01-05 10:01:51 +01:00
|
|
|
user_agents_used_threshold=analyzer.get('user_agents_used_threshold', 2),
|
2026-01-23 21:33:32 +01:00
|
|
|
attack_urls_threshold=analyzer.get('attack_urls_threshold', 1),
|
|
|
|
|
infinite_pages_for_malicious=crawl.get('infinite_pages_for_malicious', True),
|
|
|
|
|
max_pages_limit=crawl.get('max_pages_limit', 200),
|
|
|
|
|
ban_duration_seconds=crawl.get('ban_duration_seconds', 60)
|
2025-12-14 19:08:01 +01:00
|
|
|
)
|
2026-01-04 22:20:10 +01:00
|
|
|
|
2026-01-23 17:34:23 +01:00
|
|
|
def __get_env_from_config(config: str) -> str:
|
|
|
|
|
|
|
|
|
|
env = config.upper().replace('.', '_').replace('-', '__').replace(' ', '_')
|
|
|
|
|
|
|
|
|
|
return f'KRAWL_{env}'
|
|
|
|
|
|
|
|
|
|
def override_config_from_env(config: Config = None):
|
|
|
|
|
"""Initialize configuration from environment variables"""
|
|
|
|
|
|
|
|
|
|
for field in config.__dataclass_fields__:
|
|
|
|
|
|
|
|
|
|
env_var = __get_env_from_config(field)
|
|
|
|
|
if env_var in os.environ:
|
|
|
|
|
field_type = config.__dataclass_fields__[field].type
|
|
|
|
|
env_value = os.environ[env_var]
|
|
|
|
|
if field_type == int:
|
|
|
|
|
setattr(config, field, int(env_value))
|
|
|
|
|
elif field_type == float:
|
|
|
|
|
setattr(config, field, float(env_value))
|
|
|
|
|
elif field_type == Tuple[int, int]:
|
|
|
|
|
parts = env_value.split(',')
|
|
|
|
|
if len(parts) == 2:
|
|
|
|
|
setattr(config, field, (int(parts[0]), int(parts[1])))
|
|
|
|
|
else:
|
|
|
|
|
setattr(config, field, env_value)
|
2026-01-04 22:20:10 +01:00
|
|
|
|
|
|
|
|
_config_instance = None
|
|
|
|
|
|
|
|
|
|
def get_config() -> Config:
|
|
|
|
|
"""Get the singleton Config instance"""
|
|
|
|
|
global _config_instance
|
|
|
|
|
if _config_instance is None:
|
|
|
|
|
_config_instance = Config.from_yaml()
|
2026-01-23 17:34:23 +01:00
|
|
|
|
|
|
|
|
override_config_from_env(_config_instance)
|
|
|
|
|
|
|
|
|
|
return _config_instance
|