Files
krawl.es/src/tracker.py
Phillip Tarrant f1c142c53d feat: add SQLite persistent storage for request logging
- Add SQLAlchemy-based database layer for persistent storage
  - Create models for access_logs, credential_attempts, attack_detections, ip_stats
  - Include fields for future GeoIP and reputation enrichment
  - Implement sanitization utilities to protect against malicious payloads
  - Fix XSS vulnerability in dashboard template (HTML escape all user data)
  - Add DATABASE_PATH and DATABASE_RETENTION_DAYS config options
  - Dual storage: in-memory for dashboard performance + SQLite for persistence

  New files:
  - src/models.py - SQLAlchemy ORM models
  - src/database.py - DatabaseManager singleton
  - src/sanitizer.py - Input sanitization and HTML escaping
  - requirements.txt - SQLAlchemy dependency

  Security protections:
  - Parameterized queries via SQLAlchemy ORM
  - Field length limits to prevent storage exhaustion
  - Null byte and control character stripping
  - HTML escaping on dashboard output
2025-12-28 10:43:32 -06:00

295 lines
11 KiB
Python

#!/usr/bin/env python3
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from datetime import datetime
import re
import urllib.parse
from database import get_database, DatabaseManager
class AccessTracker:
"""
Track IP addresses and paths accessed.
Maintains in-memory structures for fast dashboard access and
persists data to SQLite for long-term storage and analysis.
"""
def __init__(self, db_manager: Optional[DatabaseManager] = None):
"""
Initialize the access tracker.
Args:
db_manager: Optional DatabaseManager for persistence.
If None, will use the global singleton.
"""
self.ip_counts: Dict[str, int] = defaultdict(int)
self.path_counts: Dict[str, int] = defaultdict(int)
self.user_agent_counts: Dict[str, int] = defaultdict(int)
self.access_log: List[Dict] = []
self.credential_attempts: List[Dict] = []
self.suspicious_patterns = [
'bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python-requests',
'scanner', 'nikto', 'sqlmap', 'nmap', 'masscan', 'nessus', 'acunetix',
'burp', 'zap', 'w3af', 'metasploit', 'nuclei', 'gobuster', 'dirbuster'
]
# Common attack types such as xss, shell injection, probes
self.attack_types = {
'path_traversal': r'\.\.',
'sql_injection': r"('|--|;|\bOR\b|\bUNION\b|\bSELECT\b|\bDROP\b)",
'xss_attempt': r'(<script|javascript:|onerror=|onload=)',
'common_probes': r'(wp-admin|phpmyadmin|\.env|\.git|/admin|/config)',
'shell_injection': r'(\||;|`|\$\(|&&)',
}
# Track IPs that accessed honeypot paths from robots.txt
self.honeypot_triggered: Dict[str, List[str]] = defaultdict(list)
# Database manager for persistence (lazily initialized)
self._db_manager = db_manager
@property
def db(self) -> Optional[DatabaseManager]:
"""
Get the database manager, lazily initializing if needed.
Returns:
DatabaseManager instance or None if not available
"""
if self._db_manager is None:
try:
self._db_manager = get_database()
except Exception:
# Database not initialized, persistence disabled
pass
return self._db_manager
def parse_credentials(self, post_data: str) -> Tuple[str, str]:
"""
Parse username and password from POST data.
Returns tuple (username, password) or (None, None) if not found.
"""
if not post_data:
return None, None
username = None
password = None
try:
# Parse URL-encoded form data
parsed = urllib.parse.parse_qs(post_data)
# Common username field names
username_fields = ['username', 'user', 'login', 'email', 'log', 'userid', 'account']
for field in username_fields:
if field in parsed and parsed[field]:
username = parsed[field][0]
break
# Common password field names
password_fields = ['password', 'pass', 'passwd', 'pwd', 'passphrase']
for field in password_fields:
if field in parsed and parsed[field]:
password = parsed[field][0]
break
except Exception:
# If parsing fails, try simple regex patterns
username_match = re.search(r'(?:username|user|login|email|log)=([^&\s]+)', post_data, re.IGNORECASE)
password_match = re.search(r'(?:password|pass|passwd|pwd)=([^&\s]+)', post_data, re.IGNORECASE)
if username_match:
username = urllib.parse.unquote_plus(username_match.group(1))
if password_match:
password = urllib.parse.unquote_plus(password_match.group(1))
return username, password
def record_credential_attempt(self, ip: str, path: str, username: str, password: str):
"""
Record a credential login attempt.
Stores in both in-memory list and SQLite database.
"""
# In-memory storage for dashboard
self.credential_attempts.append({
'ip': ip,
'path': path,
'username': username,
'password': password,
'timestamp': datetime.now().isoformat()
})
# Persist to database
if self.db:
try:
self.db.persist_credential(
ip=ip,
path=path,
username=username,
password=password
)
except Exception:
# Don't crash if database persistence fails
pass
def record_access(
self,
ip: str,
path: str,
user_agent: str = '',
body: str = '',
method: str = 'GET'
):
"""
Record an access attempt.
Stores in both in-memory structures and SQLite database.
Args:
ip: Client IP address
path: Requested path
user_agent: Client user agent string
body: Request body (for POST/PUT)
method: HTTP method
"""
self.ip_counts[ip] += 1
self.path_counts[path] += 1
if user_agent:
self.user_agent_counts[user_agent] += 1
# Path attack type detection
attack_findings = self.detect_attack_type(path)
# POST/PUT body attack detection
if len(body) > 0:
attack_findings.extend(self.detect_attack_type(body))
is_suspicious = (
self.is_suspicious_user_agent(user_agent) or
self.is_honeypot_path(path) or
len(attack_findings) > 0
)
is_honeypot = self.is_honeypot_path(path)
# Track if this IP accessed a honeypot path
if is_honeypot:
self.honeypot_triggered[ip].append(path)
# In-memory storage for dashboard
self.access_log.append({
'ip': ip,
'path': path,
'user_agent': user_agent,
'suspicious': is_suspicious,
'honeypot_triggered': is_honeypot,
'attack_types': attack_findings,
'timestamp': datetime.now().isoformat()
})
# Persist to database
if self.db:
try:
self.db.persist_access(
ip=ip,
path=path,
user_agent=user_agent,
method=method,
is_suspicious=is_suspicious,
is_honeypot_trigger=is_honeypot,
attack_types=attack_findings if attack_findings else None
)
except Exception:
# Don't crash if database persistence fails
pass
def detect_attack_type(self, data:str) -> list[str]:
"""
Returns a list of all attack types found in path data
"""
findings = []
for name, pattern in self.attack_types.items():
if re.search(pattern, data, re.IGNORECASE):
findings.append(name)
return findings
def is_honeypot_path(self, path: str) -> bool:
"""Check if path is one of the honeypot traps from robots.txt"""
honeypot_paths = [
'/admin',
'/admin/',
'/backup',
'/backup/',
'/config',
'/config/',
'/private',
'/private/',
'/database',
'/database/',
'/credentials.txt',
'/passwords.txt',
'/admin_notes.txt',
'/api_keys.json',
'/.env',
'/wp-admin',
'/wp-admin/',
'/phpmyadmin',
'/phpMyAdmin/'
]
return path in honeypot_paths or any(hp in path.lower() for hp in ['/backup', '/admin', '/config', '/private', '/database', 'phpmyadmin'])
def is_suspicious_user_agent(self, user_agent: str) -> bool:
"""Check if user agent matches suspicious patterns"""
if not user_agent:
return True
ua_lower = user_agent.lower()
return any(pattern in ua_lower for pattern in self.suspicious_patterns)
def get_top_ips(self, limit: int = 10) -> List[Tuple[str, int]]:
"""Get top N IP addresses by access count"""
return sorted(self.ip_counts.items(), key=lambda x: x[1], reverse=True)[:limit]
def get_top_paths(self, limit: int = 10) -> List[Tuple[str, int]]:
"""Get top N paths by access count"""
return sorted(self.path_counts.items(), key=lambda x: x[1], reverse=True)[:limit]
def get_top_user_agents(self, limit: int = 10) -> List[Tuple[str, int]]:
"""Get top N user agents by access count"""
return sorted(self.user_agent_counts.items(), key=lambda x: x[1], reverse=True)[:limit]
def get_suspicious_accesses(self, limit: int = 20) -> List[Dict]:
"""Get recent suspicious accesses"""
suspicious = [log for log in self.access_log if log.get('suspicious', False)]
return suspicious[-limit:]
def get_attack_type_accesses(self, limit: int = 20) -> List[Dict]:
"""Get recent accesses with detected attack types"""
attacks = [log for log in self.access_log if log.get('attack_types')]
return attacks[-limit:]
def get_honeypot_triggered_ips(self) -> List[Tuple[str, List[str]]]:
"""Get IPs that accessed honeypot paths"""
return [(ip, paths) for ip, paths in self.honeypot_triggered.items()]
def get_stats(self) -> Dict:
"""Get statistics summary"""
suspicious_count = sum(1 for log in self.access_log if log.get('suspicious', False))
honeypot_count = sum(1 for log in self.access_log if log.get('honeypot_triggered', False))
return {
'total_accesses': len(self.access_log),
'unique_ips': len(self.ip_counts),
'unique_paths': len(self.path_counts),
'suspicious_accesses': suspicious_count,
'honeypot_triggered': honeypot_count,
'honeypot_ips': len(self.honeypot_triggered),
'top_ips': self.get_top_ips(10),
'top_paths': self.get_top_paths(10),
'top_user_agents': self.get_top_user_agents(10),
'recent_suspicious': self.get_suspicious_accesses(20),
'honeypot_triggered_ips': self.get_honeypot_triggered_ips(),
'attack_types': self.get_attack_type_accesses(20),
'credential_attempts': self.credential_attempts[-50:] # Last 50 attempts
}