feat: add SQLite persistent storage for request logging

- Add SQLAlchemy-based database layer for persistent storage
  - Create models for access_logs, credential_attempts, attack_detections, ip_stats
  - Include fields for future GeoIP and reputation enrichment
  - Implement sanitization utilities to protect against malicious payloads
  - Fix XSS vulnerability in dashboard template (HTML escape all user data)
  - Add DATABASE_PATH and DATABASE_RETENTION_DAYS config options
  - Dual storage: in-memory for dashboard performance + SQLite for persistence

  New files:
  - src/models.py - SQLAlchemy ORM models
  - src/database.py - DatabaseManager singleton
  - src/sanitizer.py - Input sanitization and HTML escaping
  - requirements.txt - SQLAlchemy dependency

  Security protections:
  - Parameterized queries via SQLAlchemy ORM
  - Field length limits to prevent storage exhaustion
  - Null byte and control character stripping
  - HTML escaping on dashboard output
This commit is contained in:
Phillip Tarrant
2025-12-28 10:43:32 -06:00
parent 499760c939
commit f1c142c53d
11 changed files with 860 additions and 32 deletions

361
src/database.py Normal file
View File

@@ -0,0 +1,361 @@
#!/usr/bin/env python3
"""
Database singleton module for the Krawl honeypot.
Provides SQLAlchemy session management and database initialization.
"""
import os
import stat
from datetime import datetime
from typing import Optional, List, Dict, Any
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session, Session
from models import Base, AccessLog, CredentialAttempt, AttackDetection, IpStats
from sanitizer import (
sanitize_ip,
sanitize_path,
sanitize_user_agent,
sanitize_credential,
sanitize_attack_pattern,
)
class DatabaseManager:
"""
Singleton database manager for the Krawl honeypot.
Handles database initialization, session management, and provides
methods for persisting access logs, credentials, and attack detections.
"""
_instance: Optional["DatabaseManager"] = None
def __new__(cls) -> "DatabaseManager":
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def initialize(self, database_path: str = "data/krawl.db") -> None:
"""
Initialize the database connection and create tables.
Args:
database_path: Path to the SQLite database file
"""
if self._initialized:
return
# Create data directory if it doesn't exist
data_dir = os.path.dirname(database_path)
if data_dir and not os.path.exists(data_dir):
os.makedirs(data_dir, exist_ok=True)
# Create SQLite database with check_same_thread=False for multi-threaded access
database_url = f"sqlite:///{database_path}"
self._engine = create_engine(
database_url,
connect_args={"check_same_thread": False},
echo=False # Set to True for SQL debugging
)
# Create session factory with scoped_session for thread safety
session_factory = sessionmaker(bind=self._engine)
self._Session = scoped_session(session_factory)
# Create all tables
Base.metadata.create_all(self._engine)
# Set restrictive file permissions (owner read/write only)
if os.path.exists(database_path):
try:
os.chmod(database_path, stat.S_IRUSR | stat.S_IWUSR) # 600
except OSError:
# May fail on some systems, not critical
pass
self._initialized = True
@property
def session(self) -> Session:
"""Get a thread-local database session."""
if not self._initialized:
raise RuntimeError("DatabaseManager not initialized. Call initialize() first.")
return self._Session()
def close_session(self) -> None:
"""Close the current thread-local session."""
if self._initialized:
self._Session.remove()
def persist_access(
self,
ip: str,
path: str,
user_agent: str = "",
method: str = "GET",
is_suspicious: bool = False,
is_honeypot_trigger: bool = False,
attack_types: Optional[List[str]] = None,
matched_patterns: Optional[Dict[str, str]] = None
) -> Optional[int]:
"""
Persist an access log entry to the database.
Args:
ip: Client IP address
path: Requested path
user_agent: Client user agent string
method: HTTP method (GET, POST, HEAD)
is_suspicious: Whether the request was flagged as suspicious
is_honeypot_trigger: Whether a honeypot path was accessed
attack_types: List of detected attack types
matched_patterns: Dict mapping attack_type to matched pattern
Returns:
The ID of the created AccessLog record, or None on error
"""
session = self.session
try:
# Create access log with sanitized fields
access_log = AccessLog(
ip=sanitize_ip(ip),
path=sanitize_path(path),
user_agent=sanitize_user_agent(user_agent),
method=method[:10],
is_suspicious=is_suspicious,
is_honeypot_trigger=is_honeypot_trigger,
timestamp=datetime.utcnow()
)
session.add(access_log)
session.flush() # Get the ID before committing
# Add attack detections if any
if attack_types:
matched_patterns = matched_patterns or {}
for attack_type in attack_types:
detection = AttackDetection(
access_log_id=access_log.id,
attack_type=attack_type[:50],
matched_pattern=sanitize_attack_pattern(
matched_patterns.get(attack_type, "")
)
)
session.add(detection)
# Update IP stats
self._update_ip_stats(session, ip)
session.commit()
return access_log.id
except Exception as e:
session.rollback()
# Log error but don't crash - database persistence is secondary to honeypot function
print(f"Database error persisting access: {e}")
return None
finally:
self.close_session()
def persist_credential(
self,
ip: str,
path: str,
username: Optional[str] = None,
password: Optional[str] = None
) -> Optional[int]:
"""
Persist a credential attempt to the database.
Args:
ip: Client IP address
path: Login form path
username: Submitted username
password: Submitted password
Returns:
The ID of the created CredentialAttempt record, or None on error
"""
session = self.session
try:
credential = CredentialAttempt(
ip=sanitize_ip(ip),
path=sanitize_path(path),
username=sanitize_credential(username),
password=sanitize_credential(password),
timestamp=datetime.utcnow()
)
session.add(credential)
session.commit()
return credential.id
except Exception as e:
session.rollback()
print(f"Database error persisting credential: {e}")
return None
finally:
self.close_session()
def _update_ip_stats(self, session: Session, ip: str) -> None:
"""
Update IP statistics (upsert pattern).
Args:
session: Active database session
ip: IP address to update
"""
sanitized_ip = sanitize_ip(ip)
now = datetime.utcnow()
ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first()
if ip_stats:
ip_stats.total_requests += 1
ip_stats.last_seen = now
else:
ip_stats = IpStats(
ip=sanitized_ip,
total_requests=1,
first_seen=now,
last_seen=now
)
session.add(ip_stats)
def get_access_logs(
self,
limit: int = 100,
offset: int = 0,
ip_filter: Optional[str] = None,
suspicious_only: bool = False
) -> List[Dict[str, Any]]:
"""
Retrieve access logs with optional filtering.
Args:
limit: Maximum number of records to return
offset: Number of records to skip
ip_filter: Filter by IP address
suspicious_only: Only return suspicious requests
Returns:
List of access log dictionaries
"""
session = self.session
try:
query = session.query(AccessLog).order_by(AccessLog.timestamp.desc())
if ip_filter:
query = query.filter(AccessLog.ip == sanitize_ip(ip_filter))
if suspicious_only:
query = query.filter(AccessLog.is_suspicious == True)
logs = query.offset(offset).limit(limit).all()
return [
{
'id': log.id,
'ip': log.ip,
'path': log.path,
'user_agent': log.user_agent,
'method': log.method,
'is_suspicious': log.is_suspicious,
'is_honeypot_trigger': log.is_honeypot_trigger,
'timestamp': log.timestamp.isoformat(),
'attack_types': [d.attack_type for d in log.attack_detections]
}
for log in logs
]
finally:
self.close_session()
def get_credential_attempts(
self,
limit: int = 100,
offset: int = 0,
ip_filter: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Retrieve credential attempts with optional filtering.
Args:
limit: Maximum number of records to return
offset: Number of records to skip
ip_filter: Filter by IP address
Returns:
List of credential attempt dictionaries
"""
session = self.session
try:
query = session.query(CredentialAttempt).order_by(
CredentialAttempt.timestamp.desc()
)
if ip_filter:
query = query.filter(CredentialAttempt.ip == sanitize_ip(ip_filter))
attempts = query.offset(offset).limit(limit).all()
return [
{
'id': attempt.id,
'ip': attempt.ip,
'path': attempt.path,
'username': attempt.username,
'password': attempt.password,
'timestamp': attempt.timestamp.isoformat()
}
for attempt in attempts
]
finally:
self.close_session()
def get_ip_stats(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
Retrieve IP statistics ordered by total requests.
Args:
limit: Maximum number of records to return
Returns:
List of IP stats dictionaries
"""
session = self.session
try:
stats = session.query(IpStats).order_by(
IpStats.total_requests.desc()
).limit(limit).all()
return [
{
'ip': s.ip,
'total_requests': s.total_requests,
'first_seen': s.first_seen.isoformat(),
'last_seen': s.last_seen.isoformat(),
'country_code': s.country_code,
'city': s.city,
'asn': s.asn,
'asn_org': s.asn_org,
'reputation_score': s.reputation_score,
'reputation_source': s.reputation_source
}
for s in stats
]
finally:
self.close_session()
# Module-level singleton instance
_db_manager = DatabaseManager()
def get_database() -> DatabaseManager:
"""Get the database manager singleton instance."""
return _db_manager
def initialize_database(database_path: str = "data/krawl.db") -> None:
"""Initialize the database system."""
_db_manager.initialize(database_path)