added raw request handling, enanched attack detection for GET and POSTS, templatized suspicioius activity to fetch from wordlists.json, aligned helm to load new wordlist config, added migration scripts from 1.0.0 to new krawl versions, removed old and unused functions, added test scripts

2026-02-08 16:02:18 +01:00
parent 594eae7447
commit 771174c6a9
26 changed files with 2312 additions and 867 deletions
--- a/src/database.py
+++ b/src/database.py
@@ -207,6 +207,7 @@ class DatabaseManager:
        is_honeypot_trigger: bool = False,
        attack_types: Optional[List[str]] = None,
        matched_patterns: Optional[Dict[str, str]] = None,
+        raw_request: Optional[str] = None,
    ) -> Optional[int]:
        """
        Persist an access log entry to the database.
@@ -220,6 +221,7 @@ class DatabaseManager:
            is_honeypot_trigger: Whether a honeypot path was accessed
            attack_types: List of detected attack types
            matched_patterns: Dict mapping attack_type to matched pattern
+            raw_request: Full raw HTTP request for forensic analysis

        Returns:
            The ID of the created AccessLog record, or None on error
@@ -235,6 +237,7 @@ class DatabaseManager:
                is_suspicious=is_suspicious,
                is_honeypot_trigger=is_honeypot_trigger,
                timestamp=datetime.now(),
+                raw_request=raw_request,
            )
            session.add(access_log)
            session.flush()  # Get the ID before committing
@@ -1606,7 +1609,10 @@ class DatabaseManager:
                sort_order.lower() if sort_order.lower() in {"asc", "desc"} else "desc"
            )

-            # Get all access logs with attack detections
+            # Count total attacks first (efficient)
+            total_attacks = session.query(AccessLog).join(AttackDetection).count()
+
+            # Get paginated access logs with attack detections
            query = session.query(AccessLog).join(AttackDetection)

            if sort_by == "timestamp":
@@ -1619,30 +1625,27 @@ class DatabaseManager:
                query = query.order_by(
                    AccessLog.ip.desc() if sort_order == "desc" else AccessLog.ip.asc()
                )
+            # Note: attack_type sorting requires loading all data, so we skip it for performance
+            # elif sort_by == "attack_type":
+            #     Can't efficiently sort by related table field

-            logs = query.all()
+            # Apply LIMIT and OFFSET at database level
+            logs = query.offset(offset).limit(page_size).all()

-            # Convert to attack list
-            attack_list = [
+            # Convert to attack list (exclude raw_request for performance - it's too large)
+            paginated = [
                {
+                    "id": log.id,
                    "ip": log.ip,
                    "path": log.path,
                    "user_agent": log.user_agent,
                    "timestamp": log.timestamp.isoformat() if log.timestamp else None,
                    "attack_types": [d.attack_type for d in log.attack_detections],
+                    "raw_request": log.raw_request,  # Keep for backward compatibility
                }
                for log in logs
            ]

-            # Sort by attack_type if needed (this must be done post-fetch since it's in a related table)
-            if sort_by == "attack_type":
-                attack_list.sort(
-                    key=lambda x: x["attack_types"][0] if x["attack_types"] else "",
-                    reverse=(sort_order == "desc"),
-                )
-
-            total_attacks = len(attack_list)
-            paginated = attack_list[offset : offset + page_size]
            total_pages = (total_attacks + page_size - 1) // page_size

            return {
@@ -1657,6 +1660,60 @@ class DatabaseManager:
        finally:
            self.close_session()

+    def get_raw_request_by_id(self, log_id: int) -> Optional[str]:
+        """
+        Retrieve raw HTTP request for a specific access log ID.
+
+        Args:
+            log_id: The access log ID
+
+        Returns:
+            The raw request string, or None if not found or not available
+        """
+        session = self.session
+        try:
+            access_log = session.query(AccessLog).filter(AccessLog.id == log_id).first()
+            if access_log:
+                return access_log.raw_request
+            return None
+        finally:
+            self.close_session()
+
+    def get_attack_types_stats(self, limit: int = 20) -> Dict[str, Any]:
+        """
+        Get aggregated statistics for attack types (efficient for large datasets).
+
+        Args:
+            limit: Maximum number of attack types to return
+
+        Returns:
+            Dictionary with attack type counts
+        """
+        session = self.session
+        try:
+            from sqlalchemy import func
+
+            # Aggregate attack types with count
+            results = (
+                session.query(
+                    AttackDetection.attack_type,
+                    func.count(AttackDetection.id).label('count')
+                )
+                .group_by(AttackDetection.attack_type)
+                .order_by(func.count(AttackDetection.id).desc())
+                .limit(limit)
+                .all()
+            )
+
+            return {
+                "attack_types": [
+                    {"type": row.attack_type, "count": row.count}
+                    for row in results
+                ]
+            }
+        finally:
+            self.close_session()
+

 # Module-level singleton instance
 _db_manager = DatabaseManager()