krawl.es/src/tasks/analyze_ips.py

from collections import Counter
from database import get_database
from pathlib import Path
from datetime import datetime, timedelta
import re
import urllib.parse
from wordlists import get_wordlists
from config import get_config
from logger import get_app_logger

# ----------------------
# TASK CONFIG
# ----------------------

TASK_CONFIG = {
    "name": "analyze-ips",
    "cron": "*/1 * * * *",
    "enabled": True,
    "run_when_loaded": True,
}


def main():
    config = get_config()
    db_manager = get_database()
    app_logger = get_app_logger()

    http_risky_methods_threshold = config.http_risky_methods_threshold
    violated_robots_threshold = config.violated_robots_threshold
    uneven_request_timing_threshold = config.uneven_request_timing_threshold
    user_agents_used_threshold = config.user_agents_used_threshold
    attack_urls_threshold = config.attack_urls_threshold
    uneven_request_timing_time_window_seconds = (
        config.uneven_request_timing_time_window_seconds
    )
    app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}")
    score = {}
    score["attacker"] = {
        "risky_http_methods": False,
        "robots_violations": False,
        "uneven_request_timing": False,
        "different_user_agents": False,
        "attack_url": False,
    }
    score["good_crawler"] = {
        "risky_http_methods": False,
        "robots_violations": False,
        "uneven_request_timing": False,
        "different_user_agents": False,
        "attack_url": False,
    }
    score["bad_crawler"] = {
        "risky_http_methods": False,
        "robots_violations": False,
        "uneven_request_timing": False,
        "different_user_agents": False,
        "attack_url": False,
    }
    score["regular_user"] = {
        "risky_http_methods": False,
        "robots_violations": False,
        "uneven_request_timing": False,
        "different_user_agents": False,
        "attack_url": False,
    }

    # 1-3 low, 4-6 mid, 7-9 high, 10-20 extreme
    weights = {
        "attacker": {
            "risky_http_methods": 6,
            "robots_violations": 4,
            "uneven_request_timing": 3,
            "different_user_agents": 2,
            "attack_url": 15,
        },
        "good_crawler": {
            "risky_http_methods": 1,
            "robots_violations": 0,
            "uneven_request_timing": 0,
            "different_user_agents": 0,
            "attack_url": 0,
        },
        "bad_crawler": {
            "risky_http_methods": 2,
            "robots_violations": 7,
            "uneven_request_timing": 0,
            "different_user_agents": 7,
            "attack_url": 5,
        },
        "regular_user": {
            "risky_http_methods": 0,
            "robots_violations": 0,
            "uneven_request_timing": 8,
            "different_user_agents": 3,
            "attack_url": 0,
        },
    }
    # Parse robots.txt once before the loop (it never changes during a run)
    robots_disallows = []
    robots_path = Path(__file__).parent.parent / "templates" / "html" / "robots.txt"
    with open(robots_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(":")
            if parts[0] == "Disallow":
                parts[1] = parts[1].rstrip("/")
                robots_disallows.append(parts[1].strip())

    # Get IPs flagged for reevaluation (set when a suspicious request arrives)
    ips_to_analyze = set(db_manager.get_ips_needing_reevaluation())

    if not ips_to_analyze:
        app_logger.debug(
            "[Background Task] analyze-ips: No IPs need reevaluation, skipping"
        )
        return

    for ip in ips_to_analyze:
        # Get full history for this IP to perform accurate analysis
        ip_accesses = db_manager.get_access_logs(
            limit=10000, ip_filter=ip, since_minutes=1440 * 30
        )  # look back up to 30 days of history for better accuracy
        total_accesses_count = len(ip_accesses)
        if total_accesses_count <= 0:
            continue

        # --------------------- HTTP Methods ---------------------
        method_counts = Counter(item["method"] for item in ip_accesses)
        if total_accesses_count > http_risky_methods_threshold:
            risky_count = sum(
                method_counts.get(m, 0)
                for m in ("POST", "PUT", "DELETE", "OPTIONS", "PATCH")
            )
            http_method_attacker_score = risky_count / total_accesses_count
        else:
            http_method_attacker_score = 0
        # print(f"HTTP Method attacker score: {http_method_attacker_score}")
        if http_method_attacker_score >= http_risky_methods_threshold:
            score["attacker"]["risky_http_methods"] = True
            score["good_crawler"]["risky_http_methods"] = False
            score["bad_crawler"]["risky_http_methods"] = True
            score["regular_user"]["risky_http_methods"] = False
        else:
            score["attacker"]["risky_http_methods"] = False
            score["good_crawler"]["risky_http_methods"] = True
            score["bad_crawler"]["risky_http_methods"] = False
            score["regular_user"]["risky_http_methods"] = False
        # --------------------- Robots Violations ---------------------
        violated_robots_count = len(
            [
                item
                for item in ip_accesses
                if any(
                    item["path"].rstrip("/").startswith(disallow)
                    for disallow in robots_disallows
                )
            ]
        )
        # print(f"Violated robots count: {violated_robots_count}")
        if total_accesses_count > 0:
            violated_robots_ratio = violated_robots_count / total_accesses_count
        else:
            violated_robots_ratio = 0
        if violated_robots_ratio >= violated_robots_threshold:
            score["attacker"]["robots_violations"] = True
            score["good_crawler"]["robots_violations"] = False
            score["bad_crawler"]["robots_violations"] = True
            score["regular_user"]["robots_violations"] = False
        else:
            score["attacker"]["robots_violations"] = False
            score["good_crawler"]["robots_violations"] = False
            score["bad_crawler"]["robots_violations"] = False
            score["regular_user"]["robots_violations"] = False

        # --------------------- Requests Timing ---------------------
        # Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior
        timestamps = [datetime.fromisoformat(item["timestamp"]) for item in ip_accesses]
        now_utc = datetime.now()
        timestamps = [
            ts
            for ts in timestamps
            if now_utc - ts
            <= timedelta(seconds=uneven_request_timing_time_window_seconds)
        ]
        timestamps = sorted(timestamps, reverse=True)
        time_diffs = []
        for i in range(0, len(timestamps) - 1):
            diff = (timestamps[i] - timestamps[i + 1]).total_seconds()
            time_diffs.append(diff)

        mean = 0
        variance = 0
        std = 0
        cv = 0
        if time_diffs:
            mean = sum(time_diffs) / len(time_diffs)
            variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs)
            std = variance**0.5
            cv = std / mean
            app_logger.debug(
                f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}"
            )
        if cv >= uneven_request_timing_threshold:
            score["attacker"]["uneven_request_timing"] = True
            score["good_crawler"]["uneven_request_timing"] = False
            score["bad_crawler"]["uneven_request_timing"] = False
            score["regular_user"]["uneven_request_timing"] = True
        else:
            score["attacker"]["uneven_request_timing"] = False
            score["good_crawler"]["uneven_request_timing"] = False
            score["bad_crawler"]["uneven_request_timing"] = False
            score["regular_user"]["uneven_request_timing"] = False
        # --------------------- Different User Agents ---------------------
        # Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers
        user_agents_used = [item["user_agent"] for item in ip_accesses]
        user_agents_used = list(dict.fromkeys(user_agents_used))
        # print(f"User agents used: {user_agents_used}")
        if len(user_agents_used) >= user_agents_used_threshold:
            score["attacker"]["different_user_agents"] = True
            score["good_crawler"]["different_user_agents"] = False
            score["bad_crawler"]["different_user_agents"] = True
            score["regular_user"]["different_user_agents"] = False
        else:
            score["attacker"]["different_user_agents"] = False
            score["good_crawler"]["different_user_agents"] = False
            score["bad_crawler"]["different_user_agents"] = False
            score["regular_user"]["different_user_agents"] = False
        # --------------------- Attack URLs ---------------------
        attack_urls_found_list = []
        wl = get_wordlists()
        if wl.attack_patterns:
            queried_paths = [item["path"] for item in ip_accesses]
            for queried_path in queried_paths:
                # URL decode the path to catch encoded attacks
                try:
                    decoded_path = urllib.parse.unquote(queried_path)
                    # Double decode to catch double-encoded attacks
                    decoded_path_twice = urllib.parse.unquote(decoded_path)
                except Exception:
                    decoded_path = queried_path
                    decoded_path_twice = queried_path

                for name, pattern in wl.attack_patterns.items():
                    # Check original, decoded, and double-decoded paths
                    if (
                        re.search(pattern, queried_path, re.IGNORECASE)
                        or re.search(pattern, decoded_path, re.IGNORECASE)
                        or re.search(pattern, decoded_path_twice, re.IGNORECASE)
                    ):
                        attack_urls_found_list.append(f"{name}: {pattern}")

            # remove duplicates
            attack_urls_found_list = set(attack_urls_found_list)
            attack_urls_found_list = list(attack_urls_found_list)

            if len(attack_urls_found_list) >= attack_urls_threshold:
                score["attacker"]["attack_url"] = True
                score["good_crawler"]["attack_url"] = False
                score["bad_crawler"]["attack_url"] = False
                score["regular_user"]["attack_url"] = False
            else:
                score["attacker"]["attack_url"] = False
                score["good_crawler"]["attack_url"] = False
                score["bad_crawler"]["attack_url"] = False
                score["regular_user"]["attack_url"] = False
        # --------------------- Calculate score ---------------------
        attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0
        attacker_score = (
            score["attacker"]["risky_http_methods"]
            * weights["attacker"]["risky_http_methods"]
        )
        attacker_score = (
            attacker_score
            + score["attacker"]["robots_violations"]
            * weights["attacker"]["robots_violations"]
        )
        attacker_score = (
            attacker_score
            + score["attacker"]["uneven_request_timing"]
            * weights["attacker"]["uneven_request_timing"]
        )
        attacker_score = (
            attacker_score
            + score["attacker"]["different_user_agents"]
            * weights["attacker"]["different_user_agents"]
        )
        attacker_score = (
            attacker_score
            + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"]
        )
        good_crawler_score = (
            score["good_crawler"]["risky_http_methods"]
            * weights["good_crawler"]["risky_http_methods"]
        )
        good_crawler_score = (
            good_crawler_score
            + score["good_crawler"]["robots_violations"]
            * weights["good_crawler"]["robots_violations"]
        )
        good_crawler_score = (
            good_crawler_score
            + score["good_crawler"]["uneven_request_timing"]
            * weights["good_crawler"]["uneven_request_timing"]
        )
        good_crawler_score = (
            good_crawler_score
            + score["good_crawler"]["different_user_agents"]
            * weights["good_crawler"]["different_user_agents"]
        )
        good_crawler_score = (
            good_crawler_score
            + score["good_crawler"]["attack_url"]
            * weights["good_crawler"]["attack_url"]
        )
        bad_crawler_score = (
            score["bad_crawler"]["risky_http_methods"]
            * weights["bad_crawler"]["risky_http_methods"]
        )
        bad_crawler_score = (
            bad_crawler_score
            + score["bad_crawler"]["robots_violations"]
            * weights["bad_crawler"]["robots_violations"]
        )
        bad_crawler_score = (
            bad_crawler_score
            + score["bad_crawler"]["uneven_request_timing"]
            * weights["bad_crawler"]["uneven_request_timing"]
        )
        bad_crawler_score = (
            bad_crawler_score
            + score["bad_crawler"]["different_user_agents"]
            * weights["bad_crawler"]["different_user_agents"]
        )
        bad_crawler_score = (
            bad_crawler_score
            + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"]
        )
        regular_user_score = (
            score["regular_user"]["risky_http_methods"]
            * weights["regular_user"]["risky_http_methods"]
        )
        regular_user_score = (
            regular_user_score
            + score["regular_user"]["robots_violations"]
            * weights["regular_user"]["robots_violations"]
        )
        regular_user_score = (
            regular_user_score
            + score["regular_user"]["uneven_request_timing"]
            * weights["regular_user"]["uneven_request_timing"]
        )
        regular_user_score = (
            regular_user_score
            + score["regular_user"]["different_user_agents"]
            * weights["regular_user"]["different_user_agents"]
        )
        regular_user_score = (
            regular_user_score
            + score["regular_user"]["attack_url"]
            * weights["regular_user"]["attack_url"]
        )
        score_details = f"""
        Attacker score: {attacker_score}
        Good Crawler score: {good_crawler_score}
        Bad Crawler score: {bad_crawler_score}
        Regular User score: {regular_user_score}
        """
        app_logger.debug(score_details)
        analyzed_metrics = {
            "risky_http_methods": http_method_attacker_score,
            "robots_violations": violated_robots_ratio,
            "uneven_request_timing": mean,
            "different_user_agents": user_agents_used,
            "attack_url": attack_urls_found_list,
        }
        category_scores = {
            "attacker": attacker_score,
            "good_crawler": good_crawler_score,
            "bad_crawler": bad_crawler_score,
            "regular_user": regular_user_score,
        }
        category = max(category_scores, key=category_scores.get)
        last_analysis = datetime.now()
        db_manager.update_ip_stats_analysis(
            ip, analyzed_metrics, category, category_scores, last_analysis
        )
    return