Linted code iwht black tool

2026-01-23 22:00:21 +01:00
parent 25384585d9
commit 4450d3a4e3
22 changed files with 1387 additions and 868 deletions
--- a/src/tasks/analyze_ips.py
+++ b/src/tasks/analyze_ips.py
@@ -20,7 +20,7 @@ TASK_CONFIG = {
    "name": "analyze-ips",
    "cron": "*/1 * * * *",
    "enabled": True,
-    "run_when_loaded": True
+    "run_when_loaded": True,
 }


@@ -34,48 +34,74 @@ def main():
    uneven_request_timing_threshold = config.uneven_request_timing_threshold
    user_agents_used_threshold = config.user_agents_used_threshold
    attack_urls_threshold = config.attack_urls_threshold
-    uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds
+    uneven_request_timing_time_window_seconds = (
+        config.uneven_request_timing_time_window_seconds
+    )
    app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}")
    score = {}
-    score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
-    score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
-    score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
-    score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
+    score["attacker"] = {
+        "risky_http_methods": False,
+        "robots_violations": False,
+        "uneven_request_timing": False,
+        "different_user_agents": False,
+        "attack_url": False,
+    }
+    score["good_crawler"] = {
+        "risky_http_methods": False,
+        "robots_violations": False,
+        "uneven_request_timing": False,
+        "different_user_agents": False,
+        "attack_url": False,
+    }
+    score["bad_crawler"] = {
+        "risky_http_methods": False,
+        "robots_violations": False,
+        "uneven_request_timing": False,
+        "different_user_agents": False,
+        "attack_url": False,
+    }
+    score["regular_user"] = {
+        "risky_http_methods": False,
+        "robots_violations": False,
+        "uneven_request_timing": False,
+        "different_user_agents": False,
+        "attack_url": False,
+    }

-    #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme
+    # 1-3 low, 4-6 mid, 7-9 high, 10-20 extreme
    weights = {
        "attacker": {
            "risky_http_methods": 6,
            "robots_violations": 4,
            "uneven_request_timing": 3,
            "different_user_agents": 8,
-            "attack_url": 15
+            "attack_url": 15,
        },
        "good_crawler": {
            "risky_http_methods": 1,
            "robots_violations": 0,
            "uneven_request_timing": 0,
            "different_user_agents": 0,
-            "attack_url": 0
+            "attack_url": 0,
        },
        "bad_crawler": {
            "risky_http_methods": 2,
            "robots_violations": 7,
            "uneven_request_timing": 0,
            "different_user_agents": 5,
-            "attack_url": 5
+            "attack_url": 5,
        },
        "regular_user": {
            "risky_http_methods": 0,
            "robots_violations": 0,
            "uneven_request_timing": 8,
            "different_user_agents": 3,
-            "attack_url": 0
-        }
+            "attack_url": 0,
+        },
    }
    # Get IPs with recent activity (last minute to match cron schedule)
    recent_accesses = db_manager.get_access_logs(limit=999999999, since_minutes=1)
-    ips_to_analyze = {item['ip'] for item in recent_accesses}
+    ips_to_analyze = {item["ip"] for item in recent_accesses}

    if not ips_to_analyze:
        app_logger.debug("[Background Task] analyze-ips: No recent activity, skipping")
@@ -92,23 +118,51 @@ def main():
        if total_accesses_count < 3:
            category = "unknown"
            analyzed_metrics = {}
-            category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0}
+            category_scores = {
+                "attacker": 0,
+                "good_crawler": 0,
+                "bad_crawler": 0,
+                "regular_user": 0,
+                "unknown": 0,
+            }
            last_analysis = datetime.now()
-            db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
+            db_manager.update_ip_stats_analysis(
+                ip, analyzed_metrics, category, category_scores, last_analysis
+            )
            return 0
-        #--------------------- HTTP Methods ---------------------
-        get_accesses_count = len([item for item in ip_accesses if item["method"] == "GET"])
-        post_accesses_count = len([item for item in ip_accesses if item["method"] == "POST"])
-        put_accesses_count = len([item for item in ip_accesses if item["method"] == "PUT"])
-        delete_accesses_count = len([item for item in ip_accesses if item["method"] == "DELETE"])
-        head_accesses_count = len([item for item in ip_accesses if item["method"] == "HEAD"])
-        options_accesses_count = len([item for item in ip_accesses if item["method"] == "OPTIONS"])
-        patch_accesses_count = len([item for item in ip_accesses if item["method"] == "PATCH"])
+        # --------------------- HTTP Methods ---------------------
+        get_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "GET"]
+        )
+        post_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "POST"]
+        )
+        put_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "PUT"]
+        )
+        delete_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "DELETE"]
+        )
+        head_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "HEAD"]
+        )
+        options_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "OPTIONS"]
+        )
+        patch_accesses_count = len(
+            [item for item in ip_accesses if item["method"] == "PATCH"]
+        )
        if total_accesses_count > http_risky_methods_threshold:
-            http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count
+            http_method_attacker_score = (
+                post_accesses_count
+                + put_accesses_count
+                + delete_accesses_count
+                + options_accesses_count
+                + patch_accesses_count
+            ) / total_accesses_count
        else:
            http_method_attacker_score = 0
-        #print(f"HTTP Method attacker score: {http_method_attacker_score}")
+        # print(f"HTTP Method attacker score: {http_method_attacker_score}")
        if http_method_attacker_score >= http_risky_methods_threshold:
            score["attacker"]["risky_http_methods"] = True
            score["good_crawler"]["risky_http_methods"] = False
@@ -119,8 +173,8 @@ def main():
            score["good_crawler"]["risky_http_methods"] = True
            score["bad_crawler"]["risky_http_methods"] = False
            score["regular_user"]["risky_http_methods"] = False
-        #--------------------- Robots Violations ---------------------
-        #respect robots.txt and login/config pages access frequency
+        # --------------------- Robots Violations ---------------------
+        # respect robots.txt and login/config pages access frequency
        robots_disallows = []
        robots_path = Path(__file__).parent.parent / "templates" / "html" / "robots.txt"
        with open(robots_path, "r") as f:
@@ -132,11 +186,20 @@ def main():

                if parts[0] == "Disallow":
                    parts[1] = parts[1].rstrip("/")
-                    #print(f"DISALLOW {parts[1]}")
+                    # print(f"DISALLOW {parts[1]}")
                    robots_disallows.append(parts[1].strip())
-        #if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker
-        violated_robots_count = len([item for item in ip_accesses if any(item["path"].rstrip("/").startswith(disallow) for disallow in robots_disallows)])
-        #print(f"Violated robots count: {violated_robots_count}")
+        # if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker
+        violated_robots_count = len(
+            [
+                item
+                for item in ip_accesses
+                if any(
+                    item["path"].rstrip("/").startswith(disallow)
+                    for disallow in robots_disallows
+                )
+            ]
+        )
+        # print(f"Violated robots count: {violated_robots_count}")
        if total_accesses_count > 0:
            violated_robots_ratio = violated_robots_count / total_accesses_count
        else:
@@ -152,15 +215,20 @@ def main():
            score["bad_crawler"]["robots_violations"] = False
            score["regular_user"]["robots_violations"] = False

-        #--------------------- Requests Timing ---------------------
+        # --------------------- Requests Timing ---------------------
        # Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior
        timestamps = [datetime.fromisoformat(item["timestamp"]) for item in ip_accesses]
        now_utc = datetime.now()
-        timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)]
+        timestamps = [
+            ts
+            for ts in timestamps
+            if now_utc - ts
+            <= timedelta(seconds=uneven_request_timing_time_window_seconds)
+        ]
        timestamps = sorted(timestamps, reverse=True)
        time_diffs = []
-        for i in range(0, len(timestamps)-1):
-            diff = (timestamps[i] - timestamps[i+1]).total_seconds()
+        for i in range(0, len(timestamps) - 1):
+            diff = (timestamps[i] - timestamps[i + 1]).total_seconds()
            time_diffs.append(diff)

        mean = 0
@@ -170,9 +238,11 @@ def main():
        if time_diffs:
            mean = sum(time_diffs) / len(time_diffs)
            variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs)
-            std = variance ** 0.5
-            cv = std/mean
-            app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}")
+            std = variance**0.5
+            cv = std / mean
+            app_logger.debug(
+                f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}"
+            )
        if cv >= uneven_request_timing_threshold:
            score["attacker"]["uneven_request_timing"] = True
            score["good_crawler"]["uneven_request_timing"] = False
@@ -183,11 +253,11 @@ def main():
            score["good_crawler"]["uneven_request_timing"] = False
            score["bad_crawler"]["uneven_request_timing"] = False
            score["regular_user"]["uneven_request_timing"] = False
-        #--------------------- Different User Agents ---------------------
-        #Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers
+        # --------------------- Different User Agents ---------------------
+        # Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers
        user_agents_used = [item["user_agent"] for item in ip_accesses]
        user_agents_used = list(dict.fromkeys(user_agents_used))
-        #print(f"User agents used: {user_agents_used}")
+        # print(f"User agents used: {user_agents_used}")
        if len(user_agents_used) >= user_agents_used_threshold:
            score["attacker"]["different_user_agents"] = True
            score["good_crawler"]["different_user_agents"] = False
@@ -198,7 +268,7 @@ def main():
            score["good_crawler"]["different_user_agents"] = False
            score["bad_crawler"]["different_user_agents"] = False
            score["regular_user"]["different_user_agents"] = False
-        #--------------------- Attack URLs ---------------------
+        # --------------------- Attack URLs ---------------------
        attack_urls_found_list = []
        wl = get_wordlists()
        if wl.attack_patterns:
@@ -215,12 +285,14 @@ def main():

                for name, pattern in wl.attack_patterns.items():
                    # Check original, decoded, and double-decoded paths
-                    if (re.search(pattern, queried_path, re.IGNORECASE) or
-                        re.search(pattern, decoded_path, re.IGNORECASE) or
-                        re.search(pattern, decoded_path_twice, re.IGNORECASE)):
+                    if (
+                        re.search(pattern, queried_path, re.IGNORECASE)
+                        or re.search(pattern, decoded_path, re.IGNORECASE)
+                        or re.search(pattern, decoded_path_twice, re.IGNORECASE)
+                    ):
                        attack_urls_found_list.append(f"{name}: {pattern}")

-            #remove duplicates
+            # remove duplicates
            attack_urls_found_list = set(attack_urls_found_list)
            attack_urls_found_list = list(attack_urls_found_list)

@@ -234,28 +306,102 @@ def main():
                score["good_crawler"]["attack_url"] = False
                score["bad_crawler"]["attack_url"] = False
                score["regular_user"]["attack_url"] = False
-        #--------------------- Calculate score ---------------------
+        # --------------------- Calculate score ---------------------
        attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0
-        attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"]
-        attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"]
-        attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"]
-        attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"]
-        attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"]
-        good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"]
-        good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"]
-        good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"]
-        good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"]
-        good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"]
-        bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"]
-        bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"]
-        bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"]
-        bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"]
-        bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"]
-        regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"]
-        regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"]
-        regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"]
-        regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"]
-        regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"]
+        attacker_score = (
+            score["attacker"]["risky_http_methods"]
+            * weights["attacker"]["risky_http_methods"]
+        )
+        attacker_score = (
+            attacker_score
+            + score["attacker"]["robots_violations"]
+            * weights["attacker"]["robots_violations"]
+        )
+        attacker_score = (
+            attacker_score
+            + score["attacker"]["uneven_request_timing"]
+            * weights["attacker"]["uneven_request_timing"]
+        )
+        attacker_score = (
+            attacker_score
+            + score["attacker"]["different_user_agents"]
+            * weights["attacker"]["different_user_agents"]
+        )
+        attacker_score = (
+            attacker_score
+            + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"]
+        )
+        good_crawler_score = (
+            score["good_crawler"]["risky_http_methods"]
+            * weights["good_crawler"]["risky_http_methods"]
+        )
+        good_crawler_score = (
+            good_crawler_score
+            + score["good_crawler"]["robots_violations"]
+            * weights["good_crawler"]["robots_violations"]
+        )
+        good_crawler_score = (
+            good_crawler_score
+            + score["good_crawler"]["uneven_request_timing"]
+            * weights["good_crawler"]["uneven_request_timing"]
+        )
+        good_crawler_score = (
+            good_crawler_score
+            + score["good_crawler"]["different_user_agents"]
+            * weights["good_crawler"]["different_user_agents"]
+        )
+        good_crawler_score = (
+            good_crawler_score
+            + score["good_crawler"]["attack_url"]
+            * weights["good_crawler"]["attack_url"]
+        )
+        bad_crawler_score = (
+            score["bad_crawler"]["risky_http_methods"]
+            * weights["bad_crawler"]["risky_http_methods"]
+        )
+        bad_crawler_score = (
+            bad_crawler_score
+            + score["bad_crawler"]["robots_violations"]
+            * weights["bad_crawler"]["robots_violations"]
+        )
+        bad_crawler_score = (
+            bad_crawler_score
+            + score["bad_crawler"]["uneven_request_timing"]
+            * weights["bad_crawler"]["uneven_request_timing"]
+        )
+        bad_crawler_score = (
+            bad_crawler_score
+            + score["bad_crawler"]["different_user_agents"]
+            * weights["bad_crawler"]["different_user_agents"]
+        )
+        bad_crawler_score = (
+            bad_crawler_score
+            + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"]
+        )
+        regular_user_score = (
+            score["regular_user"]["risky_http_methods"]
+            * weights["regular_user"]["risky_http_methods"]
+        )
+        regular_user_score = (
+            regular_user_score
+            + score["regular_user"]["robots_violations"]
+            * weights["regular_user"]["robots_violations"]
+        )
+        regular_user_score = (
+            regular_user_score
+            + score["regular_user"]["uneven_request_timing"]
+            * weights["regular_user"]["uneven_request_timing"]
+        )
+        regular_user_score = (
+            regular_user_score
+            + score["regular_user"]["different_user_agents"]
+            * weights["regular_user"]["different_user_agents"]
+        )
+        regular_user_score = (
+            regular_user_score
+            + score["regular_user"]["attack_url"]
+            * weights["regular_user"]["attack_url"]
+        )
        score_details = f"""
        Attacker score: {attacker_score}
        Good Crawler score: {good_crawler_score}
@@ -263,9 +409,22 @@ def main():
        Regular User score: {regular_user_score}
        """
        app_logger.debug(score_details)
-        analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list}
-        category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score}
+        analyzed_metrics = {
+            "risky_http_methods": http_method_attacker_score,
+            "robots_violations": violated_robots_ratio,
+            "uneven_request_timing": mean,
+            "different_user_agents": user_agents_used,
+            "attack_url": attack_urls_found_list,
+        }
+        category_scores = {
+            "attacker": attacker_score,
+            "good_crawler": good_crawler_score,
+            "bad_crawler": bad_crawler_score,
+            "regular_user": regular_user_score,
+        }
        category = max(category_scores, key=category_scores.get)
        last_analysis = datetime.now()
-        db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
+        db_manager.update_ip_stats_analysis(
+            ip, analyzed_metrics, category, category_scores, last_analysis
+        )
    return
--- a/src/tasks/fetch_ip_rep.py
+++ b/src/tasks/fetch_ip_rep.py
@@ -11,7 +11,7 @@ TASK_CONFIG = {
    "name": "fetch-ip-rep",
    "cron": "*/5 * * * *",
    "enabled": True,
-    "run_when_loaded": True
+    "run_when_loaded": True,
 }


@@ -21,7 +21,9 @@ def main():

    # Only get IPs that haven't been enriched yet
    unenriched_ips = db_manager.get_unenriched_ips(limit=50)
-    app_logger.info(f"{len(unenriched_ips)} IP's need to be have reputation enrichment.")
+    app_logger.info(
+        f"{len(unenriched_ips)} IP's need to be have reputation enrichment."
+    )
    for ip in unenriched_ips:
        try:
            api_url = "https://iprep.lcrawl.com/api/iprep/"
@@ -43,8 +45,11 @@ def main():
                sanitized_list_on = sanitize_dict(list_on, 100000)

                db_manager.update_ip_rep_infos(
-                    ip, sanitized_country_iso_code, sanitized_asn,
-                    sanitized_asn_org, sanitized_list_on
+                    ip,
+                    sanitized_country_iso_code,
+                    sanitized_asn,
+                    sanitized_asn_org,
+                    sanitized_list_on,
                )
        except requests.RequestException as e:
            app_logger.warning(f"Failed to fetch IP rep for {ip}: {e}")
--- a/src/tasks/top_attacking_ips.py
+++ b/src/tasks/top_attacking_ips.py
@@ -17,24 +17,29 @@ TASK_CONFIG = {
    "name": "export-malicious-ips",
    "cron": "*/5 * * * *",
    "enabled": True,
-    "run_when_loaded": True
+    "run_when_loaded": True,
 }

 EXPORTS_DIR = "exports"
 OUTPUT_FILE = os.path.join(EXPORTS_DIR, "malicious_ips.txt")

+
 # ----------------------
 # TASK LOGIC
 # ----------------------
 def has_recent_honeypot_access(session, minutes: int = 5) -> bool:
    """Check if honeypot was accessed in the last N minutes."""
    cutoff_time = datetime.now() - timedelta(minutes=minutes)
-    count = session.query(AccessLog).filter(
-        AccessLog.is_honeypot_trigger == True,
-        AccessLog.timestamp >= cutoff_time
-    ).count()
+    count = (
+        session.query(AccessLog)
+        .filter(
+            AccessLog.is_honeypot_trigger == True, AccessLog.timestamp >= cutoff_time
+        )
+        .count()
+    )
    return count > 0

+
 def main():
    """
    Export all IPs flagged as suspicious to a text file.
@@ -49,23 +54,29 @@ def main():

        # Check for recent honeypot activity
        if not has_recent_honeypot_access(session):
-            app_logger.info(f"[Background Task] {task_name} skipped - no honeypot access in last 5 minutes")
+            app_logger.info(
+                f"[Background Task] {task_name} skipped - no honeypot access in last 5 minutes"
+            )
            return

        # Query distinct suspicious IPs
-        results = session.query(distinct(AccessLog.ip)).filter(
-            AccessLog.is_suspicious == True
-        ).all()
+        results = (
+            session.query(distinct(AccessLog.ip))
+            .filter(AccessLog.is_suspicious == True)
+            .all()
+        )

        # Ensure exports directory exists
        os.makedirs(EXPORTS_DIR, exist_ok=True)

        # Write IPs to file (one per line)
-        with open(OUTPUT_FILE, 'w') as f:
+        with open(OUTPUT_FILE, "w") as f:
            for (ip,) in results:
                f.write(f"{ip}\n")

-        app_logger.info(f"[Background Task] {task_name} exported {len(results)} IPs to {OUTPUT_FILE}")
+        app_logger.info(
+            f"[Background Task] {task_name} exported {len(results)} IPs to {OUTPUT_FILE}"
+        )

    except Exception as e:
        app_logger.error(f"[Background Task] {task_name} failed: {e}")