diff --git a/.gitignore b/.gitignore index 70b93e4..63ae0e9 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,6 @@ data/ # Personal canary tokens or sensitive configs *canary*token*.yaml personal-values.yaml + +#exports dir (keeping .gitkeep so we have the dir) +/exports/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 2c7b954..4015c74 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,9 +14,10 @@ RUN pip install --no-cache-dir -r requirements.txt COPY src/ /app/src/ COPY wordlists.json /app/ COPY entrypoint.sh /app/ +COPY config.yaml /app/ RUN useradd -m -u 1000 krawl && \ - mkdir -p /app/logs /app/data && \ + mkdir -p /app/logs /app/data /app/exports && \ chown -R krawl:krawl /app && \ chmod +x /app/entrypoint.sh diff --git a/config.yaml b/config.yaml index 6e09f30..52daa09 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,7 @@ server: port: 5000 delay: 100 # Response delay in milliseconds - timezone: null # e.g., "America/New_York" or null for system default + timezone: null # e.g., "America/New_York", "Europe/Paris" or null for system default # manually set the server header, if null a random one will be used. server_header: null @@ -11,8 +11,8 @@ server: links: min_length: 5 max_length: 15 - min_per_page: 10 - max_per_page: 15 + min_per_page: 5 + max_per_page: 10 char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" max_counter: 10 @@ -38,9 +38,9 @@ behavior: probability_error_codes: 0 # 0-100 percentage analyzer: - # http_risky_methods_threshold: 0.1 - # violated_robots_threshold: 0.1 - # uneven_request_timing_threshold: 5 - # uneven_request_timing_time_window_seconds: 300 - # user_agents_used_threshold: 2 - # attack_urls_threshold: 1 \ No newline at end of file + http_risky_methods_threshold: 0.1 + violated_robots_threshold: 0.1 + uneven_request_timing_threshold: 2 + uneven_request_timing_time_window_seconds: 300 + user_agents_used_threshold: 2 + attack_urls_threshold: 1 diff --git a/docker-compose.yaml b/docker-compose.yaml index 02b6ae7..08bcec9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,6 +12,7 @@ services: - ./wordlists.json:/app/wordlists.json:ro - ./config.yaml:/app/config.yaml:ro - ./logs:/app/logs + - ./exports:/app/exports environment: - CONFIG_LOCATION=config.yaml restart: unless-stopped diff --git a/entrypoint.sh b/entrypoint.sh index 28b5fc0..fe3ef45 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,7 +2,7 @@ set -e # Fix ownership of mounted directories -chown -R krawl:krawl /app/logs /app/data 2>/dev/null || true +chown -R krawl:krawl /app/logs /app/data /app/exports 2>/dev/null || true # Drop to krawl user and run the application exec gosu krawl "$@" diff --git a/exports/.gitkeep b/exports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 8cb6dc5..cafbb7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ PyYAML>=6.0 # Database ORM SQLAlchemy>=2.0.0,<3.0.0 + +# Scheduling +APScheduler>=3.11.2 \ No newline at end of file diff --git a/src/analyzer.py b/src/analyzer.py index 85ce529..907529f 100644 --- a/src/analyzer.py +++ b/src/analyzer.py @@ -6,8 +6,10 @@ from zoneinfo import ZoneInfo from pathlib import Path from datetime import datetime, timedelta import re +import urllib.parse from wordlists import get_wordlists from config import get_config +from logger import get_app_logger import requests from sanitizer import sanitize_for_storage, sanitize_dict @@ -15,6 +17,8 @@ from sanitizer import sanitize_for_storage, sanitize_dict Functions for user activity analysis """ +app_logger = get_app_logger() + class Analyzer: """ Analyzes users activity and produces aggregated insights @@ -48,272 +52,299 @@ class Analyzer: pass return self._db_manager - def infer_user_category(self, ip: str) -> str: + # def infer_user_category(self, ip: str) -> str: - config = get_config() + # config = get_config() - http_risky_methods_threshold = config.http_risky_methods_threshold - violated_robots_threshold = config.violated_robots_threshold - uneven_request_timing_threshold = config.uneven_request_timing_threshold - user_agents_used_threshold = config.user_agents_used_threshold - attack_urls_threshold = config.attack_urls_threshold - uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds + # http_risky_methods_threshold = config.http_risky_methods_threshold + # violated_robots_threshold = config.violated_robots_threshold + # uneven_request_timing_threshold = config.uneven_request_timing_threshold + # user_agents_used_threshold = config.user_agents_used_threshold + # attack_urls_threshold = config.attack_urls_threshold + # uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds - print(f"http_risky_methods_threshold: {http_risky_methods_threshold}") + # app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}") - score = {} - score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} - score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} - score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} - score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score = {} + # score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} - #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme - weights = { - "attacker": { - "risky_http_methods": 6, - "robots_violations": 4, - "uneven_request_timing": 3, - "different_user_agents": 8, - "attack_url": 15 - }, - "good_crawler": { - "risky_http_methods": 1, - "robots_violations": 0, - "uneven_request_timing": 0, - "different_user_agents": 0, - "attack_url": 0 - }, - "bad_crawler": { - "risky_http_methods": 2, - "robots_violations": 7, - "uneven_request_timing": 0, - "different_user_agents": 5, - "attack_url": 5 - }, - "regular_user": { - "risky_http_methods": 0, - "robots_violations": 0, - "uneven_request_timing": 8, - "different_user_agents": 3, - "attack_url": 0 - } - } + # #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme + # weights = { + # "attacker": { + # "risky_http_methods": 6, + # "robots_violations": 4, + # "uneven_request_timing": 3, + # "different_user_agents": 8, + # "attack_url": 15 + # }, + # "good_crawler": { + # "risky_http_methods": 1, + # "robots_violations": 0, + # "uneven_request_timing": 0, + # "different_user_agents": 0, + # "attack_url": 0 + # }, + # "bad_crawler": { + # "risky_http_methods": 2, + # "robots_violations": 7, + # "uneven_request_timing": 0, + # "different_user_agents": 5, + # "attack_url": 5 + # }, + # "regular_user": { + # "risky_http_methods": 0, + # "robots_violations": 0, + # "uneven_request_timing": 8, + # "different_user_agents": 3, + # "attack_url": 0 + # } + # } - accesses = self.db.get_access_logs(ip_filter = ip, limit=1000) - total_accesses_count = len(accesses) - if total_accesses_count <= 0: - return + # accesses = self.db.get_access_logs(ip_filter = ip, limit=1000) + # total_accesses_count = len(accesses) + # if total_accesses_count <= 0: + # return + + # # Set category as "unknown" for the first 5 requests + # if total_accesses_count < 3: + # category = "unknown" + # analyzed_metrics = {} + # category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0} + # last_analysis = datetime.now(tz=ZoneInfo('UTC')) + # self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) + # return 0 - #--------------------- HTTP Methods --------------------- + # #--------------------- HTTP Methods --------------------- - get_accesses_count = len([item for item in accesses if item["method"] == "GET"]) - post_accesses_count = len([item for item in accesses if item["method"] == "POST"]) - put_accesses_count = len([item for item in accesses if item["method"] == "PUT"]) - delete_accesses_count = len([item for item in accesses if item["method"] == "DELETE"]) - head_accesses_count = len([item for item in accesses if item["method"] == "HEAD"]) - options_accesses_count = len([item for item in accesses if item["method"] == "OPTIONS"]) - patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"]) + # get_accesses_count = len([item for item in accesses if item["method"] == "GET"]) + # post_accesses_count = len([item for item in accesses if item["method"] == "POST"]) + # put_accesses_count = len([item for item in accesses if item["method"] == "PUT"]) + # delete_accesses_count = len([item for item in accesses if item["method"] == "DELETE"]) + # head_accesses_count = len([item for item in accesses if item["method"] == "HEAD"]) + # options_accesses_count = len([item for item in accesses if item["method"] == "OPTIONS"]) + # patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"]) - if total_accesses_count > http_risky_methods_threshold: - http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count - else: - http_method_attacker_score = 0 + # if total_accesses_count > http_risky_methods_threshold: + # http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count + # else: + # http_method_attacker_score = 0 - #print(f"HTTP Method attacker score: {http_method_attacker_score}") - if http_method_attacker_score >= http_risky_methods_threshold: - score["attacker"]["risky_http_methods"] = True - score["good_crawler"]["risky_http_methods"] = False - score["bad_crawler"]["risky_http_methods"] = True - score["regular_user"]["risky_http_methods"] = False - else: - score["attacker"]["risky_http_methods"] = False - score["good_crawler"]["risky_http_methods"] = True - score["bad_crawler"]["risky_http_methods"] = False - score["regular_user"]["risky_http_methods"] = False + # #print(f"HTTP Method attacker score: {http_method_attacker_score}") + # if http_method_attacker_score >= http_risky_methods_threshold: + # score["attacker"]["risky_http_methods"] = True + # score["good_crawler"]["risky_http_methods"] = False + # score["bad_crawler"]["risky_http_methods"] = True + # score["regular_user"]["risky_http_methods"] = False + # else: + # score["attacker"]["risky_http_methods"] = False + # score["good_crawler"]["risky_http_methods"] = True + # score["bad_crawler"]["risky_http_methods"] = False + # score["regular_user"]["risky_http_methods"] = False - #--------------------- Robots Violations --------------------- - #respect robots.txt and login/config pages access frequency - robots_disallows = [] - robots_path = Path(__file__).parent / "templates" / "html" / "robots.txt" - with open(robots_path, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - parts = line.split(":") + # #--------------------- Robots Violations --------------------- + # #respect robots.txt and login/config pages access frequency + # robots_disallows = [] + # robots_path = Path(__file__).parent / "templates" / "html" / "robots.txt" + # with open(robots_path, "r") as f: + # for line in f: + # line = line.strip() + # if not line: + # continue + # parts = line.split(":") - if parts[0] == "Disallow": - parts[1] = parts[1].rstrip("/") - #print(f"DISALLOW {parts[1]}") - robots_disallows.append(parts[1].strip()) + # if parts[0] == "Disallow": + # parts[1] = parts[1].rstrip("/") + # #print(f"DISALLOW {parts[1]}") + # robots_disallows.append(parts[1].strip()) - #if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker - violated_robots_count = len([item for item in accesses if item["path"].rstrip("/") in tuple(robots_disallows)]) - #print(f"Violated robots count: {violated_robots_count}") - if total_accesses_count > 0: - violated_robots_ratio = violated_robots_count / total_accesses_count - else: - violated_robots_ratio = 0 + # #if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker + # violated_robots_count = len([item for item in accesses if any(item["path"].rstrip("/").startswith(disallow) for disallow in robots_disallows)]) + # #print(f"Violated robots count: {violated_robots_count}") + # if total_accesses_count > 0: + # violated_robots_ratio = violated_robots_count / total_accesses_count + # else: + # violated_robots_ratio = 0 - if violated_robots_ratio >= violated_robots_threshold: - score["attacker"]["robots_violations"] = True - score["good_crawler"]["robots_violations"] = False - score["bad_crawler"]["robots_violations"] = True - score["regular_user"]["robots_violations"] = False - else: - score["attacker"]["robots_violations"] = False - score["good_crawler"]["robots_violations"] = False - score["bad_crawler"]["robots_violations"] = False - score["regular_user"]["robots_violations"] = False + # if violated_robots_ratio >= violated_robots_threshold: + # score["attacker"]["robots_violations"] = True + # score["good_crawler"]["robots_violations"] = False + # score["bad_crawler"]["robots_violations"] = True + # score["regular_user"]["robots_violations"] = False + # else: + # score["attacker"]["robots_violations"] = False + # score["good_crawler"]["robots_violations"] = False + # score["bad_crawler"]["robots_violations"] = False + # score["regular_user"]["robots_violations"] = False - #--------------------- Requests Timing --------------------- - #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior - timestamps = [datetime.fromisoformat(item["timestamp"]) for item in accesses] - timestamps = [ts for ts in timestamps if datetime.utcnow() - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)] - timestamps = sorted(timestamps, reverse=True) + # #--------------------- Requests Timing --------------------- + # #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior + # timestamps = [datetime.fromisoformat(item["timestamp"]) for item in accesses] + # now_utc = datetime.now(tz=ZoneInfo('UTC')) + # timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)] + # timestamps = sorted(timestamps, reverse=True) - time_diffs = [] - for i in range(0, len(timestamps)-1): - diff = (timestamps[i] - timestamps[i+1]).total_seconds() - time_diffs.append(diff) + # time_diffs = [] + # for i in range(0, len(timestamps)-1): + # diff = (timestamps[i] - timestamps[i+1]).total_seconds() + # time_diffs.append(diff) - mean = 0 - variance = 0 - std = 0 - cv = 0 - if time_diffs: - mean = sum(time_diffs) / len(time_diffs) - variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs) - std = variance ** 0.5 - cv = std/mean - print(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}") + # mean = 0 + # variance = 0 + # std = 0 + # cv = 0 + # if time_diffs: + # mean = sum(time_diffs) / len(time_diffs) + # variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs) + # std = variance ** 0.5 + # cv = std/mean + # app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}") - if cv >= uneven_request_timing_threshold: - score["attacker"]["uneven_request_timing"] = True - score["good_crawler"]["uneven_request_timing"] = False - score["bad_crawler"]["uneven_request_timing"] = False - score["regular_user"]["uneven_request_timing"] = True - else: - score["attacker"]["uneven_request_timing"] = False - score["good_crawler"]["uneven_request_timing"] = False - score["bad_crawler"]["uneven_request_timing"] = False - score["regular_user"]["uneven_request_timing"] = False + # if cv >= uneven_request_timing_threshold: + # score["attacker"]["uneven_request_timing"] = True + # score["good_crawler"]["uneven_request_timing"] = False + # score["bad_crawler"]["uneven_request_timing"] = False + # score["regular_user"]["uneven_request_timing"] = True + # else: + # score["attacker"]["uneven_request_timing"] = False + # score["good_crawler"]["uneven_request_timing"] = False + # score["bad_crawler"]["uneven_request_timing"] = False + # score["regular_user"]["uneven_request_timing"] = False - #--------------------- Different User Agents --------------------- - #Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers - user_agents_used = [item["user_agent"] for item in accesses] - user_agents_used = list(dict.fromkeys(user_agents_used)) - #print(f"User agents used: {user_agents_used}") + # #--------------------- Different User Agents --------------------- + # #Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers + # user_agents_used = [item["user_agent"] for item in accesses] + # user_agents_used = list(dict.fromkeys(user_agents_used)) + # #print(f"User agents used: {user_agents_used}") - if len(user_agents_used) >= user_agents_used_threshold: - score["attacker"]["different_user_agents"] = True - score["good_crawler"]["different_user_agents"] = False - score["bad_crawler"]["different_user_agentss"] = True - score["regular_user"]["different_user_agents"] = False - else: - score["attacker"]["different_user_agents"] = False - score["good_crawler"]["different_user_agents"] = False - score["bad_crawler"]["different_user_agents"] = False - score["regular_user"]["different_user_agents"] = False + # if len(user_agents_used) >= user_agents_used_threshold: + # score["attacker"]["different_user_agents"] = True + # score["good_crawler"]["different_user_agents"] = False + # score["bad_crawler"]["different_user_agentss"] = True + # score["regular_user"]["different_user_agents"] = False + # else: + # score["attacker"]["different_user_agents"] = False + # score["good_crawler"]["different_user_agents"] = False + # score["bad_crawler"]["different_user_agents"] = False + # score["regular_user"]["different_user_agents"] = False - #--------------------- Attack URLs --------------------- + # #--------------------- Attack URLs --------------------- - attack_urls_found_list = [] + # attack_urls_found_list = [] - wl = get_wordlists() - if wl.attack_urls: - queried_paths = [item["path"] for item in accesses] + # wl = get_wordlists() + # if wl.attack_patterns: + # queried_paths = [item["path"] for item in accesses] - for queried_path in queried_paths: - for name, pattern in wl.attack_urls.items(): - if re.search(pattern, queried_path, re.IGNORECASE): - attack_urls_found_list.append(pattern) + # for queried_path in queried_paths: + # # URL decode the path to catch encoded attacks + # try: + # decoded_path = urllib.parse.unquote(queried_path) + # # Double decode to catch double-encoded attacks + # decoded_path_twice = urllib.parse.unquote(decoded_path) + # except Exception: + # decoded_path = queried_path + # decoded_path_twice = queried_path + + # for name, pattern in wl.attack_patterns.items(): + # # Check original, decoded, and double-decoded paths + # if (re.search(pattern, queried_path, re.IGNORECASE) or + # re.search(pattern, decoded_path, re.IGNORECASE) or + # re.search(pattern, decoded_path_twice, re.IGNORECASE)): + # attack_urls_found_list.append(f"{name}: {pattern}") - #remove duplicates - attack_urls_found_list = set(attack_urls_found_list) - attack_urls_found_list = list(attack_urls_found_list) + # #remove duplicates + # attack_urls_found_list = set(attack_urls_found_list) + # attack_urls_found_list = list(attack_urls_found_list) - if len(attack_urls_found_list) > attack_urls_threshold: - score["attacker"]["attack_url"] = True - score["good_crawler"]["attack_url"] = False - score["bad_crawler"]["attack_url"] = False - score["regular_user"]["attack_url"] = False - else: - score["attacker"]["attack_url"] = False - score["good_crawler"]["attack_url"] = False - score["bad_crawler"]["attack_url"] = False - score["regular_user"]["attack_url"] = False + # if len(attack_urls_found_list) > attack_urls_threshold: + # score["attacker"]["attack_url"] = True + # score["good_crawler"]["attack_url"] = False + # score["bad_crawler"]["attack_url"] = False + # score["regular_user"]["attack_url"] = False + # else: + # score["attacker"]["attack_url"] = False + # score["good_crawler"]["attack_url"] = False + # score["bad_crawler"]["attack_url"] = False + # score["regular_user"]["attack_url"] = False - #--------------------- Calculate score --------------------- + # #--------------------- Calculate score --------------------- - attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0 + # attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0 - attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"] - attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"] - attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"] - attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"] - attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"] + # attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"] + # attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"] + # attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"] + # attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"] + # attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"] - good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"] - good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"] - good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"] - good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"] - good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"] + # good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"] - bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"] - bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"] - bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"] - bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"] - bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"] + # bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"] - regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"] - regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"] - regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"] - regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"] - regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"] + # regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"] + # regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"] + # regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"] + # regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"] + # regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"] - print(f"Attacker score: {attacker_score}") - print(f"Good Crawler score: {good_crawler_score}") - print(f"Bad Crawler score: {bad_crawler_score}") - print(f"Regular User score: {regular_user_score}") + # score_details = f""" + # Attacker score: {attacker_score} + # Good Crawler score: {good_crawler_score} + # Bad Crawler score: {bad_crawler_score} + # Regular User score: {regular_user_score} + # """ + # app_logger.debug(score_details) - analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list} - category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score} - category = max(category_scores, key=category_scores.get) - last_analysis = datetime.utcnow() + # analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list} + # category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score} + # category = max(category_scores, key=category_scores.get) + # last_analysis = datetime.now(tz=ZoneInfo('UTC')) - self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) + # self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) - return 0 + # return 0 - def update_ip_rep_infos(self, ip: str) -> list[str]: - api_url = "https://iprep.lcrawl.com/api/iprep/" - params = { - "cidr": ip - } - headers = { - "Content-Type": "application/json" - } - response = requests.get(api_url, headers=headers, params=params) - payload = response.json() - if payload["results"]: - data = payload["results"][0] + # def update_ip_rep_infos(self, ip: str) -> list[str]: + # api_url = "https://iprep.lcrawl.com/api/iprep/" + # params = { + # "cidr": ip + # } + # headers = { + # "Content-Type": "application/json" + # } - country_iso_code = data["geoip_data"]["country_iso_code"] - asn = data["geoip_data"]["asn_autonomous_system_number"] - asn_org = data["geoip_data"]["asn_autonomous_system_organization"] - list_on = data["list_on"] + # response = requests.get(api_url, headers=headers, params=params) + # payload = response.json() - sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3) - sanitized_asn = sanitize_for_storage(asn, 100) - sanitized_asn_org = sanitize_for_storage(asn_org, 100) - sanitized_list_on = sanitize_dict(list_on, 100000) + # if payload["results"]: + # data = payload["results"][0] + + # country_iso_code = data["geoip_data"]["country_iso_code"] + # asn = data["geoip_data"]["asn_autonomous_system_number"] + # asn_org = data["geoip_data"]["asn_autonomous_system_organization"] + # list_on = data["list_on"] + + # sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3) + # sanitized_asn = sanitize_for_storage(asn, 100) + # sanitized_asn_org = sanitize_for_storage(asn_org, 100) + # sanitized_list_on = sanitize_dict(list_on, 100000) - self._db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on) + # self._db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on) - return \ No newline at end of file + # return \ No newline at end of file diff --git a/src/database.py b/src/database.py index b5622db..5c96828 100644 --- a/src/database.py +++ b/src/database.py @@ -9,11 +9,12 @@ import os import stat from datetime import datetime from typing import Optional, List, Dict, Any +from zoneinfo import ZoneInfo from sqlalchemy import create_engine, func, distinct, case from sqlalchemy.orm import sessionmaker, scoped_session, Session -from models import Base, AccessLog, CredentialAttempt, AttackDetection, IpStats +from models import Base, AccessLog, CredentialAttempt, AttackDetection, IpStats, CategoryHistory from sanitizer import ( sanitize_ip, sanitize_path, @@ -22,6 +23,9 @@ from sanitizer import ( sanitize_attack_pattern, ) +from logger import get_app_logger + +applogger = get_app_logger() class DatabaseManager: """ @@ -127,7 +131,7 @@ class DatabaseManager: method=method[:10], is_suspicious=is_suspicious, is_honeypot_trigger=is_honeypot_trigger, - timestamp=datetime.utcnow() + timestamp=datetime.now(tz=ZoneInfo('UTC')) ) session.add(access_log) session.flush() # Get the ID before committing @@ -154,7 +158,7 @@ class DatabaseManager: except Exception as e: session.rollback() # Log error but don't crash - database persistence is secondary to honeypot function - print(f"Database error persisting access: {e}") + applogger.critical(f"Database error persisting access: {e}") return None finally: self.close_session() @@ -185,7 +189,7 @@ class DatabaseManager: path=sanitize_path(path), username=sanitize_credential(username), password=sanitize_credential(password), - timestamp=datetime.utcnow() + timestamp=datetime.now(tz=ZoneInfo('UTC')) ) session.add(credential) session.commit() @@ -193,7 +197,7 @@ class DatabaseManager: except Exception as e: session.rollback() - print(f"Database error persisting credential: {e}") + applogger.critical(f"Database error persisting credential: {e}") return None finally: self.close_session() @@ -207,7 +211,7 @@ class DatabaseManager: ip: IP address to update """ sanitized_ip = sanitize_ip(ip) - now = datetime.utcnow() + now = datetime.now(tz=ZoneInfo('UTC')) ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() @@ -226,6 +230,7 @@ class DatabaseManager: def update_ip_stats_analysis(self, ip: str, analyzed_metrics: Dict[str, object], category: str, category_scores: Dict[str, int], last_analysis: datetime) -> None: """ Update IP statistics (ip is already persisted). + Records category change in history if category has changed. Args: ip: IP address to update @@ -235,16 +240,28 @@ class DatabaseManager: last_analysis: timestamp of last analysis """ - print(f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}") + applogger.debug(f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}") + applogger.info(f"IP: {ip} category has been updated to {category}") session = self.session sanitized_ip = sanitize_ip(ip) ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() + # Check if category has changed and record it + old_category = ip_stats.category + if old_category != category: + self._record_category_change(sanitized_ip, old_category, category, last_analysis) + ip_stats.analyzed_metrics = analyzed_metrics ip_stats.category = category ip_stats.category_scores = category_scores ip_stats.last_analysis = last_analysis + + try: + session.commit() + except Exception as e: + session.rollback() + print(f"Error updating IP stats analysis: {e}") def manual_update_category(self, ip: str, category: str) -> None: """ @@ -256,13 +273,81 @@ class DatabaseManager: """ session = self.session - sanitized_ip = sanitize_ip(ip) ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() + # Record the manual category change + old_category = ip_stats.category + if old_category != category: + self._record_category_change(sanitized_ip, old_category, category, datetime.now(tz=ZoneInfo('UTC'))) + ip_stats.category = category ip_stats.manual_category = True + + try: + session.commit() + except Exception as e: + session.rollback() + print(f"Error updating manual category: {e}") + + def _record_category_change(self, ip: str, old_category: Optional[str], new_category: str, timestamp: datetime) -> None: + """ + Internal method to record category changes in history. + Only records if there's an actual change from a previous category. + + Args: + ip: IP address + old_category: Previous category (None if first categorization) + new_category: New category + timestamp: When the change occurred + """ + # Don't record initial categorization (when old_category is None) + # Only record actual category changes + if old_category is None: + return + + session = self.session + try: + history_entry = CategoryHistory( + ip=ip, + old_category=old_category, + new_category=new_category, + timestamp=timestamp + ) + session.add(history_entry) + session.commit() + except Exception as e: + session.rollback() + applogger.error(f"Error recording category change: {e}") + + def get_category_history(self, ip: str) -> List[Dict[str, Any]]: + """ + Retrieve category change history for a specific IP. + + Args: + ip: IP address to get history for + + Returns: + List of category change records ordered by timestamp + """ + session = self.session + try: + sanitized_ip = sanitize_ip(ip) + history = session.query(CategoryHistory).filter( + CategoryHistory.ip == sanitized_ip + ).order_by(CategoryHistory.timestamp.asc()).all() + + return [ + { + 'old_category': h.old_category, + 'new_category': h.new_category, + 'timestamp': h.timestamp.isoformat() + '+00:00' + } + for h in history + ] + finally: + self.close_session() def update_ip_rep_infos(self, ip: str, country_code: str, asn: str, asn_org: str, list_on: Dict[str,str]) -> None: """ @@ -326,7 +411,7 @@ class DatabaseManager: 'method': log.method, 'is_suspicious': log.is_suspicious, 'is_honeypot_trigger': log.is_honeypot_trigger, - 'timestamp': log.timestamp.isoformat(), + 'timestamp': log.timestamp.isoformat() + '+00:00', 'attack_types': [d.attack_type for d in log.attack_detections] } for log in logs @@ -419,7 +504,7 @@ class DatabaseManager: 'path': attempt.path, 'username': attempt.username, 'password': attempt.password, - 'timestamp': attempt.timestamp.isoformat() + 'timestamp': attempt.timestamp.isoformat() + '+00:00' } for attempt in attempts ] @@ -446,8 +531,8 @@ class DatabaseManager: { 'ip': s.ip, 'total_requests': s.total_requests, - 'first_seen': s.first_seen.isoformat(), - 'last_seen': s.last_seen.isoformat(), + 'first_seen': s.first_seen.isoformat() + '+00:00', + 'last_seen': s.last_seen.isoformat() + '+00:00', 'country_code': s.country_code, 'city': s.city, 'asn': s.asn, @@ -464,6 +549,47 @@ class DatabaseManager: finally: self.close_session() + def get_ip_stats_by_ip(self, ip: str) -> Optional[Dict[str, Any]]: + """ + Retrieve IP statistics for a specific IP address. + + Args: + ip: The IP address to look up + + Returns: + Dictionary with IP stats or None if not found + """ + session = self.session + try: + stat = session.query(IpStats).filter(IpStats.ip == ip).first() + + if not stat: + return None + + # Get category history for this IP + category_history = self.get_category_history(ip) + + return { + 'ip': stat.ip, + 'total_requests': stat.total_requests, + 'first_seen': stat.first_seen.isoformat() + '+00:00' if stat.first_seen else None, + 'last_seen': stat.last_seen.isoformat() + '+00:00' if stat.last_seen else None, + 'country_code': stat.country_code, + 'city': stat.city, + 'asn': stat.asn, + 'asn_org': stat.asn_org, + 'reputation_score': stat.reputation_score, + 'reputation_source': stat.reputation_source, + 'analyzed_metrics': stat.analyzed_metrics or {}, + 'category': stat.category, + 'category_scores': stat.category_scores or {}, + 'manual_category': stat.manual_category, + 'last_analysis': stat.last_analysis.isoformat() + '+00:00' if stat.last_analysis else None, + 'category_history': category_history + } + finally: + self.close_session() + def get_dashboard_counts(self) -> Dict[str, int]: """ Get aggregate statistics for the dashboard. @@ -592,7 +718,7 @@ class DatabaseManager: 'ip': log.ip, 'path': log.path, 'user_agent': log.user_agent, - 'timestamp': log.timestamp.isoformat() + 'timestamp': log.timestamp.isoformat() + '+00:00' } for log in logs ] @@ -650,7 +776,7 @@ class DatabaseManager: 'ip': log.ip, 'path': log.path, 'user_agent': log.user_agent, - 'timestamp': log.timestamp.isoformat(), + 'timestamp': log.timestamp.isoformat() + '+00:00', 'attack_types': [d.attack_type for d in log.attack_detections] } for log in logs diff --git a/src/exports/malicious_ips.txt b/src/exports/malicious_ips.txt new file mode 100644 index 0000000..7b9ad53 --- /dev/null +++ b/src/exports/malicious_ips.txt @@ -0,0 +1 @@ +127.0.0.1 diff --git a/src/handler.py b/src/handler.py index 00238e7..9f2a77f 100644 --- a/src/handler.py +++ b/src/handler.py @@ -407,17 +407,75 @@ class Handler(BaseHTTPRequestHandler): self.end_headers() try: stats = self.tracker.get_stats() - self.wfile.write(generate_dashboard(stats).encode()) + timezone = str(self.config.timezone) if self.config.timezone else 'UTC' + dashboard_path = self.config.dashboard_secret_path + self.wfile.write(generate_dashboard(stats, timezone, dashboard_path).encode()) except BrokenPipeError: pass except Exception as e: self.app_logger.error(f"Error generating dashboard: {e}") return + + # API endpoint for fetching IP stats + if self.config.dashboard_secret_path and self.path.startswith(f"{self.config.dashboard_secret_path}/api/ip-stats/"): + ip_address = self.path.replace(f"{self.config.dashboard_secret_path}/api/ip-stats/", "") + self.send_response(200) + self.send_header('Content-type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + # Prevent browser caching - force fresh data from database every time + self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0') + self.send_header('Pragma', 'no-cache') + self.send_header('Expires', '0') + self.end_headers() + try: + from database import get_database + import json + db = get_database() + ip_stats = db.get_ip_stats_by_ip(ip_address) + if ip_stats: + self.wfile.write(json.dumps(ip_stats).encode()) + else: + self.wfile.write(json.dumps({'error': 'IP not found'}).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching IP stats: {e}") + self.wfile.write(json.dumps({'error': str(e)}).encode()) + return + + # API endpoint for downloading malicious IPs file + if self.config.dashboard_secret_path and self.path == f"{self.config.dashboard_secret_path}/api/download/malicious_ips.txt": + import os + file_path = os.path.join(os.path.dirname(__file__), 'exports', 'malicious_ips.txt') + try: + if os.path.exists(file_path): + with open(file_path, 'rb') as f: + content = f.read() + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('Content-Disposition', 'attachment; filename="malicious_ips.txt"') + self.send_header('Content-Length', str(len(content))) + self.end_headers() + self.wfile.write(content) + else: + self.send_response(404) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(b'File not found') + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error serving malicious IPs file: {e}") + self.send_response(500) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(b'Internal server error') + return self.tracker.record_access(client_ip, self.path, user_agent, method='GET') - self.analyzer.infer_user_category(client_ip) - self.analyzer.update_ip_rep_infos(client_ip) + # self.analyzer.infer_user_category(client_ip) + # self.analyzer.update_ip_rep_infos(client_ip) if self.tracker.is_suspicious_user_agent(user_agent): self.access_logger.warning(f"[SUSPICIOUS] {client_ip} - {user_agent[:50]} - {self.path}") diff --git a/src/migrations/add_category_history.py b/src/migrations/add_category_history.py new file mode 100644 index 0000000..654204e --- /dev/null +++ b/src/migrations/add_category_history.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +""" +Migration script to add CategoryHistory table to existing databases. +Run this once to upgrade your database schema. +""" + +import sys +from pathlib import Path + +# Add parent directory to path to import modules +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from database import get_database, DatabaseManager +from models import Base, CategoryHistory + + +def migrate(): + """Create CategoryHistory table if it doesn't exist.""" + print("Starting migration: Adding CategoryHistory table...") + + try: + db = get_database() + + # Initialize database if not already done + if not db._initialized: + db.initialize() + + # Create only the CategoryHistory table + CategoryHistory.__table__.create(db._engine, checkfirst=True) + + print("✓ Migration completed successfully!") + print(" - CategoryHistory table created") + + except Exception as e: + print(f"✗ Migration failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + migrate() diff --git a/src/models.py b/src/models.py index 5e5cd2c..4a13278 100644 --- a/src/models.py +++ b/src/models.py @@ -150,4 +150,59 @@ class IpStats(Base): def __repr__(self) -> str: - return f"" \ No newline at end of file + return f"" + + +class CategoryHistory(Base): + """ + Records category changes for IP addresses over time. + + Tracks when an IP's category changes, storing both the previous + and new category along with timestamp for timeline visualization. + """ + __tablename__ = 'category_history' + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True) + old_category: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + new_category: Mapped[str] = mapped_column(String(50), nullable=False) + timestamp: Mapped[datetime] = mapped_column(DateTime, nullable=False, default=datetime.utcnow, index=True) + + # Composite index for efficient IP-based timeline queries + __table_args__ = ( + Index('ix_category_history_ip_timestamp', 'ip', 'timestamp'), + ) + + def __repr__(self) -> str: + return f" {self.new_category})>" + + +# class IpLog(Base): +# """ +# Records all IPs that have accessed the honeypot, along with aggregated stats and inferred user category. +# """ +# __tablename__ = 'ip_logs' + +# id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) +# ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True) +# stats: Mapped[List[str]] = mapped_column(String(MAX_PATH_LENGTH)) +# category: Mapped[str] = mapped_column(String(15)) +# manual_category: Mapped[bool] = mapped_column(Boolean, default=False) +# last_analysis: Mapped[datetime] = mapped_column(DateTime, index=True), + +# # Relationship to attack detections +# access_logs: Mapped[List["AccessLog"]] = relationship( +# "AccessLog", +# back_populates="ip", +# cascade="all, delete-orphan" +# ) + +# # Indexes for common queries +# __table_args__ = ( +# Index('ix_access_logs_ip_timestamp', 'ip', 'timestamp'), +# Index('ix_access_logs_is_suspicious', 'is_suspicious'), +# Index('ix_access_logs_is_honeypot_trigger', 'is_honeypot_trigger'), +# ) + +# def __repr__(self) -> str: +# return f"" \ No newline at end of file diff --git a/src/server.py b/src/server.py index 59244c5..e690142 100644 --- a/src/server.py +++ b/src/server.py @@ -14,6 +14,7 @@ from analyzer import Analyzer from handler import Handler from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger from database import initialize_database +from tasks_master import get_tasksmaster def print_usage(): @@ -92,6 +93,10 @@ def main(): except IOError: app_logger.warning("Can't read input file. Using randomly generated links.") + # tasks master init + tasks_master = get_tasksmaster() + tasks_master.run_scheduled_tasks() + try: app_logger.info(f'Starting deception server on port {config.port}...') app_logger.info(f'Timezone configured: {tz.key}') diff --git a/src/tasks/analyze_ips.py b/src/tasks/analyze_ips.py new file mode 100644 index 0000000..e4fda84 --- /dev/null +++ b/src/tasks/analyze_ips.py @@ -0,0 +1,265 @@ +from sqlalchemy import select +from typing import Optional +from database import get_database, DatabaseManager +from zoneinfo import ZoneInfo +from pathlib import Path +from datetime import datetime, timedelta +import re +import urllib.parse +from wordlists import get_wordlists +from config import get_config +from logger import get_app_logger +import requests +from sanitizer import sanitize_for_storage, sanitize_dict + +# ---------------------- +# TASK CONFIG +# ---------------------- + +TASK_CONFIG = { + "name": "analyze-ips", + "cron": "*/1 * * * *", + "enabled": True, + "run_when_loaded": True +} + + +def main(): + config = get_config() + db_manager = get_database() + app_logger = get_app_logger() + + http_risky_methods_threshold = config.http_risky_methods_threshold + violated_robots_threshold = config.violated_robots_threshold + uneven_request_timing_threshold = config.uneven_request_timing_threshold + user_agents_used_threshold = config.user_agents_used_threshold + attack_urls_threshold = config.attack_urls_threshold + uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds + app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}") + score = {} + score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + + #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme + weights = { + "attacker": { + "risky_http_methods": 6, + "robots_violations": 4, + "uneven_request_timing": 3, + "different_user_agents": 8, + "attack_url": 15 + }, + "good_crawler": { + "risky_http_methods": 1, + "robots_violations": 0, + "uneven_request_timing": 0, + "different_user_agents": 0, + "attack_url": 0 + }, + "bad_crawler": { + "risky_http_methods": 2, + "robots_violations": 7, + "uneven_request_timing": 0, + "different_user_agents": 5, + "attack_url": 5 + }, + "regular_user": { + "risky_http_methods": 0, + "robots_violations": 0, + "uneven_request_timing": 8, + "different_user_agents": 3, + "attack_url": 0 + } + } + accesses = db_manager.get_access_logs(limit=999999999) + ips = {item['ip'] for item in accesses} + + for ip in ips: + ip_accesses = [item for item in accesses if item["ip"] == ip] + total_accesses_count = len(accesses) + if total_accesses_count <= 0: + return + + # Set category as "unknown" for the first 3 requests + if total_accesses_count < 3: + category = "unknown" + analyzed_metrics = {} + category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0} + last_analysis = datetime.now(tz=ZoneInfo('UTC')) + db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) + return 0 + #--------------------- HTTP Methods --------------------- + get_accesses_count = len([item for item in ip_accesses if item["method"] == "GET"]) + post_accesses_count = len([item for item in ip_accesses if item["method"] == "POST"]) + put_accesses_count = len([item for item in ip_accesses if item["method"] == "PUT"]) + delete_accesses_count = len([item for item in ip_accesses if item["method"] == "DELETE"]) + head_accesses_count = len([item for item in ip_accesses if item["method"] == "HEAD"]) + options_accesses_count = len([item for item in ip_accesses if item["method"] == "OPTIONS"]) + patch_accesses_count = len([item for item in ip_accesses if item["method"] == "PATCH"]) + if total_accesses_count > http_risky_methods_threshold: + http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count + else: + http_method_attacker_score = 0 + #print(f"HTTP Method attacker score: {http_method_attacker_score}") + if http_method_attacker_score >= http_risky_methods_threshold: + score["attacker"]["risky_http_methods"] = True + score["good_crawler"]["risky_http_methods"] = False + score["bad_crawler"]["risky_http_methods"] = True + score["regular_user"]["risky_http_methods"] = False + else: + score["attacker"]["risky_http_methods"] = False + score["good_crawler"]["risky_http_methods"] = True + score["bad_crawler"]["risky_http_methods"] = False + score["regular_user"]["risky_http_methods"] = False + #--------------------- Robots Violations --------------------- + #respect robots.txt and login/config pages access frequency + robots_disallows = [] + robots_path = Path(__file__).parent.parent / "templates" / "html" / "robots.txt" + with open(robots_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split(":") + + if parts[0] == "Disallow": + parts[1] = parts[1].rstrip("/") + #print(f"DISALLOW {parts[1]}") + robots_disallows.append(parts[1].strip()) + #if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker + violated_robots_count = len([item for item in ip_accesses if any(item["path"].rstrip("/").startswith(disallow) for disallow in robots_disallows)]) + #print(f"Violated robots count: {violated_robots_count}") + if total_accesses_count > 0: + violated_robots_ratio = violated_robots_count / total_accesses_count + else: + violated_robots_ratio = 0 + if violated_robots_ratio >= violated_robots_threshold: + score["attacker"]["robots_violations"] = True + score["good_crawler"]["robots_violations"] = False + score["bad_crawler"]["robots_violations"] = True + score["regular_user"]["robots_violations"] = False + else: + score["attacker"]["robots_violations"] = False + score["good_crawler"]["robots_violations"] = False + score["bad_crawler"]["robots_violations"] = False + score["regular_user"]["robots_violations"] = False + + #--------------------- Requests Timing --------------------- + #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior + timestamps = [datetime.fromisoformat(item["timestamp"]) for item in ip_accesses] + now_utc = datetime.now(tz=ZoneInfo('UTC')) + timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)] + timestamps = sorted(timestamps, reverse=True) + time_diffs = [] + for i in range(0, len(timestamps)-1): + diff = (timestamps[i] - timestamps[i+1]).total_seconds() + time_diffs.append(diff) + + mean = 0 + variance = 0 + std = 0 + cv = 0 + if time_diffs: + mean = sum(time_diffs) / len(time_diffs) + variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs) + std = variance ** 0.5 + cv = std/mean + app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}") + if cv >= uneven_request_timing_threshold: + score["attacker"]["uneven_request_timing"] = True + score["good_crawler"]["uneven_request_timing"] = False + score["bad_crawler"]["uneven_request_timing"] = False + score["regular_user"]["uneven_request_timing"] = True + else: + score["attacker"]["uneven_request_timing"] = False + score["good_crawler"]["uneven_request_timing"] = False + score["bad_crawler"]["uneven_request_timing"] = False + score["regular_user"]["uneven_request_timing"] = False + #--------------------- Different User Agents --------------------- + #Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers + user_agents_used = [item["user_agent"] for item in ip_accesses] + user_agents_used = list(dict.fromkeys(user_agents_used)) + #print(f"User agents used: {user_agents_used}") + if len(user_agents_used) >= user_agents_used_threshold: + score["attacker"]["different_user_agents"] = True + score["good_crawler"]["different_user_agents"] = False + score["bad_crawler"]["different_user_agentss"] = True + score["regular_user"]["different_user_agents"] = False + else: + score["attacker"]["different_user_agents"] = False + score["good_crawler"]["different_user_agents"] = False + score["bad_crawler"]["different_user_agents"] = False + score["regular_user"]["different_user_agents"] = False + #--------------------- Attack URLs --------------------- + attack_urls_found_list = [] + wl = get_wordlists() + if wl.attack_patterns: + queried_paths = [item["path"] for item in ip_accesses] + for queried_path in queried_paths: + # URL decode the path to catch encoded attacks + try: + decoded_path = urllib.parse.unquote(queried_path) + # Double decode to catch double-encoded attacks + decoded_path_twice = urllib.parse.unquote(decoded_path) + except Exception: + decoded_path = queried_path + decoded_path_twice = queried_path + + for name, pattern in wl.attack_patterns.items(): + # Check original, decoded, and double-decoded paths + if (re.search(pattern, queried_path, re.IGNORECASE) or + re.search(pattern, decoded_path, re.IGNORECASE) or + re.search(pattern, decoded_path_twice, re.IGNORECASE)): + attack_urls_found_list.append(f"{name}: {pattern}") + + #remove duplicates + attack_urls_found_list = set(attack_urls_found_list) + attack_urls_found_list = list(attack_urls_found_list) + + if len(attack_urls_found_list) >= attack_urls_threshold: + score["attacker"]["attack_url"] = True + score["good_crawler"]["attack_url"] = False + score["bad_crawler"]["attack_url"] = False + score["regular_user"]["attack_url"] = False + else: + score["attacker"]["attack_url"] = False + score["good_crawler"]["attack_url"] = False + score["bad_crawler"]["attack_url"] = False + score["regular_user"]["attack_url"] = False + #--------------------- Calculate score --------------------- + attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0 + attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"] + attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"] + attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"] + attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"] + attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"] + good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"] + good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"] + good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"] + good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"] + good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"] + bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"] + bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"] + bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"] + bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"] + bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"] + regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"] + regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"] + regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"] + regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"] + regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"] + score_details = f""" + Attacker score: {attacker_score} + Good Crawler score: {good_crawler_score} + Bad Crawler score: {bad_crawler_score} + Regular User score: {regular_user_score} + """ + app_logger.debug(score_details) + analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list} + category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score} + category = max(category_scores, key=category_scores.get) + last_analysis = datetime.now(tz=ZoneInfo('UTC')) + db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) + return \ No newline at end of file diff --git a/src/tasks/fetch_ip_rep.py b/src/tasks/fetch_ip_rep.py new file mode 100644 index 0000000..8171ae6 --- /dev/null +++ b/src/tasks/fetch_ip_rep.py @@ -0,0 +1,59 @@ +from sqlalchemy import select +from typing import Optional +from database import get_database, DatabaseManager +from zoneinfo import ZoneInfo +from pathlib import Path +from datetime import datetime, timedelta +import re +import urllib.parse +from wordlists import get_wordlists +from config import get_config +from logger import get_app_logger +import requests +from sanitizer import sanitize_for_storage, sanitize_dict + +# ---------------------- +# TASK CONFIG +# ---------------------- + +TASK_CONFIG = { + "name": "fetch-ip-rep", + "cron": "*/1 * * * *", + "enabled": True, + "run_when_loaded": True +} + + +def main(): + + config = get_config() + db_manager = get_database() + app_logger = get_app_logger() + + accesses = db_manager.get_access_logs(limit=999999999) + ips = {item['ip'] for item in accesses} + + for ip in ips: + api_url = "https://iprep.lcrawl.com/api/iprep/" + params = { + "cidr": ip + } + headers = { + "Content-Type": "application/json" + } + response = requests.get(api_url, headers=headers, params=params) + payload = response.json() + if payload["results"]: + data = payload["results"][0] + country_iso_code = data["geoip_data"]["country_iso_code"] + asn = data["geoip_data"]["asn_autonomous_system_number"] + asn_org = data["geoip_data"]["asn_autonomous_system_organization"] + list_on = data["list_on"] + sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3) + sanitized_asn = sanitize_for_storage(asn, 100) + sanitized_asn_org = sanitize_for_storage(asn_org, 100) + sanitized_list_on = sanitize_dict(list_on, 100000) + + db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on) + + return \ No newline at end of file diff --git a/src/tasks/top_attacking_ips.py b/src/tasks/top_attacking_ips.py new file mode 100644 index 0000000..d9e18d3 --- /dev/null +++ b/src/tasks/top_attacking_ips.py @@ -0,0 +1,57 @@ +# tasks/export_malicious_ips.py + +import os +from logger import get_app_logger +from database import get_database +from models import AccessLog +from sqlalchemy import distinct + +app_logger = get_app_logger() + +# ---------------------- +# TASK CONFIG +# ---------------------- +TASK_CONFIG = { + "name": "export-malicious-ips", + "cron": "*/5 * * * *", + "enabled": True, + "run_when_loaded": True +} + +EXPORTS_DIR = "exports" +OUTPUT_FILE = os.path.join(EXPORTS_DIR, "malicious_ips.txt") + +# ---------------------- +# TASK LOGIC +# ---------------------- +def main(): + """ + Export all IPs flagged as suspicious to a text file. + TasksMaster will call this function based on the cron schedule. + """ + task_name = TASK_CONFIG.get("name") + app_logger.info(f"[Background Task] {task_name} starting...") + + try: + db = get_database() + session = db.session + + # Query distinct suspicious IPs + results = session.query(distinct(AccessLog.ip)).filter( + AccessLog.is_suspicious == True + ).all() + + # Ensure exports directory exists + os.makedirs(EXPORTS_DIR, exist_ok=True) + + # Write IPs to file (one per line) + with open(OUTPUT_FILE, 'w') as f: + for (ip,) in results: + f.write(f"{ip}\n") + + app_logger.info(f"[Background Task] {task_name} exported {len(results)} IPs to {OUTPUT_FILE}") + + except Exception as e: + app_logger.error(f"[Background Task] {task_name} failed: {e}") + finally: + db.close_session() diff --git a/src/tasks_master.py b/src/tasks_master.py new file mode 100644 index 0000000..264471c --- /dev/null +++ b/src/tasks_master.py @@ -0,0 +1,288 @@ +import os +import sys +import datetime +import functools +import threading +import importlib +import importlib.util + +from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger + +app_logger = get_app_logger() + +try: + from apscheduler.schedulers.background import BackgroundScheduler + from apscheduler.triggers.cron import CronTrigger + from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR +except ModuleNotFoundError: + msg = ( + "Required modules are not installed. " + "Can not continue with module / application loading.\n" + "Install it with: pip install -r requirements" + ) + print(msg, file=sys.stderr) + app_logger.error(msg) + exit() + + +# ---------- TASKSMASTER CLASS ---------- +class TasksMaster: + + TASK_DEFAULT_CRON = '*/15 * * * *' + TASK_JITTER = 240 + TASKS_FOLDER = os.path.join(os.path.dirname(__file__), "tasks") + + def __init__(self, scheduler: BackgroundScheduler): + self.tasks = self._config_tasks() + self.scheduler = scheduler + self.last_run_times = {} + self.scheduler.add_listener(self.job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) + + def _config_tasks(self): + """ + Loads tasks from the TASKS_FOLDER and logs how many were found. + """ + tasks_defined = self._load_tasks_from_folder(self.TASKS_FOLDER) + app_logger.info(f"Scheduled Tasks Loaded from folder: {self.TASKS_FOLDER}") + return tasks_defined + + def _load_tasks_from_folder(self, folder_path): + """ + Loads and registers task modules from a specified folder. + + This function scans the given folder for Python (.py) files, dynamically + imports each as a module, and looks for two attributes: + - TASK_CONFIG: A dictionary containing task metadata, specifically the + 'name' and 'cron' (cron schedule string). + - main: A callable function that represents the task's execution logic. + + Tasks with both attributes are added to a list with their configuration and + execution function. + + Args: + folder_path (str): Path to the folder containing task scripts. + + Returns: + list[dict]: A list of task definitions with keys: + - 'name' (str): The name of the task. + - 'filename' (str): The file the task was loaded from. + - 'cron' (str): The crontab string for scheduling. + - 'enabled' (bool): Whether the task is enabled. + - 'run_when_loaded' (bool): Whether to run the task immediately. + """ + tasks = [] + + if not os.path.exists(folder_path): + app_logger.error(f"{folder_path} does not exist! Unable to load tasks!") + return tasks + + # we sort the files so that we have a set order, which helps with debugging + for filename in sorted(os.listdir(folder_path)): + + # skip any non python files, as well as any __pycache__ or .pyc files that might creep in there + if not filename.endswith('.py') or filename.startswith("__"): + continue + + path = os.path.join(folder_path, filename) + module_name = filename[:-3] + spec = importlib.util.spec_from_file_location(f"tasks.{module_name}", path) + module = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(module) + sys.modules[f"tasks.{module_name}"] = module + except Exception as e: + app_logger.error(f"Failed to import {filename}: {e}") + continue + + # if we have a tasks config and a main function, we attempt to schedule it + if hasattr(module, 'TASK_CONFIG') and hasattr(module, 'main'): + + # ensure task_config is a dict + if not isinstance(module.TASK_CONFIG, dict): + app_logger.error(f"TASK_CONFIG is not a dict in {filename}. Skipping task.") + continue + + task_cron = module.TASK_CONFIG.get("cron") or self.TASK_DEFAULT_CRON + task_name = module.TASK_CONFIG.get("name", module_name) + + # ensure the task_cron is a valid cron value + try: + CronTrigger.from_crontab(task_cron) + except ValueError as ve: + app_logger.error(f"Invalid cron format for task {task_name}: {ve} - Skipping this task") + continue + + task = { + 'name': module.TASK_CONFIG.get('name', module_name), + 'filename': filename, + 'cron': task_cron, + "enabled": module.TASK_CONFIG.get("enabled", False), + "run_when_loaded": module.TASK_CONFIG.get("run_when_loaded", False) + } + + tasks.append(task) + + # we are missing things, and we log what's missing + else: + if not hasattr(module, 'TASK_CONFIG'): + app_logger.warning(f"Missing TASK_CONFIG in {filename}") + elif not hasattr(module, 'main'): + app_logger.warning(f"Missing main() in {filename}") + + return tasks + + def _add_jobs(self): + # for each task in the tasks config file... + for task_to_run in self.tasks: + + # remember, these tasks, are built from the "load_tasks_from_folder" function, + # if you want to pass data from the TASKS_CONFIG dict, you need to pass it there to get it here. + task_name = task_to_run.get("name") + run_when_loaded = task_to_run.get("run_when_loaded") + module_name = os.path.splitext(task_to_run.get("filename"))[0] + task_enabled = task_to_run.get("enabled", False) + + # if no crontab set for this task, we use 15 as the default. + task_cron = task_to_run.get("cron") or self.TASK_DEFAULT_CRON + + # if task is disabled, skip this one + if not task_enabled: + app_logger.info(f"{task_name} is disabled in client config. Skipping task") + continue + try: + if os.path.isfile(os.path.join(self.TASKS_FOLDER, task_to_run.get("filename"))): + # schedule the task now that everything has checked out above... + self._schedule_task(task_name, module_name, task_cron, run_when_loaded) + app_logger.info(f"Scheduled {module_name} cron is set to {task_cron}.", extra={"task": task_to_run}) + else: + app_logger.info(f"Skipping invalid or unsafe file: {task_to_run.get('filename')}", extra={"task": task_to_run}) + + except Exception as e: + app_logger.error(f"Error scheduling task: {e}", extra={"tasks": task_to_run}) + + def _schedule_task(self, task_name, module_name, task_cron, run_when_loaded): + try: + # Dynamically import the module + module = importlib.import_module(f"tasks.{module_name}") + + # Check if the module has a 'main' function + if hasattr(module, 'main'): + app_logger.info(f"Scheduling {task_name} - {module_name} Main Function") + + # unique_job_id + job_identifier = f"{module_name}__{task_name}" + + # little insurance to make sure the cron is set to something and not none + if task_cron is None: + task_cron = self.TASK_DEFAULT_CRON + + trigger = CronTrigger.from_crontab(task_cron) + + # schedule the task / job + if run_when_loaded: + app_logger.info(f"Task: {task_name} is set to run instantly. Scheduling to run on scheduler start") + + self.scheduler.add_job( + module.main, + trigger, + id=job_identifier, + jitter=self.TASK_JITTER, + name=task_name, + next_run_time=datetime.datetime.now(), + max_instances=1 + ) + else: + self.scheduler.add_job( + module.main, + trigger, + id=job_identifier, + jitter=self.TASK_JITTER, + name=task_name, + max_instances=1 + ) + else: + app_logger.error(f"{module_name} does not define a 'main' function.") + + except Exception as e: + app_logger.error(f"Failed to load {module_name}: {e}") + + def job_listener(self, event): + job_id = event.job_id + self.last_run_times[job_id] = datetime.datetime.now() + + if event.exception: + app_logger.error(f"Job {event.job_id} failed: {event.exception}") + else: + app_logger.info(f"Job {event.job_id} completed successfully.") + + def list_jobs(self): + scheduled_jobs = self.scheduler.get_jobs() + jobs_list = [] + + for job in scheduled_jobs: + jobs_list.append({ + "id": job.id, + "name": job.name, + "next_run": job.next_run_time, + }) + return jobs_list + + def run_scheduled_tasks(self): + """ + Runs and schedules enabled tasks using the background scheduler. + + This method performs the following: + 1. Retrieves the current task configurations and updates internal state. + 2. Adds new jobs to the scheduler based on the latest configuration. + 3. Starts the scheduler to begin executing tasks at their defined intervals. + + This ensures the scheduler is always running with the most up-to-date + task definitions and enabled status. + """ + + # Add enabled tasks to the scheduler + self._add_jobs() + + # Start the scheduler to begin executing the scheduled tasks (if not already running) + if not self.scheduler.running: + self.scheduler.start() + + +# ---------- SINGLETON WRAPPER ---------- +T = type + +def singleton_loader(func): + """Decorator to ensure only one instance exists.""" + cache: dict[str, T] = {} + lock = threading.Lock() + + @functools.wraps(func) + def wrapper(*args, **kwargs) -> T: + with lock: + if func.__name__ not in cache: + cache[func.__name__] = func(*args, **kwargs) + return cache[func.__name__] + return wrapper + + +@singleton_loader +def get_tasksmaster(scheduler: BackgroundScheduler | None = None) -> TasksMaster: + """ + Returns the singleton TasksMaster instance. + + - Automatically creates a BackgroundScheduler if none is provided. + - Automatically starts the scheduler when the singleton is created. + + :param scheduler: Optional APScheduler instance. If None, a new BackgroundScheduler will be created. + """ + if scheduler is None: + scheduler = BackgroundScheduler() + + tm_instance = TasksMaster(scheduler) + + # Auto-start scheduler if not already running + if not scheduler.running: + scheduler.start() + app_logger.info("TasksMaster scheduler started automatically with singleton creation.") + + return tm_instance diff --git a/src/templates/dashboard_template.py b/src/templates/dashboard_template.py index dfad3dd..4e7005c 100644 --- a/src/templates/dashboard_template.py +++ b/src/templates/dashboard_template.py @@ -7,6 +7,7 @@ Customize this template to change the dashboard appearance. import html from datetime import datetime +from zoneinfo import ZoneInfo def _escape(value) -> str: """Escape HTML special characters to prevent XSS attacks.""" @@ -14,22 +15,52 @@ def _escape(value) -> str: return "" return html.escape(str(value)) -def format_timestamp(iso_timestamp: str) -> str: - """Format ISO timestamp for display (YYYY-MM-DD HH:MM:SS)""" +def format_timestamp(iso_timestamp: str, timezone: str = 'UTC', time_only: bool = False) -> str: + """Format ISO timestamp for display with timezone conversion + + Args: + iso_timestamp: ISO format timestamp string (UTC) + timezone: IANA timezone string to convert to + time_only: If True, return only HH:MM:SS, otherwise full datetime + """ try: + # Parse UTC timestamp dt = datetime.fromisoformat(iso_timestamp) + # Convert to target timezone + if dt.tzinfo is not None: + dt = dt.astimezone(ZoneInfo(timezone)) + + if time_only: + return dt.strftime("%H:%M:%S") return dt.strftime("%Y-%m-%d %H:%M:%S") except Exception: # Fallback for old format return iso_timestamp.split("T")[1][:8] if "T" in iso_timestamp else iso_timestamp -def generate_dashboard(stats: dict) -> str: - """Generate dashboard HTML with access statistics""" +def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = '') -> str: + """Generate dashboard HTML with access statistics - # Generate IP rows (IPs are generally safe but escape for consistency) + Args: + stats: Statistics dictionary + timezone: IANA timezone string (e.g., 'Europe/Paris', 'America/New_York') + dashboard_path: The secret dashboard path for generating API URLs + """ + + # Generate IP rows with clickable functionality for dropdown stats top_ips_rows = '\n'.join([ - f'{i+1}{_escape(ip)}{count}' + f''' + {i+1} + {_escape(ip)} + {count} + + + +
+
Loading stats...
+
+ + ''' for i, (ip, count) in enumerate(stats['top_ips']) ]) or 'No data' @@ -45,27 +76,76 @@ def generate_dashboard(stats: dict) -> str: for i, (ua, count) in enumerate(stats['top_user_agents']) ]) or 'No data' - # Generate suspicious accesses rows (CRITICAL: multiple user-controlled fields) + # Generate suspicious accesses rows with clickable IPs suspicious_rows = '\n'.join([ - f'{_escape(log["ip"])}{_escape(log["path"])}{_escape(log["user_agent"][:60])}{_escape(log["timestamp"].split("T")[1][:8])}' + f''' + {_escape(log["ip"])} + {_escape(log["path"])} + {_escape(log["user_agent"][:60])} + {format_timestamp(log["timestamp"], timezone, time_only=True)} + + + +
+
Loading stats...
+
+ + ''' for log in stats['recent_suspicious'][-10:] ]) or 'No suspicious activity detected' - # Generate honeypot triggered IPs rows + # Generate honeypot triggered IPs rows with clickable IPs honeypot_rows = '\n'.join([ - f'{_escape(ip)}{_escape(", ".join(paths))}{len(paths)}' + f''' + {_escape(ip)} + {_escape(", ".join(paths))} + {len(paths)} + + + +
+
Loading stats...
+
+ + ''' for ip, paths in stats.get('honeypot_triggered_ips', []) ]) or 'No honeypot triggers yet' - # Generate attack types rows (CRITICAL: paths and user agents are user-controlled) + # Generate attack types rows with clickable IPs attack_type_rows = '\n'.join([ - f'{_escape(log["ip"])}{_escape(log["path"])}{_escape(", ".join(log["attack_types"]))}{_escape(log["user_agent"][:60])}{_escape(log["timestamp"].split("T")[1][:8])}' + f''' + {_escape(log["ip"])} + {_escape(log["path"])} + {_escape(", ".join(log["attack_types"]))} + {_escape(log["user_agent"][:60])} + {format_timestamp(log["timestamp"], timezone, time_only=True)} + + + +
+
Loading stats...
+
+ + ''' for log in stats.get('attack_types', [])[-10:] ]) or 'No attacks detected' - # Generate credential attempts rows (CRITICAL: usernames and passwords are user-controlled) + # Generate credential attempts rows with clickable IPs credential_rows = '\n'.join([ - f'{_escape(log["ip"])}{_escape(log["username"])}{_escape(log["password"])}{_escape(log["path"])}{_escape(log["timestamp"].split("T")[1][:8])}' + f''' + {_escape(log["ip"])} + {_escape(log["username"])} + {_escape(log["password"])} + {_escape(log["path"])} + {format_timestamp(log["timestamp"], timezone, time_only=True)} + + + +
+
Loading stats...
+
+ + ''' for log in stats.get('credential_attempts', [])[-20:] ]) or 'No credentials captured yet' @@ -85,12 +165,36 @@ def generate_dashboard(stats: dict) -> str: .container {{ max-width: 1400px; margin: 0 auto; + position: relative; }} h1 {{ color: #58a6ff; text-align: center; margin-bottom: 40px; }} + .download-section {{ + position: absolute; + top: 0; + right: 0; + }} + .download-btn {{ + display: inline-block; + padding: 8px 14px; + background: #238636; + color: #ffffff; + text-decoration: none; + border-radius: 6px; + font-weight: 500; + font-size: 13px; + transition: background 0.2s; + border: 1px solid #2ea043; + }} + .download-btn:hover {{ + background: #2ea043; + }} + .download-btn:active {{ + background: #1f7a2f; + }} .stats-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); @@ -180,10 +284,202 @@ def generate_dashboard(stats: dict) -> str: content: '▼'; opacity: 1; }} + .ip-row {{ + transition: background-color 0.2s; + }} + .ip-clickable {{ + cursor: pointer; + color: #58a6ff !important; + font-weight: 500; + text-decoration: underline; + text-decoration-style: dotted; + text-underline-offset: 3px; + }} + .ip-clickable:hover {{ + color: #79c0ff !important; + text-decoration-style: solid; + background: #1c2128; + }} + .ip-stats-row {{ + background: #0d1117; + }} + .ip-stats-cell {{ + padding: 0 !important; + }} + .ip-stats-dropdown {{ + margin-top: 10px; + padding: 15px; + background: #0d1117; + border: 1px solid #30363d; + border-radius: 6px; + font-size: 13px; + display: flex; + gap: 20px; + }} + .stats-left {{ + flex: 1; + }} + .stats-right {{ + flex: 0 0 200px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + }} + .radar-chart {{ + position: relative; + width: 220px; + height: 220px; + overflow: visible; + }} + .radar-legend {{ + margin-top: 10px; + font-size: 11px; + }} + .radar-legend-item {{ + display: flex; + align-items: center; + gap: 6px; + margin: 3px 0; + }} + .radar-legend-color {{ + width: 12px; + height: 12px; + border-radius: 2px; + }} + .ip-stats-dropdown .loading {{ + color: #8b949e; + font-style: italic; + }} + .stat-row {{ + display: flex; + justify-content: space-between; + padding: 5px 0; + border-bottom: 1px solid #21262d; + }} + .stat-row:last-child {{ + border-bottom: none; + }} + .stat-label-sm {{ + color: #8b949e; + font-weight: 500; + }} + .stat-value-sm {{ + color: #58a6ff; + font-weight: 600; + }} + .category-badge {{ + display: inline-block; + padding: 4px 8px; + border-radius: 4px; + font-size: 12px; + font-weight: 600; + text-transform: uppercase; + }} + .category-attacker {{ + background: #f851491a; + color: #f85149; + border: 1px solid #f85149; + }} + .category-good-crawler {{ + background: #3fb9501a; + color: #3fb950; + border: 1px solid #3fb950; + }} + .category-bad-crawler {{ + background: #f0883e1a; + color: #f0883e; + border: 1px solid #f0883e; + }} + .category-regular-user {{ + background: #58a6ff1a; + color: #58a6ff; + border: 1px solid #58a6ff; + }} + .category-unknown {{ + background: #8b949e1a; + color: #8b949e; + border: 1px solid #8b949e; + }} + .timeline-container {{ + margin-top: 15px; + padding-top: 15px; + border-top: 1px solid #30363d; + }} + .timeline-title {{ + color: #58a6ff; + font-size: 13px; + font-weight: 600; + margin-bottom: 10px; + }} + .timeline {{ + position: relative; + padding-left: 30px; + }} + .timeline::before {{ + content: ''; + position: absolute; + left: 12px; + top: 5px; + bottom: 5px; + width: 3px; + background: #30363d; + }} + .timeline-item {{ + position: relative; + padding-bottom: 15px; + }} + .timeline-item:last-child {{ + padding-bottom: 0; + }} + .timeline-marker {{ + position: absolute; + left: -26px; + width: 16px; + height: 16px; + border-radius: 50%; + border: 2px solid #0d1117; + }} + .timeline-marker.attacker {{ + background: #f85149; + }} + .timeline-marker.good-crawler {{ + background: #3fb950; + }} + .timeline-marker.bad-crawler {{ + background: #f0883e; + }} + .timeline-marker.regular-user {{ + background: #58a6ff; + }} + .timeline-marker.unknown {{ + background: #8b949e; + }} + .timeline-content {{ + font-size: 12px; + }} + .timeline-category {{ + font-weight: 600; + }} + .timeline-timestamp {{ + color: #8b949e; + font-size: 11px; + margin-top: 2px; + }} + .timeline-arrow {{ + color: #8b949e; + margin: 0 7px; + }} +
+

Krawl Dashboard

@@ -331,6 +627,31 @@ def generate_dashboard(stats: dict) -> str:
diff --git a/src/templates/html/main_page.html b/src/templates/html/main_page.html index d0b39de..ac154e8 100644 --- a/src/templates/html/main_page.html +++ b/src/templates/html/main_page.html @@ -46,21 +46,12 @@ gap: 10px; align-items: center; overflow-y: auto; + overflow-x: hidden; flex: 1; padding-top: 10px; }} .links-container::-webkit-scrollbar {{ - width: 8px; - }} - .links-container::-webkit-scrollbar-track {{ - background: #0d1117; - }} - .links-container::-webkit-scrollbar-thumb {{ - background: #30363d; - border-radius: 4px; - }} - .links-container::-webkit-scrollbar-thumb:hover {{ - background: #484f58; + width: 0px; }} .link-box {{ background: #161b22; diff --git a/src/wordlists.py b/src/wordlists.py index 0b0a0e9..81f2022 100644 --- a/src/wordlists.py +++ b/src/wordlists.py @@ -131,7 +131,8 @@ class Wordlists: @property def attack_urls(self): - return self._data.get("attack_urls", []) + """Deprecated: use attack_patterns instead. Returns attack_patterns for backward compatibility.""" + return self._data.get("attack_patterns", {}) _wordlists_instance = None diff --git a/wordlists.json b/wordlists.json index 833f1eb..3ea6f40 100644 --- a/wordlists.json +++ b/wordlists.json @@ -353,11 +353,14 @@ } }, "attack_patterns": { - "path_traversal": "\\.\\.", + "path_traversal": "(\\.\\.|%2e%2e|%252e%252e|\\.{2,}|%c0%ae|%c1%9c)", "sql_injection": "('|\"|`|--|#|/\\*|\\*/|\\bunion\\b|\\bunion\\s+select\\b|\\bor\\b.*=.*|\\band\\b.*=.*|'.*or.*'.*=.*'|\\bsleep\\b|\\bwaitfor\\b|\\bdelay\\b|\\bbenchmark\\b|;.*select|;.*drop|;.*insert|;.*update|;.*delete|\\bexec\\b|\\bexecute\\b|\\bxp_cmdshell\\b|information_schema|table_schema|table_name)", "xss_attempt": "(