made ip analysis and ip rep info fetch a scheduled task

This commit is contained in:
Leonardo Bambini
2026-01-10 14:53:31 +01:00
22 changed files with 1833 additions and 288 deletions

3
.gitignore vendored
View File

@@ -76,3 +76,6 @@ data/
# Personal canary tokens or sensitive configs
*canary*token*.yaml
personal-values.yaml
#exports dir (keeping .gitkeep so we have the dir)
/exports/*

View File

@@ -14,9 +14,10 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY src/ /app/src/
COPY wordlists.json /app/
COPY entrypoint.sh /app/
COPY config.yaml /app/
RUN useradd -m -u 1000 krawl && \
mkdir -p /app/logs /app/data && \
mkdir -p /app/logs /app/data /app/exports && \
chown -R krawl:krawl /app && \
chmod +x /app/entrypoint.sh

View File

@@ -3,7 +3,7 @@
server:
port: 5000
delay: 100 # Response delay in milliseconds
timezone: null # e.g., "America/New_York" or null for system default
timezone: null # e.g., "America/New_York", "Europe/Paris" or null for system default
# manually set the server header, if null a random one will be used.
server_header: null
@@ -11,8 +11,8 @@ server:
links:
min_length: 5
max_length: 15
min_per_page: 10
max_per_page: 15
min_per_page: 5
max_per_page: 10
char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
max_counter: 10
@@ -38,9 +38,9 @@ behavior:
probability_error_codes: 0 # 0-100 percentage
analyzer:
# http_risky_methods_threshold: 0.1
# violated_robots_threshold: 0.1
# uneven_request_timing_threshold: 5
# uneven_request_timing_time_window_seconds: 300
# user_agents_used_threshold: 2
# attack_urls_threshold: 1
http_risky_methods_threshold: 0.1
violated_robots_threshold: 0.1
uneven_request_timing_threshold: 2
uneven_request_timing_time_window_seconds: 300
user_agents_used_threshold: 2
attack_urls_threshold: 1

View File

@@ -12,6 +12,7 @@ services:
- ./wordlists.json:/app/wordlists.json:ro
- ./config.yaml:/app/config.yaml:ro
- ./logs:/app/logs
- ./exports:/app/exports
environment:
- CONFIG_LOCATION=config.yaml
restart: unless-stopped

View File

@@ -2,7 +2,7 @@
set -e
# Fix ownership of mounted directories
chown -R krawl:krawl /app/logs /app/data 2>/dev/null || true
chown -R krawl:krawl /app/logs /app/data /app/exports 2>/dev/null || true
# Drop to krawl user and run the application
exec gosu krawl "$@"

0
exports/.gitkeep Normal file
View File

View File

@@ -6,3 +6,6 @@ PyYAML>=6.0
# Database ORM
SQLAlchemy>=2.0.0,<3.0.0
# Scheduling
APScheduler>=3.11.2

View File

@@ -6,8 +6,10 @@ from zoneinfo import ZoneInfo
from pathlib import Path
from datetime import datetime, timedelta
import re
import urllib.parse
from wordlists import get_wordlists
from config import get_config
from logger import get_app_logger
import requests
from sanitizer import sanitize_for_storage, sanitize_dict
@@ -15,6 +17,8 @@ from sanitizer import sanitize_for_storage, sanitize_dict
Functions for user activity analysis
"""
app_logger = get_app_logger()
class Analyzer:
"""
Analyzes users activity and produces aggregated insights
@@ -48,272 +52,299 @@ class Analyzer:
pass
return self._db_manager
def infer_user_category(self, ip: str) -> str:
# def infer_user_category(self, ip: str) -> str:
config = get_config()
# config = get_config()
http_risky_methods_threshold = config.http_risky_methods_threshold
violated_robots_threshold = config.violated_robots_threshold
uneven_request_timing_threshold = config.uneven_request_timing_threshold
user_agents_used_threshold = config.user_agents_used_threshold
attack_urls_threshold = config.attack_urls_threshold
uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds
# http_risky_methods_threshold = config.http_risky_methods_threshold
# violated_robots_threshold = config.violated_robots_threshold
# uneven_request_timing_threshold = config.uneven_request_timing_threshold
# user_agents_used_threshold = config.user_agents_used_threshold
# attack_urls_threshold = config.attack_urls_threshold
# uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds
print(f"http_risky_methods_threshold: {http_risky_methods_threshold}")
# app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}")
score = {}
score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
# score = {}
# score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
# score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
# score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
# score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
#1-3 low, 4-6 mid, 7-9 high, 10-20 extreme
weights = {
"attacker": {
"risky_http_methods": 6,
"robots_violations": 4,
"uneven_request_timing": 3,
"different_user_agents": 8,
"attack_url": 15
},
"good_crawler": {
"risky_http_methods": 1,
"robots_violations": 0,
"uneven_request_timing": 0,
"different_user_agents": 0,
"attack_url": 0
},
"bad_crawler": {
"risky_http_methods": 2,
"robots_violations": 7,
"uneven_request_timing": 0,
"different_user_agents": 5,
"attack_url": 5
},
"regular_user": {
"risky_http_methods": 0,
"robots_violations": 0,
"uneven_request_timing": 8,
"different_user_agents": 3,
"attack_url": 0
}
}
# #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme
# weights = {
# "attacker": {
# "risky_http_methods": 6,
# "robots_violations": 4,
# "uneven_request_timing": 3,
# "different_user_agents": 8,
# "attack_url": 15
# },
# "good_crawler": {
# "risky_http_methods": 1,
# "robots_violations": 0,
# "uneven_request_timing": 0,
# "different_user_agents": 0,
# "attack_url": 0
# },
# "bad_crawler": {
# "risky_http_methods": 2,
# "robots_violations": 7,
# "uneven_request_timing": 0,
# "different_user_agents": 5,
# "attack_url": 5
# },
# "regular_user": {
# "risky_http_methods": 0,
# "robots_violations": 0,
# "uneven_request_timing": 8,
# "different_user_agents": 3,
# "attack_url": 0
# }
# }
accesses = self.db.get_access_logs(ip_filter = ip, limit=1000)
total_accesses_count = len(accesses)
if total_accesses_count <= 0:
return
# accesses = self.db.get_access_logs(ip_filter = ip, limit=1000)
# total_accesses_count = len(accesses)
# if total_accesses_count <= 0:
# return
# # Set category as "unknown" for the first 5 requests
# if total_accesses_count < 3:
# category = "unknown"
# analyzed_metrics = {}
# category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0}
# last_analysis = datetime.now(tz=ZoneInfo('UTC'))
# self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
# return 0
#--------------------- HTTP Methods ---------------------
# #--------------------- HTTP Methods ---------------------
get_accesses_count = len([item for item in accesses if item["method"] == "GET"])
post_accesses_count = len([item for item in accesses if item["method"] == "POST"])
put_accesses_count = len([item for item in accesses if item["method"] == "PUT"])
delete_accesses_count = len([item for item in accesses if item["method"] == "DELETE"])
head_accesses_count = len([item for item in accesses if item["method"] == "HEAD"])
options_accesses_count = len([item for item in accesses if item["method"] == "OPTIONS"])
patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"])
# get_accesses_count = len([item for item in accesses if item["method"] == "GET"])
# post_accesses_count = len([item for item in accesses if item["method"] == "POST"])
# put_accesses_count = len([item for item in accesses if item["method"] == "PUT"])
# delete_accesses_count = len([item for item in accesses if item["method"] == "DELETE"])
# head_accesses_count = len([item for item in accesses if item["method"] == "HEAD"])
# options_accesses_count = len([item for item in accesses if item["method"] == "OPTIONS"])
# patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"])
if total_accesses_count > http_risky_methods_threshold:
http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count
else:
http_method_attacker_score = 0
# if total_accesses_count > http_risky_methods_threshold:
# http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count
# else:
# http_method_attacker_score = 0
#print(f"HTTP Method attacker score: {http_method_attacker_score}")
if http_method_attacker_score >= http_risky_methods_threshold:
score["attacker"]["risky_http_methods"] = True
score["good_crawler"]["risky_http_methods"] = False
score["bad_crawler"]["risky_http_methods"] = True
score["regular_user"]["risky_http_methods"] = False
else:
score["attacker"]["risky_http_methods"] = False
score["good_crawler"]["risky_http_methods"] = True
score["bad_crawler"]["risky_http_methods"] = False
score["regular_user"]["risky_http_methods"] = False
# #print(f"HTTP Method attacker score: {http_method_attacker_score}")
# if http_method_attacker_score >= http_risky_methods_threshold:
# score["attacker"]["risky_http_methods"] = True
# score["good_crawler"]["risky_http_methods"] = False
# score["bad_crawler"]["risky_http_methods"] = True
# score["regular_user"]["risky_http_methods"] = False
# else:
# score["attacker"]["risky_http_methods"] = False
# score["good_crawler"]["risky_http_methods"] = True
# score["bad_crawler"]["risky_http_methods"] = False
# score["regular_user"]["risky_http_methods"] = False
#--------------------- Robots Violations ---------------------
#respect robots.txt and login/config pages access frequency
robots_disallows = []
robots_path = Path(__file__).parent / "templates" / "html" / "robots.txt"
with open(robots_path, "r") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split(":")
# #--------------------- Robots Violations ---------------------
# #respect robots.txt and login/config pages access frequency
# robots_disallows = []
# robots_path = Path(__file__).parent / "templates" / "html" / "robots.txt"
# with open(robots_path, "r") as f:
# for line in f:
# line = line.strip()
# if not line:
# continue
# parts = line.split(":")
if parts[0] == "Disallow":
parts[1] = parts[1].rstrip("/")
#print(f"DISALLOW {parts[1]}")
robots_disallows.append(parts[1].strip())
# if parts[0] == "Disallow":
# parts[1] = parts[1].rstrip("/")
# #print(f"DISALLOW {parts[1]}")
# robots_disallows.append(parts[1].strip())
#if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker
violated_robots_count = len([item for item in accesses if item["path"].rstrip("/") in tuple(robots_disallows)])
#print(f"Violated robots count: {violated_robots_count}")
if total_accesses_count > 0:
violated_robots_ratio = violated_robots_count / total_accesses_count
else:
violated_robots_ratio = 0
# #if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker
# violated_robots_count = len([item for item in accesses if any(item["path"].rstrip("/").startswith(disallow) for disallow in robots_disallows)])
# #print(f"Violated robots count: {violated_robots_count}")
# if total_accesses_count > 0:
# violated_robots_ratio = violated_robots_count / total_accesses_count
# else:
# violated_robots_ratio = 0
if violated_robots_ratio >= violated_robots_threshold:
score["attacker"]["robots_violations"] = True
score["good_crawler"]["robots_violations"] = False
score["bad_crawler"]["robots_violations"] = True
score["regular_user"]["robots_violations"] = False
else:
score["attacker"]["robots_violations"] = False
score["good_crawler"]["robots_violations"] = False
score["bad_crawler"]["robots_violations"] = False
score["regular_user"]["robots_violations"] = False
# if violated_robots_ratio >= violated_robots_threshold:
# score["attacker"]["robots_violations"] = True
# score["good_crawler"]["robots_violations"] = False
# score["bad_crawler"]["robots_violations"] = True
# score["regular_user"]["robots_violations"] = False
# else:
# score["attacker"]["robots_violations"] = False
# score["good_crawler"]["robots_violations"] = False
# score["bad_crawler"]["robots_violations"] = False
# score["regular_user"]["robots_violations"] = False
#--------------------- Requests Timing ---------------------
#Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior
timestamps = [datetime.fromisoformat(item["timestamp"]) for item in accesses]
timestamps = [ts for ts in timestamps if datetime.utcnow() - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)]
timestamps = sorted(timestamps, reverse=True)
# #--------------------- Requests Timing ---------------------
# #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior
# timestamps = [datetime.fromisoformat(item["timestamp"]) for item in accesses]
# now_utc = datetime.now(tz=ZoneInfo('UTC'))
# timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)]
# timestamps = sorted(timestamps, reverse=True)
time_diffs = []
for i in range(0, len(timestamps)-1):
diff = (timestamps[i] - timestamps[i+1]).total_seconds()
time_diffs.append(diff)
# time_diffs = []
# for i in range(0, len(timestamps)-1):
# diff = (timestamps[i] - timestamps[i+1]).total_seconds()
# time_diffs.append(diff)
mean = 0
variance = 0
std = 0
cv = 0
if time_diffs:
mean = sum(time_diffs) / len(time_diffs)
variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs)
std = variance ** 0.5
cv = std/mean
print(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}")
# mean = 0
# variance = 0
# std = 0
# cv = 0
# if time_diffs:
# mean = sum(time_diffs) / len(time_diffs)
# variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs)
# std = variance ** 0.5
# cv = std/mean
# app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}")
if cv >= uneven_request_timing_threshold:
score["attacker"]["uneven_request_timing"] = True
score["good_crawler"]["uneven_request_timing"] = False
score["bad_crawler"]["uneven_request_timing"] = False
score["regular_user"]["uneven_request_timing"] = True
else:
score["attacker"]["uneven_request_timing"] = False
score["good_crawler"]["uneven_request_timing"] = False
score["bad_crawler"]["uneven_request_timing"] = False
score["regular_user"]["uneven_request_timing"] = False
# if cv >= uneven_request_timing_threshold:
# score["attacker"]["uneven_request_timing"] = True
# score["good_crawler"]["uneven_request_timing"] = False
# score["bad_crawler"]["uneven_request_timing"] = False
# score["regular_user"]["uneven_request_timing"] = True
# else:
# score["attacker"]["uneven_request_timing"] = False
# score["good_crawler"]["uneven_request_timing"] = False
# score["bad_crawler"]["uneven_request_timing"] = False
# score["regular_user"]["uneven_request_timing"] = False
#--------------------- Different User Agents ---------------------
#Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers
user_agents_used = [item["user_agent"] for item in accesses]
user_agents_used = list(dict.fromkeys(user_agents_used))
#print(f"User agents used: {user_agents_used}")
# #--------------------- Different User Agents ---------------------
# #Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers
# user_agents_used = [item["user_agent"] for item in accesses]
# user_agents_used = list(dict.fromkeys(user_agents_used))
# #print(f"User agents used: {user_agents_used}")
if len(user_agents_used) >= user_agents_used_threshold:
score["attacker"]["different_user_agents"] = True
score["good_crawler"]["different_user_agents"] = False
score["bad_crawler"]["different_user_agentss"] = True
score["regular_user"]["different_user_agents"] = False
else:
score["attacker"]["different_user_agents"] = False
score["good_crawler"]["different_user_agents"] = False
score["bad_crawler"]["different_user_agents"] = False
score["regular_user"]["different_user_agents"] = False
# if len(user_agents_used) >= user_agents_used_threshold:
# score["attacker"]["different_user_agents"] = True
# score["good_crawler"]["different_user_agents"] = False
# score["bad_crawler"]["different_user_agentss"] = True
# score["regular_user"]["different_user_agents"] = False
# else:
# score["attacker"]["different_user_agents"] = False
# score["good_crawler"]["different_user_agents"] = False
# score["bad_crawler"]["different_user_agents"] = False
# score["regular_user"]["different_user_agents"] = False
#--------------------- Attack URLs ---------------------
# #--------------------- Attack URLs ---------------------
attack_urls_found_list = []
# attack_urls_found_list = []
wl = get_wordlists()
if wl.attack_urls:
queried_paths = [item["path"] for item in accesses]
# wl = get_wordlists()
# if wl.attack_patterns:
# queried_paths = [item["path"] for item in accesses]
for queried_path in queried_paths:
for name, pattern in wl.attack_urls.items():
if re.search(pattern, queried_path, re.IGNORECASE):
attack_urls_found_list.append(pattern)
# for queried_path in queried_paths:
# # URL decode the path to catch encoded attacks
# try:
# decoded_path = urllib.parse.unquote(queried_path)
# # Double decode to catch double-encoded attacks
# decoded_path_twice = urllib.parse.unquote(decoded_path)
# except Exception:
# decoded_path = queried_path
# decoded_path_twice = queried_path
# for name, pattern in wl.attack_patterns.items():
# # Check original, decoded, and double-decoded paths
# if (re.search(pattern, queried_path, re.IGNORECASE) or
# re.search(pattern, decoded_path, re.IGNORECASE) or
# re.search(pattern, decoded_path_twice, re.IGNORECASE)):
# attack_urls_found_list.append(f"{name}: {pattern}")
#remove duplicates
attack_urls_found_list = set(attack_urls_found_list)
attack_urls_found_list = list(attack_urls_found_list)
# #remove duplicates
# attack_urls_found_list = set(attack_urls_found_list)
# attack_urls_found_list = list(attack_urls_found_list)
if len(attack_urls_found_list) > attack_urls_threshold:
score["attacker"]["attack_url"] = True
score["good_crawler"]["attack_url"] = False
score["bad_crawler"]["attack_url"] = False
score["regular_user"]["attack_url"] = False
else:
score["attacker"]["attack_url"] = False
score["good_crawler"]["attack_url"] = False
score["bad_crawler"]["attack_url"] = False
score["regular_user"]["attack_url"] = False
# if len(attack_urls_found_list) > attack_urls_threshold:
# score["attacker"]["attack_url"] = True
# score["good_crawler"]["attack_url"] = False
# score["bad_crawler"]["attack_url"] = False
# score["regular_user"]["attack_url"] = False
# else:
# score["attacker"]["attack_url"] = False
# score["good_crawler"]["attack_url"] = False
# score["bad_crawler"]["attack_url"] = False
# score["regular_user"]["attack_url"] = False
#--------------------- Calculate score ---------------------
# #--------------------- Calculate score ---------------------
attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0
# attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0
attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"]
attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"]
attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"]
attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"]
attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"]
# attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"]
# attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"]
# attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"]
# attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"]
# attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"]
good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"]
good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"]
good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"]
good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"]
good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"]
# good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"]
# good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"]
# good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"]
# good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"]
# good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"]
bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"]
# bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"]
# bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"]
# bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"]
# bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"]
# bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"]
regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"]
regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"]
regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"]
regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"]
regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"]
# regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"]
# regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"]
# regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"]
# regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"]
# regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"]
print(f"Attacker score: {attacker_score}")
print(f"Good Crawler score: {good_crawler_score}")
print(f"Bad Crawler score: {bad_crawler_score}")
print(f"Regular User score: {regular_user_score}")
# score_details = f"""
# Attacker score: {attacker_score}
# Good Crawler score: {good_crawler_score}
# Bad Crawler score: {bad_crawler_score}
# Regular User score: {regular_user_score}
# """
# app_logger.debug(score_details)
analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list}
category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score}
category = max(category_scores, key=category_scores.get)
last_analysis = datetime.utcnow()
# analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list}
# category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score}
# category = max(category_scores, key=category_scores.get)
# last_analysis = datetime.now(tz=ZoneInfo('UTC'))
self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
# self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
return 0
# return 0
def update_ip_rep_infos(self, ip: str) -> list[str]:
api_url = "https://iprep.lcrawl.com/api/iprep/"
params = {
"cidr": ip
}
headers = {
"Content-Type": "application/json"
}
response = requests.get(api_url, headers=headers, params=params)
payload = response.json()
if payload["results"]:
data = payload["results"][0]
# def update_ip_rep_infos(self, ip: str) -> list[str]:
# api_url = "https://iprep.lcrawl.com/api/iprep/"
# params = {
# "cidr": ip
# }
# headers = {
# "Content-Type": "application/json"
# }
country_iso_code = data["geoip_data"]["country_iso_code"]
asn = data["geoip_data"]["asn_autonomous_system_number"]
asn_org = data["geoip_data"]["asn_autonomous_system_organization"]
list_on = data["list_on"]
# response = requests.get(api_url, headers=headers, params=params)
# payload = response.json()
sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3)
sanitized_asn = sanitize_for_storage(asn, 100)
sanitized_asn_org = sanitize_for_storage(asn_org, 100)
sanitized_list_on = sanitize_dict(list_on, 100000)
# if payload["results"]:
# data = payload["results"][0]
# country_iso_code = data["geoip_data"]["country_iso_code"]
# asn = data["geoip_data"]["asn_autonomous_system_number"]
# asn_org = data["geoip_data"]["asn_autonomous_system_organization"]
# list_on = data["list_on"]
# sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3)
# sanitized_asn = sanitize_for_storage(asn, 100)
# sanitized_asn_org = sanitize_for_storage(asn_org, 100)
# sanitized_list_on = sanitize_dict(list_on, 100000)
self._db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on)
# self._db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on)
return
# return

View File

@@ -9,11 +9,12 @@ import os
import stat
from datetime import datetime
from typing import Optional, List, Dict, Any
from zoneinfo import ZoneInfo
from sqlalchemy import create_engine, func, distinct, case
from sqlalchemy.orm import sessionmaker, scoped_session, Session
from models import Base, AccessLog, CredentialAttempt, AttackDetection, IpStats
from models import Base, AccessLog, CredentialAttempt, AttackDetection, IpStats, CategoryHistory
from sanitizer import (
sanitize_ip,
sanitize_path,
@@ -22,6 +23,9 @@ from sanitizer import (
sanitize_attack_pattern,
)
from logger import get_app_logger
applogger = get_app_logger()
class DatabaseManager:
"""
@@ -127,7 +131,7 @@ class DatabaseManager:
method=method[:10],
is_suspicious=is_suspicious,
is_honeypot_trigger=is_honeypot_trigger,
timestamp=datetime.utcnow()
timestamp=datetime.now(tz=ZoneInfo('UTC'))
)
session.add(access_log)
session.flush() # Get the ID before committing
@@ -154,7 +158,7 @@ class DatabaseManager:
except Exception as e:
session.rollback()
# Log error but don't crash - database persistence is secondary to honeypot function
print(f"Database error persisting access: {e}")
applogger.critical(f"Database error persisting access: {e}")
return None
finally:
self.close_session()
@@ -185,7 +189,7 @@ class DatabaseManager:
path=sanitize_path(path),
username=sanitize_credential(username),
password=sanitize_credential(password),
timestamp=datetime.utcnow()
timestamp=datetime.now(tz=ZoneInfo('UTC'))
)
session.add(credential)
session.commit()
@@ -193,7 +197,7 @@ class DatabaseManager:
except Exception as e:
session.rollback()
print(f"Database error persisting credential: {e}")
applogger.critical(f"Database error persisting credential: {e}")
return None
finally:
self.close_session()
@@ -207,7 +211,7 @@ class DatabaseManager:
ip: IP address to update
"""
sanitized_ip = sanitize_ip(ip)
now = datetime.utcnow()
now = datetime.now(tz=ZoneInfo('UTC'))
ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first()
@@ -226,6 +230,7 @@ class DatabaseManager:
def update_ip_stats_analysis(self, ip: str, analyzed_metrics: Dict[str, object], category: str, category_scores: Dict[str, int], last_analysis: datetime) -> None:
"""
Update IP statistics (ip is already persisted).
Records category change in history if category has changed.
Args:
ip: IP address to update
@@ -235,16 +240,28 @@ class DatabaseManager:
last_analysis: timestamp of last analysis
"""
print(f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}")
applogger.debug(f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}")
applogger.info(f"IP: {ip} category has been updated to {category}")
session = self.session
sanitized_ip = sanitize_ip(ip)
ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first()
# Check if category has changed and record it
old_category = ip_stats.category
if old_category != category:
self._record_category_change(sanitized_ip, old_category, category, last_analysis)
ip_stats.analyzed_metrics = analyzed_metrics
ip_stats.category = category
ip_stats.category_scores = category_scores
ip_stats.last_analysis = last_analysis
try:
session.commit()
except Exception as e:
session.rollback()
print(f"Error updating IP stats analysis: {e}")
def manual_update_category(self, ip: str, category: str) -> None:
"""
@@ -256,13 +273,81 @@ class DatabaseManager:
"""
session = self.session
sanitized_ip = sanitize_ip(ip)
ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first()
# Record the manual category change
old_category = ip_stats.category
if old_category != category:
self._record_category_change(sanitized_ip, old_category, category, datetime.now(tz=ZoneInfo('UTC')))
ip_stats.category = category
ip_stats.manual_category = True
try:
session.commit()
except Exception as e:
session.rollback()
print(f"Error updating manual category: {e}")
def _record_category_change(self, ip: str, old_category: Optional[str], new_category: str, timestamp: datetime) -> None:
"""
Internal method to record category changes in history.
Only records if there's an actual change from a previous category.
Args:
ip: IP address
old_category: Previous category (None if first categorization)
new_category: New category
timestamp: When the change occurred
"""
# Don't record initial categorization (when old_category is None)
# Only record actual category changes
if old_category is None:
return
session = self.session
try:
history_entry = CategoryHistory(
ip=ip,
old_category=old_category,
new_category=new_category,
timestamp=timestamp
)
session.add(history_entry)
session.commit()
except Exception as e:
session.rollback()
applogger.error(f"Error recording category change: {e}")
def get_category_history(self, ip: str) -> List[Dict[str, Any]]:
"""
Retrieve category change history for a specific IP.
Args:
ip: IP address to get history for
Returns:
List of category change records ordered by timestamp
"""
session = self.session
try:
sanitized_ip = sanitize_ip(ip)
history = session.query(CategoryHistory).filter(
CategoryHistory.ip == sanitized_ip
).order_by(CategoryHistory.timestamp.asc()).all()
return [
{
'old_category': h.old_category,
'new_category': h.new_category,
'timestamp': h.timestamp.isoformat() + '+00:00'
}
for h in history
]
finally:
self.close_session()
def update_ip_rep_infos(self, ip: str, country_code: str, asn: str, asn_org: str, list_on: Dict[str,str]) -> None:
"""
@@ -326,7 +411,7 @@ class DatabaseManager:
'method': log.method,
'is_suspicious': log.is_suspicious,
'is_honeypot_trigger': log.is_honeypot_trigger,
'timestamp': log.timestamp.isoformat(),
'timestamp': log.timestamp.isoformat() + '+00:00',
'attack_types': [d.attack_type for d in log.attack_detections]
}
for log in logs
@@ -419,7 +504,7 @@ class DatabaseManager:
'path': attempt.path,
'username': attempt.username,
'password': attempt.password,
'timestamp': attempt.timestamp.isoformat()
'timestamp': attempt.timestamp.isoformat() + '+00:00'
}
for attempt in attempts
]
@@ -446,8 +531,8 @@ class DatabaseManager:
{
'ip': s.ip,
'total_requests': s.total_requests,
'first_seen': s.first_seen.isoformat(),
'last_seen': s.last_seen.isoformat(),
'first_seen': s.first_seen.isoformat() + '+00:00',
'last_seen': s.last_seen.isoformat() + '+00:00',
'country_code': s.country_code,
'city': s.city,
'asn': s.asn,
@@ -464,6 +549,47 @@ class DatabaseManager:
finally:
self.close_session()
def get_ip_stats_by_ip(self, ip: str) -> Optional[Dict[str, Any]]:
"""
Retrieve IP statistics for a specific IP address.
Args:
ip: The IP address to look up
Returns:
Dictionary with IP stats or None if not found
"""
session = self.session
try:
stat = session.query(IpStats).filter(IpStats.ip == ip).first()
if not stat:
return None
# Get category history for this IP
category_history = self.get_category_history(ip)
return {
'ip': stat.ip,
'total_requests': stat.total_requests,
'first_seen': stat.first_seen.isoformat() + '+00:00' if stat.first_seen else None,
'last_seen': stat.last_seen.isoformat() + '+00:00' if stat.last_seen else None,
'country_code': stat.country_code,
'city': stat.city,
'asn': stat.asn,
'asn_org': stat.asn_org,
'reputation_score': stat.reputation_score,
'reputation_source': stat.reputation_source,
'analyzed_metrics': stat.analyzed_metrics or {},
'category': stat.category,
'category_scores': stat.category_scores or {},
'manual_category': stat.manual_category,
'last_analysis': stat.last_analysis.isoformat() + '+00:00' if stat.last_analysis else None,
'category_history': category_history
}
finally:
self.close_session()
def get_dashboard_counts(self) -> Dict[str, int]:
"""
Get aggregate statistics for the dashboard.
@@ -592,7 +718,7 @@ class DatabaseManager:
'ip': log.ip,
'path': log.path,
'user_agent': log.user_agent,
'timestamp': log.timestamp.isoformat()
'timestamp': log.timestamp.isoformat() + '+00:00'
}
for log in logs
]
@@ -650,7 +776,7 @@ class DatabaseManager:
'ip': log.ip,
'path': log.path,
'user_agent': log.user_agent,
'timestamp': log.timestamp.isoformat(),
'timestamp': log.timestamp.isoformat() + '+00:00',
'attack_types': [d.attack_type for d in log.attack_detections]
}
for log in logs

View File

@@ -0,0 +1 @@
127.0.0.1

View File

@@ -407,17 +407,75 @@ class Handler(BaseHTTPRequestHandler):
self.end_headers()
try:
stats = self.tracker.get_stats()
self.wfile.write(generate_dashboard(stats).encode())
timezone = str(self.config.timezone) if self.config.timezone else 'UTC'
dashboard_path = self.config.dashboard_secret_path
self.wfile.write(generate_dashboard(stats, timezone, dashboard_path).encode())
except BrokenPipeError:
pass
except Exception as e:
self.app_logger.error(f"Error generating dashboard: {e}")
return
# API endpoint for fetching IP stats
if self.config.dashboard_secret_path and self.path.startswith(f"{self.config.dashboard_secret_path}/api/ip-stats/"):
ip_address = self.path.replace(f"{self.config.dashboard_secret_path}/api/ip-stats/", "")
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.send_header('Access-Control-Allow-Origin', '*')
# Prevent browser caching - force fresh data from database every time
self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate, max-age=0')
self.send_header('Pragma', 'no-cache')
self.send_header('Expires', '0')
self.end_headers()
try:
from database import get_database
import json
db = get_database()
ip_stats = db.get_ip_stats_by_ip(ip_address)
if ip_stats:
self.wfile.write(json.dumps(ip_stats).encode())
else:
self.wfile.write(json.dumps({'error': 'IP not found'}).encode())
except BrokenPipeError:
pass
except Exception as e:
self.app_logger.error(f"Error fetching IP stats: {e}")
self.wfile.write(json.dumps({'error': str(e)}).encode())
return
# API endpoint for downloading malicious IPs file
if self.config.dashboard_secret_path and self.path == f"{self.config.dashboard_secret_path}/api/download/malicious_ips.txt":
import os
file_path = os.path.join(os.path.dirname(__file__), 'exports', 'malicious_ips.txt')
try:
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
content = f.read()
self.send_response(200)
self.send_header('Content-type', 'text/plain')
self.send_header('Content-Disposition', 'attachment; filename="malicious_ips.txt"')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)
else:
self.send_response(404)
self.send_header('Content-type', 'text/plain')
self.end_headers()
self.wfile.write(b'File not found')
except BrokenPipeError:
pass
except Exception as e:
self.app_logger.error(f"Error serving malicious IPs file: {e}")
self.send_response(500)
self.send_header('Content-type', 'text/plain')
self.end_headers()
self.wfile.write(b'Internal server error')
return
self.tracker.record_access(client_ip, self.path, user_agent, method='GET')
self.analyzer.infer_user_category(client_ip)
self.analyzer.update_ip_rep_infos(client_ip)
# self.analyzer.infer_user_category(client_ip)
# self.analyzer.update_ip_rep_infos(client_ip)
if self.tracker.is_suspicious_user_agent(user_agent):
self.access_logger.warning(f"[SUSPICIOUS] {client_ip} - {user_agent[:50]} - {self.path}")

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""
Migration script to add CategoryHistory table to existing databases.
Run this once to upgrade your database schema.
"""
import sys
from pathlib import Path
# Add parent directory to path to import modules
sys.path.insert(0, str(Path(__file__).parent.parent))
from database import get_database, DatabaseManager
from models import Base, CategoryHistory
def migrate():
"""Create CategoryHistory table if it doesn't exist."""
print("Starting migration: Adding CategoryHistory table...")
try:
db = get_database()
# Initialize database if not already done
if not db._initialized:
db.initialize()
# Create only the CategoryHistory table
CategoryHistory.__table__.create(db._engine, checkfirst=True)
print("✓ Migration completed successfully!")
print(" - CategoryHistory table created")
except Exception as e:
print(f"✗ Migration failed: {e}")
sys.exit(1)
if __name__ == "__main__":
migrate()

View File

@@ -150,4 +150,59 @@ class IpStats(Base):
def __repr__(self) -> str:
return f"<IpStats(ip='{self.ip}', total_requests={self.total_requests})>"
return f"<IpStats(ip='{self.ip}', total_requests={self.total_requests})>"
class CategoryHistory(Base):
"""
Records category changes for IP addresses over time.
Tracks when an IP's category changes, storing both the previous
and new category along with timestamp for timeline visualization.
"""
__tablename__ = 'category_history'
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True)
old_category: Mapped[Optional[str]] = mapped_column(String(50), nullable=True)
new_category: Mapped[str] = mapped_column(String(50), nullable=False)
timestamp: Mapped[datetime] = mapped_column(DateTime, nullable=False, default=datetime.utcnow, index=True)
# Composite index for efficient IP-based timeline queries
__table_args__ = (
Index('ix_category_history_ip_timestamp', 'ip', 'timestamp'),
)
def __repr__(self) -> str:
return f"<CategoryHistory(ip='{self.ip}', {self.old_category} -> {self.new_category})>"
# class IpLog(Base):
# """
# Records all IPs that have accessed the honeypot, along with aggregated stats and inferred user category.
# """
# __tablename__ = 'ip_logs'
# id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True)
# stats: Mapped[List[str]] = mapped_column(String(MAX_PATH_LENGTH))
# category: Mapped[str] = mapped_column(String(15))
# manual_category: Mapped[bool] = mapped_column(Boolean, default=False)
# last_analysis: Mapped[datetime] = mapped_column(DateTime, index=True),
# # Relationship to attack detections
# access_logs: Mapped[List["AccessLog"]] = relationship(
# "AccessLog",
# back_populates="ip",
# cascade="all, delete-orphan"
# )
# # Indexes for common queries
# __table_args__ = (
# Index('ix_access_logs_ip_timestamp', 'ip', 'timestamp'),
# Index('ix_access_logs_is_suspicious', 'is_suspicious'),
# Index('ix_access_logs_is_honeypot_trigger', 'is_honeypot_trigger'),
# )
# def __repr__(self) -> str:
# return f"<AccessLog(id={self.id}, ip='{self.ip}', path='{self.path[:50]}')>"

View File

@@ -14,6 +14,7 @@ from analyzer import Analyzer
from handler import Handler
from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger
from database import initialize_database
from tasks_master import get_tasksmaster
def print_usage():
@@ -92,6 +93,10 @@ def main():
except IOError:
app_logger.warning("Can't read input file. Using randomly generated links.")
# tasks master init
tasks_master = get_tasksmaster()
tasks_master.run_scheduled_tasks()
try:
app_logger.info(f'Starting deception server on port {config.port}...')
app_logger.info(f'Timezone configured: {tz.key}')

265
src/tasks/analyze_ips.py Normal file
View File

@@ -0,0 +1,265 @@
from sqlalchemy import select
from typing import Optional
from database import get_database, DatabaseManager
from zoneinfo import ZoneInfo
from pathlib import Path
from datetime import datetime, timedelta
import re
import urllib.parse
from wordlists import get_wordlists
from config import get_config
from logger import get_app_logger
import requests
from sanitizer import sanitize_for_storage, sanitize_dict
# ----------------------
# TASK CONFIG
# ----------------------
TASK_CONFIG = {
"name": "analyze-ips",
"cron": "*/1 * * * *",
"enabled": True,
"run_when_loaded": True
}
def main():
config = get_config()
db_manager = get_database()
app_logger = get_app_logger()
http_risky_methods_threshold = config.http_risky_methods_threshold
violated_robots_threshold = config.violated_robots_threshold
uneven_request_timing_threshold = config.uneven_request_timing_threshold
user_agents_used_threshold = config.user_agents_used_threshold
attack_urls_threshold = config.attack_urls_threshold
uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds
app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}")
score = {}
score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False}
#1-3 low, 4-6 mid, 7-9 high, 10-20 extreme
weights = {
"attacker": {
"risky_http_methods": 6,
"robots_violations": 4,
"uneven_request_timing": 3,
"different_user_agents": 8,
"attack_url": 15
},
"good_crawler": {
"risky_http_methods": 1,
"robots_violations": 0,
"uneven_request_timing": 0,
"different_user_agents": 0,
"attack_url": 0
},
"bad_crawler": {
"risky_http_methods": 2,
"robots_violations": 7,
"uneven_request_timing": 0,
"different_user_agents": 5,
"attack_url": 5
},
"regular_user": {
"risky_http_methods": 0,
"robots_violations": 0,
"uneven_request_timing": 8,
"different_user_agents": 3,
"attack_url": 0
}
}
accesses = db_manager.get_access_logs(limit=999999999)
ips = {item['ip'] for item in accesses}
for ip in ips:
ip_accesses = [item for item in accesses if item["ip"] == ip]
total_accesses_count = len(accesses)
if total_accesses_count <= 0:
return
# Set category as "unknown" for the first 3 requests
if total_accesses_count < 3:
category = "unknown"
analyzed_metrics = {}
category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0}
last_analysis = datetime.now(tz=ZoneInfo('UTC'))
db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
return 0
#--------------------- HTTP Methods ---------------------
get_accesses_count = len([item for item in ip_accesses if item["method"] == "GET"])
post_accesses_count = len([item for item in ip_accesses if item["method"] == "POST"])
put_accesses_count = len([item for item in ip_accesses if item["method"] == "PUT"])
delete_accesses_count = len([item for item in ip_accesses if item["method"] == "DELETE"])
head_accesses_count = len([item for item in ip_accesses if item["method"] == "HEAD"])
options_accesses_count = len([item for item in ip_accesses if item["method"] == "OPTIONS"])
patch_accesses_count = len([item for item in ip_accesses if item["method"] == "PATCH"])
if total_accesses_count > http_risky_methods_threshold:
http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count
else:
http_method_attacker_score = 0
#print(f"HTTP Method attacker score: {http_method_attacker_score}")
if http_method_attacker_score >= http_risky_methods_threshold:
score["attacker"]["risky_http_methods"] = True
score["good_crawler"]["risky_http_methods"] = False
score["bad_crawler"]["risky_http_methods"] = True
score["regular_user"]["risky_http_methods"] = False
else:
score["attacker"]["risky_http_methods"] = False
score["good_crawler"]["risky_http_methods"] = True
score["bad_crawler"]["risky_http_methods"] = False
score["regular_user"]["risky_http_methods"] = False
#--------------------- Robots Violations ---------------------
#respect robots.txt and login/config pages access frequency
robots_disallows = []
robots_path = Path(__file__).parent.parent / "templates" / "html" / "robots.txt"
with open(robots_path, "r") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split(":")
if parts[0] == "Disallow":
parts[1] = parts[1].rstrip("/")
#print(f"DISALLOW {parts[1]}")
robots_disallows.append(parts[1].strip())
#if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker
violated_robots_count = len([item for item in ip_accesses if any(item["path"].rstrip("/").startswith(disallow) for disallow in robots_disallows)])
#print(f"Violated robots count: {violated_robots_count}")
if total_accesses_count > 0:
violated_robots_ratio = violated_robots_count / total_accesses_count
else:
violated_robots_ratio = 0
if violated_robots_ratio >= violated_robots_threshold:
score["attacker"]["robots_violations"] = True
score["good_crawler"]["robots_violations"] = False
score["bad_crawler"]["robots_violations"] = True
score["regular_user"]["robots_violations"] = False
else:
score["attacker"]["robots_violations"] = False
score["good_crawler"]["robots_violations"] = False
score["bad_crawler"]["robots_violations"] = False
score["regular_user"]["robots_violations"] = False
#--------------------- Requests Timing ---------------------
#Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior
timestamps = [datetime.fromisoformat(item["timestamp"]) for item in ip_accesses]
now_utc = datetime.now(tz=ZoneInfo('UTC'))
timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)]
timestamps = sorted(timestamps, reverse=True)
time_diffs = []
for i in range(0, len(timestamps)-1):
diff = (timestamps[i] - timestamps[i+1]).total_seconds()
time_diffs.append(diff)
mean = 0
variance = 0
std = 0
cv = 0
if time_diffs:
mean = sum(time_diffs) / len(time_diffs)
variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs)
std = variance ** 0.5
cv = std/mean
app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}")
if cv >= uneven_request_timing_threshold:
score["attacker"]["uneven_request_timing"] = True
score["good_crawler"]["uneven_request_timing"] = False
score["bad_crawler"]["uneven_request_timing"] = False
score["regular_user"]["uneven_request_timing"] = True
else:
score["attacker"]["uneven_request_timing"] = False
score["good_crawler"]["uneven_request_timing"] = False
score["bad_crawler"]["uneven_request_timing"] = False
score["regular_user"]["uneven_request_timing"] = False
#--------------------- Different User Agents ---------------------
#Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers
user_agents_used = [item["user_agent"] for item in ip_accesses]
user_agents_used = list(dict.fromkeys(user_agents_used))
#print(f"User agents used: {user_agents_used}")
if len(user_agents_used) >= user_agents_used_threshold:
score["attacker"]["different_user_agents"] = True
score["good_crawler"]["different_user_agents"] = False
score["bad_crawler"]["different_user_agentss"] = True
score["regular_user"]["different_user_agents"] = False
else:
score["attacker"]["different_user_agents"] = False
score["good_crawler"]["different_user_agents"] = False
score["bad_crawler"]["different_user_agents"] = False
score["regular_user"]["different_user_agents"] = False
#--------------------- Attack URLs ---------------------
attack_urls_found_list = []
wl = get_wordlists()
if wl.attack_patterns:
queried_paths = [item["path"] for item in ip_accesses]
for queried_path in queried_paths:
# URL decode the path to catch encoded attacks
try:
decoded_path = urllib.parse.unquote(queried_path)
# Double decode to catch double-encoded attacks
decoded_path_twice = urllib.parse.unquote(decoded_path)
except Exception:
decoded_path = queried_path
decoded_path_twice = queried_path
for name, pattern in wl.attack_patterns.items():
# Check original, decoded, and double-decoded paths
if (re.search(pattern, queried_path, re.IGNORECASE) or
re.search(pattern, decoded_path, re.IGNORECASE) or
re.search(pattern, decoded_path_twice, re.IGNORECASE)):
attack_urls_found_list.append(f"{name}: {pattern}")
#remove duplicates
attack_urls_found_list = set(attack_urls_found_list)
attack_urls_found_list = list(attack_urls_found_list)
if len(attack_urls_found_list) >= attack_urls_threshold:
score["attacker"]["attack_url"] = True
score["good_crawler"]["attack_url"] = False
score["bad_crawler"]["attack_url"] = False
score["regular_user"]["attack_url"] = False
else:
score["attacker"]["attack_url"] = False
score["good_crawler"]["attack_url"] = False
score["bad_crawler"]["attack_url"] = False
score["regular_user"]["attack_url"] = False
#--------------------- Calculate score ---------------------
attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0
attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"]
attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"]
attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"]
attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"]
attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"]
good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"]
good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"]
good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"]
good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"]
good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"]
bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"]
bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"]
regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"]
regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"]
regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"]
regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"]
regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"]
score_details = f"""
Attacker score: {attacker_score}
Good Crawler score: {good_crawler_score}
Bad Crawler score: {bad_crawler_score}
Regular User score: {regular_user_score}
"""
app_logger.debug(score_details)
analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list}
category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score}
category = max(category_scores, key=category_scores.get)
last_analysis = datetime.now(tz=ZoneInfo('UTC'))
db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis)
return

59
src/tasks/fetch_ip_rep.py Normal file
View File

@@ -0,0 +1,59 @@
from sqlalchemy import select
from typing import Optional
from database import get_database, DatabaseManager
from zoneinfo import ZoneInfo
from pathlib import Path
from datetime import datetime, timedelta
import re
import urllib.parse
from wordlists import get_wordlists
from config import get_config
from logger import get_app_logger
import requests
from sanitizer import sanitize_for_storage, sanitize_dict
# ----------------------
# TASK CONFIG
# ----------------------
TASK_CONFIG = {
"name": "fetch-ip-rep",
"cron": "*/1 * * * *",
"enabled": True,
"run_when_loaded": True
}
def main():
config = get_config()
db_manager = get_database()
app_logger = get_app_logger()
accesses = db_manager.get_access_logs(limit=999999999)
ips = {item['ip'] for item in accesses}
for ip in ips:
api_url = "https://iprep.lcrawl.com/api/iprep/"
params = {
"cidr": ip
}
headers = {
"Content-Type": "application/json"
}
response = requests.get(api_url, headers=headers, params=params)
payload = response.json()
if payload["results"]:
data = payload["results"][0]
country_iso_code = data["geoip_data"]["country_iso_code"]
asn = data["geoip_data"]["asn_autonomous_system_number"]
asn_org = data["geoip_data"]["asn_autonomous_system_organization"]
list_on = data["list_on"]
sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3)
sanitized_asn = sanitize_for_storage(asn, 100)
sanitized_asn_org = sanitize_for_storage(asn_org, 100)
sanitized_list_on = sanitize_dict(list_on, 100000)
db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on)
return

View File

@@ -0,0 +1,57 @@
# tasks/export_malicious_ips.py
import os
from logger import get_app_logger
from database import get_database
from models import AccessLog
from sqlalchemy import distinct
app_logger = get_app_logger()
# ----------------------
# TASK CONFIG
# ----------------------
TASK_CONFIG = {
"name": "export-malicious-ips",
"cron": "*/5 * * * *",
"enabled": True,
"run_when_loaded": True
}
EXPORTS_DIR = "exports"
OUTPUT_FILE = os.path.join(EXPORTS_DIR, "malicious_ips.txt")
# ----------------------
# TASK LOGIC
# ----------------------
def main():
"""
Export all IPs flagged as suspicious to a text file.
TasksMaster will call this function based on the cron schedule.
"""
task_name = TASK_CONFIG.get("name")
app_logger.info(f"[Background Task] {task_name} starting...")
try:
db = get_database()
session = db.session
# Query distinct suspicious IPs
results = session.query(distinct(AccessLog.ip)).filter(
AccessLog.is_suspicious == True
).all()
# Ensure exports directory exists
os.makedirs(EXPORTS_DIR, exist_ok=True)
# Write IPs to file (one per line)
with open(OUTPUT_FILE, 'w') as f:
for (ip,) in results:
f.write(f"{ip}\n")
app_logger.info(f"[Background Task] {task_name} exported {len(results)} IPs to {OUTPUT_FILE}")
except Exception as e:
app_logger.error(f"[Background Task] {task_name} failed: {e}")
finally:
db.close_session()

288
src/tasks_master.py Normal file
View File

@@ -0,0 +1,288 @@
import os
import sys
import datetime
import functools
import threading
import importlib
import importlib.util
from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger
app_logger = get_app_logger()
try:
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
except ModuleNotFoundError:
msg = (
"Required modules are not installed. "
"Can not continue with module / application loading.\n"
"Install it with: pip install -r requirements"
)
print(msg, file=sys.stderr)
app_logger.error(msg)
exit()
# ---------- TASKSMASTER CLASS ----------
class TasksMaster:
TASK_DEFAULT_CRON = '*/15 * * * *'
TASK_JITTER = 240
TASKS_FOLDER = os.path.join(os.path.dirname(__file__), "tasks")
def __init__(self, scheduler: BackgroundScheduler):
self.tasks = self._config_tasks()
self.scheduler = scheduler
self.last_run_times = {}
self.scheduler.add_listener(self.job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
def _config_tasks(self):
"""
Loads tasks from the TASKS_FOLDER and logs how many were found.
"""
tasks_defined = self._load_tasks_from_folder(self.TASKS_FOLDER)
app_logger.info(f"Scheduled Tasks Loaded from folder: {self.TASKS_FOLDER}")
return tasks_defined
def _load_tasks_from_folder(self, folder_path):
"""
Loads and registers task modules from a specified folder.
This function scans the given folder for Python (.py) files, dynamically
imports each as a module, and looks for two attributes:
- TASK_CONFIG: A dictionary containing task metadata, specifically the
'name' and 'cron' (cron schedule string).
- main: A callable function that represents the task's execution logic.
Tasks with both attributes are added to a list with their configuration and
execution function.
Args:
folder_path (str): Path to the folder containing task scripts.
Returns:
list[dict]: A list of task definitions with keys:
- 'name' (str): The name of the task.
- 'filename' (str): The file the task was loaded from.
- 'cron' (str): The crontab string for scheduling.
- 'enabled' (bool): Whether the task is enabled.
- 'run_when_loaded' (bool): Whether to run the task immediately.
"""
tasks = []
if not os.path.exists(folder_path):
app_logger.error(f"{folder_path} does not exist! Unable to load tasks!")
return tasks
# we sort the files so that we have a set order, which helps with debugging
for filename in sorted(os.listdir(folder_path)):
# skip any non python files, as well as any __pycache__ or .pyc files that might creep in there
if not filename.endswith('.py') or filename.startswith("__"):
continue
path = os.path.join(folder_path, filename)
module_name = filename[:-3]
spec = importlib.util.spec_from_file_location(f"tasks.{module_name}", path)
module = importlib.util.module_from_spec(spec)
try:
spec.loader.exec_module(module)
sys.modules[f"tasks.{module_name}"] = module
except Exception as e:
app_logger.error(f"Failed to import {filename}: {e}")
continue
# if we have a tasks config and a main function, we attempt to schedule it
if hasattr(module, 'TASK_CONFIG') and hasattr(module, 'main'):
# ensure task_config is a dict
if not isinstance(module.TASK_CONFIG, dict):
app_logger.error(f"TASK_CONFIG is not a dict in {filename}. Skipping task.")
continue
task_cron = module.TASK_CONFIG.get("cron") or self.TASK_DEFAULT_CRON
task_name = module.TASK_CONFIG.get("name", module_name)
# ensure the task_cron is a valid cron value
try:
CronTrigger.from_crontab(task_cron)
except ValueError as ve:
app_logger.error(f"Invalid cron format for task {task_name}: {ve} - Skipping this task")
continue
task = {
'name': module.TASK_CONFIG.get('name', module_name),
'filename': filename,
'cron': task_cron,
"enabled": module.TASK_CONFIG.get("enabled", False),
"run_when_loaded": module.TASK_CONFIG.get("run_when_loaded", False)
}
tasks.append(task)
# we are missing things, and we log what's missing
else:
if not hasattr(module, 'TASK_CONFIG'):
app_logger.warning(f"Missing TASK_CONFIG in {filename}")
elif not hasattr(module, 'main'):
app_logger.warning(f"Missing main() in {filename}")
return tasks
def _add_jobs(self):
# for each task in the tasks config file...
for task_to_run in self.tasks:
# remember, these tasks, are built from the "load_tasks_from_folder" function,
# if you want to pass data from the TASKS_CONFIG dict, you need to pass it there to get it here.
task_name = task_to_run.get("name")
run_when_loaded = task_to_run.get("run_when_loaded")
module_name = os.path.splitext(task_to_run.get("filename"))[0]
task_enabled = task_to_run.get("enabled", False)
# if no crontab set for this task, we use 15 as the default.
task_cron = task_to_run.get("cron") or self.TASK_DEFAULT_CRON
# if task is disabled, skip this one
if not task_enabled:
app_logger.info(f"{task_name} is disabled in client config. Skipping task")
continue
try:
if os.path.isfile(os.path.join(self.TASKS_FOLDER, task_to_run.get("filename"))):
# schedule the task now that everything has checked out above...
self._schedule_task(task_name, module_name, task_cron, run_when_loaded)
app_logger.info(f"Scheduled {module_name} cron is set to {task_cron}.", extra={"task": task_to_run})
else:
app_logger.info(f"Skipping invalid or unsafe file: {task_to_run.get('filename')}", extra={"task": task_to_run})
except Exception as e:
app_logger.error(f"Error scheduling task: {e}", extra={"tasks": task_to_run})
def _schedule_task(self, task_name, module_name, task_cron, run_when_loaded):
try:
# Dynamically import the module
module = importlib.import_module(f"tasks.{module_name}")
# Check if the module has a 'main' function
if hasattr(module, 'main'):
app_logger.info(f"Scheduling {task_name} - {module_name} Main Function")
# unique_job_id
job_identifier = f"{module_name}__{task_name}"
# little insurance to make sure the cron is set to something and not none
if task_cron is None:
task_cron = self.TASK_DEFAULT_CRON
trigger = CronTrigger.from_crontab(task_cron)
# schedule the task / job
if run_when_loaded:
app_logger.info(f"Task: {task_name} is set to run instantly. Scheduling to run on scheduler start")
self.scheduler.add_job(
module.main,
trigger,
id=job_identifier,
jitter=self.TASK_JITTER,
name=task_name,
next_run_time=datetime.datetime.now(),
max_instances=1
)
else:
self.scheduler.add_job(
module.main,
trigger,
id=job_identifier,
jitter=self.TASK_JITTER,
name=task_name,
max_instances=1
)
else:
app_logger.error(f"{module_name} does not define a 'main' function.")
except Exception as e:
app_logger.error(f"Failed to load {module_name}: {e}")
def job_listener(self, event):
job_id = event.job_id
self.last_run_times[job_id] = datetime.datetime.now()
if event.exception:
app_logger.error(f"Job {event.job_id} failed: {event.exception}")
else:
app_logger.info(f"Job {event.job_id} completed successfully.")
def list_jobs(self):
scheduled_jobs = self.scheduler.get_jobs()
jobs_list = []
for job in scheduled_jobs:
jobs_list.append({
"id": job.id,
"name": job.name,
"next_run": job.next_run_time,
})
return jobs_list
def run_scheduled_tasks(self):
"""
Runs and schedules enabled tasks using the background scheduler.
This method performs the following:
1. Retrieves the current task configurations and updates internal state.
2. Adds new jobs to the scheduler based on the latest configuration.
3. Starts the scheduler to begin executing tasks at their defined intervals.
This ensures the scheduler is always running with the most up-to-date
task definitions and enabled status.
"""
# Add enabled tasks to the scheduler
self._add_jobs()
# Start the scheduler to begin executing the scheduled tasks (if not already running)
if not self.scheduler.running:
self.scheduler.start()
# ---------- SINGLETON WRAPPER ----------
T = type
def singleton_loader(func):
"""Decorator to ensure only one instance exists."""
cache: dict[str, T] = {}
lock = threading.Lock()
@functools.wraps(func)
def wrapper(*args, **kwargs) -> T:
with lock:
if func.__name__ not in cache:
cache[func.__name__] = func(*args, **kwargs)
return cache[func.__name__]
return wrapper
@singleton_loader
def get_tasksmaster(scheduler: BackgroundScheduler | None = None) -> TasksMaster:
"""
Returns the singleton TasksMaster instance.
- Automatically creates a BackgroundScheduler if none is provided.
- Automatically starts the scheduler when the singleton is created.
:param scheduler: Optional APScheduler instance. If None, a new BackgroundScheduler will be created.
"""
if scheduler is None:
scheduler = BackgroundScheduler()
tm_instance = TasksMaster(scheduler)
# Auto-start scheduler if not already running
if not scheduler.running:
scheduler.start()
app_logger.info("TasksMaster scheduler started automatically with singleton creation.")
return tm_instance

View File

@@ -7,6 +7,7 @@ Customize this template to change the dashboard appearance.
import html
from datetime import datetime
from zoneinfo import ZoneInfo
def _escape(value) -> str:
"""Escape HTML special characters to prevent XSS attacks."""
@@ -14,22 +15,52 @@ def _escape(value) -> str:
return ""
return html.escape(str(value))
def format_timestamp(iso_timestamp: str) -> str:
"""Format ISO timestamp for display (YYYY-MM-DD HH:MM:SS)"""
def format_timestamp(iso_timestamp: str, timezone: str = 'UTC', time_only: bool = False) -> str:
"""Format ISO timestamp for display with timezone conversion
Args:
iso_timestamp: ISO format timestamp string (UTC)
timezone: IANA timezone string to convert to
time_only: If True, return only HH:MM:SS, otherwise full datetime
"""
try:
# Parse UTC timestamp
dt = datetime.fromisoformat(iso_timestamp)
# Convert to target timezone
if dt.tzinfo is not None:
dt = dt.astimezone(ZoneInfo(timezone))
if time_only:
return dt.strftime("%H:%M:%S")
return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception:
# Fallback for old format
return iso_timestamp.split("T")[1][:8] if "T" in iso_timestamp else iso_timestamp
def generate_dashboard(stats: dict) -> str:
"""Generate dashboard HTML with access statistics"""
def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = '') -> str:
"""Generate dashboard HTML with access statistics
# Generate IP rows (IPs are generally safe but escape for consistency)
Args:
stats: Statistics dictionary
timezone: IANA timezone string (e.g., 'Europe/Paris', 'America/New_York')
dashboard_path: The secret dashboard path for generating API URLs
"""
# Generate IP rows with clickable functionality for dropdown stats
top_ips_rows = '\n'.join([
f'<tr><td class="rank">{i+1}</td><td>{_escape(ip)}</td><td>{count}</td></tr>'
f'''<tr class="ip-row" data-ip="{_escape(ip)}">
<td class="rank">{i+1}</td>
<td class="ip-clickable">{_escape(ip)}</td>
<td>{count}</td>
</tr>
<tr class="ip-stats-row" id="stats-row-{_escape(ip).replace(".", "-")}" style="display: none;">
<td colspan="3" class="ip-stats-cell">
<div class="ip-stats-dropdown">
<div class="loading">Loading stats...</div>
</div>
</td>
</tr>'''
for i, (ip, count) in enumerate(stats['top_ips'])
]) or '<tr><td colspan="3" style="text-align:center;">No data</td></tr>'
@@ -45,27 +76,76 @@ def generate_dashboard(stats: dict) -> str:
for i, (ua, count) in enumerate(stats['top_user_agents'])
]) or '<tr><td colspan="3" style="text-align:center;">No data</td></tr>'
# Generate suspicious accesses rows (CRITICAL: multiple user-controlled fields)
# Generate suspicious accesses rows with clickable IPs
suspicious_rows = '\n'.join([
f'<tr><td>{_escape(log["ip"])}</td><td>{_escape(log["path"])}</td><td style="word-break: break-all;">{_escape(log["user_agent"][:60])}</td><td>{_escape(log["timestamp"].split("T")[1][:8])}</td></tr>'
f'''<tr class="ip-row" data-ip="{_escape(log["ip"])}">
<td class="ip-clickable">{_escape(log["ip"])}</td>
<td>{_escape(log["path"])}</td>
<td style="word-break: break-all;">{_escape(log["user_agent"][:60])}</td>
<td>{format_timestamp(log["timestamp"], timezone, time_only=True)}</td>
</tr>
<tr class="ip-stats-row" id="stats-row-suspicious-{_escape(log["ip"]).replace(".", "-")}" style="display: none;">
<td colspan="4" class="ip-stats-cell">
<div class="ip-stats-dropdown">
<div class="loading">Loading stats...</div>
</div>
</td>
</tr>'''
for log in stats['recent_suspicious'][-10:]
]) or '<tr><td colspan="4" style="text-align:center;">No suspicious activity detected</td></tr>'
# Generate honeypot triggered IPs rows
# Generate honeypot triggered IPs rows with clickable IPs
honeypot_rows = '\n'.join([
f'<tr><td>{_escape(ip)}</td><td style="word-break: break-all;">{_escape(", ".join(paths))}</td><td>{len(paths)}</td></tr>'
f'''<tr class="ip-row" data-ip="{_escape(ip)}">
<td class="ip-clickable">{_escape(ip)}</td>
<td style="word-break: break-all;">{_escape(", ".join(paths))}</td>
<td>{len(paths)}</td>
</tr>
<tr class="ip-stats-row" id="stats-row-honeypot-{_escape(ip).replace(".", "-")}" style="display: none;">
<td colspan="3" class="ip-stats-cell">
<div class="ip-stats-dropdown">
<div class="loading">Loading stats...</div>
</div>
</td>
</tr>'''
for ip, paths in stats.get('honeypot_triggered_ips', [])
]) or '<tr><td colspan="3" style="text-align:center;">No honeypot triggers yet</td></tr>'
# Generate attack types rows (CRITICAL: paths and user agents are user-controlled)
# Generate attack types rows with clickable IPs
attack_type_rows = '\n'.join([
f'<tr><td>{_escape(log["ip"])}</td><td>{_escape(log["path"])}</td><td>{_escape(", ".join(log["attack_types"]))}</td><td style="word-break: break-all;">{_escape(log["user_agent"][:60])}</td><td>{_escape(log["timestamp"].split("T")[1][:8])}</td></tr>'
f'''<tr class="ip-row" data-ip="{_escape(log["ip"])}">
<td class="ip-clickable">{_escape(log["ip"])}</td>
<td>{_escape(log["path"])}</td>
<td>{_escape(", ".join(log["attack_types"]))}</td>
<td style="word-break: break-all;">{_escape(log["user_agent"][:60])}</td>
<td>{format_timestamp(log["timestamp"], timezone, time_only=True)}</td>
</tr>
<tr class="ip-stats-row" id="stats-row-attack-{_escape(log["ip"]).replace(".", "-")}" style="display: none;">
<td colspan="5" class="ip-stats-cell">
<div class="ip-stats-dropdown">
<div class="loading">Loading stats...</div>
</div>
</td>
</tr>'''
for log in stats.get('attack_types', [])[-10:]
]) or '<tr><td colspan="4" style="text-align:center;">No attacks detected</td></tr>'
# Generate credential attempts rows (CRITICAL: usernames and passwords are user-controlled)
# Generate credential attempts rows with clickable IPs
credential_rows = '\n'.join([
f'<tr><td>{_escape(log["ip"])}</td><td>{_escape(log["username"])}</td><td>{_escape(log["password"])}</td><td>{_escape(log["path"])}</td><td>{_escape(log["timestamp"].split("T")[1][:8])}</td></tr>'
f'''<tr class="ip-row" data-ip="{_escape(log["ip"])}">
<td class="ip-clickable">{_escape(log["ip"])}</td>
<td>{_escape(log["username"])}</td>
<td>{_escape(log["password"])}</td>
<td>{_escape(log["path"])}</td>
<td>{format_timestamp(log["timestamp"], timezone, time_only=True)}</td>
</tr>
<tr class="ip-stats-row" id="stats-row-cred-{_escape(log["ip"]).replace(".", "-")}" style="display: none;">
<td colspan="5" class="ip-stats-cell">
<div class="ip-stats-dropdown">
<div class="loading">Loading stats...</div>
</div>
</td>
</tr>'''
for log in stats.get('credential_attempts', [])[-20:]
]) or '<tr><td colspan="5" style="text-align:center;">No credentials captured yet</td></tr>'
@@ -85,12 +165,36 @@ def generate_dashboard(stats: dict) -> str:
.container {{
max-width: 1400px;
margin: 0 auto;
position: relative;
}}
h1 {{
color: #58a6ff;
text-align: center;
margin-bottom: 40px;
}}
.download-section {{
position: absolute;
top: 0;
right: 0;
}}
.download-btn {{
display: inline-block;
padding: 8px 14px;
background: #238636;
color: #ffffff;
text-decoration: none;
border-radius: 6px;
font-weight: 500;
font-size: 13px;
transition: background 0.2s;
border: 1px solid #2ea043;
}}
.download-btn:hover {{
background: #2ea043;
}}
.download-btn:active {{
background: #1f7a2f;
}}
.stats-grid {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
@@ -180,10 +284,202 @@ def generate_dashboard(stats: dict) -> str:
content: '';
opacity: 1;
}}
.ip-row {{
transition: background-color 0.2s;
}}
.ip-clickable {{
cursor: pointer;
color: #58a6ff !important;
font-weight: 500;
text-decoration: underline;
text-decoration-style: dotted;
text-underline-offset: 3px;
}}
.ip-clickable:hover {{
color: #79c0ff !important;
text-decoration-style: solid;
background: #1c2128;
}}
.ip-stats-row {{
background: #0d1117;
}}
.ip-stats-cell {{
padding: 0 !important;
}}
.ip-stats-dropdown {{
margin-top: 10px;
padding: 15px;
background: #0d1117;
border: 1px solid #30363d;
border-radius: 6px;
font-size: 13px;
display: flex;
gap: 20px;
}}
.stats-left {{
flex: 1;
}}
.stats-right {{
flex: 0 0 200px;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
}}
.radar-chart {{
position: relative;
width: 220px;
height: 220px;
overflow: visible;
}}
.radar-legend {{
margin-top: 10px;
font-size: 11px;
}}
.radar-legend-item {{
display: flex;
align-items: center;
gap: 6px;
margin: 3px 0;
}}
.radar-legend-color {{
width: 12px;
height: 12px;
border-radius: 2px;
}}
.ip-stats-dropdown .loading {{
color: #8b949e;
font-style: italic;
}}
.stat-row {{
display: flex;
justify-content: space-between;
padding: 5px 0;
border-bottom: 1px solid #21262d;
}}
.stat-row:last-child {{
border-bottom: none;
}}
.stat-label-sm {{
color: #8b949e;
font-weight: 500;
}}
.stat-value-sm {{
color: #58a6ff;
font-weight: 600;
}}
.category-badge {{
display: inline-block;
padding: 4px 8px;
border-radius: 4px;
font-size: 12px;
font-weight: 600;
text-transform: uppercase;
}}
.category-attacker {{
background: #f851491a;
color: #f85149;
border: 1px solid #f85149;
}}
.category-good-crawler {{
background: #3fb9501a;
color: #3fb950;
border: 1px solid #3fb950;
}}
.category-bad-crawler {{
background: #f0883e1a;
color: #f0883e;
border: 1px solid #f0883e;
}}
.category-regular-user {{
background: #58a6ff1a;
color: #58a6ff;
border: 1px solid #58a6ff;
}}
.category-unknown {{
background: #8b949e1a;
color: #8b949e;
border: 1px solid #8b949e;
}}
.timeline-container {{
margin-top: 15px;
padding-top: 15px;
border-top: 1px solid #30363d;
}}
.timeline-title {{
color: #58a6ff;
font-size: 13px;
font-weight: 600;
margin-bottom: 10px;
}}
.timeline {{
position: relative;
padding-left: 30px;
}}
.timeline::before {{
content: '';
position: absolute;
left: 12px;
top: 5px;
bottom: 5px;
width: 3px;
background: #30363d;
}}
.timeline-item {{
position: relative;
padding-bottom: 15px;
}}
.timeline-item:last-child {{
padding-bottom: 0;
}}
.timeline-marker {{
position: absolute;
left: -26px;
width: 16px;
height: 16px;
border-radius: 50%;
border: 2px solid #0d1117;
}}
.timeline-marker.attacker {{
background: #f85149;
}}
.timeline-marker.good-crawler {{
background: #3fb950;
}}
.timeline-marker.bad-crawler {{
background: #f0883e;
}}
.timeline-marker.regular-user {{
background: #58a6ff;
}}
.timeline-marker.unknown {{
background: #8b949e;
}}
.timeline-content {{
font-size: 12px;
}}
.timeline-category {{
font-weight: 600;
}}
.timeline-timestamp {{
color: #8b949e;
font-size: 11px;
margin-top: 2px;
}}
.timeline-arrow {{
color: #8b949e;
margin: 0 7px;
}}
</style>
</head>
<body>
<div class="container">
<div class="download-section">
<a href="{dashboard_path}/api/download/malicious_ips.txt" class="download-btn" download>
Export Malicious IPs
</a>
</div>
<h1>Krawl Dashboard</h1>
<div class="stats-grid">
@@ -331,6 +627,31 @@ def generate_dashboard(stats: dict) -> str:
</div>
</div>
<script>
// Server timezone configuration
const SERVER_TIMEZONE = '{timezone}';
const DASHBOARD_PATH = '{dashboard_path}';
// Convert UTC timestamp to configured timezone
function formatTimestamp(isoTimestamp) {{
if (!isoTimestamp) return 'N/A';
try {{
const date = new Date(isoTimestamp);
return date.toLocaleString('en-US', {{
timeZone: SERVER_TIMEZONE,
year: 'numeric',
month: '2-digit',
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
second: '2-digit',
hour12: false
}});
}} catch (err) {{
console.error('Error formatting timestamp:', err);
return new Date(isoTimestamp).toLocaleString();
}}
}}
// Add sorting functionality to tables
document.querySelectorAll('th.sortable').forEach(header => {{
header.addEventListener('click', function() {{
@@ -387,6 +708,248 @@ def generate_dashboard(stats: dict) -> str:
rows.forEach(row => tbody.appendChild(row));
}});
}});
// IP stats dropdown functionality
document.querySelectorAll('.ip-clickable').forEach(cell => {{
cell.addEventListener('click', async function(e) {{
const row = e.currentTarget.closest('.ip-row');
if (!row) return;
const ip = row.getAttribute('data-ip');
const statsRow = row.nextElementSibling;
if (!statsRow || !statsRow.classList.contains('ip-stats-row')) return;
const isVisible = getComputedStyle(statsRow).display !== 'none';
document.querySelectorAll('.ip-stats-row').forEach(r => {{
r.style.display = 'none';
}});
if (isVisible) return;
statsRow.style.display = 'table-row';
const dropdown = statsRow.querySelector('.ip-stats-dropdown');
// Always fetch fresh data from database
if (dropdown) {{
dropdown.innerHTML = '<div class="loading">Loading stats...</div>';
try {{
const response = await fetch(`${{DASHBOARD_PATH}}/api/ip-stats/${{ip}}`, {{
cache: 'no-store',
headers: {{
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
}}
}});
if (!response.ok) throw new Error(`HTTP ${{response.status}}`);
const data = await response.json();
dropdown.innerHTML = data.error
? `<div style="color:#f85149;">Error: ${{data.error}}</div>`
: formatIpStats(data);
}} catch (err) {{
dropdown.innerHTML = `<div style="color:#f85149;">Failed to load stats: ${{err.message}}</div>`;
}}
}}
}});
}});
function formatIpStats(stats) {{
let html = '<div class="stats-left">';
// Basic info
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">Total Requests:</span>';
html += `<span class="stat-value-sm">${{stats.total_requests || 0}}</span>`;
html += '</div>';
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">First Seen:</span>';
html += `<span class="stat-value-sm">${{formatTimestamp(stats.first_seen)}}</span>`;
html += '</div>';
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">Last Seen:</span>';
html += `<span class="stat-value-sm">${{formatTimestamp(stats.last_seen)}}</span>`;
html += '</div>';
// Category
if (stats.category) {{
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">Category:</span>';
const categoryClass = 'category-' + stats.category.toLowerCase().replace('_', '-');
html += `<span class="category-badge ${{categoryClass}}">${{stats.category}}</span>`;
html += '</div>';
}}
// GeoIP info if available
if (stats.country_code || stats.city) {{
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">Location:</span>';
html += `<span class="stat-value-sm">${{stats.city || ''}}${{stats.city && stats.country_code ? ', ' : ''}}${{stats.country_code || 'Unknown'}}</span>`;
html += '</div>';
}}
if (stats.asn_org) {{
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">ASN Org:</span>';
html += `<span class="stat-value-sm">${{stats.asn_org}}</span>`;
html += '</div>';
}}
// Reputation score if available
if (stats.reputation_score !== null && stats.reputation_score !== undefined) {{
html += '<div class="stat-row">';
html += '<span class="stat-label-sm">Reputation Score:</span>';
html += `<span class="stat-value-sm">${{stats.reputation_score}} ${{stats.reputation_source ? '(' + stats.reputation_source + ')' : ''}}</span>`;
html += '</div>';
}}
// Category History Timeline
if (stats.category_history && stats.category_history.length > 0) {{
html += '<div class="timeline-container">';
html += '<div class="timeline-title">Behavior Timeline</div>';
html += '<div class="timeline">';
stats.category_history.forEach((change, index) => {{
const categoryClass = change.new_category.toLowerCase().replace('_', '-');
const timestamp = formatTimestamp(change.timestamp);
html += '<div class="timeline-item">';
html += `<div class="timeline-marker ${{categoryClass}}"></div>`;
html += '<div class="timeline-content">';
if (change.old_category) {{
const oldCategoryBadge = 'category-' + change.old_category.toLowerCase().replace('_', '-');
html += `<span class="category-badge ${{oldCategoryBadge}}">${{change.old_category}}</span>`;
html += '<span class="timeline-arrow">→</span>';
}} else {{
html += '<span style="color: #8b949e;">Initial:</span> ';
}}
const newCategoryBadge = 'category-' + change.new_category.toLowerCase().replace('_', '-');
html += `<span class="category-badge ${{newCategoryBadge}}">${{change.new_category}}</span>`;
html += `<div class="timeline-timestamp">${{timestamp}}</div>`;
html += '</div>';
html += '</div>';
}});
html += '</div>';
html += '</div>';
}}
html += '</div>';
// Radar chart on the right
if (stats.category_scores && Object.keys(stats.category_scores).length > 0) {{
html += '<div class="stats-right">';
html += '<div style="font-size: 13px; font-weight: 600; color: #58a6ff; margin-bottom: 10px;">Category Score</div>';
html += '<svg class="radar-chart" viewBox="-30 -30 260 260" preserveAspectRatio="xMidYMid meet">';
const scores = {{
attacker: stats.category_scores.attacker || 0,
good_crawler: stats.category_scores.good_crawler || 0,
bad_crawler: stats.category_scores.bad_crawler || 0,
regular_user: stats.category_scores.regular_user || 0,
unknown: stats.category_scores.unknown || 0
}};
// Normalize scores for better visualization
const maxScore = Math.max(...Object.values(scores), 1);
const minVisibleRadius = 0.15; // Minimum 15% visibility even for 0 values
const normalizedScores = {{}};
Object.keys(scores).forEach(key => {{
// Scale values: ensure minimum visibility + proportional to max
normalizedScores[key] = minVisibleRadius + (scores[key] / maxScore) * (1 - minVisibleRadius);
}});
const colors = {{
attacker: '#f85149',
good_crawler: '#3fb950',
bad_crawler: '#f0883e',
regular_user: '#58a6ff',
unknown: '#8b949e'
}};
const labels = {{
attacker: 'Attacker',
good_crawler: 'Good Bot',
bad_crawler: 'Bad Bot',
regular_user: 'User',
unknown: 'Unknown'
}};
// Draw radar background grid
const cx = 100, cy = 100, maxRadius = 75;
for (let i = 1; i <= 5; i++) {{
const r = (maxRadius / 5) * i;
html += `<circle cx="${{cx}}" cy="${{cy}}" r="${{r}}" fill="none" stroke="#30363d" stroke-width="0.5"/>`;
}}
// Draw axes (now with 5 points for pentagon)
const angles = [0, 72, 144, 216, 288];
const keys = ['good_crawler', 'regular_user', 'unknown', 'bad_crawler', 'attacker'];
angles.forEach((angle, i) => {{
const rad = (angle - 90) * Math.PI / 180;
const x2 = cx + maxRadius * Math.cos(rad);
const y2 = cy + maxRadius * Math.sin(rad);
html += `<line x1="${{cx}}" y1="${{cy}}" x2="${{x2}}" y2="${{y2}}" stroke="#30363d" stroke-width="0.5"/>`;
// Add labels at consistent distance
const labelDist = maxRadius + 35;
const lx = cx + labelDist * Math.cos(rad);
const ly = cy + labelDist * Math.sin(rad);
html += `<text x="${{lx}}" y="${{ly}}" fill="#8b949e" font-size="12" text-anchor="middle" dominant-baseline="middle">${{labels[keys[i]]}}</text>`;
}});
// Draw filled polygon for scores
let points = [];
angles.forEach((angle, i) => {{
const normalizedScore = normalizedScores[keys[i]];
const rad = (angle - 90) * Math.PI / 180;
const r = normalizedScore * maxRadius;
const x = cx + r * Math.cos(rad);
const y = cy + r * Math.sin(rad);
points.push(`${{x}},${{y}}`);
}});
// Determine dominant category color
const dominantKey = Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b);
const dominantColor = colors[dominantKey];
// Draw single colored area
html += `<polygon points="${{points.join(' ')}}" fill="${{dominantColor}}" fill-opacity="0.4" stroke="${{dominantColor}}" stroke-width="2.5"/>`;
// Draw points
angles.forEach((angle, i) => {{
const normalizedScore = normalizedScores[keys[i]];
const rad = (angle - 90) * Math.PI / 180;
const r = normalizedScore * maxRadius;
const x = cx + r * Math.cos(rad);
const y = cy + r * Math.sin(rad);
html += `<circle cx="${{x}}" cy="${{y}}" r="4.5" fill="${{colors[keys[i]]}}" stroke="#0d1117" stroke-width="2"/>`;
}});
html += '</svg>';
// Legend
html += '<div class="radar-legend">';
keys.forEach(key => {{
html += '<div class="radar-legend-item">';
html += `<div class="radar-legend-color" style="background: ${{colors[key]}};"></div>`;
html += `<span style="color: #8b949e;">${{labels[key]}}: ${{scores[key]}} pt</span>`;
html += '</div>';
}});
html += '</div>';
html += '</div>';
}}
return html;
}}
</script>
</body>
</html>

View File

@@ -46,21 +46,12 @@
gap: 10px;
align-items: center;
overflow-y: auto;
overflow-x: hidden;
flex: 1;
padding-top: 10px;
}}
.links-container::-webkit-scrollbar {{
width: 8px;
}}
.links-container::-webkit-scrollbar-track {{
background: #0d1117;
}}
.links-container::-webkit-scrollbar-thumb {{
background: #30363d;
border-radius: 4px;
}}
.links-container::-webkit-scrollbar-thumb:hover {{
background: #484f58;
width: 0px;
}}
.link-box {{
background: #161b22;

View File

@@ -131,7 +131,8 @@ class Wordlists:
@property
def attack_urls(self):
return self._data.get("attack_urls", [])
"""Deprecated: use attack_patterns instead. Returns attack_patterns for backward compatibility."""
return self._data.get("attack_patterns", {})
_wordlists_instance = None

View File

@@ -353,11 +353,14 @@
}
},
"attack_patterns": {
"path_traversal": "\\.\\.",
"path_traversal": "(\\.\\.|%2e%2e|%252e%252e|\\.{2,}|%c0%ae|%c1%9c)",
"sql_injection": "('|\"|`|--|#|/\\*|\\*/|\\bunion\\b|\\bunion\\s+select\\b|\\bor\\b.*=.*|\\band\\b.*=.*|'.*or.*'.*=.*'|\\bsleep\\b|\\bwaitfor\\b|\\bdelay\\b|\\bbenchmark\\b|;.*select|;.*drop|;.*insert|;.*update|;.*delete|\\bexec\\b|\\bexecute\\b|\\bxp_cmdshell\\b|information_schema|table_schema|table_name)",
"xss_attempt": "(<script|</script|javascript:|onerror=|onload=|onclick=|onmouseover=|onfocus=|onblur=|<iframe|<img|<svg|<embed|<object|<body|<input|eval\\(|alert\\(|prompt\\(|confirm\\(|document\\.|window\\.|<style|expression\\(|vbscript:|data:text/html)",
"common_probes": "(wp-admin|phpmyadmin|\\.env|\\.git|/admin|/config)",
"shell_injection": "(\\||;|`|\\$\\(|&&)"
"shell_injection": "(\\||;|`|\\$\\(|&&|\\bnc\\b|\\bnetcat\\b|\\bwget\\b|\\bcurl\\b|/bin/bash|/bin/sh|cmd\\.exe)",
"lfi_rfi": "(file://|php://|expect://|data://|zip://|phar://|/etc/passwd|/etc/shadow|/proc/self|c:\\\\windows)",
"xxe_injection": "(<!ENTITY|<!DOCTYPE|SYSTEM|PUBLIC)",
"ldap_injection": "(\\*\\)|\\(\\||\\(&)",
"command_injection": "(&&|\\|\\||;|\\$\\{|\\$\\(|`)"
},
"server_headers": [
"Apache/2.4.41 (Ubuntu)",
@@ -366,11 +369,5 @@
"cloudflare",
"AmazonS3",
"gunicorn/20.1.0"
],
"attack_urls": {
"path_traversal": "\\.\\.",
"sql_injection": "('|--|;|\bOR\b|\bUNION\b|\bSELECT\b|\bDROP\b)",
"xss_attempt": "(<script|javascript:|onerror=|onload=)",
"shell_injection": "(\\||;|`|\\$\\(|&&)"
}
]
}