From 7d9f0616b77a924c42b8c6efbdede6bf376c4ffd Mon Sep 17 00:00:00 2001 From: Phillip Tarrant Date: Mon, 5 Jan 2026 11:54:02 -0600 Subject: [PATCH 1/4] Add background task to export suspicious IPs to text file - Implement export-malicious-ips task that queries distinct IPs flagged as is_suspicious from database and writes to exports/malicious_ips.txt - Add exports volume mount to docker-compose.yaml for host persistence - Update entrypoint.sh to fix ownership of exports directory for krawl user - Update Dockerfile to create /app/exports directory during build Other tasks can be added by creating them in the tasks dir using the same setup as this task. All tasks *MUST* include a TASK_CONFIG dict and a main method in the file to work correctly. --- Dockerfile | 2 +- docker-compose.yaml | 1 + entrypoint.sh | 2 +- exports/.gitkeep | 0 requirements.txt | 3 + src/server.py | 5 + src/tasks/top_attacking_ips.py | 57 +++++++ src/tasks_master.py | 288 +++++++++++++++++++++++++++++++++ 8 files changed, 356 insertions(+), 2 deletions(-) create mode 100644 exports/.gitkeep create mode 100644 src/tasks/top_attacking_ips.py create mode 100644 src/tasks_master.py diff --git a/Dockerfile b/Dockerfile index 2c7b954..92c2d9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ COPY wordlists.json /app/ COPY entrypoint.sh /app/ RUN useradd -m -u 1000 krawl && \ - mkdir -p /app/logs /app/data && \ + mkdir -p /app/logs /app/data /app/exports && \ chown -R krawl:krawl /app && \ chmod +x /app/entrypoint.sh diff --git a/docker-compose.yaml b/docker-compose.yaml index 02b6ae7..08bcec9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,6 +12,7 @@ services: - ./wordlists.json:/app/wordlists.json:ro - ./config.yaml:/app/config.yaml:ro - ./logs:/app/logs + - ./exports:/app/exports environment: - CONFIG_LOCATION=config.yaml restart: unless-stopped diff --git a/entrypoint.sh b/entrypoint.sh index 28b5fc0..fe3ef45 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -2,7 +2,7 @@ set -e # Fix ownership of mounted directories -chown -R krawl:krawl /app/logs /app/data 2>/dev/null || true +chown -R krawl:krawl /app/logs /app/data /app/exports 2>/dev/null || true # Drop to krawl user and run the application exec gosu krawl "$@" diff --git a/exports/.gitkeep b/exports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 8cb6dc5..cafbb7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ PyYAML>=6.0 # Database ORM SQLAlchemy>=2.0.0,<3.0.0 + +# Scheduling +APScheduler>=3.11.2 \ No newline at end of file diff --git a/src/server.py b/src/server.py index 7a59c73..135284c 100644 --- a/src/server.py +++ b/src/server.py @@ -13,6 +13,7 @@ from tracker import AccessTracker from handler import Handler from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger from database import initialize_database +from tasks_master import get_tasksmaster def print_usage(): @@ -89,6 +90,10 @@ def main(): except IOError: app_logger.warning("Can't read input file. Using randomly generated links.") + # tasks master init + tasks_master = get_tasksmaster() + tasks_master.run_scheduled_tasks() + try: app_logger.info(f'Starting deception server on port {config.port}...') app_logger.info(f'Timezone configured: {tz.key}') diff --git a/src/tasks/top_attacking_ips.py b/src/tasks/top_attacking_ips.py new file mode 100644 index 0000000..d9e18d3 --- /dev/null +++ b/src/tasks/top_attacking_ips.py @@ -0,0 +1,57 @@ +# tasks/export_malicious_ips.py + +import os +from logger import get_app_logger +from database import get_database +from models import AccessLog +from sqlalchemy import distinct + +app_logger = get_app_logger() + +# ---------------------- +# TASK CONFIG +# ---------------------- +TASK_CONFIG = { + "name": "export-malicious-ips", + "cron": "*/5 * * * *", + "enabled": True, + "run_when_loaded": True +} + +EXPORTS_DIR = "exports" +OUTPUT_FILE = os.path.join(EXPORTS_DIR, "malicious_ips.txt") + +# ---------------------- +# TASK LOGIC +# ---------------------- +def main(): + """ + Export all IPs flagged as suspicious to a text file. + TasksMaster will call this function based on the cron schedule. + """ + task_name = TASK_CONFIG.get("name") + app_logger.info(f"[Background Task] {task_name} starting...") + + try: + db = get_database() + session = db.session + + # Query distinct suspicious IPs + results = session.query(distinct(AccessLog.ip)).filter( + AccessLog.is_suspicious == True + ).all() + + # Ensure exports directory exists + os.makedirs(EXPORTS_DIR, exist_ok=True) + + # Write IPs to file (one per line) + with open(OUTPUT_FILE, 'w') as f: + for (ip,) in results: + f.write(f"{ip}\n") + + app_logger.info(f"[Background Task] {task_name} exported {len(results)} IPs to {OUTPUT_FILE}") + + except Exception as e: + app_logger.error(f"[Background Task] {task_name} failed: {e}") + finally: + db.close_session() diff --git a/src/tasks_master.py b/src/tasks_master.py new file mode 100644 index 0000000..264471c --- /dev/null +++ b/src/tasks_master.py @@ -0,0 +1,288 @@ +import os +import sys +import datetime +import functools +import threading +import importlib +import importlib.util + +from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger + +app_logger = get_app_logger() + +try: + from apscheduler.schedulers.background import BackgroundScheduler + from apscheduler.triggers.cron import CronTrigger + from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR +except ModuleNotFoundError: + msg = ( + "Required modules are not installed. " + "Can not continue with module / application loading.\n" + "Install it with: pip install -r requirements" + ) + print(msg, file=sys.stderr) + app_logger.error(msg) + exit() + + +# ---------- TASKSMASTER CLASS ---------- +class TasksMaster: + + TASK_DEFAULT_CRON = '*/15 * * * *' + TASK_JITTER = 240 + TASKS_FOLDER = os.path.join(os.path.dirname(__file__), "tasks") + + def __init__(self, scheduler: BackgroundScheduler): + self.tasks = self._config_tasks() + self.scheduler = scheduler + self.last_run_times = {} + self.scheduler.add_listener(self.job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) + + def _config_tasks(self): + """ + Loads tasks from the TASKS_FOLDER and logs how many were found. + """ + tasks_defined = self._load_tasks_from_folder(self.TASKS_FOLDER) + app_logger.info(f"Scheduled Tasks Loaded from folder: {self.TASKS_FOLDER}") + return tasks_defined + + def _load_tasks_from_folder(self, folder_path): + """ + Loads and registers task modules from a specified folder. + + This function scans the given folder for Python (.py) files, dynamically + imports each as a module, and looks for two attributes: + - TASK_CONFIG: A dictionary containing task metadata, specifically the + 'name' and 'cron' (cron schedule string). + - main: A callable function that represents the task's execution logic. + + Tasks with both attributes are added to a list with their configuration and + execution function. + + Args: + folder_path (str): Path to the folder containing task scripts. + + Returns: + list[dict]: A list of task definitions with keys: + - 'name' (str): The name of the task. + - 'filename' (str): The file the task was loaded from. + - 'cron' (str): The crontab string for scheduling. + - 'enabled' (bool): Whether the task is enabled. + - 'run_when_loaded' (bool): Whether to run the task immediately. + """ + tasks = [] + + if not os.path.exists(folder_path): + app_logger.error(f"{folder_path} does not exist! Unable to load tasks!") + return tasks + + # we sort the files so that we have a set order, which helps with debugging + for filename in sorted(os.listdir(folder_path)): + + # skip any non python files, as well as any __pycache__ or .pyc files that might creep in there + if not filename.endswith('.py') or filename.startswith("__"): + continue + + path = os.path.join(folder_path, filename) + module_name = filename[:-3] + spec = importlib.util.spec_from_file_location(f"tasks.{module_name}", path) + module = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(module) + sys.modules[f"tasks.{module_name}"] = module + except Exception as e: + app_logger.error(f"Failed to import {filename}: {e}") + continue + + # if we have a tasks config and a main function, we attempt to schedule it + if hasattr(module, 'TASK_CONFIG') and hasattr(module, 'main'): + + # ensure task_config is a dict + if not isinstance(module.TASK_CONFIG, dict): + app_logger.error(f"TASK_CONFIG is not a dict in {filename}. Skipping task.") + continue + + task_cron = module.TASK_CONFIG.get("cron") or self.TASK_DEFAULT_CRON + task_name = module.TASK_CONFIG.get("name", module_name) + + # ensure the task_cron is a valid cron value + try: + CronTrigger.from_crontab(task_cron) + except ValueError as ve: + app_logger.error(f"Invalid cron format for task {task_name}: {ve} - Skipping this task") + continue + + task = { + 'name': module.TASK_CONFIG.get('name', module_name), + 'filename': filename, + 'cron': task_cron, + "enabled": module.TASK_CONFIG.get("enabled", False), + "run_when_loaded": module.TASK_CONFIG.get("run_when_loaded", False) + } + + tasks.append(task) + + # we are missing things, and we log what's missing + else: + if not hasattr(module, 'TASK_CONFIG'): + app_logger.warning(f"Missing TASK_CONFIG in {filename}") + elif not hasattr(module, 'main'): + app_logger.warning(f"Missing main() in {filename}") + + return tasks + + def _add_jobs(self): + # for each task in the tasks config file... + for task_to_run in self.tasks: + + # remember, these tasks, are built from the "load_tasks_from_folder" function, + # if you want to pass data from the TASKS_CONFIG dict, you need to pass it there to get it here. + task_name = task_to_run.get("name") + run_when_loaded = task_to_run.get("run_when_loaded") + module_name = os.path.splitext(task_to_run.get("filename"))[0] + task_enabled = task_to_run.get("enabled", False) + + # if no crontab set for this task, we use 15 as the default. + task_cron = task_to_run.get("cron") or self.TASK_DEFAULT_CRON + + # if task is disabled, skip this one + if not task_enabled: + app_logger.info(f"{task_name} is disabled in client config. Skipping task") + continue + try: + if os.path.isfile(os.path.join(self.TASKS_FOLDER, task_to_run.get("filename"))): + # schedule the task now that everything has checked out above... + self._schedule_task(task_name, module_name, task_cron, run_when_loaded) + app_logger.info(f"Scheduled {module_name} cron is set to {task_cron}.", extra={"task": task_to_run}) + else: + app_logger.info(f"Skipping invalid or unsafe file: {task_to_run.get('filename')}", extra={"task": task_to_run}) + + except Exception as e: + app_logger.error(f"Error scheduling task: {e}", extra={"tasks": task_to_run}) + + def _schedule_task(self, task_name, module_name, task_cron, run_when_loaded): + try: + # Dynamically import the module + module = importlib.import_module(f"tasks.{module_name}") + + # Check if the module has a 'main' function + if hasattr(module, 'main'): + app_logger.info(f"Scheduling {task_name} - {module_name} Main Function") + + # unique_job_id + job_identifier = f"{module_name}__{task_name}" + + # little insurance to make sure the cron is set to something and not none + if task_cron is None: + task_cron = self.TASK_DEFAULT_CRON + + trigger = CronTrigger.from_crontab(task_cron) + + # schedule the task / job + if run_when_loaded: + app_logger.info(f"Task: {task_name} is set to run instantly. Scheduling to run on scheduler start") + + self.scheduler.add_job( + module.main, + trigger, + id=job_identifier, + jitter=self.TASK_JITTER, + name=task_name, + next_run_time=datetime.datetime.now(), + max_instances=1 + ) + else: + self.scheduler.add_job( + module.main, + trigger, + id=job_identifier, + jitter=self.TASK_JITTER, + name=task_name, + max_instances=1 + ) + else: + app_logger.error(f"{module_name} does not define a 'main' function.") + + except Exception as e: + app_logger.error(f"Failed to load {module_name}: {e}") + + def job_listener(self, event): + job_id = event.job_id + self.last_run_times[job_id] = datetime.datetime.now() + + if event.exception: + app_logger.error(f"Job {event.job_id} failed: {event.exception}") + else: + app_logger.info(f"Job {event.job_id} completed successfully.") + + def list_jobs(self): + scheduled_jobs = self.scheduler.get_jobs() + jobs_list = [] + + for job in scheduled_jobs: + jobs_list.append({ + "id": job.id, + "name": job.name, + "next_run": job.next_run_time, + }) + return jobs_list + + def run_scheduled_tasks(self): + """ + Runs and schedules enabled tasks using the background scheduler. + + This method performs the following: + 1. Retrieves the current task configurations and updates internal state. + 2. Adds new jobs to the scheduler based on the latest configuration. + 3. Starts the scheduler to begin executing tasks at their defined intervals. + + This ensures the scheduler is always running with the most up-to-date + task definitions and enabled status. + """ + + # Add enabled tasks to the scheduler + self._add_jobs() + + # Start the scheduler to begin executing the scheduled tasks (if not already running) + if not self.scheduler.running: + self.scheduler.start() + + +# ---------- SINGLETON WRAPPER ---------- +T = type + +def singleton_loader(func): + """Decorator to ensure only one instance exists.""" + cache: dict[str, T] = {} + lock = threading.Lock() + + @functools.wraps(func) + def wrapper(*args, **kwargs) -> T: + with lock: + if func.__name__ not in cache: + cache[func.__name__] = func(*args, **kwargs) + return cache[func.__name__] + return wrapper + + +@singleton_loader +def get_tasksmaster(scheduler: BackgroundScheduler | None = None) -> TasksMaster: + """ + Returns the singleton TasksMaster instance. + + - Automatically creates a BackgroundScheduler if none is provided. + - Automatically starts the scheduler when the singleton is created. + + :param scheduler: Optional APScheduler instance. If None, a new BackgroundScheduler will be created. + """ + if scheduler is None: + scheduler = BackgroundScheduler() + + tm_instance = TasksMaster(scheduler) + + # Auto-start scheduler if not already running + if not scheduler.running: + scheduler.start() + app_logger.info("TasksMaster scheduler started automatically with singleton creation.") + + return tm_instance From edb288a27157cf85993dad9940f90c053caa3ae1 Mon Sep 17 00:00:00 2001 From: Phillip Tarrant Date: Wed, 7 Jan 2026 12:33:43 -0600 Subject: [PATCH 2/4] Fixed some print statements to leverage logging, pulled in most recent dev edits, added exports to gitignore --- .gitignore | 3 +++ src/analyzer.py | 18 ++++++++++++------ src/database.py | 12 ++++++++---- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 70b93e4..63ae0e9 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,6 @@ data/ # Personal canary tokens or sensitive configs *canary*token*.yaml personal-values.yaml + +#exports dir (keeping .gitkeep so we have the dir) +/exports/* \ No newline at end of file diff --git a/src/analyzer.py b/src/analyzer.py index a745813..b10e4e7 100644 --- a/src/analyzer.py +++ b/src/analyzer.py @@ -8,10 +8,13 @@ from datetime import datetime, timedelta import re from wordlists import get_wordlists from config import get_config +from logger import get_app_logger """ Functions for user activity analysis """ +app_logger = get_app_logger() + class Analyzer: """ Analyzes users activity and produces aggregated insights @@ -56,7 +59,7 @@ class Analyzer: attack_urls_threshold = config.attack_urls_threshold uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds - print(f"http_risky_methods_threshold: {http_risky_methods_threshold}") + app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}") score = {} score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} @@ -185,7 +188,7 @@ class Analyzer: variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs) std = variance ** 0.5 cv = std/mean - print(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}") + app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}") if cv >= uneven_request_timing_threshold: score["attacker"]["uneven_request_timing"] = True @@ -268,10 +271,13 @@ class Analyzer: regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"] regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"] - print(f"Attacker score: {attacker_score}") - print(f"Good Crawler score: {good_crawler_score}") - print(f"Bad Crawler score: {bad_crawler_score}") - print(f"Regular User score: {regular_user_score}") + score_details = f""" + Attacker score: {attacker_score} + Good Crawler score: {good_crawler_score} + Bad Crawler score: {bad_crawler_score} + Regular User score: {regular_user_score} + """ + app_logger.debug(score_details) analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list} category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score} diff --git a/src/database.py b/src/database.py index 0245105..c184e9e 100644 --- a/src/database.py +++ b/src/database.py @@ -22,6 +22,9 @@ from sanitizer import ( sanitize_attack_pattern, ) +from logger import get_app_logger + +applogger = get_app_logger() class DatabaseManager: """ @@ -154,7 +157,7 @@ class DatabaseManager: except Exception as e: session.rollback() # Log error but don't crash - database persistence is secondary to honeypot function - print(f"Database error persisting access: {e}") + applogger.critical(f"Database error persisting access: {e}") return None finally: self.close_session() @@ -193,7 +196,7 @@ class DatabaseManager: except Exception as e: session.rollback() - print(f"Database error persisting credential: {e}") + applogger.critical(f"Database error persisting credential: {e}") return None finally: self.close_session() @@ -236,7 +239,8 @@ class DatabaseManager: last_analysis: timestamp of last analysis """ - print(f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}") + applogger.debug(f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}") + applogger.info(f"IP: {ip} category has been updated to {category}") session = self.session sanitized_ip = sanitize_ip(ip) @@ -295,7 +299,7 @@ class DatabaseManager: session.commit() except Exception as e: session.rollback() - print(f"Error recording category change: {e}") + applogger.error(f"Error recording category change: {e}") def get_category_history(self, ip: str) -> List[Dict[str, Any]]: """ From be7ba1f820258342f728b9b9deb73428f7f0c5c7 Mon Sep 17 00:00:00 2001 From: Patrick Di Fazio Date: Fri, 9 Jan 2026 20:37:20 +0100 Subject: [PATCH 3/4] added download button --- config.yaml | 2 +- src/exports/malicious_ips.txt | 1 + src/handler.py | 32 +++++++++++++++++++++++++- src/templates/dashboard_template.py | 35 +++++++++++++++++++++++++++-- 4 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 src/exports/malicious_ips.txt diff --git a/config.yaml b/config.yaml index 52daa09..a2f6b58 100644 --- a/config.yaml +++ b/config.yaml @@ -1,7 +1,7 @@ # Krawl Honeypot Configuration server: - port: 5000 + port: 1234 delay: 100 # Response delay in milliseconds timezone: null # e.g., "America/New_York", "Europe/Paris" or null for system default diff --git a/src/exports/malicious_ips.txt b/src/exports/malicious_ips.txt new file mode 100644 index 0000000..7b9ad53 --- /dev/null +++ b/src/exports/malicious_ips.txt @@ -0,0 +1 @@ +127.0.0.1 diff --git a/src/handler.py b/src/handler.py index ebc0b66..1f96d6c 100644 --- a/src/handler.py +++ b/src/handler.py @@ -408,7 +408,8 @@ class Handler(BaseHTTPRequestHandler): try: stats = self.tracker.get_stats() timezone = str(self.config.timezone) if self.config.timezone else 'UTC' - self.wfile.write(generate_dashboard(stats, timezone).encode()) + dashboard_path = self.config.dashboard_secret_path + self.wfile.write(generate_dashboard(stats, timezone, dashboard_path).encode()) except BrokenPipeError: pass except Exception as e: @@ -442,6 +443,35 @@ class Handler(BaseHTTPRequestHandler): self.wfile.write(json.dumps({'error': str(e)}).encode()) return + # API endpoint for downloading malicious IPs file + if self.config.dashboard_secret_path and self.path == f"{self.config.dashboard_secret_path}/api/download/malicious_ips.txt": + import os + file_path = os.path.join(os.path.dirname(__file__), 'exports', 'malicious_ips.txt') + try: + if os.path.exists(file_path): + with open(file_path, 'rb') as f: + content = f.read() + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('Content-Disposition', 'attachment; filename="malicious_ips.txt"') + self.send_header('Content-Length', str(len(content))) + self.end_headers() + self.wfile.write(content) + else: + self.send_response(404) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(b'File not found') + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error serving malicious IPs file: {e}") + self.send_response(500) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(b'Internal server error') + return + self.tracker.record_access(client_ip, self.path, user_agent, method='GET') self.analyzer.infer_user_category(client_ip) diff --git a/src/templates/dashboard_template.py b/src/templates/dashboard_template.py index bbb6ad9..4e7005c 100644 --- a/src/templates/dashboard_template.py +++ b/src/templates/dashboard_template.py @@ -38,12 +38,13 @@ def format_timestamp(iso_timestamp: str, timezone: str = 'UTC', time_only: bool return iso_timestamp.split("T")[1][:8] if "T" in iso_timestamp else iso_timestamp -def generate_dashboard(stats: dict, timezone: str = 'UTC') -> str: +def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = '') -> str: """Generate dashboard HTML with access statistics Args: stats: Statistics dictionary timezone: IANA timezone string (e.g., 'Europe/Paris', 'America/New_York') + dashboard_path: The secret dashboard path for generating API URLs """ # Generate IP rows with clickable functionality for dropdown stats @@ -164,12 +165,36 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC') -> str: .container {{ max-width: 1400px; margin: 0 auto; + position: relative; }} h1 {{ color: #58a6ff; text-align: center; margin-bottom: 40px; }} + .download-section {{ + position: absolute; + top: 0; + right: 0; + }} + .download-btn {{ + display: inline-block; + padding: 8px 14px; + background: #238636; + color: #ffffff; + text-decoration: none; + border-radius: 6px; + font-weight: 500; + font-size: 13px; + transition: background 0.2s; + border: 1px solid #2ea043; + }} + .download-btn:hover {{ + background: #2ea043; + }} + .download-btn:active {{ + background: #1f7a2f; + }} .stats-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); @@ -450,6 +475,11 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC') -> str:
+

Krawl Dashboard

@@ -599,6 +629,7 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC') -> str: