diff --git a/.gitignore b/.gitignore index ecc3154..90cc56f 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ secrets/ .env .env.local .env.*.local +.envrc # Logs *.log diff --git a/config.yaml b/config.yaml index 622093c..388b694 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,6 @@ server: port: 5000 delay: 100 # Response delay in milliseconds - timezone: null # e.g., "America/New_York", "Europe/Paris" or null for system default # manually set the server header, if null a random one will be used. server_header: null diff --git a/docker-compose.yaml b/docker-compose.yaml index d8ea198..233692b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,5 +1,4 @@ -version: '3.8' - +--- services: krawl: build: @@ -8,11 +7,26 @@ services: container_name: krawl-server ports: - "5000:5000" + environment: + - CONFIG_LOCATION=config.yaml + # set this to change timezone, alternatively mount /etc/timezone or /etc/localtime based on the time system management of the host environment + # - TZ=${TZ} volumes: - ./wordlists.json:/app/wordlists.json:ro - ./config.yaml:/app/config.yaml:ro - ./logs:/app/logs - ./exports:/app/exports - environment: - - CONFIG_LOCATION=config.yaml + - data:/app/data restart: unless-stopped + develop: + watch: + - path: ./Dockerfile + action: rebuild + - path: ./src/ + action: sync+restart + target: /app/src + - path: ./docker-compose.yaml + action: rebuild + +volumes: + data: diff --git a/src/analyzer.py b/src/analyzer.py index 907529f..c0ff515 100644 --- a/src/analyzer.py +++ b/src/analyzer.py @@ -23,7 +23,7 @@ class Analyzer: """ Analyzes users activity and produces aggregated insights """ - def __init__(self, db_manager: Optional[DatabaseManager] = None, timezone: Optional[ZoneInfo] = None): + def __init__(self, db_manager: Optional[DatabaseManager] = None): """ Initialize the access tracker. @@ -31,11 +31,10 @@ class Analyzer: db_manager: Optional DatabaseManager for persistence. If None, will use the global singleton. """ - self.timezone = timezone or ZoneInfo('UTC') # Database manager for persistence (lazily initialized) self._db_manager = db_manager - + @property def db(self) -> Optional[DatabaseManager]: """ @@ -51,11 +50,11 @@ class Analyzer: # Database not initialized, persistence disabled pass return self._db_manager - + # def infer_user_category(self, ip: str) -> str: # config = get_config() - + # http_risky_methods_threshold = config.http_risky_methods_threshold # violated_robots_threshold = config.violated_robots_threshold # uneven_request_timing_threshold = config.uneven_request_timing_threshold @@ -70,7 +69,7 @@ class Analyzer: # score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} # score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} # score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} - + # #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme # weights = { # "attacker": { @@ -108,7 +107,7 @@ class Analyzer: # total_accesses_count = len(accesses) # if total_accesses_count <= 0: # return - + # # Set category as "unknown" for the first 5 requests # if total_accesses_count < 3: # category = "unknown" @@ -127,7 +126,7 @@ class Analyzer: # delete_accesses_count = len([item for item in accesses if item["method"] == "DELETE"]) # head_accesses_count = len([item for item in accesses if item["method"] == "HEAD"]) # options_accesses_count = len([item for item in accesses if item["method"] == "OPTIONS"]) - # patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"]) + # patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"]) # if total_accesses_count > http_risky_methods_threshold: # http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count @@ -156,7 +155,7 @@ class Analyzer: # if not line: # continue # parts = line.split(":") - + # if parts[0] == "Disallow": # parts[1] = parts[1].rstrip("/") # #print(f"DISALLOW {parts[1]}") @@ -180,7 +179,7 @@ class Analyzer: # score["good_crawler"]["robots_violations"] = False # score["bad_crawler"]["robots_violations"] = False # score["regular_user"]["robots_violations"] = False - + # #--------------------- Requests Timing --------------------- # #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior # timestamps = [datetime.fromisoformat(item["timestamp"]) for item in accesses] @@ -192,7 +191,7 @@ class Analyzer: # for i in range(0, len(timestamps)-1): # diff = (timestamps[i] - timestamps[i+1]).total_seconds() # time_diffs.append(diff) - + # mean = 0 # variance = 0 # std = 0 @@ -250,10 +249,10 @@ class Analyzer: # except Exception: # decoded_path = queried_path # decoded_path_twice = queried_path - + # for name, pattern in wl.attack_patterns.items(): # # Check original, decoded, and double-decoded paths - # if (re.search(pattern, queried_path, re.IGNORECASE) or + # if (re.search(pattern, queried_path, re.IGNORECASE) or # re.search(pattern, decoded_path, re.IGNORECASE) or # re.search(pattern, decoded_path_twice, re.IGNORECASE)): # attack_urls_found_list.append(f"{name}: {pattern}") @@ -261,7 +260,7 @@ class Analyzer: # #remove duplicates # attack_urls_found_list = set(attack_urls_found_list) # attack_urls_found_list = list(attack_urls_found_list) - + # if len(attack_urls_found_list) > attack_urls_threshold: # score["attacker"]["attack_url"] = True # score["good_crawler"]["attack_url"] = False @@ -344,7 +343,7 @@ class Analyzer: # sanitized_asn = sanitize_for_storage(asn, 100) # sanitized_asn_org = sanitize_for_storage(asn_org, 100) # sanitized_list_on = sanitize_dict(list_on, 100000) - + # self._db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on) - - # return \ No newline at end of file + + # return diff --git a/src/config.py b/src/config.py index 66938b1..1a9dbc2 100644 --- a/src/config.py +++ b/src/config.py @@ -32,7 +32,6 @@ class Config: # Database settings database_path: str = "data/krawl.db" database_retention_days: int = 30 - timezone: str = None # IANA timezone (e.g., 'America/New_York', 'Europe/Rome') # Analyzer settings http_risky_methods_threshold: float = None @@ -42,39 +41,6 @@ class Config: user_agents_used_threshold: float = None attack_urls_threshold: float = None - @staticmethod - # Try to fetch timezone before if not set - def get_system_timezone() -> str: - """Get the system's default timezone""" - try: - if os.path.islink('/etc/localtime'): - tz_path = os.readlink('/etc/localtime') - if 'zoneinfo/' in tz_path: - return tz_path.split('zoneinfo/')[-1] - - local_tz = time.tzname[time.daylight] - if local_tz and local_tz != 'UTC': - return local_tz - except Exception: - pass - - # Default fallback to UTC - return 'UTC' - - def get_timezone(self) -> ZoneInfo: - """Get configured timezone as ZoneInfo object""" - if self.timezone: - try: - return ZoneInfo(self.timezone) - except Exception: - pass - - system_tz = self.get_system_timezone() - try: - return ZoneInfo(system_tz) - except Exception: - return ZoneInfo('UTC') - @classmethod def from_yaml(cls) -> 'Config': """Create configuration from YAML file""" @@ -113,12 +79,11 @@ class Config: # ensure the dashboard path starts with a / if dashboard_path[:1] != "/": dashboard_path = f"/{dashboard_path}" - + return cls( port=server.get('port', 5000), delay=server.get('delay', 100), server_header=server.get('server_header',""), - timezone=server.get('timezone'), links_length_range=( links.get('min_length', 5), links.get('max_length', 15) @@ -140,7 +105,7 @@ class Config: database_retention_days=database.get('retention_days', 30), http_risky_methods_threshold=analyzer.get('http_risky_methods_threshold', 0.1), violated_robots_threshold=analyzer.get('violated_robots_threshold', 0.1), - uneven_request_timing_threshold=analyzer.get('uneven_request_timing_threshold', 0.5), # coefficient of variation + uneven_request_timing_threshold=analyzer.get('uneven_request_timing_threshold', 0.5), # coefficient of variation uneven_request_timing_time_window_seconds=analyzer.get('uneven_request_timing_time_window_seconds', 300), user_agents_used_threshold=analyzer.get('user_agents_used_threshold', 2), attack_urls_threshold=analyzer.get('attack_urls_threshold', 1) diff --git a/src/database.py b/src/database.py index 5d41e2c..bfe2725 100644 --- a/src/database.py +++ b/src/database.py @@ -7,7 +7,7 @@ Provides SQLAlchemy session management and database initialization. import os import stat -from datetime import datetime +from datetime import datetime, timedelta from typing import Optional, List, Dict, Any from zoneinfo import ZoneInfo @@ -141,7 +141,7 @@ class DatabaseManager: method=method[:10], is_suspicious=is_suspicious, is_honeypot_trigger=is_honeypot_trigger, - timestamp=datetime.now(tz=ZoneInfo('UTC')) + timestamp=datetime.now() ) session.add(access_log) session.flush() # Get the ID before committing @@ -199,7 +199,7 @@ class DatabaseManager: path=sanitize_path(path), username=sanitize_credential(username), password=sanitize_credential(password), - timestamp=datetime.now(tz=ZoneInfo('UTC')) + timestamp=datetime.now() ) session.add(credential) session.commit() @@ -221,7 +221,7 @@ class DatabaseManager: ip: IP address to update """ sanitized_ip = sanitize_ip(ip) - now = datetime.now(tz=ZoneInfo('UTC')) + now = datetime.now() ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() @@ -266,7 +266,7 @@ class DatabaseManager: ip_stats.category = category ip_stats.category_scores = category_scores ip_stats.last_analysis = last_analysis - + try: session.commit() except Exception as e: @@ -280,21 +280,21 @@ class DatabaseManager: Args: ip: IP address to update category: selected category - + """ session = self.session sanitized_ip = sanitize_ip(ip) ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() - + # Record the manual category change old_category = ip_stats.category if old_category != category: - self._record_category_change(sanitized_ip, old_category, category, datetime.now(tz=ZoneInfo('UTC'))) + self._record_category_change(sanitized_ip, old_category, category, datetime.now()) ip_stats.category = category ip_stats.manual_category = True - + try: session.commit() except Exception as e: @@ -316,7 +316,7 @@ class DatabaseManager: # Only record actual category changes if old_category is None: return - + session = self.session try: history_entry = CategoryHistory( @@ -352,7 +352,7 @@ class DatabaseManager: { 'old_category': h.old_category, 'new_category': h.new_category, - 'timestamp': h.timestamp.isoformat() + '+00:00' + 'timestamp': h.timestamp.isoformat() } for h in history ] @@ -390,6 +390,7 @@ class DatabaseManager: def get_unenriched_ips(self, limit: int = 100) -> List[str]: """ Get IPs that don't have reputation data yet. + Excludes RFC1918 private addresses and other non-routable IPs. Args: limit: Maximum number of IPs to return @@ -400,7 +401,18 @@ class DatabaseManager: session = self.session try: ips = session.query(IpStats.ip).filter( - IpStats.country_code.is_(None) + IpStats.country_code.is_(None), + ~IpStats.ip.like('10.%'), + ~IpStats.ip.like('172.16.%'), + ~IpStats.ip.like('172.17.%'), + ~IpStats.ip.like('172.18.%'), + ~IpStats.ip.like('172.19.%'), + ~IpStats.ip.like('172.2_.%'), + ~IpStats.ip.like('172.30.%'), + ~IpStats.ip.like('172.31.%'), + ~IpStats.ip.like('192.168.%'), + ~IpStats.ip.like('127.%'), + ~IpStats.ip.like('169.254.%') ).limit(limit).all() return [ip[0] for ip in ips] finally: @@ -411,7 +423,8 @@ class DatabaseManager: limit: int = 100, offset: int = 0, ip_filter: Optional[str] = None, - suspicious_only: bool = False + suspicious_only: bool = False, + since_minutes: Optional[int] = None ) -> List[Dict[str, Any]]: """ Retrieve access logs with optional filtering. @@ -421,6 +434,7 @@ class DatabaseManager: offset: Number of records to skip ip_filter: Filter by IP address suspicious_only: Only return suspicious requests + since_minutes: Only return logs from the last N minutes Returns: List of access log dictionaries @@ -433,6 +447,9 @@ class DatabaseManager: query = query.filter(AccessLog.ip == sanitize_ip(ip_filter)) if suspicious_only: query = query.filter(AccessLog.is_suspicious == True) + if since_minutes is not None: + cutoff_time = datetime.now() - timedelta(minutes=since_minutes) + query = query.filter(AccessLog.timestamp >= cutoff_time) logs = query.offset(offset).limit(limit).all() @@ -445,7 +462,7 @@ class DatabaseManager: 'method': log.method, 'is_suspicious': log.is_suspicious, 'is_honeypot_trigger': log.is_honeypot_trigger, - 'timestamp': log.timestamp.isoformat() + '+00:00', + 'timestamp': log.timestamp.isoformat(), 'attack_types': [d.attack_type for d in log.attack_detections] } for log in logs @@ -501,7 +518,7 @@ class DatabaseManager: # print(f"Database error persisting access: {e}") # return None # finally: - # self.close_session() + # self.close_session() def get_credential_attempts( self, @@ -538,7 +555,7 @@ class DatabaseManager: 'path': attempt.path, 'username': attempt.username, 'password': attempt.password, - 'timestamp': attempt.timestamp.isoformat() + '+00:00' + 'timestamp': attempt.timestamp.isoformat() } for attempt in attempts ] @@ -565,8 +582,8 @@ class DatabaseManager: { 'ip': s.ip, 'total_requests': s.total_requests, - 'first_seen': s.first_seen.isoformat() + '+00:00', - 'last_seen': s.last_seen.isoformat() + '+00:00', + 'first_seen': s.first_seen.isoformat(), + 'last_seen': s.last_seen.isoformat(), 'country_code': s.country_code, 'city': s.city, 'asn': s.asn, @@ -596,18 +613,18 @@ class DatabaseManager: session = self.session try: stat = session.query(IpStats).filter(IpStats.ip == ip).first() - + if not stat: return None - + # Get category history for this IP category_history = self.get_category_history(ip) - + return { 'ip': stat.ip, 'total_requests': stat.total_requests, - 'first_seen': stat.first_seen.isoformat() + '+00:00' if stat.first_seen else None, - 'last_seen': stat.last_seen.isoformat() + '+00:00' if stat.last_seen else None, + 'first_seen': stat.first_seen.isoformat() if stat.first_seen else None, + 'last_seen': stat.last_seen.isoformat() if stat.last_seen else None, 'country_code': stat.country_code, 'city': stat.city, 'asn': stat.asn, @@ -619,7 +636,7 @@ class DatabaseManager: 'category': stat.category, 'category_scores': stat.category_scores or {}, 'manual_category': stat.manual_category, - 'last_analysis': stat.last_analysis.isoformat() + '+00:00' if stat.last_analysis else None, + 'last_analysis': stat.last_analysis.isoformat() if stat.last_analysis else None, 'category_history': category_history } finally: @@ -690,7 +707,7 @@ class DatabaseManager: Args: limit: Maximum number of results - Returns: + Returns:data List of (path, count) tuples ordered by count descending """ session = self.session @@ -753,7 +770,7 @@ class DatabaseManager: 'ip': log.ip, 'path': log.path, 'user_agent': log.user_agent, - 'timestamp': log.timestamp.isoformat() + '+00:00' + 'timestamp': log.timestamp.isoformat() } for log in logs ] @@ -811,7 +828,7 @@ class DatabaseManager: 'ip': log.ip, 'path': log.path, 'user_agent': log.user_agent, - 'timestamp': log.timestamp.isoformat() + '+00:00', + 'timestamp': log.timestamp.isoformat(), 'attack_types': [d.attack_type for d in log.attack_detections] } for log in logs diff --git a/src/handler.py b/src/handler.py index 9f2a77f..ef26fb5 100644 --- a/src/handler.py +++ b/src/handler.py @@ -43,12 +43,12 @@ class Handler(BaseHTTPRequestHandler): if forwarded_for: # X-Forwarded-For can contain multiple IPs, get the first (original client) return forwarded_for.split(',')[0].strip() - + # Check X-Real-IP header (set by nginx and other proxies) real_ip = self.headers.get('X-Real-IP') if real_ip: return real_ip.strip() - + # Fallback to direct connection IP return self.client_address[0] @@ -73,12 +73,12 @@ class Handler(BaseHTTPRequestHandler): if not error_codes: error_codes = [400, 401, 403, 404, 500, 502, 503] return random.choice(error_codes) - + def _parse_query_string(self) -> str: """Extract query string from the request path""" parsed = urlparse(self.path) return parsed.query - + def _handle_sql_endpoint(self, path: str) -> bool: """ Handle SQL injection honeypot endpoints. @@ -86,22 +86,22 @@ class Handler(BaseHTTPRequestHandler): """ # SQL-vulnerable endpoints sql_endpoints = ['/api/search', '/api/sql', '/api/database'] - + base_path = urlparse(path).path if base_path not in sql_endpoints: return False - + try: # Get query parameters query_string = self._parse_query_string() - + # Log SQL injection attempt client_ip = self._get_client_ip() user_agent = self._get_user_agent() - + # Always check for SQL injection patterns error_msg, content_type, status_code = generate_sql_error_response(query_string or "") - + if error_msg: # SQL injection detected - log and return error self.access_logger.warning(f"[SQL INJECTION DETECTED] {client_ip} - {base_path} - Query: {query_string[:100] if query_string else 'empty'}") @@ -117,9 +117,9 @@ class Handler(BaseHTTPRequestHandler): self.end_headers() response_data = get_sql_response_with_data(base_path, query_string or "") self.wfile.write(response_data.encode()) - + return True - + except BrokenPipeError: # Client disconnected return True @@ -142,7 +142,7 @@ class Handler(BaseHTTPRequestHandler): # Build the content HTML content = "" - + # Add canary token if needed if Handler.counter <= 0 and self.config.canary_token_url: content += f""" @@ -189,16 +189,16 @@ class Handler(BaseHTTPRequestHandler): from urllib.parse import urlparse base_path = urlparse(self.path).path - + if base_path in ['/api/search', '/api/sql', '/api/database']: content_length = int(self.headers.get('Content-Length', 0)) if content_length > 0: post_data = self.rfile.read(content_length).decode('utf-8', errors="replace") - + self.access_logger.info(f"[SQL ENDPOINT POST] {client_ip} - {base_path} - Data: {post_data[:100] if post_data else 'empty'}") - + error_msg, content_type, status_code = generate_sql_error_response(post_data) - + try: if error_msg: self.access_logger.warning(f"[SQL INJECTION DETECTED POST] {client_ip} - {base_path}") @@ -217,26 +217,26 @@ class Handler(BaseHTTPRequestHandler): except Exception as e: self.app_logger.error(f"Error in SQL POST handler: {str(e)}") return - + if base_path == '/api/contact': content_length = int(self.headers.get('Content-Length', 0)) if content_length > 0: post_data = self.rfile.read(content_length).decode('utf-8', errors="replace") - + parsed_data = {} for pair in post_data.split('&'): if '=' in pair: key, value = pair.split('=', 1) from urllib.parse import unquote_plus parsed_data[unquote_plus(key)] = unquote_plus(value) - + xss_detected = any(detect_xss_pattern(v) for v in parsed_data.values()) - + if xss_detected: self.access_logger.warning(f"[XSS ATTEMPT DETECTED] {client_ip} - {base_path} - Data: {post_data[:200]}") else: self.access_logger.info(f"[XSS ENDPOINT POST] {client_ip} - {base_path}") - + try: self.send_response(200) self.send_header('Content-type', 'text/html') @@ -264,17 +264,17 @@ class Handler(BaseHTTPRequestHandler): timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") credential_line = f"{timestamp}|{client_ip}|{username or 'N/A'}|{password or 'N/A'}|{self.path}" self.credential_logger.info(credential_line) - + # Also record in tracker for dashboard self.tracker.record_credential_attempt(client_ip, self.path, username or 'N/A', password or 'N/A') - + self.access_logger.warning(f"[CREDENTIALS CAPTURED] {client_ip} - Username: {username or 'N/A'} - Path: {self.path}") # send the post data (body) to the record_access function so the post data can be used to detect suspicious things. self.tracker.record_access(client_ip, self.path, user_agent, post_data, method='POST') - + time.sleep(1) - + try: self.send_response(200) self.send_header('Content-type', 'text/html') @@ -289,11 +289,11 @@ class Handler(BaseHTTPRequestHandler): def serve_special_path(self, path: str) -> bool: """Serve special paths like robots.txt, API endpoints, etc.""" - + # Check SQL injection honeypot endpoints first if self._handle_sql_endpoint(path): return True - + try: if path == '/robots.txt': self.send_response(200) @@ -301,7 +301,7 @@ class Handler(BaseHTTPRequestHandler): self.end_headers() self.wfile.write(html_templates.robots_txt().encode()) return True - + if path in ['/credentials.txt', '/passwords.txt', '/admin_notes.txt']: self.send_response(200) self.send_header('Content-type', 'text/plain') @@ -311,7 +311,7 @@ class Handler(BaseHTTPRequestHandler): else: self.wfile.write(passwords_txt().encode()) return True - + if path in ['/users.json', '/api_keys.json', '/config.json']: self.send_response(200) self.send_header('Content-type', 'application/json') @@ -323,28 +323,28 @@ class Handler(BaseHTTPRequestHandler): else: self.wfile.write(api_response('/api/config').encode()) return True - + if path in ['/admin', '/admin/', '/admin/login', '/login']: self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.wfile.write(html_templates.login_form().encode()) return True - + if path in ['/users', '/user', '/database', '/db', '/search']: self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.wfile.write(html_templates.product_search().encode()) return True - + if path in ['/info', '/input', '/contact', '/feedback', '/comment']: self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.wfile.write(html_templates.input_form().encode()) return True - + if path == '/server': error_html, content_type = generate_server_error() self.send_response(500) @@ -352,35 +352,35 @@ class Handler(BaseHTTPRequestHandler): self.end_headers() self.wfile.write(error_html.encode()) return True - + if path in ['/wp-login.php', '/wp-login', '/wp-admin', '/wp-admin/']: self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.wfile.write(html_templates.wp_login().encode()) return True - + if path in ['/wp-content/', '/wp-includes/'] or 'wordpress' in path.lower(): self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.wfile.write(html_templates.wordpress().encode()) return True - + if 'phpmyadmin' in path.lower() or path in ['/pma/', '/phpMyAdmin/']: self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() self.wfile.write(html_templates.phpmyadmin().encode()) return True - + if path.startswith('/api/') or path.startswith('/api') or path in ['/.env']: self.send_response(200) self.send_header('Content-type', 'application/json') self.end_headers() self.wfile.write(api_response(path).encode()) return True - + if path in ['/backup/', '/uploads/', '/private/', '/admin/', '/config/', '/database/']: self.send_response(200) self.send_header('Content-type', 'text/html') @@ -400,22 +400,21 @@ class Handler(BaseHTTPRequestHandler): """Responds to webpage requests""" client_ip = self._get_client_ip() user_agent = self._get_user_agent() - + if self.config.dashboard_secret_path and self.path == self.config.dashboard_secret_path: self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() try: stats = self.tracker.get_stats() - timezone = str(self.config.timezone) if self.config.timezone else 'UTC' dashboard_path = self.config.dashboard_secret_path - self.wfile.write(generate_dashboard(stats, timezone, dashboard_path).encode()) + self.wfile.write(generate_dashboard(stats, dashboard_path).encode()) except BrokenPipeError: pass except Exception as e: self.app_logger.error(f"Error generating dashboard: {e}") return - + # API endpoint for fetching IP stats if self.config.dashboard_secret_path and self.path.startswith(f"{self.config.dashboard_secret_path}/api/ip-stats/"): ip_address = self.path.replace(f"{self.config.dashboard_secret_path}/api/ip-stats/", "") @@ -473,7 +472,7 @@ class Handler(BaseHTTPRequestHandler): return self.tracker.record_access(client_ip, self.path, user_agent, method='GET') - + # self.analyzer.infer_user_category(client_ip) # self.analyzer.update_ip_rep_infos(client_ip) @@ -497,9 +496,9 @@ class Handler(BaseHTTPRequestHandler): try: self.wfile.write(self.generate_page(self.path).encode()) - + Handler.counter -= 1 - + if Handler.counter < 0: Handler.counter = self.config.canary_token_tries except BrokenPipeError: diff --git a/src/logger.py b/src/logger.py index 992cad8..bf16c77 100644 --- a/src/logger.py +++ b/src/logger.py @@ -8,20 +8,17 @@ Provides two loggers: app (application) and access (HTTP access logs). import logging import os from logging.handlers import RotatingFileHandler -from typing import Optional -from zoneinfo import ZoneInfo from datetime import datetime class TimezoneFormatter(logging.Formatter): """Custom formatter that respects configured timezone""" - def __init__(self, fmt=None, datefmt=None, timezone: Optional[ZoneInfo] = None): + def __init__(self, fmt=None, datefmt=None): super().__init__(fmt, datefmt) - self.timezone = timezone or ZoneInfo('UTC') - + def formatTime(self, record, datefmt=None): """Override formatTime to use configured timezone""" - dt = datetime.fromtimestamp(record.created, tz=self.timezone) + dt = datetime.fromtimestamp(record.created) if datefmt: return dt.strftime(datefmt) return dt.isoformat() @@ -37,19 +34,16 @@ class LoggerManager: cls._instance._initialized = False return cls._instance - def initialize(self, log_dir: str = "logs", timezone: Optional[ZoneInfo] = None) -> None: + def initialize(self, log_dir: str = "logs") -> None: """ - Initialize the logging system with rotating file handlers. + Initialize the logging system with rotating file handlers.loggers Args: log_dir: Directory for log files (created if not exists) - timezone: ZoneInfo timezone for log timestamps (defaults to UTC) """ if self._initialized: return - self.timezone = timezone or ZoneInfo('UTC') - # Create log directory if it doesn't exist os.makedirs(log_dir, exist_ok=True) @@ -57,7 +51,6 @@ class LoggerManager: log_format = TimezoneFormatter( "[%(asctime)s] %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", - timezone=self.timezone ) # Rotation settings: 1MB max, 5 backups @@ -104,8 +97,8 @@ class LoggerManager: self._credential_logger.handlers.clear() # Credential logger uses a simple format: timestamp|ip|username|password|path - credential_format = TimezoneFormatter("%(message)s", timezone=self.timezone) - + credential_format = TimezoneFormatter("%(message)s") + credential_file_handler = RotatingFileHandler( os.path.join(log_dir, "credentials.log"), maxBytes=max_bytes, @@ -157,6 +150,6 @@ def get_credential_logger() -> logging.Logger: return _logger_manager.credentials -def initialize_logging(log_dir: str = "logs", timezone: Optional[ZoneInfo] = None) -> None: +def initialize_logging(log_dir: str = "logs") -> None: """Initialize the logging system.""" - _logger_manager.initialize(log_dir, timezone) + _logger_manager.initialize(log_dir) diff --git a/src/server.py b/src/server.py index e690142..a61a372 100644 --- a/src/server.py +++ b/src/server.py @@ -29,7 +29,6 @@ def print_usage(): print(' server:') print(' port: 5000') print(' delay: 100') - print(' timezone: null # or "America/New_York"') print(' links:') print(' min_length: 5') print(' max_length: 15') @@ -55,11 +54,8 @@ def main(): config = get_config() - # Get timezone configuration - tz = config.get_timezone() - # Initialize logging with timezone - initialize_logging(timezone=tz) + initialize_logging() app_logger = get_app_logger() access_logger = get_access_logger() credential_logger = get_credential_logger() @@ -71,8 +67,8 @@ def main(): except Exception as e: app_logger.warning(f'Database initialization failed: {e}. Continuing with in-memory only.') - tracker = AccessTracker(timezone=tz) - analyzer = Analyzer(timezone=tz) + tracker = AccessTracker() + analyzer = Analyzer() Handler.config = config Handler.tracker = tracker @@ -99,7 +95,6 @@ def main(): try: app_logger.info(f'Starting deception server on port {config.port}...') - app_logger.info(f'Timezone configured: {tz.key}') app_logger.info(f'Dashboard available at: {config.dashboard_secret_path}') if config.canary_token_url: app_logger.info(f'Canary token will appear after {config.canary_token_tries} tries') diff --git a/src/tasks/analyze_ips.py b/src/tasks/analyze_ips.py index e4fda84..788d9a3 100644 --- a/src/tasks/analyze_ips.py +++ b/src/tasks/analyze_ips.py @@ -28,7 +28,7 @@ def main(): config = get_config() db_manager = get_database() app_logger = get_app_logger() - + http_risky_methods_threshold = config.http_risky_methods_threshold violated_robots_threshold = config.violated_robots_threshold uneven_request_timing_threshold = config.uneven_request_timing_threshold @@ -41,7 +41,7 @@ def main(): score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} - + #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme weights = { "attacker": { @@ -73,21 +73,27 @@ def main(): "attack_url": 0 } } - accesses = db_manager.get_access_logs(limit=999999999) - ips = {item['ip'] for item in accesses} + # Get IPs with recent activity (last minute to match cron schedule) + recent_accesses = db_manager.get_access_logs(limit=999999999, since_minutes=1) + ips_to_analyze = {item['ip'] for item in recent_accesses} - for ip in ips: - ip_accesses = [item for item in accesses if item["ip"] == ip] - total_accesses_count = len(accesses) + if not ips_to_analyze: + app_logger.debug("[Background Task] analyze-ips: No recent activity, skipping") + return + + for ip in ips_to_analyze: + # Get full history for this IP to perform accurate analysis + ip_accesses = db_manager.get_access_logs(limit=999999999, ip_filter=ip) + total_accesses_count = len(ip_accesses) if total_accesses_count <= 0: return - + # Set category as "unknown" for the first 3 requests if total_accesses_count < 3: category = "unknown" analyzed_metrics = {} category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0} - last_analysis = datetime.now(tz=ZoneInfo('UTC')) + last_analysis = datetime.now() db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) return 0 #--------------------- HTTP Methods --------------------- @@ -97,7 +103,7 @@ def main(): delete_accesses_count = len([item for item in ip_accesses if item["method"] == "DELETE"]) head_accesses_count = len([item for item in ip_accesses if item["method"] == "HEAD"]) options_accesses_count = len([item for item in ip_accesses if item["method"] == "OPTIONS"]) - patch_accesses_count = len([item for item in ip_accesses if item["method"] == "PATCH"]) + patch_accesses_count = len([item for item in ip_accesses if item["method"] == "PATCH"]) if total_accesses_count > http_risky_methods_threshold: http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count else: @@ -123,7 +129,7 @@ def main(): if not line: continue parts = line.split(":") - + if parts[0] == "Disallow": parts[1] = parts[1].rstrip("/") #print(f"DISALLOW {parts[1]}") @@ -145,18 +151,18 @@ def main(): score["good_crawler"]["robots_violations"] = False score["bad_crawler"]["robots_violations"] = False score["regular_user"]["robots_violations"] = False - + #--------------------- Requests Timing --------------------- - #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior + # Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior timestamps = [datetime.fromisoformat(item["timestamp"]) for item in ip_accesses] - now_utc = datetime.now(tz=ZoneInfo('UTC')) + now_utc = datetime.now() timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)] timestamps = sorted(timestamps, reverse=True) time_diffs = [] for i in range(0, len(timestamps)-1): diff = (timestamps[i] - timestamps[i+1]).total_seconds() time_diffs.append(diff) - + mean = 0 variance = 0 std = 0 @@ -206,14 +212,14 @@ def main(): except Exception: decoded_path = queried_path decoded_path_twice = queried_path - + for name, pattern in wl.attack_patterns.items(): # Check original, decoded, and double-decoded paths - if (re.search(pattern, queried_path, re.IGNORECASE) or + if (re.search(pattern, queried_path, re.IGNORECASE) or re.search(pattern, decoded_path, re.IGNORECASE) or re.search(pattern, decoded_path_twice, re.IGNORECASE)): attack_urls_found_list.append(f"{name}: {pattern}") - + #remove duplicates attack_urls_found_list = set(attack_urls_found_list) attack_urls_found_list = list(attack_urls_found_list) @@ -260,6 +266,6 @@ def main(): analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list} category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score} category = max(category_scores, key=category_scores.get) - last_analysis = datetime.now(tz=ZoneInfo('UTC')) + last_analysis = datetime.now() db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) - return \ No newline at end of file + return diff --git a/src/tasks/fetch_ip_rep.py b/src/tasks/fetch_ip_rep.py index 9a78ee6..efddaea 100644 --- a/src/tasks/fetch_ip_rep.py +++ b/src/tasks/fetch_ip_rep.py @@ -21,7 +21,7 @@ def main(): # Only get IPs that haven't been enriched yet unenriched_ips = db_manager.get_unenriched_ips(limit=50) - + app_logger.info(f"{len(unenriched_ips)} IP's need to be have reputation enrichment.") for ip in unenriched_ips: try: api_url = "https://iprep.lcrawl.com/api/iprep/" diff --git a/src/tasks/top_attacking_ips.py b/src/tasks/top_attacking_ips.py index d9e18d3..cb95d57 100644 --- a/src/tasks/top_attacking_ips.py +++ b/src/tasks/top_attacking_ips.py @@ -1,6 +1,8 @@ # tasks/export_malicious_ips.py import os +from datetime import datetime, timedelta +from zoneinfo import ZoneInfo from logger import get_app_logger from database import get_database from models import AccessLog @@ -24,6 +26,15 @@ OUTPUT_FILE = os.path.join(EXPORTS_DIR, "malicious_ips.txt") # ---------------------- # TASK LOGIC # ---------------------- +def has_recent_honeypot_access(session, minutes: int = 5) -> bool: + """Check if honeypot was accessed in the last N minutes.""" + cutoff_time = datetime.now() - timedelta(minutes=minutes) + count = session.query(AccessLog).filter( + AccessLog.is_honeypot_trigger == True, + AccessLog.timestamp >= cutoff_time + ).count() + return count > 0 + def main(): """ Export all IPs flagged as suspicious to a text file. @@ -36,6 +47,11 @@ def main(): db = get_database() session = db.session + # Check for recent honeypot activity + if not has_recent_honeypot_access(session): + app_logger.info(f"[Background Task] {task_name} skipped - no honeypot access in last 5 minutes") + return + # Query distinct suspicious IPs results = session.query(distinct(AccessLog.ip)).filter( AccessLog.is_suspicious == True diff --git a/src/templates/dashboard_template.py b/src/templates/dashboard_template.py index 4c5a77a..f6ef2eb 100644 --- a/src/templates/dashboard_template.py +++ b/src/templates/dashboard_template.py @@ -15,21 +15,16 @@ def _escape(value) -> str: return "" return html.escape(str(value)) -def format_timestamp(iso_timestamp: str, timezone: str = 'UTC', time_only: bool = False) -> str: +def format_timestamp(iso_timestamp: str, time_only: bool = False) -> str: """Format ISO timestamp for display with timezone conversion - + Args: iso_timestamp: ISO format timestamp string (UTC) - timezone: IANA timezone string to convert to time_only: If True, return only HH:MM:SS, otherwise full datetime """ try: # Parse UTC timestamp dt = datetime.fromisoformat(iso_timestamp) - # Convert to target timezone - if dt.tzinfo is not None: - dt = dt.astimezone(ZoneInfo(timezone)) - if time_only: return dt.strftime("%H:%M:%S") return dt.strftime("%Y-%m-%d %H:%M:%S") @@ -38,15 +33,14 @@ def format_timestamp(iso_timestamp: str, timezone: str = 'UTC', time_only: bool return iso_timestamp.split("T")[1][:8] if "T" in iso_timestamp else iso_timestamp -def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = '') -> str: +def generate_dashboard(stats: dict, dashboard_path: str = '') -> str: """Generate dashboard HTML with access statistics - + Args: stats: Statistics dictionary - timezone: IANA timezone string (e.g., 'Europe/Paris', 'America/New_York') dashboard_path: The secret dashboard path for generating API URLs """ - + # Generate IP rows with clickable functionality for dropdown stats top_ips_rows = '\n'.join([ f''' @@ -82,7 +76,7 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = {_escape(log["ip"])} {_escape(log["path"])} {_escape(log["user_agent"][:60])} - {format_timestamp(log["timestamp"], timezone, time_only=True)} + {format_timestamp(log["timestamp"], time_only=True)} @@ -118,7 +112,7 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = {_escape(log["path"])} {_escape(", ".join(log["attack_types"]))} {_escape(log["user_agent"][:60])} - {format_timestamp(log["timestamp"], timezone, time_only=True)} + {format_timestamp(log["timestamp"],time_only=True)} @@ -137,7 +131,7 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str = {_escape(log["username"])} {_escape(log["password"])} {_escape(log["path"])} - {format_timestamp(log["timestamp"], timezone, time_only=True)} + {format_timestamp(log["timestamp"], time_only=True)} @@ -537,7 +531,7 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str =

Krawl Dashboard

- +
{stats['total_accesses']}
@@ -683,15 +677,13 @@ def generate_dashboard(stats: dict, timezone: str = 'UTC', dashboard_path: str =
diff --git a/src/tracker.py b/src/tracker.py index cd8a187..8bec7ce 100644 --- a/src/tracker.py +++ b/src/tracker.py @@ -17,7 +17,7 @@ class AccessTracker: Maintains in-memory structures for fast dashboard access and persists data to SQLite for long-term storage and analysis. """ - def __init__(self, db_manager: Optional[DatabaseManager] = None, timezone: Optional[ZoneInfo] = None): + def __init__(self, db_manager: Optional[DatabaseManager] = None): """ Initialize the access tracker. @@ -30,7 +30,6 @@ class AccessTracker: self.user_agent_counts: Dict[str, int] = defaultdict(int) self.access_log: List[Dict] = [] self.credential_attempts: List[Dict] = [] - self.timezone = timezone or ZoneInfo('UTC') self.suspicious_patterns = [ 'bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python-requests', 'scanner', 'nikto', 'sqlmap', 'nmap', 'masscan', 'nessus', 'acunetix', @@ -40,7 +39,7 @@ class AccessTracker: # Load attack patterns from wordlists wl = get_wordlists() self.attack_types = wl.attack_patterns - + # Fallback if wordlists not loaded if not self.attack_types: self.attack_types = { @@ -80,38 +79,38 @@ class AccessTracker: """ if not post_data: return None, None - + username = None password = None - + try: # Parse URL-encoded form data parsed = urllib.parse.parse_qs(post_data) - + # Common username field names username_fields = ['username', 'user', 'login', 'email', 'log', 'userid', 'account'] for field in username_fields: if field in parsed and parsed[field]: username = parsed[field][0] break - + # Common password field names password_fields = ['password', 'pass', 'passwd', 'pwd', 'passphrase'] for field in password_fields: if field in parsed and parsed[field]: password = parsed[field][0] break - + except Exception: # If parsing fails, try simple regex patterns username_match = re.search(r'(?:username|user|login|email|log)=([^&\s]+)', post_data, re.IGNORECASE) password_match = re.search(r'(?:password|pass|passwd|pwd)=([^&\s]+)', post_data, re.IGNORECASE) - + if username_match: username = urllib.parse.unquote_plus(username_match.group(1)) if password_match: password = urllib.parse.unquote_plus(password_match.group(1)) - + return username, password def record_credential_attempt(self, ip: str, path: str, username: str, password: str): @@ -126,7 +125,7 @@ class AccessTracker: 'path': path, 'username': username, 'password': password, - 'timestamp': datetime.now(self.timezone).isoformat() + 'timestamp': datetime.now().isoformat() }) # Persist to database @@ -193,7 +192,7 @@ class AccessTracker: 'suspicious': is_suspicious, 'honeypot_triggered': self.is_honeypot_path(path), 'attack_types':attack_findings, - 'timestamp': datetime.now(self.timezone).isoformat() + 'timestamp': datetime.now().isoformat() }) # Persist to database