From 6dc9cfe502ab6a3ff667997c35b1d38e760801de Mon Sep 17 00:00:00 2001 From: Lorenzo Venerandi Date: Tue, 3 Mar 2026 19:16:27 +0100 Subject: [PATCH] feat: enhance database retention logic to preserve suspicious access logs and linked IPs --- src/tasks/db_retention.py | 64 ++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/src/tasks/db_retention.py b/src/tasks/db_retention.py index eb76637..ab4af86 100644 --- a/src/tasks/db_retention.py +++ b/src/tasks/db_retention.py @@ -7,6 +7,8 @@ Periodically deletes old records based on configured retention_days. from datetime import datetime, timedelta +from sqlalchemy import or_ + from database import get_database from logger import get_app_logger @@ -26,14 +28,14 @@ app_logger = get_app_logger() def main(): """ - Delete all records older than the configured retention period. - Covers: AccessLog, AttackDetection, CredentialAttempt, IpStats, CategoryHistory. + Delete old records based on the configured retention period. + Keeps suspicious access logs, their attack detections, linked IPs, + category history, and all credential attempts. """ try: from config import get_config from models import ( AccessLog, - CredentialAttempt, AttackDetection, IpStats, CategoryHistory, @@ -47,56 +49,68 @@ def main(): cutoff = datetime.now() - timedelta(days=retention_days) - # Delete attack detections linked to old access logs first (FK constraint) - old_log_ids = session.query(AccessLog.id).filter(AccessLog.timestamp < cutoff) + # Delete attack detections linked to old NON-suspicious access logs (FK constraint) + old_nonsuspicious_log_ids = session.query(AccessLog.id).filter( + AccessLog.timestamp < cutoff, + AccessLog.is_suspicious == False, + AccessLog.is_honeypot_trigger == False, + ) detections_deleted = ( session.query(AttackDetection) - .filter(AttackDetection.access_log_id.in_(old_log_ids)) + .filter(AttackDetection.access_log_id.in_(old_nonsuspicious_log_ids)) .delete(synchronize_session=False) ) - # Delete old access logs + # Delete old non-suspicious access logs (keep suspicious ones) logs_deleted = ( session.query(AccessLog) - .filter(AccessLog.timestamp < cutoff) + .filter( + AccessLog.timestamp < cutoff, + AccessLog.is_suspicious == False, + AccessLog.is_honeypot_trigger == False, + ) .delete(synchronize_session=False) ) - # Delete old credential attempts - creds_deleted = ( - session.query(CredentialAttempt) - .filter(CredentialAttempt.timestamp < cutoff) - .delete(synchronize_session=False) + # IPs to preserve: those with any suspicious access logs + preserved_ips = ( + session.query(AccessLog.ip) + .filter( + or_( + AccessLog.is_suspicious == True, + AccessLog.is_honeypot_trigger == True, + ) + ) + .distinct() ) - # Delete IPs not seen within the retention period + # Delete stale IPs, but keep those linked to suspicious logs ips_deleted = ( session.query(IpStats) - .filter(IpStats.last_seen < cutoff) + .filter( + IpStats.last_seen < cutoff, + ~IpStats.ip.in_(preserved_ips), + ) .delete(synchronize_session=False) ) - # Delete old category history records + # Delete old category history, but keep records for preserved IPs history_deleted = ( session.query(CategoryHistory) - .filter(CategoryHistory.timestamp < cutoff) + .filter( + CategoryHistory.timestamp < cutoff, + ~CategoryHistory.ip.in_(preserved_ips), + ) .delete(synchronize_session=False) ) session.commit() - total = ( - logs_deleted - + detections_deleted - + creds_deleted - + ips_deleted - + history_deleted - ) + total = logs_deleted + detections_deleted + ips_deleted + history_deleted if total: app_logger.info( f"DB retention: Deleted {logs_deleted} access logs, " f"{detections_deleted} attack detections, " - f"{creds_deleted} credential attempts, " f"{ips_deleted} stale IPs, " f"{history_deleted} category history records " f"older than {retention_days} days"