Merge pull request #76 from BlessedRebuS/feat/database-dump-task

Add periodic database dump feature
This commit is contained in:
Patrick Di Fazio
2026-02-05 22:42:21 +01:00
committed by GitHub
9 changed files with 128 additions and 8 deletions

3
.gitignore vendored
View File

@@ -68,6 +68,7 @@ data/
*.db
*.sqlite
*.sqlite3
backups/
# Temporary files
*.tmp
@@ -83,4 +84,4 @@ personal-values.yaml
/src/exports/*
# tmux config
.tmux.conf
.tmux.conf

View File

@@ -211,6 +211,8 @@ Krawl uses a **configuration hierarchy** in which **environment variables take p
| `KRAWL_PROBABILITY_ERROR_CODES` | Error response probability (0-100%) | `0` |
| `KRAWL_DATABASE_PATH` | Database file location | `data/krawl.db` |
| `KRAWL_EXPORTS_PATH` | Path where firewalls rule sets are exported | `exports` |
| `KRAWL_BACKUPS_PATH` | Path where database dump are saved | `backups` |
| `KRAWL_BACKUPS_CRON` | cron expression to control backup job schedule | `*/30 * * * *` |
| `KRAWL_DATABASE_RETENTION_DAYS` | Days to retain data in database | `30` |
| `KRAWL_HTTP_RISKY_METHODS_THRESHOLD` | Threshold for risky HTTP methods detection | `0.1` |
| `KRAWL_VIOLATED_ROBOTS_THRESHOLD` | Threshold for robots.txt violations | `0.1` |

View File

@@ -25,6 +25,10 @@ dashboard:
# secret_path: super-secret-dashboard-path
secret_path: test
backups:
path: "backups"
cron: "*/30 * * * *"
exports:
path: "exports"

View File

@@ -18,6 +18,7 @@ services:
- ./logs:/app/logs
- ./exports:/app/exports
- ./data:/app/data
- ./backups:/app/backups
restart: unless-stopped
develop:
watch:

View File

@@ -22,6 +22,9 @@ data:
token_tries: {{ .Values.config.canary.token_tries }}
dashboard:
secret_path: {{ .Values.config.dashboard.secret_path | toYaml }}
backups:
path: {{ .Values.config.backups.path | quote }}
cron: {{ .Values.config.backups.cron | quote }}
exports:
path: {{ .Values.config.exports.path | quote }}
database:

View File

@@ -84,6 +84,9 @@ config:
token_tries: 10
dashboard:
secret_path: null # Auto-generated if not set, or set to "/my-secret-dashboard"
backups:
path: "backups"
cron: "*/30 * * * *"
exports:
path: "exports"
database:

View File

@@ -39,6 +39,10 @@ class Config:
# exporter settings
exports_path: str = "exports"
# backup job settings
backups_path: str = "backups"
backups_cron: str = "*/30 * * * *"
# Database settings
database_path: str = "data/krawl.db"
database_retention_days: int = 30
@@ -153,6 +157,7 @@ class Config:
dashboard = data.get("dashboard", {})
api = data.get("api", {})
exports = data.get("exports", {})
backups = data.get("backups", {})
database = data.get("database", {})
behavior = data.get("behavior", {})
analyzer = data.get("analyzer") or {}
@@ -189,6 +194,8 @@ class Config:
dashboard_secret_path=dashboard_path,
probability_error_codes=behavior.get("probability_error_codes", 0),
exports_path=exports.get("path"),
backups_path=backups.get("path"),
backups_cron=backups.get("cron"),
database_path=database.get("path", "data/krawl.db"),
database_retention_days=database.get("retention_days", 30),
http_risky_methods_threshold=analyzer.get(

104
src/tasks/db_dump.py Normal file
View File

@@ -0,0 +1,104 @@
# tasks/db_dump.py
from logger import get_app_logger
from database import get_database
from config import get_config
from sqlalchemy import MetaData, inspect
from sqlalchemy.schema import CreateTable
import os
config = get_config()
app_logger = get_app_logger()
# ----------------------
# TASK CONFIG
# ----------------------
TASK_CONFIG = {
"name": "dump-krawl-data",
"cron": f"{config.backups_cron}",
"enabled": True,
"run_when_loaded": True,
}
# ----------------------
# TASK LOGIC
# ----------------------
def main():
"""
Dump krawl database to a sql file for backups
"""
task_name = TASK_CONFIG.get("name")
app_logger.info(f"[Background Task] {task_name} starting...")
try:
db = get_database()
engine = db._engine
metadata = MetaData()
# Reflect the database structure
metadata.reflect(bind=engine)
output_file = os.path.join(config.backups_path, "db_dump.sql")
with open(output_file, "w") as f:
# Write header
app_logger.info(f"[Background Task] {task_name} started database dump")
# Get inspector for additional metadata
inspector = inspect(engine)
# Dump schema (CREATE TABLE statements)
f.write("-- Schema\n")
f.write("-- " + "=" * 70 + "\n\n")
for table_name in metadata.tables:
table = metadata.tables[table_name]
app_logger.info(
f"[Background Task] {task_name} dumping {table} table schema"
)
# Create table statement
create_stmt = str(CreateTable(table).compile(engine))
f.write(f"{create_stmt};\n\n")
f.write("\n-- Data\n")
f.write("-- " + "=" * 70 + "\n\n")
with engine.connect() as conn:
for table_name in metadata.tables:
table = metadata.tables[table_name]
f.write(f"-- Table: {table_name}\n")
# Select all data from table
result = conn.execute(table.select())
rows = result.fetchall()
if rows:
app_logger.info(
f"[Background Task] {task_name} dumping {table} content"
)
for row in rows:
# Build INSERT statement
columns = ", ".join([col.name for col in table.columns])
values = ", ".join([repr(value) for value in row])
f.write(
f"INSERT INTO {table_name} ({columns}) VALUES ({values});\n"
)
f.write("\n")
else:
f.write(f"-- No data in {table_name}\n\n")
app_logger.info(
f"[Background Task] {task_name} no data in {table}"
)
app_logger.info(
f"[Background Task] {task_name} Database dump completed: {output_file}"
)
except Exception as e:
app_logger.error(f"[Background Task] {task_name} failed: {e}")
finally:
db.close_session()

View File

@@ -50,9 +50,7 @@ def generate_dashboard(stats: dict, dashboard_path: str = "") -> str:
# Generate suspicious accesses rows with clickable IPs
suspicious_rows = (
"\n".join(
[
f"""<tr class="ip-row" data-ip="{_escape(log["ip"])}">
"\n".join([f"""<tr class="ip-row" data-ip="{_escape(log["ip"])}">
<td class="ip-clickable">{_escape(log["ip"])}</td>
<td>{_escape(log["path"])}</td>
<td style="word-break: break-all;">{_escape(log["user_agent"][:60])}</td>
@@ -64,10 +62,7 @@ def generate_dashboard(stats: dict, dashboard_path: str = "") -> str:
<div class="loading">Loading stats...</div>
</div>
</td>
</tr>"""
for log in stats["recent_suspicious"][-10:]
]
)
</tr>""" for log in stats["recent_suspicious"][-10:]])
or '<tr><td colspan="4" style="text-align:center;">No suspicious activity detected</td></tr>'
)