diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 0000000..70fbb36 --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,87 @@ +name: Build and Push Docker Image + +on: + push: + branches: + - main + - beta + - dev + - github-actions-ci + paths: + - 'src/**' + - 'helm/Chart.yaml' + - 'config.yaml' + - 'Dockerfile' + - 'requirements.txt' + - 'entrypoint.sh' + - '.github/workflows/docker-build-push.yml' + tags: + - 'v*.*.*' + release: + types: [published] + workflow_dispatch: + +env: + REGISTRY: ${{ vars.DOCKER_REGISTRY }} + IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME }} + +jobs: + build-and-push: + runs-on: self-hosted + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract appVersion from Chart.yaml and determine tags + id: tags + run: | + APP_VERSION=$(grep '^appVersion:' helm/Chart.yaml | awk '{print $2}' | tr -d '"' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + if [ -z "$APP_VERSION" ]; then + echo "Error: Could not extract appVersion from Chart.yaml" + exit 1 + fi + + if [[ "${{ github.ref_name }}" == "main" ]]; then + TAGS="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${APP_VERSION},${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest" + else + TAGS="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${APP_VERSION}-${{ github.ref_name }}" + fi + + echo "tags=$TAGS" >> $GITHUB_OUTPUT + + - name: Extract metadata (tags, labels) + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + push: true + tags: ${{ steps.tags.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max + + - name: Image digest + run: | + echo "Image built and pushed with tags:" + echo "${{ steps.tags.outputs.tags }}" diff --git a/.github/workflows/helm-package-push.yml b/.github/workflows/helm-package-push.yml new file mode 100644 index 0000000..9ba9150 --- /dev/null +++ b/.github/workflows/helm-package-push.yml @@ -0,0 +1,76 @@ +name: Package and Push Helm Chart + +on: + push: + branches: + - main + - beta + - dev + - github-actions-ci + paths: + - 'helm/**' + - '.github/workflows/helm-package-push.yml' + tags: + - 'v*' + release: + types: + - published + - created + workflow_dispatch: + +env: + REGISTRY: ${{ vars.DOCKER_REGISTRY }} + +jobs: + package-and-push: + runs-on: self-hosted + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: 'latest' + + - name: Log in to Container Registry + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | helm registry login ghcr.io --username ${{ github.actor }} --password-stdin + + - name: Set Helm chart version and package + run: | + CHART_NAME=$(grep '^name:' ./helm/Chart.yaml | awk '{print $2}') + BASE_VERSION=$(grep '^version:' ./helm/Chart.yaml | awk '{print $2}') + + if [[ "${{ github.ref_name }}" == "main" ]]; then + CHART_VERSION="${BASE_VERSION}" + else + CHART_VERSION="${BASE_VERSION}-${{ github.ref_name }}" + fi + + # Update Chart.yaml temporarily with the versioned name + sed -i "s/^version:.*/version: ${CHART_VERSION}/" ./helm/Chart.yaml + + # Package the helm chart + helm package ./helm + + echo "CHART_NAME=${CHART_NAME}" >> $GITHUB_ENV + echo "CHART_VERSION=${CHART_VERSION}" >> $GITHUB_ENV + + - name: Push Helm chart to registry + run: | + helm push ${{ env.CHART_NAME }}-${{ env.CHART_VERSION }}.tgz oci://${{ env.REGISTRY }} + + - name: Chart pushed + run: | + CHART_VERSION=$(grep '^version:' ./helm/Chart.yaml | awk '{print $2}') + CHART_FILE=$(grep '^name:' ./helm/Chart.yaml | awk '{print $2}') + if [[ "${{ github.ref_name }}" == "main" ]]; then + echo "Chart pushed: ${CHART_FILE}:${CHART_VERSION}" + else + echo "Chart pushed: ${CHART_FILE}:${CHART_VERSION}-${{ github.ref_name }}" + fi diff --git a/.github/workflows/kubernetes-validation.yml b/.github/workflows/kubernetes-validation.yml new file mode 100644 index 0000000..de5e1cd --- /dev/null +++ b/.github/workflows/kubernetes-validation.yml @@ -0,0 +1,57 @@ +name: Kubernetes Validation + +on: + pull_request: + branches: + - main + - beta + - dev + paths: + - 'kubernetes/**' + - 'helm/**' + - '.github/workflows/kubernetes-validation.yml' + +permissions: + contents: read + +jobs: + validate-manifests: + name: Validate Kubernetes Manifests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Validate YAML syntax + run: | + for manifest in kubernetes/**/*.yaml; do + if [ -f "$manifest" ]; then + echo "Validating YAML syntax: $manifest" + python3 -c "import yaml, sys; yaml.safe_load(open('$manifest'))" || exit 1 + fi + done + + - name: Validate manifest structure + run: | + for manifest in kubernetes/**/*.yaml; do + if [ -f "$manifest" ]; then + echo "Checking $manifest" + if ! grep -q "kind:" "$manifest"; then + echo "Error: $manifest does not contain a Kubernetes kind" + exit 1 + fi + fi + done + + validate-helm: + name: Validate Helm Chart + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: azure/setup-helm@v4 + + - name: Helm lint + run: helm lint ./helm + + - name: Helm template validation + run: helm template krawl ./helm > /tmp/helm-output.yaml diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml new file mode 100644 index 0000000..9feb01c --- /dev/null +++ b/.github/workflows/pr-checks.yml @@ -0,0 +1,47 @@ +name: PR Checks + +on: + pull_request: + branches: + - main + - beta + - dev + +permissions: + contents: read + pull-requests: read + +jobs: + lint-and-test: + name: Lint & Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install black flake8 pylint pytest + + - name: Black format check + run: | + if ! black --check src/; then + echo "Run 'black src/' to format code" + black --diff src/ + exit 1 + fi + + build-docker: + name: Build Docker + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build Docker image + run: docker build -t krawl:test . diff --git a/.github/workflows/security-scan.yml b/.github/workflows/security-scan.yml new file mode 100644 index 0000000..732b1b7 --- /dev/null +++ b/.github/workflows/security-scan.yml @@ -0,0 +1,59 @@ +name: Security Scan + +on: + pull_request: + branches: + - main + - beta + - dev + +permissions: + contents: read + +jobs: + security-checks: + name: Security & Dependencies + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install bandit safety + + - name: Bandit security check + run: | + bandit -r src/ -f txt | tee bandit-report.txt + + # Extract HIGH severity (not confidence) - look for the severity section + SEVERITY_SECTION=$(sed -n '/Total issues (by severity):/,/Total issues (by confidence):/p' bandit-report.txt) + HIGH_COUNT=$(echo "$SEVERITY_SECTION" | grep "High:" | grep -o "[0-9]*" | head -1) + + if [ -z "$HIGH_COUNT" ]; then + HIGH_COUNT=0 + fi + + if [ "$HIGH_COUNT" -gt 0 ]; then + echo "Found $HIGH_COUNT HIGH severity security issues" + exit 1 + fi + echo "✓ No HIGH severity security issues found" + + - name: Safety check for dependencies + run: safety check --json || true + + - name: Trivy vulnerability scan + uses: aquasecurity/trivy-action@master + with: + scan-type: 'fs' + scan-ref: '.' + format: 'table' + severity: 'CRITICAL,HIGH' + exit-code: '1' diff --git a/.gitignore b/.gitignore index 70b93e4..6249e18 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ secrets/ .env .env.local .env.*.local +.envrc # Logs *.log @@ -76,3 +77,7 @@ data/ # Personal canary tokens or sensitive configs *canary*token*.yaml personal-values.yaml + +#exports dir (keeping .gitkeep so we have the dir) +/exports/* +/src/exports/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index adac20f..4015c74 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,16 +4,26 @@ LABEL org.opencontainers.image.source=https://github.com/BlessedRebuS/Krawl WORKDIR /app +# Install gosu for dropping privileges +RUN apt-get update && apt-get install -y --no-install-recommends gosu && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + COPY src/ /app/src/ COPY wordlists.json /app/ +COPY entrypoint.sh /app/ +COPY config.yaml /app/ RUN useradd -m -u 1000 krawl && \ - chown -R krawl:krawl /app - -USER krawl + mkdir -p /app/logs /app/data /app/exports && \ + chown -R krawl:krawl /app && \ + chmod +x /app/entrypoint.sh EXPOSE 5000 ENV PYTHONUNBUFFERED=1 +ENTRYPOINT ["/app/entrypoint.sh"] CMD ["python3", "src/server.py"] diff --git a/README.md b/README.md index 58e0b06..aa1aee6 100644 --- a/README.md +++ b/README.md @@ -1,322 +1,360 @@ -

🕷️ Krawl

- -

- - -

-
- -

- A modern, customizable zero-dependencies honeypot server designed to detect and track malicious activity through deceptive web pages, fake credentials, and canary tokens. -

- -
- - License - - - Release - -
- -
- - GitHub Container Registry - - - Kubernetes - - - Helm Chart - -
- -
- -

- What is Krawl? • - Quick Start • - Honeypot Pages • - Dashboard • - Todo • - Contributing -

- -
-
- -## Demo -Tip: crawl the `robots.txt` paths for additional fun -### Krawl URL: [http://demo.krawlme.com](http://demo.krawlme.com) -### View the dashboard [http://demo.krawlme.com/das_dashboard](http://demo.krawlme.com/das_dashboard) - -## What is Krawl? - -**Krawl** is a cloud‑native deception server designed to detect, delay, and analyze malicious web crawlers and automated scanners. - -It creates realistic fake web applications filled with low‑hanging fruit such as admin panels, configuration files, and exposed fake credentials to attract and identify suspicious activity. - -By wasting attacker resources, Krawl helps clearly distinguish malicious behavior from legitimate crawlers. - -It features: - -- **Spider Trap Pages**: Infinite random links to waste crawler resources based on the [spidertrap project](https://github.com/adhdproject/spidertrap) -- **Fake Login Pages**: WordPress, phpMyAdmin, admin panels -- **Honeypot Paths**: Advertised in robots.txt to catch scanners -- **Fake Credentials**: Realistic-looking usernames, passwords, API keys -- **[Canary Token](#customizing-the-canary-token) Integration**: External alert triggering -- **Real-time Dashboard**: Monitor suspicious activity -- **Customizable Wordlists**: Easy JSON-based configuration -- **Random Error Injection**: Mimic real server behavior - -![asd](img/deception-page.png) - -## 🚀 Quick Start -## Helm Chart - -Install with default values - -```bash -helm install krawl oci://ghcr.io/blessedrebus/krawl-chart \ - --namespace krawl-system \ - --create-namespace -``` - -Install with custom [canary token](#customizing-the-canary-token) - -```bash -helm install krawl oci://ghcr.io/blessedrebus/krawl-chart \ - --namespace krawl-system \ - --create-namespace \ - --set config.canaryTokenUrl="http://your-canary-token-url" -``` - -To access the deception server - -```bash -kubectl get svc krawl -n krawl-system -``` - -Once the EXTERNAL-IP is assigned, access your deception server at: - -``` -http://:5000 -``` - -## Kubernetes / Kustomize -Apply all manifests with - -```bash -kubectl apply -f https://raw.githubusercontent.com/BlessedRebuS/Krawl/refs/heads/main/manifests/krawl-all-in-one-deploy.yaml -``` - -Retrieve dashboard path with -```bash -kubectl get secret krawl-server -n krawl-system -o jsonpath='{.data.dashboard-path}' | base64 -d -``` - -Or clone the repo and apply the `manifest` folder with - -```bash -kubectl apply -k manifests -``` - -## Docker -Run Krawl as a docker container with - -```bash -docker run -d \ - -p 5000:5000 \ - -e CANARY_TOKEN_URL="http://your-canary-token-url" \ - --name krawl \ - ghcr.io/blessedrebus/krawl:latest -``` - -## Docker Compose -Run Krawl with docker-compose in the project folder with - -```bash -docker-compose up -d -``` - -Stop it with - -```bash -docker-compose down -``` - -## Python 3.11+ - -Clone the repository - -```bash -git clone https://github.com/blessedrebus/krawl.git -cd krawl/src -``` -Run the server -```bash -python3 server.py -``` - -Visit - -`http://localhost:5000` - -To access the dashboard - -`http://localhost:5000/` - -## Configuration via Environment Variables - -To customize the deception server installation several **environment variables** can be specified. - -| Variable | Description | Default | -|----------|-------------|---------| -| `PORT` | Server listening port | `5000` | -| `DELAY` | Response delay in milliseconds | `100` | -| `LINKS_MIN_LENGTH` | Minimum random link length | `5` | -| `LINKS_MAX_LENGTH` | Maximum random link length | `15` | -| `LINKS_MIN_PER_PAGE` | Minimum links per page | `10` | -| `LINKS_MAX_PER_PAGE` | Maximum links per page | `15` | -| `MAX_COUNTER` | Initial counter value | `10` | -| `CANARY_TOKEN_TRIES` | Requests before showing canary token | `10` | -| `CANARY_TOKEN_URL` | External canary token URL | None | -| `DASHBOARD_SECRET_PATH` | Custom dashboard path | Auto-generated | -| `PROBABILITY_ERROR_CODES` | Error response probability (0-100%) | `0` | -| `SERVER_HEADER` | HTTP Server header for deception | `Apache/2.2.22 (Ubuntu)` | - -## robots.txt -The actual (juicy) robots.txt configuration is the following - -```txt -Disallow: /admin/ -Disallow: /api/ -Disallow: /backup/ -Disallow: /config/ -Disallow: /database/ -Disallow: /private/ -Disallow: /uploads/ -Disallow: /wp-admin/ -Disallow: /phpMyAdmin/ -Disallow: /admin/login.php -Disallow: /api/v1/users -Disallow: /api/v2/secrets -Disallow: /.env -Disallow: /credentials.txt -Disallow: /passwords.txt -Disallow: /.git/ -Disallow: /backup.sql -Disallow: /db_backup.sql -``` - -## Honeypot pages -Requests to common admin endpoints (`/admin/`, `/wp-admin/`, `/phpMyAdmin/`) return a fake login page. Any login attempt triggers a 1-second delay to simulate real processing and is fully logged in the dashboard (credentials, IP, headers, timing). - -
- -
- -Requests to paths like `/backup/`, `/config/`, `/database/`, `/private/`, or `/uploads/` return a fake directory listing populated with “interesting” files, each assigned a random file size to look realistic. - -![directory-page](img/directory-page.png) - -The `.env` endpoint exposes fake database connection strings, **AWS API keys**, and **Stripe secrets**. It intentionally returns an error due to the `Content-Type` being `application/json` instead of plain text, mimicking a “juicy” misconfiguration that crawlers and scanners often flag as information leakage. - -![env-page](img/env-page.png) - -The pages `/api/v1/users` and `/api/v2/secrets` show fake users and random secrets in JSON format - -
- - -
- -The pages `/credentials.txt` and `/passwords.txt` show fake users and random secrets - -
- - -
- -## Customizing the Canary Token -To create a custom canary token, visit https://canarytokens.org - -and generate a “Web bug” canary token. - -This optional token is triggered when a crawler fully traverses the webpage until it reaches 0. At that point, a URL is returned. When this URL is requested, it sends an alert to the user via email, including the visitor’s IP address and user agent. - - -To enable this feature, set the canary token URL [using the environment variable](#configuration-via-environment-variables) `CANARY_TOKEN_URL`. - -## Customizing the wordlist - -Edit `wordlists.json` to customize fake data for your use case - -```json -{ - "usernames": { - "prefixes": ["admin", "root", "user"], - "suffixes": ["_prod", "_dev", "123"] - }, - "passwords": { - "prefixes": ["P@ssw0rd", "Admin"], - "simple": ["test", "password"] - }, - "directory_listing": { - "files": ["credentials.txt", "backup.sql"], - "directories": ["admin/", "backup/"] - } -} -``` - -or **values.yaml** in the case of helm chart installation - -## Dashboard - -Access the dashboard at `http://:/` - -The dashboard shows: -- Total and unique accesses -- Suspicious activity detection -- Top IPs, paths, and user-agents -- Real-time monitoring - -The attackers' triggered honeypot path and the suspicious activity (such as failed login attempts) are logged - -![dashboard-1](img/dashboard-1.png) - -The top IP Addresses is shown along with top paths and User Agents - -![dashboard-2](img/dashboard-2.png) - -### Retrieving Dashboard Path - -Check server startup logs or get the secret with - -```bash -kubectl get secret krawl-server -n krawl-system \ - -o jsonpath='{.data.dashboard-path}' | base64 -d && echo -``` - -## 🤝 Contributing - -Contributions welcome! Please: -1. Fork the repository -2. Create a feature branch -3. Make your changes -4. Submit a pull request (explain the changes!) - - -
- -## ⚠️ Disclaimer - -**This is a deception/honeypot system.** -Deploy in isolated environments and monitor carefully for security events. -Use responsibly and in compliance with applicable laws and regulations. - -## Star History -Star History Chart +

Krawl

+ +

+ + +

+
+ +

+ A modern, customizable web honeypot server designed to detect and track malicious activity from attackers and web crawlers through deceptive web pages, fake credentials, and canary tokens. +

+ + + + + +
+ +

+ What is Krawl? • + Installation • + Honeypot Pages • + Dashboard • + Todo • + Contributing +

+ +
+
+ +## Demo +Tip: crawl the `robots.txt` paths for additional fun +### Krawl URL: [http://demo.krawlme.com](http://demo.krawlme.com) +### View the dashboard [http://demo.krawlme.com/das_dashboard](http://demo.krawlme.com/das_dashboard) + +## What is Krawl? + +**Krawl** is a cloud‑native deception server designed to detect, delay, and analyze malicious attackers, web crawlers and automated scanners. + +It creates realistic fake web applications filled with low‑hanging fruit such as admin panels, configuration files, and exposed fake credentials to attract and identify suspicious activity. + +By wasting attacker resources, Krawl helps clearly distinguish malicious behavior from legitimate crawlers. + +It features: + +- **Spider Trap Pages**: Infinite random links to waste crawler resources based on the [spidertrap project](https://github.com/adhdproject/spidertrap) +- **Fake Login Pages**: WordPress, phpMyAdmin, admin panels +- **Honeypot Paths**: Advertised in robots.txt to catch scanners +- **Fake Credentials**: Realistic-looking usernames, passwords, API keys +- **[Canary Token](#customizing-the-canary-token) Integration**: External alert triggering +- **Random server headers**: Confuse attacks based on server header and version +- **Real-time Dashboard**: Monitor suspicious activity +- **Customizable Wordlists**: Easy JSON-based configuration +- **Random Error Injection**: Mimic real server behavior + +![dashboard](img/deception-page.png) + +![geoip](img/geoip_dashboard.png) + +## 🚀 Installation + +### Docker Run + +Run Krawl with the latest image: + +```bash +docker run -d \ + -p 5000:5000 \ + -e KRAWL_PORT=5000 \ + -e KRAWL_DELAY=100 \ + -e KRAWL_DASHBOARD_SECRET_PATH="/my-secret-dashboard" \ + -e KRAWL_DATABASE_RETENTION_DAYS=30 \ + --name krawl \ + ghcr.io/blessedrebus/krawl:latest +``` + +Access the server at `http://localhost:5000` + +### Docker Compose + +Create a `docker-compose.yaml` file: + +```yaml +services: + krawl: + image: ghcr.io/blessedrebus/krawl:latest + container_name: krawl-server + ports: + - "5000:5000" + environment: + - CONFIG_LOCATION=config.yaml + - TZ="Europe/Rome" + volumes: + - ./config.yaml:/app/config.yaml:ro + - krawl-data:/app/data + restart: unless-stopped + +volumes: + krawl-data: +``` + +Run with: + +```bash +docker-compose up -d +``` + +Stop with: + +```bash +docker-compose down +``` + +### Kubernetes +**Krawl is also available natively on Kubernetes**. Installation can be done either [via manifest](kubernetes/README.md) or [using the helm chart](helm/README.md). + +## Use Krawl to Ban Malicious IPs +Krawl uses a reputation-based system to classify attacker IP addresses. Every five minutes, Krawl exports the identified malicious IPs to a `malicious_ips.txt` file. + +This file can either be mounted from the Docker container into another system or downloaded directly via `curl`: + +```bash +curl https://your-krawl-instance//api/download/malicious_ips.txt +``` + +This file can be used to [update a set of firewall rules](https://www.allthingstech.ch/using-opnsense-and-ip-blocklists-to-block-malicious-traffic), for example on OPNsense and pfSense, enabling automatic blocking of malicious IPs or using IPtables + +## IP Reputation +Krawl [uses tasks that analyze recent traffic to build and continuously update an IP reputation](src/tasks/analyze_ips.py) score. It runs periodically and evaluates each active IP address based on multiple behavioral indicators to classify it as an attacker, crawler, or regular user. Thresholds are fully customizable. + +![ip reputation](img/ip-reputation.png) + +The analysis includes: +- **Risky HTTP methods usage** (e.g. POST, PUT, DELETE ratios) +- **Robots.txt violations** +- **Request timing anomalies** (bursty or irregular patterns) +- **User-Agent consistency** +- **Attack URL detection** (e.g. SQL injection, XSS patterns) + +Each signal contributes to a weighted scoring model that assigns a reputation category: +- `attacker` +- `bad_crawler` +- `good_crawler` +- `regular_user` +- `unknown` (for insufficient data) + +The resulting scores and metrics are stored in the database and used by Krawl to drive dashboards, reputation tracking, and automated mitigation actions such as IP banning or firewall integration. + +## Forward server header +If Krawl is deployed behind a proxy such as NGINX the **server header** should be forwarded using the following configuration in your proxy: + +```bash +location / { + proxy_pass https://your-krawl-instance; + proxy_pass_header Server; +} +``` + +## API +Krawl uses the following APIs +- https://iprep.lcrawl.com (IP Reputation) +- https://nominatim.openstreetmap.org/reverse (Reverse IP Lookup) +- https://api.ipify.org (Public IP discovery) +- http://ident.me (Public IP discovery) +- https://ifconfig.me (Public IP discovery) + +## Configuration +Krawl uses a **configuration hierarchy** in which **environment variables take precedence over the configuration file**. This approach is recommended for Docker deployments and quick out-of-the-box customization. + +### Configuration via Enviromental Variables + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| `CONFIG_LOCATION` | Path to yaml config file | `config.yaml` | +| `KRAWL_PORT` | Server listening port | `5000` | +| `KRAWL_DELAY` | Response delay in milliseconds | `100` | +| `KRAWL_SERVER_HEADER` | HTTP Server header for deception | `""` | +| `KRAWL_LINKS_LENGTH_RANGE` | Link length range as `min,max` | `5,15` | +| `KRAWL_LINKS_PER_PAGE_RANGE` | Links per page as `min,max` | `10,15` | +| `KRAWL_CHAR_SPACE` | Characters used for link generation | `abcdefgh...` | +| `KRAWL_MAX_COUNTER` | Initial counter value | `10` | +| `KRAWL_CANARY_TOKEN_URL` | External canary token URL | None | +| `KRAWL_CANARY_TOKEN_TRIES` | Requests before showing canary token | `10` | +| `KRAWL_DASHBOARD_SECRET_PATH` | Custom dashboard path | Auto-generated | +| `KRAWL_PROBABILITY_ERROR_CODES` | Error response probability (0-100%) | `0` | +| `KRAWL_DATABASE_PATH` | Database file location | `data/krawl.db` | +| `KRAWL_DATABASE_RETENTION_DAYS` | Days to retain data in database | `30` | +| `KRAWL_HTTP_RISKY_METHODS_THRESHOLD` | Threshold for risky HTTP methods detection | `0.1` | +| `KRAWL_VIOLATED_ROBOTS_THRESHOLD` | Threshold for robots.txt violations | `0.1` | +| `KRAWL_UNEVEN_REQUEST_TIMING_THRESHOLD` | Coefficient of variation threshold for timing | `0.5` | +| `KRAWL_UNEVEN_REQUEST_TIMING_TIME_WINDOW_SECONDS` | Time window for request timing analysis in seconds | `300` | +| `KRAWL_USER_AGENTS_USED_THRESHOLD` | Threshold for detecting multiple user agents | `2` | +| `KRAWL_ATTACK_URLS_THRESHOLD` | Threshold for attack URL detection | `1` | +| `KRAWL_INFINITE_PAGES_FOR_MALICIOUS` | Serve infinite pages to malicious IPs | `true` | +| `KRAWL_MAX_PAGES_LIMIT` | Maximum page limit for crawlers | `250` | +| `KRAWL_BAN_DURATION_SECONDS` | Ban duration in seconds for rate-limited IPs | `600` | + +For example + +```bash +# Set canary token +export CONFIG_LOCATION="config.yaml" +export KRAWL_CANARY_TOKEN_URL="http://your-canary-token-url" + +# Set number of pages range (min,max format) +export KRAWL_LINKS_PER_PAGE_RANGE="5,25" + +# Set analyzer thresholds +export KRAWL_HTTP_RISKY_METHODS_THRESHOLD="0.2" +export KRAWL_VIOLATED_ROBOTS_THRESHOLD="0.15" + +# Set custom dashboard path +export KRAWL_DASHBOARD_SECRET_PATH="/my-secret-dashboard" +``` + +Example of a Docker run with env variables: + +```bash +docker run -d \ + -p 5000:5000 \ + -e KRAWL_PORT=5000 \ + -e KRAWL_DELAY=100 \ + -e KRAWL_CANARY_TOKEN_URL="http://your-canary-token-url" \ + --name krawl \ + ghcr.io/blessedrebus/krawl:latest +``` + +### Configuration via config.yaml +You can use the [config.yaml](config.yaml) file for more advanced configurations, such as Docker Compose or Helm chart deployments. + +# Honeypot +Below is a complete overview of the Krawl honeypot’s capabilities + +## robots.txt +The actual (juicy) robots.txt configuration [is the following](src/templates/html/robots.txt). + +## Honeypot pages +Requests to common admin endpoints (`/admin/`, `/wp-admin/`, `/phpMyAdmin/`) return a fake login page. Any login attempt triggers a 1-second delay to simulate real processing and is fully logged in the dashboard (credentials, IP, headers, timing). + +![admin page](img/admin-page.png) + + +Requests to paths like `/backup/`, `/config/`, `/database/`, `/private/`, or `/uploads/` return a fake directory listing populated with “interesting” files, each assigned a random file size to look realistic. + +![directory-page](img/directory-page.png) + +The `.env` endpoint exposes fake database connection strings, **AWS API keys**, and **Stripe secrets**. It intentionally returns an error due to the `Content-Type` being `application/json` instead of plain text, mimicking a “juicy” misconfiguration that crawlers and scanners often flag as information leakage. + +The `/server` page displays randomly generated fake error information for each known server. + +![server and env page](img/server-and-env-page.png) + +The pages `/api/v1/users` and `/api/v2/secrets` show fake users and random secrets in JSON format + +![users and secrets](img/users-and-secrets.png) + +The pages `/credentials.txt` and `/passwords.txt` show fake users and random secrets + +![credentials and passwords](img/credentials-and-passwords.png) + +Pages such as `/users`, `/search`, `/contact`, `/info`, `/input`, and `/feedback`, along with APIs like `/api/sql` and `/api/database`, are designed to lure attackers into performing attacks such as **SQL injection** or **XSS**. + +![sql injection](img/sql_injection.png) + +Automated tools like **SQLMap** will receive a different randomized database error on each request, increasing scan noise and confusing the attacker. All detected attacks are logged and displayed in the dashboard. + +## Customizing the Canary Token +To create a custom canary token, visit https://canarytokens.org + +and generate a “Web bug” canary token. + +This optional token is triggered when a crawler fully traverses the webpage until it reaches 0. At that point, a URL is returned. When this URL is requested, it sends an alert to the user via email, including the visitor’s IP address and user agent. + + +To enable this feature, set the canary token URL [using the environment variable](#configuration-via-environment-variables) `CANARY_TOKEN_URL`. + +## Customizing the wordlist + +Edit `wordlists.json` to customize fake data for your use case + +```json +{ + "usernames": { + "prefixes": ["admin", "root", "user"], + "suffixes": ["_prod", "_dev", "123"] + }, + "passwords": { + "prefixes": ["P@ssw0rd", "Admin"], + "simple": ["test", "password"] + }, + "directory_listing": { + "files": ["credentials.txt", "backup.sql"], + "directories": ["admin/", "backup/"] + } +} +``` + +or **values.yaml** in the case of helm chart installation + +## Dashboard + +Access the dashboard at `http://:/` + +The dashboard shows: +- Total and unique accesses +- Suspicious activity and attack detection +- Top IPs, paths, user-agents and GeoIP localization +- Real-time monitoring + +The attackers’ access to the honeypot endpoint and related suspicious activities (such as failed login attempts) are logged. + +Krawl also implements a scoring system designed to distinguish between malicious and legitimate behavior on the website. + +![dashboard-1](img/dashboard-1.png) + +The top IP Addresses is shown along with top paths and User Agents + +![dashboard-2](img/dashboard-2.png) + +![dashboard-3](img/dashboard-3.png) + +## 🤝 Contributing + +Contributions welcome! Please: +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Submit a pull request (explain the changes!) + + +
+ +## ⚠️ Disclaimer + +**This is a deception/honeypot system.** +Deploy in isolated environments and monitor carefully for security events. +Use responsibly and in compliance with applicable laws and regulations. + +## Star History +Star History Chart diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..c29ebe4 --- /dev/null +++ b/config.yaml @@ -0,0 +1,46 @@ +# Krawl Honeypot Configuration + +server: + port: 5000 + delay: 100 # Response delay in milliseconds + + # manually set the server header, if null a random one will be used. + server_header: null + +links: + min_length: 5 + max_length: 15 + min_per_page: 5 + max_per_page: 10 + char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + max_counter: 10 + +canary: + token_url: null # Optional canary token URL + token_tries: 10 + +dashboard: + # if set to "null" this will Auto-generates random path if not set + # can be set to "/dashboard" or similar <-- note this MUST include a forward slash + # secret_path: super-secret-dashboard-path + secret_path: test + +database: + path: "data/krawl.db" + retention_days: 30 + +behavior: + probability_error_codes: 0 # 0-100 percentage + +analyzer: + http_risky_methods_threshold: 0.1 + violated_robots_threshold: 0.1 + uneven_request_timing_threshold: 0.5 + uneven_request_timing_time_window_seconds: 300 + user_agents_used_threshold: 2 + attack_urls_threshold: 1 + +crawl: + infinite_pages_for_malicious: true + max_pages_limit: 250 + ban_duration_seconds: 600 \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 1612864..233692b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,5 +1,4 @@ -version: '3.8' - +--- services: krawl: build: @@ -8,27 +7,26 @@ services: container_name: krawl-server ports: - "5000:5000" + environment: + - CONFIG_LOCATION=config.yaml + # set this to change timezone, alternatively mount /etc/timezone or /etc/localtime based on the time system management of the host environment + # - TZ=${TZ} volumes: - ./wordlists.json:/app/wordlists.json:ro - environment: - - PORT=5000 - - DELAY=100 - - LINKS_MIN_LENGTH=5 - - LINKS_MAX_LENGTH=15 - - LINKS_MIN_PER_PAGE=10 - - LINKS_MAX_PER_PAGE=15 - - MAX_COUNTER=10 - - CANARY_TOKEN_TRIES=10 - - PROBABILITY_ERROR_CODES=0 - - SERVER_HEADER=Apache/2.2.22 (Ubuntu) - # Optional: Set your canary token URL - # - CANARY_TOKEN_URL=http://canarytokens.com/api/users/YOUR_TOKEN/passwords.txt - # Optional: Set custom dashboard path (auto-generated if not set) - # - DASHBOARD_SECRET_PATH=/my-secret-dashboard + - ./config.yaml:/app/config.yaml:ro + - ./logs:/app/logs + - ./exports:/app/exports + - data:/app/data restart: unless-stopped - healthcheck: - test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:5000')"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 10s + develop: + watch: + - path: ./Dockerfile + action: rebuild + - path: ./src/ + action: sync+restart + target: /app/src + - path: ./docker-compose.yaml + action: rebuild + +volumes: + data: diff --git a/docs/coding-guidelines.md b/docs/coding-guidelines.md new file mode 100644 index 0000000..1e13575 --- /dev/null +++ b/docs/coding-guidelines.md @@ -0,0 +1,90 @@ +### Coding Standards + +**Style & Structure** +- Prefer longer, explicit code over compact one-liners +- Always include docstrings for functions/classes + inline comments +- Strongly prefer OOP-style code (classes over functional/nested functions) +- Strong typing throughout (dataclasses, TypedDict, Enums, type hints) +- Value future-proofing and expanded usage insights + +**Data Design** +- Use dataclasses for internal data modeling +- Typed JSON structures +- Functions return fully typed objects (no loose dicts) +- Snapshot files in JSON or YAML +- Human-readable fields (e.g., `sql_injection`, `xss_attempt`) + +**Templates & UI** +- Don't mix large HTML/CSS blocks in Python code +- Prefer Jinja templates for HTML rendering +- Clean CSS, minimal inline clutter, readable template logic + +**Writing & Documentation** +- Markdown documentation +- Clear section headers +- Roadmap/Phase/Feature-Session style documents + +**Logging** +- Use singleton for logging found in `src\logger.py` +- Setup logging at app start: + ``` + initialize_logging() + app_logger = get_app_logger() + access_logger = get_access_logger() + credential_logger = get_credential_logger() + ``` + +**Preferred Pip Packages** +- API/Web Server: Simple Python +- HTTP: Requests +- SQLite: Sqlalchemy +- Database Migrations: Alembic + +### Error Handling +- Custom exception classes for domain-specific errors +- Consistent error response formats (JSON structure) +- Logging severity levels (ERROR vs WARNING) + +### Configuration +- `.env` for secrets (never committed) +- Maintain `.env.example` in each component for documentation +- Typed config loaders using dataclasses +- Validation on startup + +### Containerization & Deployment +- Explicit Dockerfiles +- Production-friendly hardening (distroless/slim when meaningful) +- Use git branch as tag + +### Dependency Management +- Use `requirements.txt` and virtual environments (`python3 -m venv venv`) +- Use path `venv` for all virtual environments +- Pin versions to version ranges (or exact versions if pinning a particular version) +- Activate venv before running code (unless in Docker) + +### Testing Standards +- Manual testing preferred for applications +- **tests:** Use shell scripts with curl/httpie for simulation and attack scripts. +- tests should be located in `tests` directory + +### Git Standards + +**Branch Strategy:** +- `master` - Production-ready code only +- `beta` - Public pre-release testing +- `dev` - Main development branch, integration point + +**Workflow:** +- Feature work branches off `dev` (e.g., `feature/add-scheduler`) +- Merge features back to `dev` for testing +- Promote `dev` → `beta` for public testing (when applicable) +- Promote `beta` (or `dev`) → `master` for production + +**Commit Messages:** +- Use conventional commit format: `feat:`, `fix:`, `docs:`, `refactor:`, etc. +- Keep commits atomic and focused +- Write clear, descriptive messages + +**Tagging:** +- Tag releases on `master` with semantic versioning (e.g., `v1.2.3`) +- Optionally tag beta releases (e.g., `v1.2.3-beta.1`) \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..fe3ef45 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/sh +set -e + +# Fix ownership of mounted directories +chown -R krawl:krawl /app/logs /app/data /app/exports 2>/dev/null || true + +# Drop to krawl user and run the application +exec gosu krawl "$@" diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 3fe5d8a..2e3ae94 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: krawl-chart description: A Helm chart for Krawl honeypot server type: application -version: 0.1.2 -appVersion: "1.0.0" +version: 1.0.0 +appVersion: 1.0.0 keywords: - honeypot - security @@ -13,3 +13,4 @@ maintainers: home: https://github.com/blessedrebus/krawl sources: - https://github.com/blessedrebus/krawl +icon: https://raw.githubusercontent.com/blessedrebus/krawl/main/img/krawl-svg.svg \ No newline at end of file diff --git a/helm/README.md b/helm/README.md new file mode 100644 index 0000000..ae57261 --- /dev/null +++ b/helm/README.md @@ -0,0 +1,356 @@ +# Krawl Helm Chart + +A Helm chart for deploying the Krawl honeypot application on Kubernetes. + +## Prerequisites + +- Kubernetes 1.19+ +- Helm 3.0+ +- Persistent Volume provisioner (optional, for database persistence) + +## Installation + + +### Helm Chart + +Install with default values: + +```bash +helm install krawl oci://ghcr.io/blessedrebus/krawl-chart \ + --version 1.0.0 \ + --namespace krawl-system \ + --create-namespace +``` + +Or create a minimal `values.yaml` file: + +```yaml +service: + type: LoadBalancer + port: 5000 + +timezone: "Europe/Rome" + +ingress: + enabled: true + className: "traefik" + hosts: + - host: krawl.example.com + paths: + - path: / + pathType: Prefix + +config: + server: + port: 5000 + delay: 100 + dashboard: + secret_path: null # Auto-generated if not set + +database: + persistence: + enabled: true + size: 1Gi +``` + +Install with custom values: + +```bash +helm install krawl oci://ghcr.io/blessedrebus/krawl-chart \ + --version 0.2.2 \ + --namespace krawl-system \ + --create-namespace \ + -f values.yaml +``` + +To access the deception server: + +```bash +kubectl get svc krawl -n krawl-system +``` + +Once the EXTERNAL-IP is assigned, access your deception server at `http://:5000` + +### Add the repository (if applicable) + +```bash +helm repo add krawl https://github.com/BlessedRebuS/Krawl +helm repo update +``` + +### Install from OCI Registry + +```bash +helm install krawl oci://ghcr.io/blessedrebus/krawl-chart --version 0.2.1 +``` + +Or with a specific namespace: + +```bash +helm install krawl oci://ghcr.io/blessedrebus/krawl-chart --version 0.2.1 -n krawl --create-namespace +``` + +### Install the chart locally + +```bash +helm install krawl ./helm +``` + +### Install with custom values + +```bash +helm install krawl ./helm -f values.yaml +``` + +### Install in a specific namespace + +```bash +helm install krawl ./helm -n krawl --create-namespace +``` + +## Configuration + +The following table lists the main configuration parameters of the Krawl chart and their default values. + +### Global Settings + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `replicaCount` | Number of pod replicas | `1` | +| `image.repository` | Image repository | `ghcr.io/blessedrebus/krawl` | +| `image.tag` | Image tag | `latest` | +| `image.pullPolicy` | Image pull policy | `Always` | + +### Service Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `service.type` | Service type | `LoadBalancer` | +| `service.port` | Service port | `5000` | +| `service.externalTrafficPolicy` | External traffic policy | `Local` | + +### Ingress Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `ingress.enabled` | Enable ingress | `true` | +| `ingress.className` | Ingress class name | `traefik` | +| `ingress.hosts[0].host` | Ingress hostname | `krawl.example.com` | + +### Server Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.server.port` | Server port | `5000` | +| `config.server.delay` | Response delay in milliseconds | `100` | +| `config.server.timezone` | IANA timezone (e.g., "America/New_York") | `null` | + +### Links Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.links.min_length` | Minimum link length | `5` | +| `config.links.max_length` | Maximum link length | `15` | +| `config.links.min_per_page` | Minimum links per page | `10` | +| `config.links.max_per_page` | Maximum links per page | `15` | +| `config.links.char_space` | Character space for link generation | `abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789` | +| `config.links.max_counter` | Maximum counter value | `10` | + +### Canary Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.canary.token_url` | Canary token URL | `null` | +| `config.canary.token_tries` | Number of canary token tries | `10` | + +### Dashboard Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.dashboard.secret_path` | Secret dashboard path (auto-generated if null) | `null` | + +### API Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.api.server_url` | API server URL | `null` | +| `config.api.server_port` | API server port | `8080` | +| `config.api.server_path` | API server path | `/api/v2/users` | + +### Database Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.database.path` | Database file path | `data/krawl.db` | +| `config.database.retention_days` | Data retention in days | `30` | +| `database.persistence.enabled` | Enable persistent volume | `true` | +| `database.persistence.size` | Persistent volume size | `1Gi` | +| `database.persistence.accessMode` | Access mode | `ReadWriteOnce` | + +### Behavior Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.behavior.probability_error_codes` | Error code probability (0-100) | `0` | + +### Analyzer Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.analyzer.http_risky_methods_threshold` | HTTP risky methods threshold | `0.1` | +| `config.analyzer.violated_robots_threshold` | Violated robots.txt threshold | `0.1` | +| `config.analyzer.uneven_request_timing_threshold` | Uneven request timing threshold | `0.5` | +| `config.analyzer.uneven_request_timing_time_window_seconds` | Time window for request timing analysis | `300` | +| `config.analyzer.user_agents_used_threshold` | User agents threshold | `2` | +| `config.analyzer.attack_urls_threshold` | Attack URLs threshold | `1` | + +### Crawl Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `config.crawl.infinite_pages_for_malicious` | Infinite pages for malicious crawlers | `true` | +| `config.crawl.max_pages_limit` | Maximum pages limit for legitimate crawlers | `250` | +| `config.crawl.ban_duration_seconds` | IP ban duration in seconds | `600` | + +### Resource Limits + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `resources.limits.cpu` | CPU limit | `500m` | +| `resources.limits.memory` | Memory limit | `256Mi` | +| `resources.requests.cpu` | CPU request | `100m` | +| `resources.requests.memory` | Memory request | `64Mi` | + +### Autoscaling + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `autoscaling.enabled` | Enable horizontal pod autoscaling | `false` | +| `autoscaling.minReplicas` | Minimum replicas | `1` | +| `autoscaling.maxReplicas` | Maximum replicas | `1` | +| `autoscaling.targetCPUUtilizationPercentage` | Target CPU utilization | `70` | +| `autoscaling.targetMemoryUtilizationPercentage` | Target memory utilization | `80` | + +### Network Policy + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `networkPolicy.enabled` | Enable network policy | `true` | + +### Retrieving Dashboard Path + +Check server startup logs or get the secret with + +```bash +kubectl get secret krawl-server -n krawl-system \ + -o jsonpath='{.data.dashboard-path}' | base64 -d && echo +``` + +## Usage Examples + +### Basic Installation + +```bash +helm install krawl ./helm +``` + +### Installation with Custom Domain + +```bash +helm install krawl ./helm \ + --set ingress.hosts[0].host=honeypot.example.com +``` + +### Enable Canary Tokens + +```bash +helm install krawl ./helm \ + --set config.canary.token_url=https://canarytokens.com/your-token +``` + +### Configure Custom API Endpoint + +```bash +helm install krawl ./helm \ + --set config.api.server_url=https://api.example.com \ + --set config.api.server_port=443 +``` + +### Create Values Override File + +Create `custom-values.yaml`: + +```yaml +config: + server: + port: 8080 + delay: 500 + canary: + token_url: https://your-canary-token-url + dashboard: + secret_path: /super-secret-path + crawl: + max_pages_limit: 500 + ban_duration_seconds: 3600 +``` + +Then install: + +```bash +helm install krawl ./helm -f custom-values.yaml +``` + +## Upgrading + +```bash +helm upgrade krawl ./helm +``` + +## Uninstalling + +```bash +helm uninstall krawl +``` + +## Troubleshooting + +### Check chart syntax + +```bash +helm lint ./helm +``` + +### Dry run to verify values + +```bash +helm install krawl ./helm --dry-run --debug +``` + +### Check deployed configuration + +```bash +kubectl get configmap krawl-config -o yaml +``` + +### View pod logs + +```bash +kubectl logs -l app.kubernetes.io/name=krawl +``` + +## Chart Files + +- `Chart.yaml` - Chart metadata +- `values.yaml` - Default configuration values +- `templates/` - Kubernetes resource templates + - `deployment.yaml` - Krawl deployment + - `service.yaml` - Service configuration + - `configmap.yaml` - Application configuration + - `pvc.yaml` - Persistent volume claim + - `ingress.yaml` - Ingress configuration + - `hpa.yaml` - Horizontal pod autoscaler + - `network-policy.yaml` - Network policies + +## Support + +For issues and questions, please visit the [Krawl GitHub repository](https://github.com/BlessedRebuS/Krawl). diff --git a/helm/templates/configmap.yaml b/helm/templates/configmap.yaml index c50ab75..f81d319 100644 --- a/helm/templates/configmap.yaml +++ b/helm/templates/configmap.yaml @@ -5,14 +5,36 @@ metadata: labels: {{- include "krawl.labels" . | nindent 4 }} data: - PORT: {{ .Values.config.port | quote }} - DELAY: {{ .Values.config.delay | quote }} - LINKS_MIN_LENGTH: {{ .Values.config.linksMinLength | quote }} - LINKS_MAX_LENGTH: {{ .Values.config.linksMaxLength | quote }} - LINKS_MIN_PER_PAGE: {{ .Values.config.linksMinPerPage | quote }} - LINKS_MAX_PER_PAGE: {{ .Values.config.linksMaxPerPage | quote }} - MAX_COUNTER: {{ .Values.config.maxCounter | quote }} - CANARY_TOKEN_TRIES: {{ .Values.config.canaryTokenTries | quote }} - PROBABILITY_ERROR_CODES: {{ .Values.config.probabilityErrorCodes | quote }} - SERVER_HEADER: {{ .Values.config.serverHeader | quote }} - CANARY_TOKEN_URL: {{ .Values.config.canaryTokenUrl | quote }} + config.yaml: | + # Krawl Honeypot Configuration + server: + port: {{ .Values.config.server.port }} + delay: {{ .Values.config.server.delay }} + links: + min_length: {{ .Values.config.links.min_length }} + max_length: {{ .Values.config.links.max_length }} + min_per_page: {{ .Values.config.links.min_per_page }} + max_per_page: {{ .Values.config.links.max_per_page }} + char_space: {{ .Values.config.links.char_space | quote }} + max_counter: {{ .Values.config.links.max_counter }} + canary: + token_url: {{ .Values.config.canary.token_url | toYaml }} + token_tries: {{ .Values.config.canary.token_tries }} + dashboard: + secret_path: {{ .Values.config.dashboard.secret_path | toYaml }} + database: + path: {{ .Values.config.database.path | quote }} + retention_days: {{ .Values.config.database.retention_days }} + behavior: + probability_error_codes: {{ .Values.config.behavior.probability_error_codes }} + analyzer: + http_risky_methods_threshold: {{ .Values.config.analyzer.http_risky_methods_threshold }} + violated_robots_threshold: {{ .Values.config.analyzer.violated_robots_threshold }} + uneven_request_timing_threshold: {{ .Values.config.analyzer.uneven_request_timing_threshold }} + uneven_request_timing_time_window_seconds: {{ .Values.config.analyzer.uneven_request_timing_time_window_seconds }} + user_agents_used_threshold: {{ .Values.config.analyzer.user_agents_used_threshold }} + attack_urls_threshold: {{ .Values.config.analyzer.attack_urls_threshold }} + crawl: + infinite_pages_for_malicious: {{ .Values.config.crawl.infinite_pages_for_malicious }} + max_pages_limit: {{ .Values.config.crawl.max_pages_limit }} + ban_duration_seconds: {{ .Values.config.crawl.ban_duration_seconds }} diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index b0aeb6d..f24261c 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -38,30 +38,49 @@ spec: imagePullPolicy: {{ .Values.image.pullPolicy }} ports: - name: http - containerPort: {{ .Values.config.port }} + containerPort: {{ .Values.config.server.port }} protocol: TCP - envFrom: - - configMapRef: - name: {{ include "krawl.fullname" . }}-config env: - - name: DASHBOARD_SECRET_PATH - valueFrom: - secretKeyRef: - name: {{ include "krawl.fullname" . }} - key: dashboard-path + - name: CONFIG_LOCATION + value: "config.yaml" + {{- if .Values.timezone }} + - name: TZ + value: {{ .Values.timezone | quote }} + {{- end }} volumeMounts: + - name: config + mountPath: /app/config.yaml + subPath: config.yaml + readOnly: true - name: wordlists mountPath: /app/wordlists.json subPath: wordlists.json readOnly: true + {{- if .Values.database.persistence.enabled }} + - name: database + mountPath: /app/data + {{- end }} {{- with .Values.resources }} resources: {{- toYaml . | nindent 12 }} {{- end }} volumes: + - name: config + configMap: + name: {{ include "krawl.fullname" . }}-config - name: wordlists configMap: name: {{ include "krawl.fullname" . }}-wordlists + {{- if .Values.database.persistence.enabled }} + - name: database + {{- if .Values.database.persistence.existingClaim }} + persistentVolumeClaim: + claimName: {{ .Values.database.persistence.existingClaim }} + {{- else }} + persistentVolumeClaim: + claimName: {{ include "krawl.fullname" . }}-db + {{- end }} + {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/helm/templates/pvc.yaml b/helm/templates/pvc.yaml new file mode 100644 index 0000000..ec73af2 --- /dev/null +++ b/helm/templates/pvc.yaml @@ -0,0 +1,17 @@ +{{- if and .Values.database.persistence.enabled (not .Values.database.persistence.existingClaim) }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "krawl.fullname" . }}-db + labels: + {{- include "krawl.labels" . | nindent 4 }} +spec: + accessModes: + - {{ .Values.database.persistence.accessMode }} + {{- if .Values.database.persistence.storageClassName }} + storageClassName: {{ .Values.database.persistence.storageClassName }} + {{- end }} + resources: + requests: + storage: {{ .Values.database.persistence.size }} +{{- end }} diff --git a/helm/templates/secret.yaml b/helm/templates/secret.yaml deleted file mode 100644 index 798289c..0000000 --- a/helm/templates/secret.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- $secret := (lookup "v1" "Secret" .Release.Namespace (include "krawl.fullname" .)) -}} -{{- $dashboardPath := "" -}} -{{- if and $secret $secret.data -}} - {{- $dashboardPath = index $secret.data "dashboard-path" | b64dec -}} -{{- else -}} - {{- $dashboardPath = printf "/%s" (randAlphaNum 32) -}} -{{- end -}} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "krawl.fullname" . }} - labels: - {{- include "krawl.labels" . | nindent 4 }} -type: Opaque -stringData: - dashboard-path: {{ $dashboardPath | quote }} diff --git a/helm/values.yaml b/helm/values.yaml index a095632..fb9be82 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -3,7 +3,7 @@ replicaCount: 1 image: repository: ghcr.io/blessedrebus/krawl pullPolicy: Always - tag: "latest" + tag: "1.0.0" imagePullSecrets: [] nameOverride: "krawl" @@ -49,6 +49,11 @@ resources: cpu: 100m memory: 64Mi +# Container timezone configuration +# Set this to change timezone (e.g., "America/New_York", "Europe/Rome") +# If not set, container will use its default timezone +timezone: "" + autoscaling: enabled: false minReplicas: 1 @@ -62,19 +67,53 @@ tolerations: [] affinity: {} -# Application configuration +# Application configuration (config.yaml structure) config: - port: 5000 - delay: 100 - linksMinLength: 5 - linksMaxLength: 15 - linksMinPerPage: 10 - linksMaxPerPage: 15 - maxCounter: 10 - canaryTokenTries: 10 - probabilityErrorCodes: 0 - serverHeader: "Apache/2.2.22 (Ubuntu)" -# canaryTokenUrl: set-your-canary-token-url-here + server: + port: 5000 + delay: 100 + links: + min_length: 5 + max_length: 15 + min_per_page: 10 + max_per_page: 15 + char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + max_counter: 10 + canary: + token_url: null # Set your canary token URL here + token_tries: 10 + dashboard: + secret_path: null # Auto-generated if not set, or set to "/my-secret-dashboard" + database: + path: "data/krawl.db" + retention_days: 30 + behavior: + probability_error_codes: 0 + analyzer: + http_risky_methods_threshold: 0.1 + violated_robots_threshold: 0.1 + uneven_request_timing_threshold: 0.5 + uneven_request_timing_time_window_seconds: 300 + user_agents_used_threshold: 2 + attack_urls_threshold: 1 + crawl: + infinite_pages_for_malicious: true + max_pages_limit: 250 + ban_duration_seconds: 600 + +# Database persistence configuration +database: + # Persistence configuration + persistence: + enabled: true + # Storage class name (use default if not specified) + # storageClassName: "" + # Access mode for the persistent volume + accessMode: ReadWriteOnce + # Size of the persistent volume + size: 1Gi + # Optional: Use existing PVC + # existingClaim: "" networkPolicy: enabled: true @@ -268,6 +307,17 @@ wordlists: - .git/ - keys/ - credentials/ + server_headers: + - Apache/2.2.22 (Ubuntu) + - nginx/1.18.0 + - Microsoft-IIS/10.0 + - LiteSpeed + - Caddy + - Gunicorn/20.0.4 + - uvicorn/0.13.4 + - Express + - Flask/1.1.2 + - Django/3.1 error_codes: - 400 - 401 diff --git a/img/admin-page.png b/img/admin-page.png index ba82843..790e3c3 100644 Binary files a/img/admin-page.png and b/img/admin-page.png differ diff --git a/img/api-secrets-page.png b/img/api-secrets-page.png deleted file mode 100644 index 77b47c8..0000000 Binary files a/img/api-secrets-page.png and /dev/null differ diff --git a/img/api-users-page.png b/img/api-users-page.png deleted file mode 100644 index 6746594..0000000 Binary files a/img/api-users-page.png and /dev/null differ diff --git a/img/credentials-and-passwords.png b/img/credentials-and-passwords.png new file mode 100644 index 0000000..acb134a Binary files /dev/null and b/img/credentials-and-passwords.png differ diff --git a/img/credentials-page.png b/img/credentials-page.png deleted file mode 100644 index bc3fffa..0000000 Binary files a/img/credentials-page.png and /dev/null differ diff --git a/img/dashboard-1.png b/img/dashboard-1.png index ad11dd8..4479914 100644 Binary files a/img/dashboard-1.png and b/img/dashboard-1.png differ diff --git a/img/dashboard-2.png b/img/dashboard-2.png index 65c0766..e6a208d 100644 Binary files a/img/dashboard-2.png and b/img/dashboard-2.png differ diff --git a/img/dashboard-3.png b/img/dashboard-3.png new file mode 100644 index 0000000..e7b24df Binary files /dev/null and b/img/dashboard-3.png differ diff --git a/img/env-page.png b/img/env-page.png deleted file mode 100644 index a738732..0000000 Binary files a/img/env-page.png and /dev/null differ diff --git a/img/geoip_dashboard.png b/img/geoip_dashboard.png new file mode 100644 index 0000000..6825be7 Binary files /dev/null and b/img/geoip_dashboard.png differ diff --git a/img/ip-reputation.png b/img/ip-reputation.png new file mode 100644 index 0000000..9119e63 Binary files /dev/null and b/img/ip-reputation.png differ diff --git a/img/krawl-svg.svg b/img/krawl-svg.svg new file mode 100644 index 0000000..2d15e51 --- /dev/null +++ b/img/krawl-svg.svg @@ -0,0 +1,95 @@ + + + + diff --git a/img/passwords-page.png b/img/passwords-page.png deleted file mode 100644 index c9ca2f0..0000000 Binary files a/img/passwords-page.png and /dev/null differ diff --git a/img/server-and-env-page.png b/img/server-and-env-page.png new file mode 100644 index 0000000..700c39d Binary files /dev/null and b/img/server-and-env-page.png differ diff --git a/img/sql_injection.png b/img/sql_injection.png new file mode 100644 index 0000000..8eb8ad3 Binary files /dev/null and b/img/sql_injection.png differ diff --git a/img/users-and-secrets.png b/img/users-and-secrets.png new file mode 100644 index 0000000..f99297e Binary files /dev/null and b/img/users-and-secrets.png differ diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 0000000..d803496 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,47 @@ +### Kubernetes + +Apply all manifests with: + +```bash +kubectl apply -f https://raw.githubusercontent.com/BlessedRebuS/Krawl/refs/heads/main/kubernetes/krawl-all-in-one-deploy.yaml +``` + +Or clone the repo and apply the manifest: + +```bash +kubectl apply -f kubernetes/krawl-all-in-one-deploy.yaml +``` + +Access the deception server: + +```bash +kubectl get svc krawl-server -n krawl-system +``` + +Once the EXTERNAL-IP is assigned, access your deception server at `http://:5000` + +### Retrieving Dashboard Path + +Check server startup logs or get the secret with + +```bash +kubectl get secret krawl-server -n krawl-system \ + -o jsonpath='{.data.dashboard-path}' | base64 -d && echo +``` + +### From Source (Python 3.11+) + +Clone the repository: + +```bash +git clone https://github.com/blessedrebus/krawl.git +cd krawl/src +``` + +Run the server: + +```bash +python3 server.py +``` + +Visit `http://localhost:5000` and access the dashboard at `http://localhost:5000/` diff --git a/kubernetes/krawl-all-in-one-deploy.yaml b/kubernetes/krawl-all-in-one-deploy.yaml index 0362220..767c080 100644 --- a/kubernetes/krawl-all-in-one-deploy.yaml +++ b/kubernetes/krawl-all-in-one-deploy.yaml @@ -4,369 +4,226 @@ kind: Namespace metadata: name: krawl-system --- +# Source: krawl-chart/templates/network-policy.yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: krawl + namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + policyTypes: + - Ingress + - Egress + ingress: + - from: + - podSelector: {} + - namespaceSelector: {} + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - port: 5000 + protocol: TCP + egress: + - ports: + - protocol: TCP + - protocol: UDP + to: + - namespaceSelector: {} + - ipBlock: + cidr: 0.0.0.0/0 +--- +# Source: krawl-chart/templates/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: krawl-config namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" data: - PORT: "5000" - DELAY: "100" - LINKS_MIN_LENGTH: "5" - LINKS_MAX_LENGTH: "15" - LINKS_MIN_PER_PAGE: "10" - LINKS_MAX_PER_PAGE: "15" - MAX_COUNTER: "10" - CANARY_TOKEN_TRIES: "10" - PROBABILITY_ERROR_CODES: "0" -# CANARY_TOKEN_URL: set-your-canary-token-url-here + config.yaml: | + # Krawl Honeypot Configuration + server: + port: 5000 + delay: 100 + links: + min_length: 5 + max_length: 15 + min_per_page: 10 + max_per_page: 15 + char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + max_counter: 10 + canary: + token_url: null + token_tries: 10 + dashboard: + secret_path: null + database: + path: "data/krawl.db" + retention_days: 30 + behavior: + probability_error_codes: 0 + analyzer: + http_risky_methods_threshold: 0.1 + violated_robots_threshold: 0.1 + uneven_request_timing_threshold: 0.5 + uneven_request_timing_time_window_seconds: 300 + user_agents_used_threshold: 2 + attack_urls_threshold: 1 + crawl: + infinite_pages_for_malicious: true + max_pages_limit: 250 + ban_duration_seconds: 600 --- +# Source: krawl-chart/templates/wordlists-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: krawl-wordlists namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" data: wordlists.json: | - { - "usernames": { - "prefixes": [ - "admin", - "user", - "developer", - "root", - "system", - "db", - "api", - "service", - "deploy", - "test", - "prod", - "backup", - "monitor", - "jenkins", - "webapp" - ], - "suffixes": [ - "", - "_prod", - "_dev", - "_test", - "123", - "2024", - "_backup", - "_admin", - "01", - "02", - "_user", - "_service", - "_api" - ] - }, - "passwords": { - "prefixes": [ - "P@ssw0rd", - "Passw0rd", - "Admin", - "Secret", - "Welcome", - "System", - "Database", - "Secure", - "Master", - "Root" - ], - "simple": [ - "test", - "demo", - "temp", - "change", - "password", - "admin", - "letmein", - "welcome", - "default", - "sample" - ] - }, - "emails": { - "domains": [ - "example.com", - "company.com", - "localhost.com", - "test.com", - "domain.com", - "corporate.com", - "internal.net", - "enterprise.com", - "business.org" - ] - }, - "api_keys": { - "prefixes": [ - "sk_live_", - "sk_test_", - "api_", - "key_", - "token_", - "access_", - "secret_", - "prod_", - "" - ] - }, - "databases": { - "names": [ - "production", - "prod_db", - "main_db", - "app_database", - "users_db", - "customer_data", - "analytics", - "staging_db", - "dev_database", - "wordpress", - "ecommerce", - "crm_db", - "inventory" - ], - "hosts": [ - "localhost", - "db.internal", - "mysql.local", - "postgres.internal", - "127.0.0.1", - "db-server-01", - "database.prod", - "sql.company.com" - ] - }, - "applications": { - "names": [ - "WebApp", - "API Gateway", - "Dashboard", - "Admin Panel", - "CMS", - "Portal", - "Manager", - "Console", - "Control Panel", - "Backend" - ] - }, - "users": { - "roles": [ - "Administrator", - "Developer", - "Manager", - "User", - "Guest", - "Moderator", - "Editor", - "Viewer", - "Analyst", - "Support" - ] - }, - "directory_listing": { - "files": [ - "admin.txt", - "test.exe", - "backup.sql", - "database.sql", - "db_backup.sql", - "dump.sql", - "config.php", - "credentials.txt", - "passwords.txt", - "users.csv", - ".env", - "id_rsa", - "id_rsa.pub", - "private_key.pem", - "api_keys.json", - "secrets.yaml", - "admin_notes.txt", - "settings.ini", - "database.yml", - "wp-config.php", - ".htaccess", - "server.key", - "cert.pem", - "shadow.bak", - "passwd.old" - ], - "directories": [ - "uploads/", - "backups/", - "logs/", - "temp/", - "cache/", - "private/", - "config/", - "admin/", - "database/", - "backup/", - "old/", - "archive/", - ".git/", - "keys/", - "credentials/" - ] - }, - "error_codes": [ - 400, - 401, - 403, - 404, - 500, - 502, - 503 - ] - } + {"api_keys":{"prefixes":["sk_live_","sk_test_","api_","key_","token_","access_","secret_","prod_",""]},"applications":{"names":["WebApp","API Gateway","Dashboard","Admin Panel","CMS","Portal","Manager","Console","Control Panel","Backend"]},"databases":{"hosts":["localhost","db.internal","mysql.local","postgres.internal","127.0.0.1","db-server-01","database.prod","sql.company.com"],"names":["production","prod_db","main_db","app_database","users_db","customer_data","analytics","staging_db","dev_database","wordpress","ecommerce","crm_db","inventory"]},"directory_listing":{"directories":["uploads/","backups/","logs/","temp/","cache/","private/","config/","admin/","database/","backup/","old/","archive/",".git/","keys/","credentials/"],"files":["admin.txt","test.exe","backup.sql","database.sql","db_backup.sql","dump.sql","config.php","credentials.txt","passwords.txt","users.csv",".env","id_rsa","id_rsa.pub","private_key.pem","api_keys.json","secrets.yaml","admin_notes.txt","settings.ini","database.yml","wp-config.php",".htaccess","server.key","cert.pem","shadow.bak","passwd.old"]},"emails":{"domains":["example.com","company.com","localhost.com","test.com","domain.com","corporate.com","internal.net","enterprise.com","business.org"]},"error_codes":[400,401,403,404,500,502,503],"passwords":{"prefixes":["P@ssw0rd","Passw0rd","Admin","Secret","Welcome","System","Database","Secure","Master","Root"],"simple":["test","demo","temp","change","password","admin","letmein","welcome","default","sample"]},"server_headers":["Apache/2.2.22 (Ubuntu)","nginx/1.18.0","Microsoft-IIS/10.0","LiteSpeed","Caddy","Gunicorn/20.0.4","uvicorn/0.13.4","Express","Flask/1.1.2","Django/3.1"],"usernames":{"prefixes":["admin","user","developer","root","system","db","api","service","deploy","test","prod","backup","monitor","jenkins","webapp"],"suffixes":["","_prod","_dev","_test","123","2024","_backup","_admin","01","02","_user","_service","_api"]},"users":{"roles":["Administrator","Developer","Manager","User","Guest","Moderator","Editor","Viewer","Analyst","Support"]}} --- +# Source: krawl-chart/templates/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: krawl-db + namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi +--- +# Source: krawl-chart/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: krawl + namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" +spec: + type: LoadBalancer + externalTrafficPolicy: Local + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 10800 + ports: + - port: 5000 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl +--- +# Source: krawl-chart/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: krawl-server + name: krawl namespace: krawl-system labels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" spec: replicas: 1 selector: matchLabels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl template: metadata: labels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl spec: containers: - - name: krawl - image: ghcr.io/blessedrebus/krawl:latest + - name: krawl-chart + image: "ghcr.io/blessedrebus/krawl:1.0.0" imagePullPolicy: Always ports: - - containerPort: 5000 - name: http + - name: http + containerPort: 5000 protocol: TCP - envFrom: - - configMapRef: - name: krawl-config + env: + - name: CONFIG_LOCATION + value: "config.yaml" volumeMounts: + - name: config + mountPath: /app/config.yaml + subPath: config.yaml + readOnly: true - name: wordlists mountPath: /app/wordlists.json subPath: wordlists.json readOnly: true + - name: database + mountPath: /app/data resources: - requests: - memory: "64Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "500m" + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 64Mi volumes: + - name: config + configMap: + name: krawl-config - name: wordlists configMap: name: krawl-wordlists + - name: database + persistentVolumeClaim: + claimName: krawl-db --- -apiVersion: v1 -kind: Service -metadata: - name: krawl-server - namespace: krawl-system - labels: - app: krawl-server -spec: - type: LoadBalancer - ports: - - port: 5000 - targetPort: 5000 - protocol: TCP - name: http - selector: - app: krawl-server ---- +# Source: krawl-chart/templates/ingress.yaml apiVersion: networking.k8s.io/v1 kind: Ingress metadata: - name: krawl-ingress + name: krawl namespace: krawl-system - annotations: - nginx.ingress.kubernetes.io/rewrite-target: / + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" spec: - ingressClassName: nginx + ingressClassName: traefik rules: - - host: krawl.example.com # Change to your domain - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: krawl-server - port: - number: 5000 - # tls: - # - hosts: - # - krawl.example.com - # secretName: krawl-tls ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: krawl-network-policy - namespace: krawl-system -spec: - podSelector: - matchLabels: - app: krawl-server - policyTypes: - - Ingress - - Egress - ingress: - - from: - - podSelector: {} - - namespaceSelector: {} - - ipBlock: - cidr: 0.0.0.0/0 - ports: - - protocol: TCP - port: 5000 - egress: - - to: - - namespaceSelector: {} - - ipBlock: - cidr: 0.0.0.0/0 - ports: - - protocol: TCP - - protocol: UDP ---- -# Optional: HorizontalPodAutoscaler for auto-scaling -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: krawl-hpa - namespace: krawl-system -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: krawl-server - minReplicas: 1 - maxReplicas: 5 - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: 80 + - host: "krawl.example.com" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: krawl + port: + number: 5000 diff --git a/kubernetes/manifests/configmap.yaml b/kubernetes/manifests/configmap.yaml index 431b9a3..cdf6f1b 100644 --- a/kubernetes/manifests/configmap.yaml +++ b/kubernetes/manifests/configmap.yaml @@ -1,17 +1,44 @@ +# Source: krawl-chart/templates/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: krawl-config namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" data: - PORT: "5000" - DELAY: "100" - LINKS_MIN_LENGTH: "5" - LINKS_MAX_LENGTH: "15" - LINKS_MIN_PER_PAGE: "10" - LINKS_MAX_PER_PAGE: "15" - MAX_COUNTER: "10" - CANARY_TOKEN_TRIES: "10" - PROBABILITY_ERROR_CODES: "0" - SERVER_HEADER: "Apache/2.2.22 (Ubuntu)" -# CANARY_TOKEN_URL: set-your-canary-token-url-here \ No newline at end of file + config.yaml: | + # Krawl Honeypot Configuration + server: + port: 5000 + delay: 100 + links: + min_length: 5 + max_length: 15 + min_per_page: 10 + max_per_page: 15 + char_space: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + max_counter: 10 + canary: + token_url: null + token_tries: 10 + dashboard: + secret_path: null + database: + path: "data/krawl.db" + retention_days: 30 + behavior: + probability_error_codes: 0 + analyzer: + http_risky_methods_threshold: 0.1 + violated_robots_threshold: 0.1 + uneven_request_timing_threshold: 0.5 + uneven_request_timing_time_window_seconds: 300 + user_agents_used_threshold: 2 + attack_urls_threshold: 1 + crawl: + infinite_pages_for_malicious: true + max_pages_limit: 250 + ban_duration_seconds: 600 diff --git a/kubernetes/manifests/deployment.yaml b/kubernetes/manifests/deployment.yaml index 0552eba..4c87a73 100644 --- a/kubernetes/manifests/deployment.yaml +++ b/kubernetes/manifests/deployment.yaml @@ -1,44 +1,61 @@ +# Source: krawl-chart/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: krawl-server + name: krawl namespace: krawl-system labels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" spec: replicas: 1 selector: matchLabels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl template: metadata: labels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl spec: containers: - - name: krawl - image: ghcr.io/blessedrebus/krawl:latest + - name: krawl-chart + image: "ghcr.io/blessedrebus/krawl:1.0.0" imagePullPolicy: Always ports: - - containerPort: 5000 - name: http + - name: http + containerPort: 5000 protocol: TCP - envFrom: - - configMapRef: - name: krawl-config + env: + - name: CONFIG_LOCATION + value: "config.yaml" volumeMounts: + - name: config + mountPath: /app/config.yaml + subPath: config.yaml + readOnly: true - name: wordlists mountPath: /app/wordlists.json subPath: wordlists.json readOnly: true + - name: database + mountPath: /app/data resources: - requests: - memory: "64Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "500m" + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 64Mi volumes: + - name: config + configMap: + name: krawl-config - name: wordlists configMap: name: krawl-wordlists + - name: database + persistentVolumeClaim: + claimName: krawl-db diff --git a/kubernetes/manifests/hpa.yaml b/kubernetes/manifests/hpa.yaml deleted file mode 100644 index 10bab0c..0000000 --- a/kubernetes/manifests/hpa.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Optional: HorizontalPodAutoscaler for auto-scaling -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: krawl-hpa - namespace: krawl-system -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: krawl-server - minReplicas: 1 - maxReplicas: 5 - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: 80 diff --git a/kubernetes/manifests/ingress.yaml b/kubernetes/manifests/ingress.yaml index f5a6efc..5134798 100644 --- a/kubernetes/manifests/ingress.yaml +++ b/kubernetes/manifests/ingress.yaml @@ -1,24 +1,23 @@ +# Source: krawl-chart/templates/ingress.yaml apiVersion: networking.k8s.io/v1 kind: Ingress metadata: - name: krawl-ingress + name: krawl namespace: krawl-system - annotations: - nginx.ingress.kubernetes.io/rewrite-target: / + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" spec: - ingressClassName: nginx + ingressClassName: traefik rules: - - host: krawl.example.com # Change to your domain - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: krawl-server - port: - number: 5000 - # tls: - # - hosts: - # - krawl.example.com - # secretName: krawl-tls + - host: "krawl.example.com" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: krawl + port: + number: 5000 diff --git a/kubernetes/manifests/kustomization.yaml b/kubernetes/manifests/kustomization.yaml index 8f41776..4a5fcd9 100644 --- a/kubernetes/manifests/kustomization.yaml +++ b/kubernetes/manifests/kustomization.yaml @@ -5,6 +5,7 @@ resources: - namespace.yaml - configmap.yaml - wordlists-configmap.yaml + - pvc.yaml - deployment.yaml - service.yaml - network-policy.yaml diff --git a/kubernetes/manifests/network-policy.yaml b/kubernetes/manifests/network-policy.yaml index e765b36..7068531 100644 --- a/kubernetes/manifests/network-policy.yaml +++ b/kubernetes/manifests/network-policy.yaml @@ -1,29 +1,35 @@ +# Source: krawl-chart/templates/network-policy.yaml apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: - name: krawl-network-policy + name: krawl namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" spec: podSelector: matchLabels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl policyTypes: - - Ingress - - Egress + - Ingress + - Egress ingress: - - from: - - podSelector: {} - - namespaceSelector: {} - - ipBlock: - cidr: 0.0.0.0/0 - ports: - - protocol: TCP - port: 5000 + - from: + - podSelector: {} + - namespaceSelector: {} + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - port: 5000 + protocol: TCP egress: - - to: - - namespaceSelector: {} - - ipBlock: - cidr: 0.0.0.0/0 - ports: - - protocol: TCP - - protocol: UDP + - ports: + - protocol: TCP + - protocol: UDP + to: + - namespaceSelector: {} + - ipBlock: + cidr: 0.0.0.0/0 diff --git a/kubernetes/manifests/pvc.yaml b/kubernetes/manifests/pvc.yaml new file mode 100644 index 0000000..526093d --- /dev/null +++ b/kubernetes/manifests/pvc.yaml @@ -0,0 +1,16 @@ +# Source: krawl-chart/templates/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: krawl-db + namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/kubernetes/manifests/service.yaml b/kubernetes/manifests/service.yaml index 8db65b4..1b73cc0 100644 --- a/kubernetes/manifests/service.yaml +++ b/kubernetes/manifests/service.yaml @@ -1,16 +1,25 @@ +# Source: krawl-chart/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: krawl-server + name: krawl namespace: krawl-system labels: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" spec: type: LoadBalancer + externalTrafficPolicy: Local + sessionAffinity: ClientIP + sessionAffinityConfig: + clientIP: + timeoutSeconds: 10800 ports: - port: 5000 - targetPort: 5000 + targetPort: http protocol: TCP name: http selector: - app: krawl-server + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl diff --git a/kubernetes/manifests/wordlists-configmap.yaml b/kubernetes/manifests/wordlists-configmap.yaml index 4ff0b5d..279410e 100644 --- a/kubernetes/manifests/wordlists-configmap.yaml +++ b/kubernetes/manifests/wordlists-configmap.yaml @@ -1,205 +1,13 @@ +# Source: krawl-chart/templates/wordlists-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: krawl-wordlists namespace: krawl-system + labels: + app.kubernetes.io/name: krawl + app.kubernetes.io/instance: krawl + app.kubernetes.io/version: "1.0.0" data: wordlists.json: | - { - "usernames": { - "prefixes": [ - "admin", - "user", - "developer", - "root", - "system", - "db", - "api", - "service", - "deploy", - "test", - "prod", - "backup", - "monitor", - "jenkins", - "webapp" - ], - "suffixes": [ - "", - "_prod", - "_dev", - "_test", - "123", - "2024", - "_backup", - "_admin", - "01", - "02", - "_user", - "_service", - "_api" - ] - }, - "passwords": { - "prefixes": [ - "P@ssw0rd", - "Passw0rd", - "Admin", - "Secret", - "Welcome", - "System", - "Database", - "Secure", - "Master", - "Root" - ], - "simple": [ - "test", - "demo", - "temp", - "change", - "password", - "admin", - "letmein", - "welcome", - "default", - "sample" - ] - }, - "emails": { - "domains": [ - "example.com", - "company.com", - "localhost.com", - "test.com", - "domain.com", - "corporate.com", - "internal.net", - "enterprise.com", - "business.org" - ] - }, - "api_keys": { - "prefixes": [ - "sk_live_", - "sk_test_", - "api_", - "key_", - "token_", - "access_", - "secret_", - "prod_", - "" - ] - }, - "databases": { - "names": [ - "production", - "prod_db", - "main_db", - "app_database", - "users_db", - "customer_data", - "analytics", - "staging_db", - "dev_database", - "wordpress", - "ecommerce", - "crm_db", - "inventory" - ], - "hosts": [ - "localhost", - "db.internal", - "mysql.local", - "postgres.internal", - "127.0.0.1", - "db-server-01", - "database.prod", - "sql.company.com" - ] - }, - "applications": { - "names": [ - "WebApp", - "API Gateway", - "Dashboard", - "Admin Panel", - "CMS", - "Portal", - "Manager", - "Console", - "Control Panel", - "Backend" - ] - }, - "users": { - "roles": [ - "Administrator", - "Developer", - "Manager", - "User", - "Guest", - "Moderator", - "Editor", - "Viewer", - "Analyst", - "Support" - ] - }, - "directory_listing": { - "files": [ - "admin.txt", - "test.exe", - "backup.sql", - "database.sql", - "db_backup.sql", - "dump.sql", - "config.php", - "credentials.txt", - "passwords.txt", - "users.csv", - ".env", - "id_rsa", - "id_rsa.pub", - "private_key.pem", - "api_keys.json", - "secrets.yaml", - "admin_notes.txt", - "settings.ini", - "database.yml", - "wp-config.php", - ".htaccess", - "server.key", - "cert.pem", - "shadow.bak", - "passwd.old" - ], - "directories": [ - "uploads/", - "backups/", - "logs/", - "temp/", - "cache/", - "private/", - "config/", - "admin/", - "database/", - "backup/", - "old/", - "archive/", - ".git/", - "keys/", - "credentials/" - ] - }, - "error_codes": [ - 400, - 401, - 403, - 404, - 500, - 502, - 503 - ] - } + {"api_keys":{"prefixes":["sk_live_","sk_test_","api_","key_","token_","access_","secret_","prod_",""]},"applications":{"names":["WebApp","API Gateway","Dashboard","Admin Panel","CMS","Portal","Manager","Console","Control Panel","Backend"]},"databases":{"hosts":["localhost","db.internal","mysql.local","postgres.internal","127.0.0.1","db-server-01","database.prod","sql.company.com"],"names":["production","prod_db","main_db","app_database","users_db","customer_data","analytics","staging_db","dev_database","wordpress","ecommerce","crm_db","inventory"]},"directory_listing":{"directories":["uploads/","backups/","logs/","temp/","cache/","private/","config/","admin/","database/","backup/","old/","archive/",".git/","keys/","credentials/"],"files":["admin.txt","test.exe","backup.sql","database.sql","db_backup.sql","dump.sql","config.php","credentials.txt","passwords.txt","users.csv",".env","id_rsa","id_rsa.pub","private_key.pem","api_keys.json","secrets.yaml","admin_notes.txt","settings.ini","database.yml","wp-config.php",".htaccess","server.key","cert.pem","shadow.bak","passwd.old"]},"emails":{"domains":["example.com","company.com","localhost.com","test.com","domain.com","corporate.com","internal.net","enterprise.com","business.org"]},"error_codes":[400,401,403,404,500,502,503],"passwords":{"prefixes":["P@ssw0rd","Passw0rd","Admin","Secret","Welcome","System","Database","Secure","Master","Root"],"simple":["test","demo","temp","change","password","admin","letmein","welcome","default","sample"]},"server_headers":["Apache/2.2.22 (Ubuntu)","nginx/1.18.0","Microsoft-IIS/10.0","LiteSpeed","Caddy","Gunicorn/20.0.4","uvicorn/0.13.4","Express","Flask/1.1.2","Django/3.1"],"usernames":{"prefixes":["admin","user","developer","root","system","db","api","service","deploy","test","prod","backup","monitor","jenkins","webapp"],"suffixes":["","_prod","_dev","_test","123","2024","_backup","_admin","01","02","_user","_service","_api"]},"users":{"roles":["Administrator","Developer","Manager","User","Guest","Moderator","Editor","Viewer","Analyst","Support"]}} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b3f9b03 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +# Krawl Honeypot Dependencies +# Install with: pip install -r requirements.txt + +# Configuration +PyYAML>=6.0 + +# Database ORM +SQLAlchemy>=2.0.0,<3.0.0 + +# Scheduling +APScheduler>=3.11.2 + +requests>=2.32.5 diff --git a/src/analyzer.py b/src/analyzer.py new file mode 100644 index 0000000..7f29662 --- /dev/null +++ b/src/analyzer.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +from sqlalchemy import select +from typing import Optional +from database import get_database, DatabaseManager +from zoneinfo import ZoneInfo +from pathlib import Path +from datetime import datetime, timedelta +import re +import urllib.parse +from wordlists import get_wordlists +from config import get_config +from logger import get_app_logger +import requests + +""" +Functions for user activity analysis +""" + +app_logger = get_app_logger() + + +class Analyzer: + """ + Analyzes users activity and produces aggregated insights + """ + + def __init__(self, db_manager: Optional[DatabaseManager] = None): + """ + Initialize the analyzer. + + Args: + db_manager: Optional DatabaseManager for persistence. + If None, will use the global singleton. + """ + self._db_manager = db_manager + + @property + def db(self) -> Optional[DatabaseManager]: + """ + Get the database manager, lazily initializing if needed. + + Returns: + DatabaseManager instance or None if not available + """ + if self._db_manager is None: + try: + self._db_manager = get_database() + except Exception: + pass + return self._db_manager + + # def infer_user_category(self, ip: str) -> str: + + # config = get_config() + + # http_risky_methods_threshold = config.http_risky_methods_threshold + # violated_robots_threshold = config.violated_robots_threshold + # uneven_request_timing_threshold = config.uneven_request_timing_threshold + # user_agents_used_threshold = config.user_agents_used_threshold + # attack_urls_threshold = config.attack_urls_threshold + # uneven_request_timing_time_window_seconds = config.uneven_request_timing_time_window_seconds + + # app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}") + + # score = {} + # score["attacker"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score["good_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score["bad_crawler"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + # score["regular_user"] = {"risky_http_methods": False, "robots_violations": False, "uneven_request_timing": False, "different_user_agents": False, "attack_url": False} + + # #1-3 low, 4-6 mid, 7-9 high, 10-20 extreme + # weights = { + # "attacker": { + # "risky_http_methods": 6, + # "robots_violations": 4, + # "uneven_request_timing": 3, + # "different_user_agents": 8, + # "attack_url": 15 + # }, + # "good_crawler": { + # "risky_http_methods": 1, + # "robots_violations": 0, + # "uneven_request_timing": 0, + # "different_user_agents": 0, + # "attack_url": 0 + # }, + # "bad_crawler": { + # "risky_http_methods": 2, + # "robots_violations": 7, + # "uneven_request_timing": 0, + # "different_user_agents": 5, + # "attack_url": 5 + # }, + # "regular_user": { + # "risky_http_methods": 0, + # "robots_violations": 0, + # "uneven_request_timing": 8, + # "different_user_agents": 3, + # "attack_url": 0 + # } + # } + + # accesses = self.db.get_access_logs(ip_filter = ip, limit=1000) + # total_accesses_count = len(accesses) + # if total_accesses_count <= 0: + # return + + # # Set category as "unknown" for the first 5 requests + # if total_accesses_count < 3: + # category = "unknown" + # analyzed_metrics = {} + # category_scores = {"attacker": 0, "good_crawler": 0, "bad_crawler": 0, "regular_user": 0, "unknown": 0} + # last_analysis = datetime.now(tz=ZoneInfo('UTC')) + # self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) + # return 0 + + # #--------------------- HTTP Methods --------------------- + + # get_accesses_count = len([item for item in accesses if item["method"] == "GET"]) + # post_accesses_count = len([item for item in accesses if item["method"] == "POST"]) + # put_accesses_count = len([item for item in accesses if item["method"] == "PUT"]) + # delete_accesses_count = len([item for item in accesses if item["method"] == "DELETE"]) + # head_accesses_count = len([item for item in accesses if item["method"] == "HEAD"]) + # options_accesses_count = len([item for item in accesses if item["method"] == "OPTIONS"]) + # patch_accesses_count = len([item for item in accesses if item["method"] == "PATCH"]) + + # if total_accesses_count > http_risky_methods_threshold: + # http_method_attacker_score = (post_accesses_count + put_accesses_count + delete_accesses_count + options_accesses_count + patch_accesses_count) / total_accesses_count + # else: + # http_method_attacker_score = 0 + + # #print(f"HTTP Method attacker score: {http_method_attacker_score}") + # if http_method_attacker_score >= http_risky_methods_threshold: + # score["attacker"]["risky_http_methods"] = True + # score["good_crawler"]["risky_http_methods"] = False + # score["bad_crawler"]["risky_http_methods"] = True + # score["regular_user"]["risky_http_methods"] = False + # else: + # score["attacker"]["risky_http_methods"] = False + # score["good_crawler"]["risky_http_methods"] = True + # score["bad_crawler"]["risky_http_methods"] = False + # score["regular_user"]["risky_http_methods"] = False + + # #--------------------- Robots Violations --------------------- + # #respect robots.txt and login/config pages access frequency + # robots_disallows = [] + # robots_path = Path(__file__).parent / "templates" / "html" / "robots.txt" + # with open(robots_path, "r") as f: + # for line in f: + # line = line.strip() + # if not line: + # continue + # parts = line.split(":") + + # if parts[0] == "Disallow": + # parts[1] = parts[1].rstrip("/") + # #print(f"DISALLOW {parts[1]}") + # robots_disallows.append(parts[1].strip()) + + # #if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker + # violated_robots_count = len([item for item in accesses if any(item["path"].rstrip("/").startswith(disallow) for disallow in robots_disallows)]) + # #print(f"Violated robots count: {violated_robots_count}") + # if total_accesses_count > 0: + # violated_robots_ratio = violated_robots_count / total_accesses_count + # else: + # violated_robots_ratio = 0 + + # if violated_robots_ratio >= violated_robots_threshold: + # score["attacker"]["robots_violations"] = True + # score["good_crawler"]["robots_violations"] = False + # score["bad_crawler"]["robots_violations"] = True + # score["regular_user"]["robots_violations"] = False + # else: + # score["attacker"]["robots_violations"] = False + # score["good_crawler"]["robots_violations"] = False + # score["bad_crawler"]["robots_violations"] = False + # score["regular_user"]["robots_violations"] = False + + # #--------------------- Requests Timing --------------------- + # #Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior + # timestamps = [datetime.fromisoformat(item["timestamp"]) for item in accesses] + # now_utc = datetime.now(tz=ZoneInfo('UTC')) + # timestamps = [ts for ts in timestamps if now_utc - ts <= timedelta(seconds=uneven_request_timing_time_window_seconds)] + # timestamps = sorted(timestamps, reverse=True) + + # time_diffs = [] + # for i in range(0, len(timestamps)-1): + # diff = (timestamps[i] - timestamps[i+1]).total_seconds() + # time_diffs.append(diff) + + # mean = 0 + # variance = 0 + # std = 0 + # cv = 0 + # if time_diffs: + # mean = sum(time_diffs) / len(time_diffs) + # variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs) + # std = variance ** 0.5 + # cv = std/mean + # app_logger.debug(f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}") + + # if cv >= uneven_request_timing_threshold: + # score["attacker"]["uneven_request_timing"] = True + # score["good_crawler"]["uneven_request_timing"] = False + # score["bad_crawler"]["uneven_request_timing"] = False + # score["regular_user"]["uneven_request_timing"] = True + # else: + # score["attacker"]["uneven_request_timing"] = False + # score["good_crawler"]["uneven_request_timing"] = False + # score["bad_crawler"]["uneven_request_timing"] = False + # score["regular_user"]["uneven_request_timing"] = False + + # #--------------------- Different User Agents --------------------- + # #Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers + # user_agents_used = [item["user_agent"] for item in accesses] + # user_agents_used = list(dict.fromkeys(user_agents_used)) + # #print(f"User agents used: {user_agents_used}") + + # if len(user_agents_used) >= user_agents_used_threshold: + # score["attacker"]["different_user_agents"] = True + # score["good_crawler"]["different_user_agents"] = False + # score["bad_crawler"]["different_user_agentss"] = True + # score["regular_user"]["different_user_agents"] = False + # else: + # score["attacker"]["different_user_agents"] = False + # score["good_crawler"]["different_user_agents"] = False + # score["bad_crawler"]["different_user_agents"] = False + # score["regular_user"]["different_user_agents"] = False + + # #--------------------- Attack URLs --------------------- + + # attack_urls_found_list = [] + + # wl = get_wordlists() + # if wl.attack_patterns: + # queried_paths = [item["path"] for item in accesses] + + # for queried_path in queried_paths: + # # URL decode the path to catch encoded attacks + # try: + # decoded_path = urllib.parse.unquote(queried_path) + # # Double decode to catch double-encoded attacks + # decoded_path_twice = urllib.parse.unquote(decoded_path) + # except Exception: + # decoded_path = queried_path + # decoded_path_twice = queried_path + + # for name, pattern in wl.attack_patterns.items(): + # # Check original, decoded, and double-decoded paths + # if (re.search(pattern, queried_path, re.IGNORECASE) or + # re.search(pattern, decoded_path, re.IGNORECASE) or + # re.search(pattern, decoded_path_twice, re.IGNORECASE)): + # attack_urls_found_list.append(f"{name}: {pattern}") + + # #remove duplicates + # attack_urls_found_list = set(attack_urls_found_list) + # attack_urls_found_list = list(attack_urls_found_list) + + # if len(attack_urls_found_list) > attack_urls_threshold: + # score["attacker"]["attack_url"] = True + # score["good_crawler"]["attack_url"] = False + # score["bad_crawler"]["attack_url"] = False + # score["regular_user"]["attack_url"] = False + # else: + # score["attacker"]["attack_url"] = False + # score["good_crawler"]["attack_url"] = False + # score["bad_crawler"]["attack_url"] = False + # score["regular_user"]["attack_url"] = False + + # #--------------------- Calculate score --------------------- + + # attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0 + + # attacker_score = score["attacker"]["risky_http_methods"] * weights["attacker"]["risky_http_methods"] + # attacker_score = attacker_score + score["attacker"]["robots_violations"] * weights["attacker"]["robots_violations"] + # attacker_score = attacker_score + score["attacker"]["uneven_request_timing"] * weights["attacker"]["uneven_request_timing"] + # attacker_score = attacker_score + score["attacker"]["different_user_agents"] * weights["attacker"]["different_user_agents"] + # attacker_score = attacker_score + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"] + + # good_crawler_score = score["good_crawler"]["risky_http_methods"] * weights["good_crawler"]["risky_http_methods"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["robots_violations"] * weights["good_crawler"]["robots_violations"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["uneven_request_timing"] * weights["good_crawler"]["uneven_request_timing"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["different_user_agents"] * weights["good_crawler"]["different_user_agents"] + # good_crawler_score = good_crawler_score + score["good_crawler"]["attack_url"] * weights["good_crawler"]["attack_url"] + + # bad_crawler_score = score["bad_crawler"]["risky_http_methods"] * weights["bad_crawler"]["risky_http_methods"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["robots_violations"] * weights["bad_crawler"]["robots_violations"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["uneven_request_timing"] * weights["bad_crawler"]["uneven_request_timing"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["different_user_agents"] * weights["bad_crawler"]["different_user_agents"] + # bad_crawler_score = bad_crawler_score + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"] + + # regular_user_score = score["regular_user"]["risky_http_methods"] * weights["regular_user"]["risky_http_methods"] + # regular_user_score = regular_user_score + score["regular_user"]["robots_violations"] * weights["regular_user"]["robots_violations"] + # regular_user_score = regular_user_score + score["regular_user"]["uneven_request_timing"] * weights["regular_user"]["uneven_request_timing"] + # regular_user_score = regular_user_score + score["regular_user"]["different_user_agents"] * weights["regular_user"]["different_user_agents"] + # regular_user_score = regular_user_score + score["regular_user"]["attack_url"] * weights["regular_user"]["attack_url"] + + # score_details = f""" + # Attacker score: {attacker_score} + # Good Crawler score: {good_crawler_score} + # Bad Crawler score: {bad_crawler_score} + # Regular User score: {regular_user_score} + # """ + # app_logger.debug(score_details) + + # analyzed_metrics = {"risky_http_methods": http_method_attacker_score, "robots_violations": violated_robots_ratio, "uneven_request_timing": mean, "different_user_agents": user_agents_used, "attack_url": attack_urls_found_list} + # category_scores = {"attacker": attacker_score, "good_crawler": good_crawler_score, "bad_crawler": bad_crawler_score, "regular_user": regular_user_score} + # category = max(category_scores, key=category_scores.get) + # last_analysis = datetime.now(tz=ZoneInfo('UTC')) + + # self._db_manager.update_ip_stats_analysis(ip, analyzed_metrics, category, category_scores, last_analysis) + + # return 0 + + # def update_ip_rep_infos(self, ip: str) -> list[str]: + # api_url = "https://iprep.lcrawl.com/api/iprep/" + # params = { + # "cidr": ip + # } + # headers = { + # "Content-Type": "application/json" + # } + + # response = requests.get(api_url, headers=headers, params=params) + # payload = response.json() + + # if payload["results"]: + # data = payload["results"][0] + + # country_iso_code = data["geoip_data"]["country_iso_code"] + # asn = data["geoip_data"]["asn_autonomous_system_number"] + # asn_org = data["geoip_data"]["asn_autonomous_system_organization"] + # list_on = data["list_on"] + + # sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3) + # sanitized_asn = sanitize_for_storage(asn, 100) + # sanitized_asn_org = sanitize_for_storage(asn_org, 100) + # sanitized_list_on = sanitize_dict(list_on, 100000) + + # self._db_manager.update_ip_rep_infos(ip, sanitized_country_iso_code, sanitized_asn, sanitized_asn_org, sanitized_list_on) + + # return diff --git a/src/config.py b/src/config.py index 7c6714c..3e5983f 100644 --- a/src/config.py +++ b/src/config.py @@ -1,50 +1,261 @@ #!/usr/bin/env python3 import os +import sys from dataclasses import dataclass +from pathlib import Path from typing import Optional, Tuple +from zoneinfo import ZoneInfo +import time +from logger import get_app_logger +import socket +import time +import requests +import yaml @dataclass class Config: """Configuration class for the deception server""" + port: int = 5000 delay: int = 100 # milliseconds + server_header: str = "" links_length_range: Tuple[int, int] = (5, 15) links_per_page_range: Tuple[int, int] = (10, 15) - char_space: str = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' + char_space: str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" max_counter: int = 10 canary_token_url: Optional[str] = None canary_token_tries: int = 10 dashboard_secret_path: str = None - api_server_url: Optional[str] = None - api_server_port: int = 8080 - api_server_path: str = "/api/v2/users" probability_error_codes: int = 0 # Percentage (0-100) - server_header: str = "Apache/2.2.22 (Ubuntu)" + + # Crawl limiting settings - for legitimate vs malicious crawlers + max_pages_limit: int = ( + 100 # Max pages limit for good crawlers and regular users (and bad crawlers/attackers if infinite_pages_for_malicious is False) + ) + infinite_pages_for_malicious: bool = True # Infinite pages for malicious crawlers + ban_duration_seconds: int = 600 # Ban duration in seconds for IPs exceeding limits + + # Database settings + database_path: str = "data/krawl.db" + database_retention_days: int = 30 + + # Analyzer settings + http_risky_methods_threshold: float = None + violated_robots_threshold: float = None + uneven_request_timing_threshold: float = None + uneven_request_timing_time_window_seconds: float = None + user_agents_used_threshold: float = None + attack_urls_threshold: float = None + + _server_ip: Optional[str] = None + _server_ip_cache_time: float = 0 + _ip_cache_ttl: int = 300 + + def get_server_ip(self, refresh: bool = False) -> Optional[str]: + """ + Get the server's own public IP address. + Excludes requests from the server itself from being tracked. + """ + + current_time = time.time() + + # Check if cache is valid and not forced refresh + if ( + self._server_ip is not None + and not refresh + and (current_time - self._server_ip_cache_time) < self._ip_cache_ttl + ): + return self._server_ip + + try: + # Try multiple external IP detection services (fallback chain) + ip_detection_services = [ + "https://api.ipify.org", # Plain text response + "http://ident.me", # Plain text response + "https://ifconfig.me", # Plain text response + ] + + ip = None + for service_url in ip_detection_services: + try: + response = requests.get(service_url, timeout=5) + if response.status_code == 200: + ip = response.text.strip() + if ip: + break + except Exception: + continue + + if not ip: + get_app_logger().warning( + "Could not determine server IP from external services. " + "All IPs will be tracked (including potential server IP)." + ) + return None + + self._server_ip = ip + self._server_ip_cache_time = current_time + return ip + + except Exception as e: + get_app_logger().warning( + f"Could not determine server IP address: {e}. " + "All IPs will be tracked (including potential server IP)." + ) + return None + + def refresh_server_ip(self) -> Optional[str]: + """ + Force refresh the cached server IP. + Use this if you suspect the IP has changed. + + Returns: + New server IP address or None if unable to determine + """ + return self.get_server_ip(refresh=True) @classmethod - def from_env(cls) -> 'Config': - """Create configuration from environment variables""" + def from_yaml(cls) -> "Config": + """Create configuration from YAML file""" + config_location = os.getenv("CONFIG_LOCATION", "config.yaml") + config_path = Path(__file__).parent.parent / config_location + + try: + with open(config_path, "r") as f: + data = yaml.safe_load(f) + except FileNotFoundError: + print( + f"Error: Configuration file '{config_path}' not found.", file=sys.stderr + ) + print( + f"Please create a config.yaml file or set CONFIG_LOCATION environment variable.", + file=sys.stderr, + ) + sys.exit(1) + except yaml.YAMLError as e: + print( + f"Error: Invalid YAML in configuration file '{config_path}': {e}", + file=sys.stderr, + ) + sys.exit(1) + + if data is None: + data = {} + + # Extract nested values with defaults + server = data.get("server", {}) + links = data.get("links", {}) + canary = data.get("canary", {}) + dashboard = data.get("dashboard", {}) + api = data.get("api", {}) + database = data.get("database", {}) + behavior = data.get("behavior", {}) + analyzer = data.get("analyzer") or {} + crawl = data.get("crawl", {}) + + # Handle dashboard_secret_path - auto-generate if null/not set + dashboard_path = dashboard.get("secret_path") + if dashboard_path is None: + dashboard_path = f"/{os.urandom(16).hex()}" + else: + # ensure the dashboard path starts with a / + if dashboard_path[:1] != "/": + dashboard_path = f"/{dashboard_path}" + return cls( - port=int(os.getenv('PORT', 5000)), - delay=int(os.getenv('DELAY', 100)), + port=server.get("port", 5000), + delay=server.get("delay", 100), + server_header=server.get("server_header", ""), links_length_range=( - int(os.getenv('LINKS_MIN_LENGTH', 5)), - int(os.getenv('LINKS_MAX_LENGTH', 15)) + links.get("min_length", 5), + links.get("max_length", 15), ), links_per_page_range=( - int(os.getenv('LINKS_MIN_PER_PAGE', 10)), - int(os.getenv('LINKS_MAX_PER_PAGE', 15)) + links.get("min_per_page", 10), + links.get("max_per_page", 15), ), - char_space=os.getenv('CHAR_SPACE', 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'), - max_counter=int(os.getenv('MAX_COUNTER', 10)), - canary_token_url=os.getenv('CANARY_TOKEN_URL'), - canary_token_tries=int(os.getenv('CANARY_TOKEN_TRIES', 10)), - dashboard_secret_path=os.getenv('DASHBOARD_SECRET_PATH', f'/{os.urandom(16).hex()}'), - api_server_url=os.getenv('API_SERVER_URL'), - api_server_port=int(os.getenv('API_SERVER_PORT', 8080)), - api_server_path=os.getenv('API_SERVER_PATH', '/api/v2/users'), - probability_error_codes=int(os.getenv('PROBABILITY_ERROR_CODES', 5)), - server_header=os.getenv('SERVER_HEADER', 'Apache/2.2.22 (Ubuntu)') + char_space=links.get( + "char_space", + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", + ), + max_counter=links.get("max_counter", 10), + canary_token_url=canary.get("token_url"), + canary_token_tries=canary.get("token_tries", 10), + dashboard_secret_path=dashboard_path, + probability_error_codes=behavior.get("probability_error_codes", 0), + database_path=database.get("path", "data/krawl.db"), + database_retention_days=database.get("retention_days", 30), + http_risky_methods_threshold=analyzer.get( + "http_risky_methods_threshold", 0.1 + ), + violated_robots_threshold=analyzer.get("violated_robots_threshold", 0.1), + uneven_request_timing_threshold=analyzer.get( + "uneven_request_timing_threshold", 0.5 + ), # coefficient of variation + uneven_request_timing_time_window_seconds=analyzer.get( + "uneven_request_timing_time_window_seconds", 300 + ), + user_agents_used_threshold=analyzer.get("user_agents_used_threshold", 2), + attack_urls_threshold=analyzer.get("attack_urls_threshold", 1), + infinite_pages_for_malicious=crawl.get( + "infinite_pages_for_malicious", True + ), + max_pages_limit=crawl.get("max_pages_limit", 250), + ban_duration_seconds=crawl.get("ban_duration_seconds", 600), ) + + +def __get_env_from_config(config: str) -> str: + + env = config.upper().replace(".", "_").replace("-", "__").replace(" ", "_") + + return f"KRAWL_{env}" + + +def override_config_from_env(config: Config = None): + """Initialize configuration from environment variables""" + + for field in config.__dataclass_fields__: + + env_var = __get_env_from_config(field) + if env_var in os.environ: + + get_app_logger().info( + f"Overriding config '{field}' from environment variable '{env_var}'" + ) + try: + field_type = config.__dataclass_fields__[field].type + env_value = os.environ[env_var] + if field_type == int: + setattr(config, field, int(env_value)) + elif field_type == float: + setattr(config, field, float(env_value)) + elif field_type == bool: + # Handle boolean values (case-insensitive: true/false, yes/no, 1/0) + setattr(config, field, env_value.lower() in ("true", "yes", "1")) + elif field_type == Tuple[int, int]: + parts = env_value.split(",") + if len(parts) == 2: + setattr(config, field, (int(parts[0]), int(parts[1]))) + else: + setattr(config, field, env_value) + except Exception as e: + get_app_logger().error( + f"Error overriding config '{field}' from environment variable '{env_var}': {e}" + ) + + +_config_instance = None + + +def get_config() -> Config: + """Get the singleton Config instance""" + global _config_instance + if _config_instance is None: + _config_instance = Config.from_yaml() + + override_config_from_env(_config_instance) + + return _config_instance diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..36cc7e1 --- /dev/null +++ b/src/database.py @@ -0,0 +1,1602 @@ +#!/usr/bin/env python3 + +""" +Database singleton module for the Krawl honeypot. +Provides SQLAlchemy session management and database initialization. +""" + +import os +import stat +from datetime import datetime, timedelta +from typing import Optional, List, Dict, Any +from zoneinfo import ZoneInfo + +from sqlalchemy import create_engine, func, distinct, case, event, or_ +from sqlalchemy.orm import sessionmaker, scoped_session, Session +from sqlalchemy.engine import Engine + +from ip_utils import is_local_or_private_ip, is_valid_public_ip + + +@event.listens_for(Engine, "connect") +def set_sqlite_pragma(dbapi_connection, connection_record): + """Enable WAL mode and set busy timeout for SQLite connections.""" + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA journal_mode=WAL") + cursor.execute("PRAGMA busy_timeout=30000") + cursor.close() + + +from models import ( + Base, + AccessLog, + CredentialAttempt, + AttackDetection, + IpStats, + CategoryHistory, +) +from sanitizer import ( + sanitize_ip, + sanitize_path, + sanitize_user_agent, + sanitize_credential, + sanitize_attack_pattern, +) + +from logger import get_app_logger + +applogger = get_app_logger() + + +class DatabaseManager: + """ + Singleton database manager for the Krawl honeypot. + + Handles database initialization, session management, and provides + methods for persisting access logs, credentials, and attack detections. + """ + + _instance: Optional["DatabaseManager"] = None + + def __new__(cls) -> "DatabaseManager": + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def initialize(self, database_path: str = "data/krawl.db") -> None: + """ + Initialize the database connection and create tables. + + Args: + database_path: Path to the SQLite database file + """ + if self._initialized: + return + + # Create data directory if it doesn't exist + data_dir = os.path.dirname(database_path) + if data_dir and not os.path.exists(data_dir): + os.makedirs(data_dir, exist_ok=True) + + # Create SQLite database with check_same_thread=False for multi-threaded access + database_url = f"sqlite:///{database_path}" + self._engine = create_engine( + database_url, + connect_args={"check_same_thread": False}, + echo=False, # Set to True for SQL debugging + ) + + # Create session factory with scoped_session for thread safety + session_factory = sessionmaker(bind=self._engine) + self._Session = scoped_session(session_factory) + + # Create all tables + Base.metadata.create_all(self._engine) + + # Run automatic migrations for backward compatibility + self._run_migrations(database_path) + + # Set restrictive file permissions (owner read/write only) + if os.path.exists(database_path): + try: + os.chmod(database_path, stat.S_IRUSR | stat.S_IWUSR) # 600 + except OSError: + # May fail on some systems, not critical + pass + + self._initialized = True + + def _run_migrations(self, database_path: str) -> None: + """ + Run automatic migrations for backward compatibility. + Adds missing columns that were added in newer versions. + + Args: + database_path: Path to the SQLite database file + """ + import sqlite3 + + try: + conn = sqlite3.connect(database_path) + cursor = conn.cursor() + + # Check if latitude/longitude columns exist + cursor.execute("PRAGMA table_info(ip_stats)") + columns = [row[1] for row in cursor.fetchall()] + + migrations_run = [] + + # Add latitude column if missing + if "latitude" not in columns: + cursor.execute("ALTER TABLE ip_stats ADD COLUMN latitude REAL") + migrations_run.append("latitude") + + # Add longitude column if missing + if "longitude" not in columns: + cursor.execute("ALTER TABLE ip_stats ADD COLUMN longitude REAL") + migrations_run.append("longitude") + + if migrations_run: + conn.commit() + applogger.info( + f"Auto-migration: Added columns {', '.join(migrations_run)} to ip_stats table" + ) + + conn.close() + except Exception as e: + applogger.error(f"Auto-migration failed: {e}") + # Don't raise - allow app to continue even if migration fails + + @property + def session(self) -> Session: + """Get a thread-local database session.""" + if not self._initialized: + raise RuntimeError( + "DatabaseManager not initialized. Call initialize() first." + ) + return self._Session() + + def close_session(self) -> None: + """Close the current thread-local session.""" + if self._initialized: + self._Session.remove() + + def persist_access( + self, + ip: str, + path: str, + user_agent: str = "", + method: str = "GET", + is_suspicious: bool = False, + is_honeypot_trigger: bool = False, + attack_types: Optional[List[str]] = None, + matched_patterns: Optional[Dict[str, str]] = None, + ) -> Optional[int]: + """ + Persist an access log entry to the database. + + Args: + ip: Client IP address + path: Requested path + user_agent: Client user agent string + method: HTTP method (GET, POST, HEAD) + is_suspicious: Whether the request was flagged as suspicious + is_honeypot_trigger: Whether a honeypot path was accessed + attack_types: List of detected attack types + matched_patterns: Dict mapping attack_type to matched pattern + + Returns: + The ID of the created AccessLog record, or None on error + """ + session = self.session + try: + # Create access log with sanitized fields + access_log = AccessLog( + ip=sanitize_ip(ip), + path=sanitize_path(path), + user_agent=sanitize_user_agent(user_agent), + method=method[:10], + is_suspicious=is_suspicious, + is_honeypot_trigger=is_honeypot_trigger, + timestamp=datetime.now(), + ) + session.add(access_log) + session.flush() # Get the ID before committing + + # Add attack detections if any + if attack_types: + matched_patterns = matched_patterns or {} + for attack_type in attack_types: + detection = AttackDetection( + access_log_id=access_log.id, + attack_type=attack_type[:50], + matched_pattern=sanitize_attack_pattern( + matched_patterns.get(attack_type, "") + ), + ) + session.add(detection) + + # Update IP stats + self._update_ip_stats(session, ip) + + session.commit() + return access_log.id + + except Exception as e: + session.rollback() + # Log error but don't crash - database persistence is secondary to honeypot function + applogger.critical(f"Database error persisting access: {e}") + return None + finally: + self.close_session() + + def persist_credential( + self, + ip: str, + path: str, + username: Optional[str] = None, + password: Optional[str] = None, + ) -> Optional[int]: + """ + Persist a credential attempt to the database. + + Args: + ip: Client IP address + path: Login form path + username: Submitted username + password: Submitted password + + Returns: + The ID of the created CredentialAttempt record, or None on error + """ + session = self.session + try: + credential = CredentialAttempt( + ip=sanitize_ip(ip), + path=sanitize_path(path), + username=sanitize_credential(username), + password=sanitize_credential(password), + timestamp=datetime.now(), + ) + session.add(credential) + session.commit() + return credential.id + + except Exception as e: + session.rollback() + applogger.critical(f"Database error persisting credential: {e}") + return None + finally: + self.close_session() + + def _update_ip_stats(self, session: Session, ip: str) -> None: + """ + Update IP statistics (upsert pattern). + + Args: + session: Active database session + ip: IP address to update + """ + sanitized_ip = sanitize_ip(ip) + now = datetime.now() + + ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() + + if ip_stats: + ip_stats.total_requests += 1 + ip_stats.last_seen = now + else: + ip_stats = IpStats( + ip=sanitized_ip, total_requests=1, first_seen=now, last_seen=now + ) + session.add(ip_stats) + + def update_ip_stats_analysis( + self, + ip: str, + analyzed_metrics: Dict[str, object], + category: str, + category_scores: Dict[str, int], + last_analysis: datetime, + ) -> None: + """ + Update IP statistics (ip is already persisted). + Records category change in history if category has changed. + + Args: + ip: IP address to update + analyzed_metrics: metric values analyzed be the analyzer + category: inferred category + category_scores: inferred category scores + last_analysis: timestamp of last analysis + + """ + applogger.debug( + f"Analyzed metrics {analyzed_metrics}, category {category}, category scores {category_scores}, last analysis {last_analysis}" + ) + applogger.info(f"IP: {ip} category has been updated to {category}") + + session = self.session + sanitized_ip = sanitize_ip(ip) + ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() + + # Check if category has changed and record it + old_category = ip_stats.category + if old_category != category: + self._record_category_change( + sanitized_ip, old_category, category, last_analysis + ) + + ip_stats.analyzed_metrics = analyzed_metrics + ip_stats.category = category + ip_stats.category_scores = category_scores + ip_stats.last_analysis = last_analysis + + try: + session.commit() + except Exception as e: + session.rollback() + applogger.error(f"Error updating IP stats analysis: {e}") + + def manual_update_category(self, ip: str, category: str) -> None: + """ + Update IP category as a result of a manual intervention by an admin + + Args: + ip: IP address to update + category: selected category + + """ + session = self.session + sanitized_ip = sanitize_ip(ip) + ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() + + # Record the manual category change + old_category = ip_stats.category + if old_category != category: + self._record_category_change( + sanitized_ip, old_category, category, datetime.now() + ) + + ip_stats.category = category + ip_stats.manual_category = True + + try: + session.commit() + except Exception as e: + session.rollback() + applogger.error(f"Error updating manual category: {e}") + + def _record_category_change( + self, + ip: str, + old_category: Optional[str], + new_category: str, + timestamp: datetime, + ) -> None: + """ + Internal method to record category changes in history. + Only records if there's an actual change from a previous category. + + Args: + ip: IP address + old_category: Previous category (None if first categorization) + new_category: New category + timestamp: When the change occurred + """ + # Don't record initial categorization (when old_category is None) + # Only record actual category changes + if old_category is None: + return + + session = self.session + try: + history_entry = CategoryHistory( + ip=ip, + old_category=old_category, + new_category=new_category, + timestamp=timestamp, + ) + session.add(history_entry) + session.commit() + except Exception as e: + session.rollback() + applogger.error(f"Error recording category change: {e}") + + def get_category_history(self, ip: str) -> List[Dict[str, Any]]: + """ + Retrieve category change history for a specific IP. + + Args: + ip: IP address to get history for + + Returns: + List of category change records ordered by timestamp + """ + session = self.session + try: + sanitized_ip = sanitize_ip(ip) + history = ( + session.query(CategoryHistory) + .filter(CategoryHistory.ip == sanitized_ip) + .order_by(CategoryHistory.timestamp.asc()) + .all() + ) + + return [ + { + "old_category": h.old_category, + "new_category": h.new_category, + "timestamp": h.timestamp.isoformat(), + } + for h in history + ] + finally: + self.close_session() + + def update_ip_rep_infos( + self, + ip: str, + country_code: str, + asn: str, + asn_org: str, + list_on: Dict[str, str], + city: Optional[str] = None, + latitude: Optional[float] = None, + longitude: Optional[float] = None, + ) -> None: + """ + Update IP rep stats + + Args: + ip: IP address + country_code: IP address country code + asn: IP address ASN + asn_org: IP address ASN ORG + list_on: public lists containing the IP address + city: City name (optional) + latitude: Latitude coordinate (optional) + longitude: Longitude coordinate (optional) + + """ + session = self.session + try: + sanitized_ip = sanitize_ip(ip) + ip_stats = session.query(IpStats).filter(IpStats.ip == sanitized_ip).first() + if ip_stats: + ip_stats.country_code = country_code + ip_stats.asn = asn + ip_stats.asn_org = asn_org + ip_stats.list_on = list_on + if city: + ip_stats.city = city + if latitude is not None: + ip_stats.latitude = latitude + if longitude is not None: + ip_stats.longitude = longitude + session.commit() + except Exception as e: + session.rollback() + raise + finally: + self.close_session() + + def get_unenriched_ips(self, limit: int = 100) -> List[str]: + """ + Get IPs that don't have complete reputation data yet. + Returns IPs without country_code, city, latitude, or longitude data. + Excludes RFC1918 private addresses and other non-routable IPs. + + Args: + limit: Maximum number of IPs to return + + Returns: + List of IP addresses without complete reputation data + """ + from sqlalchemy.exc import OperationalError + + session = self.session + try: + # Try to query including latitude/longitude (for backward compatibility) + try: + ips = ( + session.query(IpStats.ip) + .filter( + or_( + IpStats.country_code.is_(None), + IpStats.city.is_(None), + IpStats.latitude.is_(None), + IpStats.longitude.is_(None), + ), + ~IpStats.ip.like("10.%"), + ~IpStats.ip.like("172.16.%"), + ~IpStats.ip.like("172.17.%"), + ~IpStats.ip.like("172.18.%"), + ~IpStats.ip.like("172.19.%"), + ~IpStats.ip.like("172.2_.%"), + ~IpStats.ip.like("172.30.%"), + ~IpStats.ip.like("172.31.%"), + ~IpStats.ip.like("192.168.%"), + ~IpStats.ip.like("127.%"), + ~IpStats.ip.like("169.254.%"), + ) + .limit(limit) + .all() + ) + except OperationalError as e: + # If latitude/longitude columns don't exist yet, fall back to old query + if "no such column" in str(e).lower(): + ips = ( + session.query(IpStats.ip) + .filter( + or_(IpStats.country_code.is_(None), IpStats.city.is_(None)), + ~IpStats.ip.like("10.%"), + ~IpStats.ip.like("172.16.%"), + ~IpStats.ip.like("172.17.%"), + ~IpStats.ip.like("172.18.%"), + ~IpStats.ip.like("172.19.%"), + ~IpStats.ip.like("172.2_.%"), + ~IpStats.ip.like("172.30.%"), + ~IpStats.ip.like("172.31.%"), + ~IpStats.ip.like("192.168.%"), + ~IpStats.ip.like("127.%"), + ~IpStats.ip.like("169.254.%"), + ) + .limit(limit) + .all() + ) + else: + raise + + return [ip[0] for ip in ips] + finally: + self.close_session() + + def get_access_logs( + self, + limit: int = 100, + offset: int = 0, + ip_filter: Optional[str] = None, + suspicious_only: bool = False, + since_minutes: Optional[int] = None, + ) -> List[Dict[str, Any]]: + """ + Retrieve access logs with optional filtering. + + Args: + limit: Maximum number of records to return + offset: Number of records to skip + ip_filter: Filter by IP address + suspicious_only: Only return suspicious requests + since_minutes: Only return logs from the last N minutes + + Returns: + List of access log dictionaries + """ + session = self.session + try: + query = session.query(AccessLog).order_by(AccessLog.timestamp.desc()) + + if ip_filter: + query = query.filter(AccessLog.ip == sanitize_ip(ip_filter)) + if suspicious_only: + query = query.filter(AccessLog.is_suspicious == True) + if since_minutes is not None: + cutoff_time = datetime.now() - timedelta(minutes=since_minutes) + query = query.filter(AccessLog.timestamp >= cutoff_time) + + logs = query.offset(offset).limit(limit).all() + + return [ + { + "id": log.id, + "ip": log.ip, + "path": log.path, + "user_agent": log.user_agent, + "method": log.method, + "is_suspicious": log.is_suspicious, + "is_honeypot_trigger": log.is_honeypot_trigger, + "timestamp": log.timestamp.isoformat(), + "attack_types": [d.attack_type for d in log.attack_detections], + } + for log in logs + ] + finally: + self.close_session() + + def get_credential_attempts( + self, limit: int = 100, offset: int = 0, ip_filter: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + Retrieve credential attempts with optional filtering. + + Args: + limit: Maximum number of records to return + offset: Number of records to skip + ip_filter: Filter by IP address + + Returns: + List of credential attempt dictionaries + """ + session = self.session + try: + query = session.query(CredentialAttempt).order_by( + CredentialAttempt.timestamp.desc() + ) + + if ip_filter: + query = query.filter(CredentialAttempt.ip == sanitize_ip(ip_filter)) + + attempts = query.offset(offset).limit(limit).all() + + return [ + { + "id": attempt.id, + "ip": attempt.ip, + "path": attempt.path, + "username": attempt.username, + "password": attempt.password, + "timestamp": attempt.timestamp.isoformat(), + } + for attempt in attempts + ] + finally: + self.close_session() + + def get_ip_stats(self, limit: int = 100) -> List[Dict[str, Any]]: + """ + Retrieve IP statistics ordered by total requests. + + Args: + limit: Maximum number of records to return + + Returns: + List of IP stats dictionaries + """ + session = self.session + try: + stats = ( + session.query(IpStats) + .order_by(IpStats.total_requests.desc()) + .limit(limit) + .all() + ) + + return [ + { + "ip": s.ip, + "total_requests": s.total_requests, + "first_seen": s.first_seen.isoformat() if s.first_seen else None, + "last_seen": s.last_seen.isoformat() if s.last_seen else None, + "country_code": s.country_code, + "city": s.city, + "asn": s.asn, + "asn_org": s.asn_org, + "reputation_score": s.reputation_score, + "reputation_source": s.reputation_source, + "analyzed_metrics": s.analyzed_metrics, + "category": s.category, + "manual_category": s.manual_category, + "last_analysis": ( + s.last_analysis.isoformat() if s.last_analysis else None + ), + } + for s in stats + ] + finally: + self.close_session() + + def get_ip_stats_by_ip(self, ip: str) -> Optional[Dict[str, Any]]: + """ + Retrieve IP statistics for a specific IP address. + + Args: + ip: The IP address to look up + + Returns: + Dictionary with IP stats or None if not found + """ + session = self.session + try: + stat = session.query(IpStats).filter(IpStats.ip == ip).first() + + if not stat: + return None + + # Get category history for this IP + category_history = self.get_category_history(ip) + + return { + "ip": stat.ip, + "total_requests": stat.total_requests, + "first_seen": stat.first_seen.isoformat() if stat.first_seen else None, + "last_seen": stat.last_seen.isoformat() if stat.last_seen else None, + "country_code": stat.country_code, + "city": stat.city, + "asn": stat.asn, + "asn_org": stat.asn_org, + "list_on": stat.list_on or {}, + "reputation_score": stat.reputation_score, + "reputation_source": stat.reputation_source, + "analyzed_metrics": stat.analyzed_metrics or {}, + "category": stat.category, + "category_scores": stat.category_scores or {}, + "manual_category": stat.manual_category, + "last_analysis": ( + stat.last_analysis.isoformat() if stat.last_analysis else None + ), + "category_history": category_history, + } + finally: + self.close_session() + + def get_attackers_paginated( + self, + page: int = 1, + page_size: int = 25, + sort_by: str = "total_requests", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of attacker IPs ordered by specified field. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (total_requests, first_seen, last_seen) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with attackers list and pagination info + """ + session = self.session + try: + offset = (page - 1) * page_size + + # Validate sort parameters + valid_sort_fields = {"total_requests", "first_seen", "last_seen"} + sort_by = sort_by if sort_by in valid_sort_fields else "total_requests" + sort_order = ( + sort_order.lower() if sort_order.lower() in {"asc", "desc"} else "desc" + ) + + # Get total count of attackers + total_attackers = ( + session.query(IpStats).filter(IpStats.category == "attacker").count() + ) + + # Build query with sorting + query = session.query(IpStats).filter(IpStats.category == "attacker") + + if sort_by == "total_requests": + query = query.order_by( + IpStats.total_requests.desc() + if sort_order == "desc" + else IpStats.total_requests.asc() + ) + elif sort_by == "first_seen": + query = query.order_by( + IpStats.first_seen.desc() + if sort_order == "desc" + else IpStats.first_seen.asc() + ) + elif sort_by == "last_seen": + query = query.order_by( + IpStats.last_seen.desc() + if sort_order == "desc" + else IpStats.last_seen.asc() + ) + + # Get paginated attackers + attackers = query.offset(offset).limit(page_size).all() + + total_pages = (total_attackers + page_size - 1) // page_size + + return { + "attackers": [ + { + "ip": a.ip, + "total_requests": a.total_requests, + "first_seen": ( + a.first_seen.isoformat() if a.first_seen else None + ), + "last_seen": a.last_seen.isoformat() if a.last_seen else None, + "country_code": a.country_code, + "city": a.city, + "latitude": a.latitude, + "longitude": a.longitude, + "asn": a.asn, + "asn_org": a.asn_org, + "reputation_score": a.reputation_score, + "reputation_source": a.reputation_source, + "category": a.category, + "category_scores": a.category_scores or {}, + } + for a in attackers + ], + "pagination": { + "page": page, + "page_size": page_size, + "total_attackers": total_attackers, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_all_ips_paginated( + self, + page: int = 1, + page_size: int = 25, + sort_by: str = "total_requests", + sort_order: str = "desc", + categories: Optional[List[str]] = None, + ) -> Dict[str, Any]: + """ + Retrieve paginated list of all IPs (or filtered by categories) ordered by specified field. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (total_requests, first_seen, last_seen) + sort_order: Sort order (asc or desc) + categories: Optional list of categories to filter by + + Returns: + Dictionary with IPs list and pagination info + """ + session = self.session + try: + offset = (page - 1) * page_size + + # Validate sort parameters + valid_sort_fields = {"total_requests", "first_seen", "last_seen"} + sort_by = sort_by if sort_by in valid_sort_fields else "total_requests" + sort_order = ( + sort_order.lower() if sort_order.lower() in {"asc", "desc"} else "desc" + ) + + # Build query with optional category filter + query = session.query(IpStats) + if categories: + query = query.filter(IpStats.category.in_(categories)) + + # Get total count + total_ips = query.count() + + # Apply sorting + if sort_by == "total_requests": + query = query.order_by( + IpStats.total_requests.desc() + if sort_order == "desc" + else IpStats.total_requests.asc() + ) + elif sort_by == "first_seen": + query = query.order_by( + IpStats.first_seen.desc() + if sort_order == "desc" + else IpStats.first_seen.asc() + ) + elif sort_by == "last_seen": + query = query.order_by( + IpStats.last_seen.desc() + if sort_order == "desc" + else IpStats.last_seen.asc() + ) + + # Get paginated IPs + ips = query.offset(offset).limit(page_size).all() + + total_pages = (total_ips + page_size - 1) // page_size + + return { + "ips": [ + { + "ip": ip.ip, + "total_requests": ip.total_requests, + "first_seen": ( + ip.first_seen.isoformat() if ip.first_seen else None + ), + "last_seen": ip.last_seen.isoformat() if ip.last_seen else None, + "country_code": ip.country_code, + "city": ip.city, + "latitude": ip.latitude, + "longitude": ip.longitude, + "asn": ip.asn, + "asn_org": ip.asn_org, + "reputation_score": ip.reputation_score, + "reputation_source": ip.reputation_source, + "category": ip.category, + "category_scores": ip.category_scores or {}, + } + for ip in ips + ], + "pagination": { + "page": page, + "page_size": page_size, + "total": total_ips, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_dashboard_counts(self) -> Dict[str, int]: + """ + Get aggregate statistics for the dashboard (excludes local/private IPs and server IP). + + Returns: + Dictionary with total_accesses, unique_ips, unique_paths, + suspicious_accesses, honeypot_triggered, honeypot_ips + """ + session = self.session + try: + # Get server IP to filter it out + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + + # Get all accesses first, then filter out local IPs and server IP + all_accesses = session.query(AccessLog).all() + + # Filter out local/private IPs and server IP + public_accesses = [ + log for log in all_accesses if is_valid_public_ip(log.ip, server_ip) + ] + + # Calculate counts from filtered data + total_accesses = len(public_accesses) + unique_ips = len(set(log.ip for log in public_accesses)) + unique_paths = len(set(log.path for log in public_accesses)) + suspicious_accesses = sum(1 for log in public_accesses if log.is_suspicious) + honeypot_triggered = sum( + 1 for log in public_accesses if log.is_honeypot_trigger + ) + honeypot_ips = len( + set(log.ip for log in public_accesses if log.is_honeypot_trigger) + ) + + # Count unique attackers from IpStats (matching the "Attackers by Total Requests" table) + unique_attackers = ( + session.query(IpStats).filter(IpStats.category == "attacker").count() + ) + + return { + "total_accesses": total_accesses, + "unique_ips": unique_ips, + "unique_paths": unique_paths, + "suspicious_accesses": suspicious_accesses, + "honeypot_triggered": honeypot_triggered, + "honeypot_ips": honeypot_ips, + "unique_attackers": unique_attackers, + } + finally: + self.close_session() + + def get_top_ips(self, limit: int = 10) -> List[tuple]: + """ + Get top IP addresses by access count (excludes local/private IPs and server IP). + + Args: + limit: Maximum number of results + + Returns: + List of (ip, count) tuples ordered by count descending + """ + session = self.session + try: + # Get server IP to filter it out + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + + results = ( + session.query(AccessLog.ip, func.count(AccessLog.id).label("count")) + .group_by(AccessLog.ip) + .order_by(func.count(AccessLog.id).desc()) + .all() + ) + + # Filter out local/private IPs and server IP, then limit results + filtered = [ + (row.ip, row.count) + for row in results + if is_valid_public_ip(row.ip, server_ip) + ] + return filtered[:limit] + finally: + self.close_session() + + def get_top_paths(self, limit: int = 10) -> List[tuple]: + """ + Get top paths by access count. + + Args: + limit: Maximum number of results + + Returns: + List of (path, count) tuples ordered by count descending + """ + session = self.session + try: + results = ( + session.query(AccessLog.path, func.count(AccessLog.id).label("count")) + .group_by(AccessLog.path) + .order_by(func.count(AccessLog.id).desc()) + .limit(limit) + .all() + ) + + return [(row.path, row.count) for row in results] + finally: + self.close_session() + + def get_top_user_agents(self, limit: int = 10) -> List[tuple]: + """ + Get top user agents by access count. + + Args: + limit: Maximum number of results + + Returns: + List of (user_agent, count) tuples ordered by count descending + """ + session = self.session + try: + results = ( + session.query( + AccessLog.user_agent, func.count(AccessLog.id).label("count") + ) + .filter(AccessLog.user_agent.isnot(None), AccessLog.user_agent != "") + .group_by(AccessLog.user_agent) + .order_by(func.count(AccessLog.id).desc()) + .limit(limit) + .all() + ) + + return [(row.user_agent, row.count) for row in results] + finally: + self.close_session() + + def get_recent_suspicious(self, limit: int = 20) -> List[Dict[str, Any]]: + """ + Get recent suspicious access attempts (excludes local/private IPs and server IP). + + Args: + limit: Maximum number of results + + Returns: + List of access log dictionaries with is_suspicious=True + """ + session = self.session + try: + # Get server IP to filter it out + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + + logs = ( + session.query(AccessLog) + .filter(AccessLog.is_suspicious == True) + .order_by(AccessLog.timestamp.desc()) + .all() + ) + + # Filter out local/private IPs and server IP + filtered_logs = [ + log for log in logs if is_valid_public_ip(log.ip, server_ip) + ] + + return [ + { + "ip": log.ip, + "path": log.path, + "user_agent": log.user_agent, + "timestamp": log.timestamp.isoformat(), + } + for log in filtered_logs[:limit] + ] + finally: + self.close_session() + + def get_honeypot_triggered_ips(self) -> List[tuple]: + """ + Get IPs that triggered honeypot paths with the paths they accessed + (excludes local/private IPs and server IP). + + Returns: + List of (ip, [paths]) tuples + """ + session = self.session + try: + # Get server IP to filter it out + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + + # Get all honeypot triggers grouped by IP + results = ( + session.query(AccessLog.ip, AccessLog.path) + .filter(AccessLog.is_honeypot_trigger == True) + .all() + ) + + # Group paths by IP, filtering out local/private IPs and server IP + ip_paths: Dict[str, List[str]] = {} + for row in results: + # Skip invalid IPs + if not is_valid_public_ip(row.ip, server_ip): + continue + if row.ip not in ip_paths: + ip_paths[row.ip] = [] + if row.path not in ip_paths[row.ip]: + ip_paths[row.ip].append(row.path) + + return [(ip, paths) for ip, paths in ip_paths.items()] + finally: + self.close_session() + + def get_recent_attacks(self, limit: int = 20) -> List[Dict[str, Any]]: + """ + Get recent access logs that have attack detections. + + Args: + limit: Maximum number of results + + Returns: + List of access log dicts with attack_types included + """ + session = self.session + try: + # Get access logs that have attack detections + logs = ( + session.query(AccessLog) + .join(AttackDetection) + .order_by(AccessLog.timestamp.desc()) + .limit(limit) + .all() + ) + + return [ + { + "ip": log.ip, + "path": log.path, + "user_agent": log.user_agent, + "timestamp": log.timestamp.isoformat(), + "attack_types": [d.attack_type for d in log.attack_detections], + } + for log in logs + ] + finally: + self.close_session() + + def get_honeypot_paginated( + self, + page: int = 1, + page_size: int = 5, + sort_by: str = "count", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of honeypot-triggered IPs with their paths. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (count or ip) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with honeypots list and pagination info + """ + session = self.session + try: + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + + offset = (page - 1) * page_size + + # Get honeypot triggers grouped by IP + results = ( + session.query(AccessLog.ip, AccessLog.path) + .filter(AccessLog.is_honeypot_trigger == True) + .all() + ) + + # Group paths by IP, filtering out invalid IPs + ip_paths: Dict[str, List[str]] = {} + for row in results: + if not is_valid_public_ip(row.ip, server_ip): + continue + if row.ip not in ip_paths: + ip_paths[row.ip] = [] + if row.path not in ip_paths[row.ip]: + ip_paths[row.ip].append(row.path) + + # Create list and sort + honeypot_list = [ + {"ip": ip, "paths": paths, "count": len(paths)} + for ip, paths in ip_paths.items() + ] + + if sort_by == "count": + honeypot_list.sort( + key=lambda x: x["count"], reverse=(sort_order == "desc") + ) + else: # sort by ip + honeypot_list.sort( + key=lambda x: x["ip"], reverse=(sort_order == "desc") + ) + + total_honeypots = len(honeypot_list) + paginated = honeypot_list[offset : offset + page_size] + total_pages = (total_honeypots + page_size - 1) // page_size + + return { + "honeypots": paginated, + "pagination": { + "page": page, + "page_size": page_size, + "total": total_honeypots, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_credentials_paginated( + self, + page: int = 1, + page_size: int = 5, + sort_by: str = "timestamp", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of credential attempts. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (timestamp, ip, username) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with credentials list and pagination info + """ + session = self.session + try: + offset = (page - 1) * page_size + + # Validate sort parameters + valid_sort_fields = {"timestamp", "ip", "username"} + sort_by = sort_by if sort_by in valid_sort_fields else "timestamp" + sort_order = ( + sort_order.lower() if sort_order.lower() in {"asc", "desc"} else "desc" + ) + + total_credentials = session.query(CredentialAttempt).count() + + # Build query with sorting + query = session.query(CredentialAttempt) + + if sort_by == "timestamp": + query = query.order_by( + CredentialAttempt.timestamp.desc() + if sort_order == "desc" + else CredentialAttempt.timestamp.asc() + ) + elif sort_by == "ip": + query = query.order_by( + CredentialAttempt.ip.desc() + if sort_order == "desc" + else CredentialAttempt.ip.asc() + ) + elif sort_by == "username": + query = query.order_by( + CredentialAttempt.username.desc() + if sort_order == "desc" + else CredentialAttempt.username.asc() + ) + + credentials = query.offset(offset).limit(page_size).all() + total_pages = (total_credentials + page_size - 1) // page_size + + return { + "credentials": [ + { + "ip": c.ip, + "username": c.username, + "password": c.password, + "path": c.path, + "timestamp": c.timestamp.isoformat() if c.timestamp else None, + } + for c in credentials + ], + "pagination": { + "page": page, + "page_size": page_size, + "total": total_credentials, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_top_ips_paginated( + self, + page: int = 1, + page_size: int = 5, + sort_by: str = "count", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of top IP addresses by access count. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (count or ip) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with IPs list and pagination info + """ + session = self.session + try: + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + + offset = (page - 1) * page_size + + results = ( + session.query(AccessLog.ip, func.count(AccessLog.id).label("count")) + .group_by(AccessLog.ip) + .all() + ) + + # Filter out local/private IPs and server IP, then sort + filtered = [ + {"ip": row.ip, "count": row.count} + for row in results + if is_valid_public_ip(row.ip, server_ip) + ] + + if sort_by == "count": + filtered.sort(key=lambda x: x["count"], reverse=(sort_order == "desc")) + else: # sort by ip + filtered.sort(key=lambda x: x["ip"], reverse=(sort_order == "desc")) + + total_ips = len(filtered) + paginated = filtered[offset : offset + page_size] + total_pages = (total_ips + page_size - 1) // page_size + + return { + "ips": paginated, + "pagination": { + "page": page, + "page_size": page_size, + "total": total_ips, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_top_paths_paginated( + self, + page: int = 1, + page_size: int = 5, + sort_by: str = "count", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of top paths by access count. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (count or path) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with paths list and pagination info + """ + session = self.session + try: + offset = (page - 1) * page_size + + results = ( + session.query(AccessLog.path, func.count(AccessLog.id).label("count")) + .group_by(AccessLog.path) + .all() + ) + + # Create list and sort + paths_list = [{"path": row.path, "count": row.count} for row in results] + + if sort_by == "count": + paths_list.sort( + key=lambda x: x["count"], reverse=(sort_order == "desc") + ) + else: # sort by path + paths_list.sort(key=lambda x: x["path"], reverse=(sort_order == "desc")) + + total_paths = len(paths_list) + paginated = paths_list[offset : offset + page_size] + total_pages = (total_paths + page_size - 1) // page_size + + return { + "paths": paginated, + "pagination": { + "page": page, + "page_size": page_size, + "total": total_paths, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_top_user_agents_paginated( + self, + page: int = 1, + page_size: int = 5, + sort_by: str = "count", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of top user agents by access count. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (count or user_agent) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with user agents list and pagination info + """ + session = self.session + try: + offset = (page - 1) * page_size + + results = ( + session.query( + AccessLog.user_agent, func.count(AccessLog.id).label("count") + ) + .filter(AccessLog.user_agent.isnot(None), AccessLog.user_agent != "") + .group_by(AccessLog.user_agent) + .all() + ) + + # Create list and sort + ua_list = [ + {"user_agent": row.user_agent, "count": row.count} for row in results + ] + + if sort_by == "count": + ua_list.sort(key=lambda x: x["count"], reverse=(sort_order == "desc")) + else: # sort by user_agent + ua_list.sort( + key=lambda x: x["user_agent"], reverse=(sort_order == "desc") + ) + + total_uas = len(ua_list) + paginated = ua_list[offset : offset + page_size] + total_pages = (total_uas + page_size - 1) // page_size + + return { + "user_agents": paginated, + "pagination": { + "page": page, + "page_size": page_size, + "total": total_uas, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + def get_attack_types_paginated( + self, + page: int = 1, + page_size: int = 5, + sort_by: str = "timestamp", + sort_order: str = "desc", + ) -> Dict[str, Any]: + """ + Retrieve paginated list of detected attack types with access logs. + + Args: + page: Page number (1-indexed) + page_size: Number of results per page + sort_by: Field to sort by (timestamp, ip, attack_type) + sort_order: Sort order (asc or desc) + + Returns: + Dictionary with attacks list and pagination info + """ + session = self.session + try: + offset = (page - 1) * page_size + + # Validate sort parameters + valid_sort_fields = {"timestamp", "ip", "attack_type"} + sort_by = sort_by if sort_by in valid_sort_fields else "timestamp" + sort_order = ( + sort_order.lower() if sort_order.lower() in {"asc", "desc"} else "desc" + ) + + # Get all access logs with attack detections + query = session.query(AccessLog).join(AttackDetection) + + if sort_by == "timestamp": + query = query.order_by( + AccessLog.timestamp.desc() + if sort_order == "desc" + else AccessLog.timestamp.asc() + ) + elif sort_by == "ip": + query = query.order_by( + AccessLog.ip.desc() if sort_order == "desc" else AccessLog.ip.asc() + ) + + logs = query.all() + + # Convert to attack list + attack_list = [ + { + "ip": log.ip, + "path": log.path, + "user_agent": log.user_agent, + "timestamp": log.timestamp.isoformat() if log.timestamp else None, + "attack_types": [d.attack_type for d in log.attack_detections], + } + for log in logs + ] + + # Sort by attack_type if needed (this must be done post-fetch since it's in a related table) + if sort_by == "attack_type": + attack_list.sort( + key=lambda x: x["attack_types"][0] if x["attack_types"] else "", + reverse=(sort_order == "desc"), + ) + + total_attacks = len(attack_list) + paginated = attack_list[offset : offset + page_size] + total_pages = (total_attacks + page_size - 1) // page_size + + return { + "attacks": paginated, + "pagination": { + "page": page, + "page_size": page_size, + "total": total_attacks, + "total_pages": total_pages, + }, + } + finally: + self.close_session() + + +# Module-level singleton instance +_db_manager = DatabaseManager() + + +def get_database() -> DatabaseManager: + """Get the database manager singleton instance.""" + return _db_manager + + +def initialize_database(database_path: str = "data/krawl.db") -> None: + """Initialize the database system.""" + _db_manager.initialize(database_path) diff --git a/src/generators.py b/src/generators.py index 16c0c32..fd29f38 100644 --- a/src/generators.py +++ b/src/generators.py @@ -9,6 +9,7 @@ import string import json from templates import html_templates from wordlists import get_wordlists +from config import get_config def random_username() -> str: @@ -21,10 +22,10 @@ def random_password() -> str: """Generate random password""" wl = get_wordlists() templates = [ - lambda: ''.join(random.choices(string.ascii_letters + string.digits, k=12)), + lambda: "".join(random.choices(string.ascii_letters + string.digits, k=12)), lambda: f"{random.choice(wl.password_prefixes)}{random.randint(100, 999)}!", lambda: f"{random.choice(wl.simple_passwords)}{random.randint(1000, 9999)}", - lambda: ''.join(random.choices(string.ascii_lowercase, k=8)), + lambda: "".join(random.choices(string.ascii_lowercase, k=8)), ] return random.choice(templates)() @@ -37,10 +38,19 @@ def random_email(username: str = None) -> str: return f"{username}@{random.choice(wl.email_domains)}" +def random_server_header() -> str: + """Generate random server header from wordlists""" + config = get_config() + if config.server_header: + return config.server_header + wl = get_wordlists() + return random.choice(wl.server_headers) + + def random_api_key() -> str: """Generate random API key""" wl = get_wordlists() - key = ''.join(random.choices(string.ascii_letters + string.digits, k=32)) + key = "".join(random.choices(string.ascii_letters + string.digits, k=32)) return random.choice(wl.api_key_prefixes) + key @@ -80,14 +90,16 @@ def users_json() -> str: users = [] for i in range(random.randint(3, 8)): username = random_username() - users.append({ - "id": i + 1, - "username": username, - "email": random_email(username), - "password": random_password(), - "role": random.choice(wl.user_roles), - "api_token": random_api_key() - }) + users.append( + { + "id": i + 1, + "username": username, + "email": random_email(username), + "password": random_password(), + "role": random.choice(wl.user_roles), + "api_token": random_api_key(), + } + ) return json.dumps({"users": users}, indent=2) @@ -95,20 +107,28 @@ def api_keys_json() -> str: """Generate fake api_keys.json with random data""" keys = { "stripe": { - "public_key": "pk_live_" + ''.join(random.choices(string.ascii_letters + string.digits, k=24)), - "secret_key": random_api_key() + "public_key": "pk_live_" + + "".join(random.choices(string.ascii_letters + string.digits, k=24)), + "secret_key": random_api_key(), }, "aws": { - "access_key_id": "AKIA" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=16)), - "secret_access_key": ''.join(random.choices(string.ascii_letters + string.digits + '+/', k=40)) + "access_key_id": "AKIA" + + "".join(random.choices(string.ascii_uppercase + string.digits, k=16)), + "secret_access_key": "".join( + random.choices(string.ascii_letters + string.digits + "+/", k=40) + ), }, "sendgrid": { - "api_key": "SG." + ''.join(random.choices(string.ascii_letters + string.digits, k=48)) + "api_key": "SG." + + "".join(random.choices(string.ascii_letters + string.digits, k=48)) }, "twilio": { - "account_sid": "AC" + ''.join(random.choices(string.ascii_lowercase + string.digits, k=32)), - "auth_token": ''.join(random.choices(string.ascii_lowercase + string.digits, k=32)) - } + "account_sid": "AC" + + "".join(random.choices(string.ascii_lowercase + string.digits, k=32)), + "auth_token": "".join( + random.choices(string.ascii_lowercase + string.digits, k=32) + ), + }, } return json.dumps(keys, indent=2) @@ -116,51 +136,70 @@ def api_keys_json() -> str: def api_response(path: str) -> str: """Generate fake API JSON responses with random data""" wl = get_wordlists() - + def random_users(count: int = 3): users = [] for i in range(count): username = random_username() - users.append({ - "id": i + 1, - "username": username, - "email": random_email(username), - "role": random.choice(wl.user_roles) - }) + users.append( + { + "id": i + 1, + "username": username, + "email": random_email(username), + "role": random.choice(wl.user_roles), + } + ) return users - + responses = { - '/api/users': json.dumps({ - "users": random_users(random.randint(2, 5)), - "total": random.randint(50, 500) - }, indent=2), - '/api/v1/users': json.dumps({ - "status": "success", - "data": [{ - "id": random.randint(1, 100), - "name": random_username(), - "api_key": random_api_key() - }] - }, indent=2), - '/api/v2/secrets': json.dumps({ - "database": { - "host": random.choice(wl.database_hosts), - "username": random_username(), - "password": random_password(), - "database": random_database_name() + "/api/users": json.dumps( + { + "users": random_users(random.randint(2, 5)), + "total": random.randint(50, 500), }, - "api_keys": { - "stripe": random_api_key(), - "aws": 'AKIA' + ''.join(random.choices(string.ascii_uppercase + string.digits, k=16)) - } - }, indent=2), - '/api/config': json.dumps({ - "app_name": random.choice(wl.application_names), - "debug": random.choice([True, False]), - "secret_key": random_api_key(), - "database_url": f"postgresql://{random_username()}:{random_password()}@localhost/{random_database_name()}" - }, indent=2), - '/.env': f"""APP_NAME={random.choice(wl.application_names)} + indent=2, + ), + "/api/v1/users": json.dumps( + { + "status": "success", + "data": [ + { + "id": random.randint(1, 100), + "name": random_username(), + "api_key": random_api_key(), + } + ], + }, + indent=2, + ), + "/api/v2/secrets": json.dumps( + { + "database": { + "host": random.choice(wl.database_hosts), + "username": random_username(), + "password": random_password(), + "database": random_database_name(), + }, + "api_keys": { + "stripe": random_api_key(), + "aws": "AKIA" + + "".join( + random.choices(string.ascii_uppercase + string.digits, k=16) + ), + }, + }, + indent=2, + ), + "/api/config": json.dumps( + { + "app_name": random.choice(wl.application_names), + "debug": random.choice([True, False]), + "secret_key": random_api_key(), + "database_url": f"postgresql://{random_username()}:{random_password()}@localhost/{random_database_name()}", + }, + indent=2, + ), + "/.env": f"""APP_NAME={random.choice(wl.application_names)} DEBUG={random.choice(['true', 'false'])} APP_KEY=base64:{''.join(random.choices(string.ascii_letters + string.digits, k=32))}= DB_CONNECTION=mysql @@ -172,7 +211,7 @@ DB_PASSWORD={random_password()} AWS_ACCESS_KEY_ID=AKIA{''.join(random.choices(string.ascii_uppercase + string.digits, k=16))} AWS_SECRET_ACCESS_KEY={''.join(random.choices(string.ascii_letters + string.digits + '+/', k=40))} STRIPE_SECRET={random_api_key()} -""" +""", } return responses.get(path, json.dumps({"error": "Not found"}, indent=2)) @@ -180,11 +219,13 @@ STRIPE_SECRET={random_api_key()} def directory_listing(path: str) -> str: """Generate fake directory listing using wordlists""" wl = get_wordlists() - + files = wl.directory_files dirs = wl.directory_dirs - - selected_files = [(f, random.randint(1024, 1024*1024)) - for f in random.sample(files, min(6, len(files)))] - + + selected_files = [ + (f, random.randint(1024, 1024 * 1024)) + for f in random.sample(files, min(6, len(files))) + ] + return html_templates.directory_listing(path, dirs, selected_files) diff --git a/src/geo_utils.py b/src/geo_utils.py new file mode 100644 index 0000000..d11f01c --- /dev/null +++ b/src/geo_utils.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Geolocation utilities for reverse geocoding and city lookups. +""" + +import requests +from typing import Optional, Tuple +from logger import get_app_logger + +app_logger = get_app_logger() + +# Simple city name cache to avoid repeated API calls +_city_cache = {} + + +def reverse_geocode_city(latitude: float, longitude: float) -> Optional[str]: + """ + Reverse geocode coordinates to get city name using Nominatim (OpenStreetMap). + + Args: + latitude: Latitude coordinate + longitude: Longitude coordinate + + Returns: + City name or None if not found + """ + # Check cache first + cache_key = f"{latitude},{longitude}" + if cache_key in _city_cache: + return _city_cache[cache_key] + + try: + # Use Nominatim reverse geocoding API (free, no API key required) + url = "https://nominatim.openstreetmap.org/reverse" + params = { + "lat": latitude, + "lon": longitude, + "format": "json", + "zoom": 10, # City level + "addressdetails": 1, + } + headers = {"User-Agent": "Krawl-Honeypot/1.0"} # Required by Nominatim ToS + + response = requests.get(url, params=params, headers=headers, timeout=5) + response.raise_for_status() + + data = response.json() + address = data.get("address", {}) + + # Try to get city from various possible fields + city = ( + address.get("city") + or address.get("town") + or address.get("village") + or address.get("municipality") + or address.get("county") + ) + + # Cache the result + _city_cache[cache_key] = city + + if city: + app_logger.debug(f"Reverse geocoded {latitude},{longitude} to {city}") + + return city + + except requests.RequestException as e: + app_logger.warning(f"Reverse geocoding failed for {latitude},{longitude}: {e}") + return None + except Exception as e: + app_logger.error(f"Error in reverse geocoding: {e}") + return None + + +def get_most_recent_geoip_data(results: list) -> Optional[dict]: + """ + Extract the most recent geoip_data from API results. + Results are assumed to be sorted by record_added (most recent first). + + Args: + results: List of result dictionaries from IP reputation API + + Returns: + Most recent geoip_data dict or None + """ + if not results: + return None + + # The first result is the most recent (sorted by record_added) + most_recent = results[0] + return most_recent.get("geoip_data") + + +def extract_city_from_coordinates(geoip_data: dict) -> Optional[str]: + """ + Extract city name from geoip_data using reverse geocoding. + + Args: + geoip_data: Dictionary containing location_latitude and location_longitude + + Returns: + City name or None + """ + if not geoip_data: + return None + + latitude = geoip_data.get("location_latitude") + longitude = geoip_data.get("location_longitude") + + if latitude is None or longitude is None: + return None + + return reverse_geocode_city(latitude, longitude) diff --git a/src/handler.py b/src/handler.py index c93b78b..0a6abb2 100644 --- a/src/handler.py +++ b/src/handler.py @@ -10,11 +10,17 @@ from urllib.parse import urlparse, parse_qs from config import Config from tracker import AccessTracker +from analyzer import Analyzer from templates import html_templates from templates.dashboard_template import generate_dashboard from generators import ( - credentials_txt, passwords_txt, users_json, api_keys_json, - api_response, directory_listing + credentials_txt, + passwords_txt, + users_json, + api_keys_json, + api_response, + directory_listing, + random_server_header, ) from wordlists import get_wordlists from sql_errors import generate_sql_error_response, get_sql_response_with_data @@ -24,9 +30,11 @@ from server_errors import generate_server_error class Handler(BaseHTTPRequestHandler): """HTTP request handler for the deception server""" + webpages: Optional[List[str]] = None config: Config = None tracker: AccessTracker = None + analyzer: Analyzer = None counter: int = 0 app_logger: logging.Logger = None access_logger: logging.Logger = None @@ -35,28 +43,40 @@ class Handler(BaseHTTPRequestHandler): def _get_client_ip(self) -> str: """Extract client IP address from request, checking proxy headers first""" # Headers might not be available during early error logging - if hasattr(self, 'headers') and self.headers: + if hasattr(self, "headers") and self.headers: # Check X-Forwarded-For header (set by load balancers/proxies) - forwarded_for = self.headers.get('X-Forwarded-For') + forwarded_for = self.headers.get("X-Forwarded-For") if forwarded_for: # X-Forwarded-For can contain multiple IPs, get the first (original client) - return forwarded_for.split(',')[0].strip() - + return forwarded_for.split(",")[0].strip() + # Check X-Real-IP header (set by nginx and other proxies) - real_ip = self.headers.get('X-Real-IP') + real_ip = self.headers.get("X-Real-IP") if real_ip: return real_ip.strip() - + # Fallback to direct connection IP return self.client_address[0] def _get_user_agent(self) -> str: """Extract user agent from request""" - return self.headers.get('User-Agent', '') + return self.headers.get("User-Agent", "") + + def _get_category_by_ip(self, client_ip: str) -> str: + """Get the category of an IP from the database""" + return self.tracker.get_category_by_ip(client_ip) + + def _get_page_visit_count(self, client_ip: str) -> int: + """Get current page visit count for an IP""" + return self.tracker.get_page_visit_count(client_ip) + + def _increment_page_visit(self, client_ip: str) -> int: + """Increment page visit counter for an IP and return new count""" + return self.tracker.increment_page_visit(client_ip) def version_string(self) -> str: """Return custom server version for deception.""" - return self.config.server_header + return random_server_header() def _should_return_error(self) -> bool: """Check if we should return an error based on probability""" @@ -71,53 +91,61 @@ class Handler(BaseHTTPRequestHandler): if not error_codes: error_codes = [400, 401, 403, 404, 500, 502, 503] return random.choice(error_codes) - + def _parse_query_string(self) -> str: """Extract query string from the request path""" parsed = urlparse(self.path) return parsed.query - + def _handle_sql_endpoint(self, path: str) -> bool: """ Handle SQL injection honeypot endpoints. Returns True if the path was handled, False otherwise. """ # SQL-vulnerable endpoints - sql_endpoints = ['/api/search', '/api/sql', '/api/database'] - + sql_endpoints = ["/api/search", "/api/sql", "/api/database"] + base_path = urlparse(path).path if base_path not in sql_endpoints: return False - + try: # Get query parameters query_string = self._parse_query_string() - + # Log SQL injection attempt client_ip = self._get_client_ip() user_agent = self._get_user_agent() - + # Always check for SQL injection patterns - error_msg, content_type, status_code = generate_sql_error_response(query_string or "") - + error_msg, content_type, status_code = generate_sql_error_response( + query_string or "" + ) + if error_msg: # SQL injection detected - log and return error - self.access_logger.warning(f"[SQL INJECTION DETECTED] {client_ip} - {base_path} - Query: {query_string[:100] if query_string else 'empty'}") + self.access_logger.warning( + f"[SQL INJECTION DETECTED] {client_ip} - {base_path} - Query: {query_string[:100] if query_string else 'empty'}" + ) self.send_response(status_code) - self.send_header('Content-type', content_type) + self.send_header("Content-type", content_type) self.end_headers() self.wfile.write(error_msg.encode()) else: # No injection detected - return fake data - self.access_logger.info(f"[SQL ENDPOINT] {client_ip} - {base_path} - Query: {query_string[:100] if query_string else 'empty'}") + self.access_logger.info( + f"[SQL ENDPOINT] {client_ip} - {base_path} - Query: {query_string[:100] if query_string else 'empty'}" + ) self.send_response(200) - self.send_header('Content-type', 'application/json') + self.send_header("Content-type", "application/json") self.end_headers() - response_data = get_sql_response_with_data(base_path, query_string or "") + response_data = get_sql_response_with_data( + base_path, query_string or "" + ) self.wfile.write(response_data.encode()) - + return True - + except BrokenPipeError: # Client disconnected return True @@ -126,120 +154,66 @@ class Handler(BaseHTTPRequestHandler): # Still send a response even on error try: self.send_response(500) - self.send_header('Content-type', 'application/json') + self.send_header("Content-type", "application/json") self.end_headers() self.wfile.write(b'{"error": "Internal server error"}') except: pass return True - def generate_page(self, seed: str) -> str: + def generate_page(self, seed: str, page_visit_count: int) -> str: """Generate a webpage containing random links or canary token""" + random.seed(seed) num_pages = random.randint(*self.config.links_per_page_range) - html = f""" - - - - Krawl - - - -
-

Krawl me! 🕸

-
{Handler.counter}
- - -
- -""" - return html + # Return the complete page using the template + return html_templates.main_page(Handler.counter, content) def do_HEAD(self): """Sends header information""" @@ -273,27 +243,36 @@ class Handler(BaseHTTPRequestHandler): post_data = "" from urllib.parse import urlparse + base_path = urlparse(self.path).path - - if base_path in ['/api/search', '/api/sql', '/api/database']: - content_length = int(self.headers.get('Content-Length', 0)) + + if base_path in ["/api/search", "/api/sql", "/api/database"]: + content_length = int(self.headers.get("Content-Length", 0)) if content_length > 0: - post_data = self.rfile.read(content_length).decode('utf-8', errors="replace") - - self.access_logger.info(f"[SQL ENDPOINT POST] {client_ip} - {base_path} - Data: {post_data[:100] if post_data else 'empty'}") - - error_msg, content_type, status_code = generate_sql_error_response(post_data) - + post_data = self.rfile.read(content_length).decode( + "utf-8", errors="replace" + ) + + self.access_logger.info( + f"[SQL ENDPOINT POST] {client_ip} - {base_path} - Data: {post_data[:100] if post_data else 'empty'}" + ) + + error_msg, content_type, status_code = generate_sql_error_response( + post_data + ) + try: if error_msg: - self.access_logger.warning(f"[SQL INJECTION DETECTED POST] {client_ip} - {base_path}") + self.access_logger.warning( + f"[SQL INJECTION DETECTED POST] {client_ip} - {base_path}" + ) self.send_response(status_code) - self.send_header('Content-type', content_type) + self.send_header("Content-type", content_type) self.end_headers() self.wfile.write(error_msg.encode()) else: self.send_response(200) - self.send_header('Content-type', 'application/json') + self.send_header("Content-type", "application/json") self.end_headers() response_data = get_sql_response_with_data(base_path, post_data) self.wfile.write(response_data.encode()) @@ -302,29 +281,36 @@ class Handler(BaseHTTPRequestHandler): except Exception as e: self.app_logger.error(f"Error in SQL POST handler: {str(e)}") return - - if base_path == '/api/contact': - content_length = int(self.headers.get('Content-Length', 0)) + + if base_path == "/api/contact": + content_length = int(self.headers.get("Content-Length", 0)) if content_length > 0: - post_data = self.rfile.read(content_length).decode('utf-8', errors="replace") - + post_data = self.rfile.read(content_length).decode( + "utf-8", errors="replace" + ) + parsed_data = {} - for pair in post_data.split('&'): - if '=' in pair: - key, value = pair.split('=', 1) + for pair in post_data.split("&"): + if "=" in pair: + key, value = pair.split("=", 1) from urllib.parse import unquote_plus + parsed_data[unquote_plus(key)] = unquote_plus(value) - + xss_detected = any(detect_xss_pattern(v) for v in parsed_data.values()) - + if xss_detected: - self.access_logger.warning(f"[XSS ATTEMPT DETECTED] {client_ip} - {base_path} - Data: {post_data[:200]}") + self.access_logger.warning( + f"[XSS ATTEMPT DETECTED] {client_ip} - {base_path} - Data: {post_data[:200]}" + ) else: - self.access_logger.info(f"[XSS ENDPOINT POST] {client_ip} - {base_path}") - + self.access_logger.info( + f"[XSS ENDPOINT POST] {client_ip} - {base_path}" + ) + try: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() response_html = generate_xss_response(parsed_data) self.wfile.write(response_html.encode()) @@ -334,31 +320,45 @@ class Handler(BaseHTTPRequestHandler): self.app_logger.error(f"Error in XSS POST handler: {str(e)}") return - self.access_logger.warning(f"[LOGIN ATTEMPT] {client_ip} - {self.path} - {user_agent[:50]}") + self.access_logger.warning( + f"[LOGIN ATTEMPT] {client_ip} - {self.path} - {user_agent[:50]}" + ) - content_length = int(self.headers.get('Content-Length', 0)) + content_length = int(self.headers.get("Content-Length", 0)) if content_length > 0: - post_data = self.rfile.read(content_length).decode('utf-8', errors="replace") + post_data = self.rfile.read(content_length).decode( + "utf-8", errors="replace" + ) self.access_logger.warning(f"[POST DATA] {post_data[:200]}") + # Parse and log credentials username, password = self.tracker.parse_credentials(post_data) if username or password: + # Log to dedicated credentials.log file timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") credential_line = f"{timestamp}|{client_ip}|{username or 'N/A'}|{password or 'N/A'}|{self.path}" self.credential_logger.info(credential_line) - - self.tracker.record_credential_attempt(client_ip, self.path, username or 'N/A', password or 'N/A') - - self.access_logger.warning(f"[CREDENTIALS CAPTURED] {client_ip} - Username: {username or 'N/A'} - Path: {self.path}") - self.tracker.record_access(client_ip, self.path, user_agent, post_data) - + # Also record in tracker for dashboard + self.tracker.record_credential_attempt( + client_ip, self.path, username or "N/A", password or "N/A" + ) + + self.access_logger.warning( + f"[CREDENTIALS CAPTURED] {client_ip} - Username: {username or 'N/A'} - Path: {self.path}" + ) + + # send the post data (body) to the record_access function so the post data can be used to detect suspicious things. + self.tracker.record_access( + client_ip, self.path, user_agent, post_data, method="POST" + ) + time.sleep(1) - + try: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.login_error().encode()) except BrokenPipeError: @@ -370,101 +370,108 @@ class Handler(BaseHTTPRequestHandler): def serve_special_path(self, path: str) -> bool: """Serve special paths like robots.txt, API endpoints, etc.""" - + # Check SQL injection honeypot endpoints first if self._handle_sql_endpoint(path): return True - + try: - if path == '/robots.txt': + if path == "/robots.txt": self.send_response(200) - self.send_header('Content-type', 'text/plain') + self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(html_templates.robots_txt().encode()) return True - - if path in ['/credentials.txt', '/passwords.txt', '/admin_notes.txt']: + + if path in ["/credentials.txt", "/passwords.txt", "/admin_notes.txt"]: self.send_response(200) - self.send_header('Content-type', 'text/plain') + self.send_header("Content-type", "text/plain") self.end_headers() - if 'credentials' in path: + if "credentials" in path: self.wfile.write(credentials_txt().encode()) else: self.wfile.write(passwords_txt().encode()) return True - - if path in ['/users.json', '/api_keys.json', '/config.json']: + + if path in ["/users.json", "/api_keys.json", "/config.json"]: self.send_response(200) - self.send_header('Content-type', 'application/json') + self.send_header("Content-type", "application/json") self.end_headers() - if 'users' in path: + if "users" in path: self.wfile.write(users_json().encode()) - elif 'api_keys' in path: + elif "api_keys" in path: self.wfile.write(api_keys_json().encode()) else: - self.wfile.write(api_response('/api/config').encode()) + self.wfile.write(api_response("/api/config").encode()) return True - - if path in ['/admin', '/admin/', '/admin/login', '/login']: + + if path in ["/admin", "/admin/", "/admin/login", "/login"]: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.login_form().encode()) return True - - if path in ['/users', '/user', '/database', '/db', '/search']: + + if path in ["/users", "/user", "/database", "/db", "/search"]: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.product_search().encode()) return True - - if path in ['/info', '/input', '/contact', '/feedback', '/comment']: + + if path in ["/info", "/input", "/contact", "/feedback", "/comment"]: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.input_form().encode()) return True - - if path == '/server': + + if path == "/server": error_html, content_type = generate_server_error() self.send_response(500) - self.send_header('Content-type', content_type) + self.send_header("Content-type", content_type) self.end_headers() self.wfile.write(error_html.encode()) return True - - if path in ['/wp-login.php', '/wp-login', '/wp-admin', '/wp-admin/']: + + if path in ["/wp-login.php", "/wp-login", "/wp-admin", "/wp-admin/"]: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.wp_login().encode()) return True - - if path in ['/wp-content/', '/wp-includes/'] or 'wordpress' in path.lower(): + + if path in ["/wp-content/", "/wp-includes/"] or "wordpress" in path.lower(): self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.wordpress().encode()) return True - - if 'phpmyadmin' in path.lower() or path in ['/pma/', '/phpMyAdmin/']: + + if "phpmyadmin" in path.lower() or path in ["/pma/", "/phpMyAdmin/"]: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(html_templates.phpmyadmin().encode()) return True - - if path.startswith('/api/') or path.startswith('/api') or path in ['/.env']: + + if path.startswith("/api/") or path.startswith("/api") or path in ["/.env"]: self.send_response(200) - self.send_header('Content-type', 'application/json') + self.send_header("Content-type", "application/json") self.end_headers() self.wfile.write(api_response(path).encode()) return True - - if path in ['/backup/', '/uploads/', '/private/', '/admin/', '/config/', '/database/']: + + if path in [ + "/backup/", + "/uploads/", + "/private/", + "/admin/", + "/config/", + "/database/", + ]: self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(directory_listing(path).encode()) return True @@ -480,29 +487,536 @@ class Handler(BaseHTTPRequestHandler): def do_GET(self): """Responds to webpage requests""" client_ip = self._get_client_ip() + if self.tracker.is_banned_ip(client_ip): + self.send_response(500) + self.end_headers() + return user_agent = self._get_user_agent() - - if self.config.dashboard_secret_path and self.path == self.config.dashboard_secret_path: + + # Handle static files for dashboard + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/static/" + ): + import os + + file_path = self.path.replace( + f"{self.config.dashboard_secret_path}/static/", "" + ) + static_dir = os.path.join(os.path.dirname(__file__), "templates", "static") + full_path = os.path.join(static_dir, file_path) + + # Security check: ensure the path is within static directory + if os.path.commonpath( + [full_path, static_dir] + ) == static_dir and os.path.exists(full_path): + try: + with open(full_path, "rb") as f: + content = f.read() + self.send_response(200) + if file_path.endswith(".svg"): + self.send_header("Content-type", "image/svg+xml") + elif file_path.endswith(".css"): + self.send_header("Content-type", "text/css") + elif file_path.endswith(".js"): + self.send_header("Content-type", "application/javascript") + else: + self.send_header("Content-type", "application/octet-stream") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + return + except Exception as e: + self.app_logger.error(f"Error serving static file: {e}") + + self.send_response(404) + self.send_header("Content-type", "text/plain") + self.end_headers() + self.wfile.write(b"Not found") + return + + if ( + self.config.dashboard_secret_path + and self.path == self.config.dashboard_secret_path + ): self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() try: stats = self.tracker.get_stats() - self.wfile.write(generate_dashboard(stats).encode()) + dashboard_path = self.config.dashboard_secret_path + self.wfile.write(generate_dashboard(stats, dashboard_path).encode()) except BrokenPipeError: pass except Exception as e: self.app_logger.error(f"Error generating dashboard: {e}") return - self.tracker.record_access(client_ip, self.path, user_agent) + # API endpoint for fetching all IP statistics + if ( + self.config.dashboard_secret_path + and self.path == f"{self.config.dashboard_secret_path}/api/all-ip-stats" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + + db = get_database() + ip_stats_list = db.get_ip_stats(limit=500) + self.wfile.write(json.dumps({"ips": ip_stats_list}).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching all IP stats: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for fetching paginated attackers + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/attackers" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + + # Parse query parameters + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["25"])[0]) + sort_by = query_params.get("sort_by", ["total_requests"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + # Ensure valid parameters + page = max(1, page) + page_size = min(max(1, page_size), 100) # Max 100 per page + + result = db.get_attackers_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching attackers: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for fetching all IPs (all categories) + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/all-ips" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + + # Parse query parameters + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["25"])[0]) + sort_by = query_params.get("sort_by", ["total_requests"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + # Ensure valid parameters + page = max(1, page) + page_size = min(max(1, page_size), 100) # Max 100 per page + + result = db.get_all_ips_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching all IPs: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for fetching IP stats + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/ip-stats/" + ): + ip_address = self.path.replace( + f"{self.config.dashboard_secret_path}/api/ip-stats/", "" + ) + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + # Prevent browser caching - force fresh data from database every time + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + + db = get_database() + ip_stats = db.get_ip_stats_by_ip(ip_address) + if ip_stats: + self.wfile.write(json.dumps(ip_stats).encode()) + else: + self.wfile.write(json.dumps({"error": "IP not found"}).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching IP stats: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for paginated honeypot triggers + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/honeypot" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["5"])[0]) + sort_by = query_params.get("sort_by", ["count"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + page = max(1, page) + page_size = min(max(1, page_size), 100) + + result = db.get_honeypot_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching honeypot data: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for paginated credentials + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/credentials" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["5"])[0]) + sort_by = query_params.get("sort_by", ["timestamp"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + page = max(1, page) + page_size = min(max(1, page_size), 100) + + result = db.get_credentials_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching credentials: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for paginated top IPs + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/top-ips" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["5"])[0]) + sort_by = query_params.get("sort_by", ["count"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + page = max(1, page) + page_size = min(max(1, page_size), 100) + + result = db.get_top_ips_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching top IPs: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for paginated top paths + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/top-paths" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["5"])[0]) + sort_by = query_params.get("sort_by", ["count"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + page = max(1, page) + page_size = min(max(1, page_size), 100) + + result = db.get_top_paths_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching top paths: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for paginated top user agents + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/top-user-agents" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["5"])[0]) + sort_by = query_params.get("sort_by", ["count"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + page = max(1, page) + page_size = min(max(1, page_size), 100) + + result = db.get_top_user_agents_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching top user agents: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for paginated attack types + if self.config.dashboard_secret_path and self.path.startswith( + f"{self.config.dashboard_secret_path}/api/attack-types" + ): + self.send_response(200) + self.send_header("Content-type", "application/json") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header( + "Cache-Control", "no-store, no-cache, must-revalidate, max-age=0" + ) + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + self.end_headers() + try: + from database import get_database + import json + from urllib.parse import urlparse, parse_qs + + db = get_database() + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + page = int(query_params.get("page", ["1"])[0]) + page_size = int(query_params.get("page_size", ["5"])[0]) + sort_by = query_params.get("sort_by", ["timestamp"])[0] + sort_order = query_params.get("sort_order", ["desc"])[0] + + page = max(1, page) + page_size = min(max(1, page_size), 100) + + result = db.get_attack_types_paginated( + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order, + ) + self.wfile.write(json.dumps(result).encode()) + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error fetching attack types: {e}") + self.wfile.write(json.dumps({"error": str(e)}).encode()) + return + + # API endpoint for downloading malicious IPs file + if ( + self.config.dashboard_secret_path + and self.path + == f"{self.config.dashboard_secret_path}/api/download/malicious_ips.txt" + ): + import os + + file_path = os.path.join( + os.path.dirname(__file__), "exports", "malicious_ips.txt" + ) + try: + if os.path.exists(file_path): + with open(file_path, "rb") as f: + content = f.read() + self.send_response(200) + self.send_header("Content-type", "text/plain") + self.send_header( + "Content-Disposition", + 'attachment; filename="malicious_ips.txt"', + ) + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + else: + self.send_response(404) + self.send_header("Content-type", "text/plain") + self.end_headers() + self.wfile.write(b"File not found") + except BrokenPipeError: + pass + except Exception as e: + self.app_logger.error(f"Error serving malicious IPs file: {e}") + self.send_response(500) + self.send_header("Content-type", "text/plain") + self.end_headers() + self.wfile.write(b"Internal server error") + return + + self.tracker.record_access(client_ip, self.path, user_agent, method="GET") + + # self.analyzer.infer_user_category(client_ip) + # self.analyzer.update_ip_rep_infos(client_ip) if self.tracker.is_suspicious_user_agent(user_agent): - self.access_logger.warning(f"[SUSPICIOUS] {client_ip} - {user_agent[:50]} - {self.path}") + self.access_logger.warning( + f"[SUSPICIOUS] {client_ip} - {user_agent[:50]} - {self.path}" + ) if self._should_return_error(): error_code = self._get_random_error_code() - self.access_logger.info(f"Returning error {error_code} to {client_ip} - {self.path}") + self.access_logger.info( + f"Returning error {error_code} to {client_ip} - {self.path}" + ) self.send_response(error_code) self.end_headers() return @@ -512,14 +1026,18 @@ class Handler(BaseHTTPRequestHandler): time.sleep(self.config.delay / 1000.0) self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() try: - self.wfile.write(self.generate_page(self.path).encode()) - + # Increment page visit counter for this IP and get the current count + current_visit_count = self._increment_page_visit(client_ip) + self.wfile.write( + self.generate_page(self.path, current_visit_count).encode() + ) + Handler.counter -= 1 - + if Handler.counter < 0: Handler.counter = self.config.canary_token_tries except BrokenPipeError: diff --git a/src/ip_utils.py b/src/ip_utils.py new file mode 100644 index 0000000..1eab6b8 --- /dev/null +++ b/src/ip_utils.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +""" +IP utility functions for filtering and validating IP addresses. +Provides common IP filtering logic used across the Krawl honeypot. +""" + +import ipaddress +from typing import Optional + + +def is_local_or_private_ip(ip_str: str) -> bool: + """ + Check if an IP address is local, private, or reserved. + + Filters out: + - 127.0.0.1 (localhost) + - 127.0.0.0/8 (loopback) + - 10.0.0.0/8 (private network) + - 172.16.0.0/12 (private network) + - 192.168.0.0/16 (private network) + - 0.0.0.0/8 (this network) + - ::1 (IPv6 localhost) + - ::ffff:127.0.0.0/104 (IPv6-mapped IPv4 loopback) + + Args: + ip_str: IP address string + + Returns: + True if IP is local/private/reserved, False if it's public + """ + try: + ip = ipaddress.ip_address(ip_str) + return ( + ip.is_private + or ip.is_loopback + or ip.is_reserved + or ip.is_link_local + or str(ip) in ("0.0.0.0", "::1") + ) + except ValueError: + # Invalid IP address + return True + + +def is_valid_public_ip(ip: str, server_ip: Optional[str] = None) -> bool: + """ + Check if an IP is public and not the server's own IP. + + Returns True only if: + - IP is not in local/private ranges AND + - IP is not the server's own public IP (if server_ip provided) + + Args: + ip: IP address string to check + server_ip: Server's public IP (optional). If provided, filters out this IP too. + + Returns: + True if IP is a valid public IP to track, False otherwise + """ + return not is_local_or_private_ip(ip) and (server_ip is None or ip != server_ip) diff --git a/src/logger.py b/src/logger.py index 9f09236..9762002 100644 --- a/src/logger.py +++ b/src/logger.py @@ -8,10 +8,26 @@ Provides two loggers: app (application) and access (HTTP access logs). import logging import os from logging.handlers import RotatingFileHandler +from datetime import datetime + + +class TimezoneFormatter(logging.Formatter): + """Custom formatter that respects configured timezone""" + + def __init__(self, fmt=None, datefmt=None): + super().__init__(fmt, datefmt) + + def formatTime(self, record, datefmt=None): + """Override formatTime to use configured timezone""" + dt = datetime.fromtimestamp(record.created) + if datefmt: + return dt.strftime(datefmt) + return dt.isoformat() class LoggerManager: """Singleton logger manager for the application.""" + _instance = None def __new__(cls): @@ -22,7 +38,7 @@ class LoggerManager: def initialize(self, log_dir: str = "logs") -> None: """ - Initialize the logging system with rotating file handlers. + Initialize the logging system with rotating file handlers.loggers Args: log_dir: Directory for log files (created if not exists) @@ -34,9 +50,9 @@ class LoggerManager: os.makedirs(log_dir, exist_ok=True) # Common format for all loggers - log_format = logging.Formatter( + log_format = TimezoneFormatter( "[%(asctime)s] %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S" + datefmt="%Y-%m-%d %H:%M:%S", ) # Rotation settings: 1MB max, 5 backups @@ -51,7 +67,7 @@ class LoggerManager: app_file_handler = RotatingFileHandler( os.path.join(log_dir, "krawl.log"), maxBytes=max_bytes, - backupCount=backup_count + backupCount=backup_count, ) app_file_handler.setFormatter(log_format) self._app_logger.addHandler(app_file_handler) @@ -68,7 +84,7 @@ class LoggerManager: access_file_handler = RotatingFileHandler( os.path.join(log_dir, "access.log"), maxBytes=max_bytes, - backupCount=backup_count + backupCount=backup_count, ) access_file_handler.setFormatter(log_format) self._access_logger.addHandler(access_file_handler) @@ -83,12 +99,12 @@ class LoggerManager: self._credential_logger.handlers.clear() # Credential logger uses a simple format: timestamp|ip|username|password|path - credential_format = logging.Formatter("%(message)s") - + credential_format = TimezoneFormatter("%(message)s") + credential_file_handler = RotatingFileHandler( os.path.join(log_dir, "credentials.log"), maxBytes=max_bytes, - backupCount=backup_count + backupCount=backup_count, ) credential_file_handler.setFormatter(credential_format) self._credential_logger.addHandler(credential_file_handler) diff --git a/src/migrations/add_category_history.py b/src/migrations/add_category_history.py new file mode 100644 index 0000000..622b61c --- /dev/null +++ b/src/migrations/add_category_history.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +""" +Migration script to add CategoryHistory table to existing databases. +Run this once to upgrade your database schema. +""" + +import sys +from pathlib import Path + +# Add parent directory to path to import modules +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from database import get_database, DatabaseManager +from models import Base, CategoryHistory + + +def migrate(): + """Create CategoryHistory table if it doesn't exist.""" + print("Starting migration: Adding CategoryHistory table...") + + try: + db = get_database() + + # Initialize database if not already done + if not db._initialized: + db.initialize() + + # Create only the CategoryHistory table + CategoryHistory.__table__.create(db._engine, checkfirst=True) + + print("✓ Migration completed successfully!") + print(" - CategoryHistory table created") + + except Exception as e: + print(f"✗ Migration failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + migrate() diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..2dbeb30 --- /dev/null +++ b/src/models.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 + +""" +SQLAlchemy ORM models for the Krawl honeypot database. +Stores access logs, credential attempts, attack detections, and IP statistics. +""" + +from datetime import datetime +from typing import Optional, List, Dict + +from sqlalchemy import ( + String, + Integer, + Boolean, + DateTime, + Float, + ForeignKey, + Index, + JSON, +) +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship + +from sanitizer import ( + MAX_IP_LENGTH, + MAX_PATH_LENGTH, + MAX_USER_AGENT_LENGTH, + MAX_CREDENTIAL_LENGTH, + MAX_ATTACK_PATTERN_LENGTH, + MAX_CITY_LENGTH, + MAX_ASN_ORG_LENGTH, + MAX_REPUTATION_SOURCE_LENGTH, +) + + +class Base(DeclarativeBase): + """Base class for all ORM models.""" + + pass + + +class AccessLog(Base): + """ + Records all HTTP requests to the honeypot. + + Stores request metadata, suspicious activity flags, and timestamps + for analysis and dashboard display. + """ + + __tablename__ = "access_logs" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + # ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True, ForeignKey('ip_logs.id', ondelete='CASCADE')) + ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True) + path: Mapped[str] = mapped_column(String(MAX_PATH_LENGTH), nullable=False) + user_agent: Mapped[Optional[str]] = mapped_column( + String(MAX_USER_AGENT_LENGTH), nullable=True + ) + method: Mapped[str] = mapped_column(String(10), nullable=False, default="GET") + is_suspicious: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + is_honeypot_trigger: Mapped[bool] = mapped_column( + Boolean, nullable=False, default=False + ) + timestamp: Mapped[datetime] = mapped_column( + DateTime, nullable=False, default=datetime.utcnow, index=True + ) + + # Relationship to attack detections + attack_detections: Mapped[List["AttackDetection"]] = relationship( + "AttackDetection", back_populates="access_log", cascade="all, delete-orphan" + ) + + # Indexes for common queries + __table_args__ = ( + Index("ix_access_logs_ip_timestamp", "ip", "timestamp"), + Index("ix_access_logs_is_suspicious", "is_suspicious"), + Index("ix_access_logs_is_honeypot_trigger", "is_honeypot_trigger"), + ) + + def __repr__(self) -> str: + return f"" + + +class CredentialAttempt(Base): + """ + Records captured login attempts from honeypot login forms. + + Stores the submitted username and password along with request metadata. + """ + + __tablename__ = "credential_attempts" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True) + path: Mapped[str] = mapped_column(String(MAX_PATH_LENGTH), nullable=False) + username: Mapped[Optional[str]] = mapped_column( + String(MAX_CREDENTIAL_LENGTH), nullable=True + ) + password: Mapped[Optional[str]] = mapped_column( + String(MAX_CREDENTIAL_LENGTH), nullable=True + ) + timestamp: Mapped[datetime] = mapped_column( + DateTime, nullable=False, default=datetime.utcnow, index=True + ) + + # Composite index for common queries + __table_args__ = (Index("ix_credential_attempts_ip_timestamp", "ip", "timestamp"),) + + def __repr__(self) -> str: + return f"" + + +class AttackDetection(Base): + """ + Records detected attack patterns in requests. + + Linked to the parent AccessLog record. Multiple attack types can be + detected in a single request. + """ + + __tablename__ = "attack_detections" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + access_log_id: Mapped[int] = mapped_column( + Integer, + ForeignKey("access_logs.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + attack_type: Mapped[str] = mapped_column(String(50), nullable=False) + matched_pattern: Mapped[Optional[str]] = mapped_column( + String(MAX_ATTACK_PATTERN_LENGTH), nullable=True + ) + + # Relationship back to access log + access_log: Mapped["AccessLog"] = relationship( + "AccessLog", back_populates="attack_detections" + ) + + def __repr__(self) -> str: + return f"" + + +class IpStats(Base): + """ + Aggregated statistics per IP address. + + Includes fields for future GeoIP and reputation enrichment. + Updated on each request from an IP. + """ + + __tablename__ = "ip_stats" + + ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), primary_key=True) + total_requests: Mapped[int] = mapped_column(Integer, nullable=False, default=0) + first_seen: Mapped[datetime] = mapped_column( + DateTime, nullable=False, default=datetime.utcnow + ) + last_seen: Mapped[datetime] = mapped_column( + DateTime, nullable=False, default=datetime.utcnow + ) + + # GeoIP fields (populated by future enrichment) + country_code: Mapped[Optional[str]] = mapped_column(String(2), nullable=True) + city: Mapped[Optional[str]] = mapped_column(String(MAX_CITY_LENGTH), nullable=True) + latitude: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + longitude: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + asn: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + asn_org: Mapped[Optional[str]] = mapped_column( + String(MAX_ASN_ORG_LENGTH), nullable=True + ) + list_on: Mapped[Optional[Dict[str, str]]] = mapped_column(JSON, nullable=True) + + # Reputation fields (populated by future enrichment) + reputation_score: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + reputation_source: Mapped[Optional[str]] = mapped_column( + String(MAX_REPUTATION_SOURCE_LENGTH), nullable=True + ) + reputation_updated: Mapped[Optional[datetime]] = mapped_column( + DateTime, nullable=True + ) + + # Analyzed metrics, category and category scores + analyzed_metrics: Mapped[Dict[str, object]] = mapped_column(JSON, nullable=True) + category: Mapped[str] = mapped_column(String, nullable=True) + category_scores: Mapped[Dict[str, int]] = mapped_column(JSON, nullable=True) + manual_category: Mapped[bool] = mapped_column(Boolean, default=False, nullable=True) + last_analysis: Mapped[datetime] = mapped_column(DateTime, nullable=True) + + def __repr__(self) -> str: + return f"" + + +class CategoryHistory(Base): + """ + Records category changes for IP addresses over time. + + Tracks when an IP's category changes, storing both the previous + and new category along with timestamp for timeline visualization. + """ + + __tablename__ = "category_history" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True) + old_category: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + new_category: Mapped[str] = mapped_column(String(50), nullable=False) + timestamp: Mapped[datetime] = mapped_column( + DateTime, nullable=False, default=datetime.utcnow, index=True + ) + + # Composite index for efficient IP-based timeline queries + __table_args__ = (Index("ix_category_history_ip_timestamp", "ip", "timestamp"),) + + def __repr__(self) -> str: + return f" {self.new_category})>" + + +# class IpLog(Base): +# """ +# Records all IPs that have accessed the honeypot, along with aggregated stats and inferred user category. +# """ +# __tablename__ = 'ip_logs' + +# id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) +# ip: Mapped[str] = mapped_column(String(MAX_IP_LENGTH), nullable=False, index=True) +# stats: Mapped[List[str]] = mapped_column(String(MAX_PATH_LENGTH)) +# category: Mapped[str] = mapped_column(String(15)) +# manual_category: Mapped[bool] = mapped_column(Boolean, default=False) +# last_analysis: Mapped[datetime] = mapped_column(DateTime, index=True), + +# # Relationship to attack detections +# access_logs: Mapped[List["AccessLog"]] = relationship( +# "AccessLog", +# back_populates="ip", +# cascade="all, delete-orphan" +# ) + +# # Indexes for common queries +# __table_args__ = ( +# Index('ix_access_logs_ip_timestamp', 'ip', 'timestamp'), +# Index('ix_access_logs_is_suspicious', 'is_suspicious'), +# Index('ix_access_logs_is_honeypot_trigger', 'is_honeypot_trigger'), +# ) + +# def __repr__(self) -> str: +# return f"" diff --git a/src/sanitizer.py b/src/sanitizer.py new file mode 100644 index 0000000..fa2ba60 --- /dev/null +++ b/src/sanitizer.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +""" +Sanitization utilities for safe database storage and HTML output. +Protects against SQL injection payloads, XSS, and storage exhaustion attacks. +""" + +import html +import re +from typing import Optional, Dict + +# Field length limits for database storage +MAX_IP_LENGTH = 45 # IPv6 max length +MAX_PATH_LENGTH = 2048 # URL max practical length +MAX_USER_AGENT_LENGTH = 512 +MAX_CREDENTIAL_LENGTH = 256 +MAX_ATTACK_PATTERN_LENGTH = 256 +MAX_CITY_LENGTH = 128 +MAX_ASN_ORG_LENGTH = 256 +MAX_REPUTATION_SOURCE_LENGTH = 64 + + +def sanitize_for_storage(value: Optional[str], max_length: int) -> str: + """ + Sanitize and truncate string for safe database storage. + + Removes null bytes and control characters that could cause issues + with database storage or log processing. + + Args: + value: The string to sanitize + max_length: Maximum length to truncate to + + Returns: + Sanitized and truncated string, empty string if input is None/empty + """ + if not value: + return "" + + # Convert to string if not already + value = str(value) + + # Remove null bytes and control characters (except newline \n, tab \t, carriage return \r) + # Control chars are 0x00-0x1F and 0x7F, we keep 0x09 (tab), 0x0A (newline), 0x0D (carriage return) + cleaned = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", value) + + # Truncate to max length + return cleaned[:max_length] + + +def sanitize_ip(value: Optional[str]) -> str: + """Sanitize IP address for storage.""" + return sanitize_for_storage(value, MAX_IP_LENGTH) + + +def sanitize_path(value: Optional[str]) -> str: + """Sanitize URL path for storage.""" + return sanitize_for_storage(value, MAX_PATH_LENGTH) + + +def sanitize_user_agent(value: Optional[str]) -> str: + """Sanitize user agent string for storage.""" + return sanitize_for_storage(value, MAX_USER_AGENT_LENGTH) + + +def sanitize_credential(value: Optional[str]) -> str: + """Sanitize username or password for storage.""" + return sanitize_for_storage(value, MAX_CREDENTIAL_LENGTH) + + +def sanitize_attack_pattern(value: Optional[str]) -> str: + """Sanitize matched attack pattern for storage.""" + return sanitize_for_storage(value, MAX_ATTACK_PATTERN_LENGTH) + + +def escape_html(value: Optional[str]) -> str: + """ + Escape HTML special characters for safe display in web pages. + + Prevents stored XSS attacks when displaying user-controlled data + in the dashboard. + + Args: + value: The string to escape + + Returns: + HTML-escaped string, empty string if input is None/empty + """ + if not value: + return "" + return html.escape(str(value)) + + +def escape_html_truncated(value: Optional[str], max_display_length: int) -> str: + """ + Escape HTML and truncate for display. + + Args: + value: The string to escape and truncate + max_display_length: Maximum display length (truncation happens before escaping) + + Returns: + HTML-escaped and truncated string + """ + if not value: + return "" + + value_str = str(value) + if len(value_str) > max_display_length: + value_str = value_str[:max_display_length] + "..." + + return html.escape(value_str) + + +def sanitize_dict(value: Optional[Dict[str, str]], max_display_length): + return {k: sanitize_for_storage(v, max_display_length) for k, v in value.items()} diff --git a/src/server.py b/src/server.py index fd8f7d2..94f1d1e 100644 --- a/src/server.py +++ b/src/server.py @@ -8,51 +8,78 @@ Run this file to start the server. import sys from http.server import HTTPServer -from config import Config +from config import get_config from tracker import AccessTracker +from analyzer import Analyzer from handler import Handler -from logger import initialize_logging, get_app_logger, get_access_logger, get_credential_logger +from logger import ( + initialize_logging, + get_app_logger, + get_access_logger, + get_credential_logger, +) +from database import initialize_database +from tasks_master import get_tasksmaster def print_usage(): """Print usage information""" - print(f'Usage: {sys.argv[0]} [FILE]\n') - print('FILE is file containing a list of webpage names to serve, one per line.') - print('If no file is provided, random links will be generated.\n') - print('Environment Variables:') - print(' PORT - Server port (default: 5000)') - print(' DELAY - Response delay in ms (default: 100)') - print(' LINKS_MIN_LENGTH - Min link length (default: 5)') - print(' LINKS_MAX_LENGTH - Max link length (default: 15)') - print(' LINKS_MIN_PER_PAGE - Min links per page (default: 10)') - print(' LINKS_MAX_PER_PAGE - Max links per page (default: 15)') - print(' MAX_COUNTER - Max counter value (default: 10)') - print(' CANARY_TOKEN_URL - Canary token URL to display') - print(' CANARY_TOKEN_TRIES - Number of tries before showing token (default: 10)') - print(' DASHBOARD_SECRET_PATH - Secret path for dashboard (auto-generated if not set)') - print(' PROBABILITY_ERROR_CODES - Probability (0-100) to return HTTP error codes (default: 0)') - print(' CHAR_SPACE - Characters for random links') - print(' SERVER_HEADER - HTTP Server header for deception (default: Apache/2.2.22 (Ubuntu))') + print(f"Usage: {sys.argv[0]} [FILE]\n") + print("FILE is file containing a list of webpage names to serve, one per line.") + print("If no file is provided, random links will be generated.\n") + print("Configuration:") + print(" Configuration is loaded from a YAML file (default: config.yaml)") + print("Set CONFIG_LOCATION environment variable to use a different file.\n") + print("Example config.yaml structure:") + print("server:") + print("port: 5000") + print("delay: 100") + print("links:") + print("min_length: 5") + print("max_length: 15") + print("min_per_page: 10") + print("max_per_page: 15") + print("canary:") + print("token_url: null") + print("token_tries: 10") + print("dashboard:") + print("secret_path: null # auto-generated if not set") + print("database:") + print('path: "data/krawl.db"') + print("retention_days: 30") + print("behavior:") + print("probability_error_codes: 0") def main(): """Main entry point for the deception server""" - if '-h' in sys.argv or '--help' in sys.argv: + if "-h" in sys.argv or "--help" in sys.argv: print_usage() exit(0) - # Initialize logging + config = get_config() + + # Initialize logging with timezone initialize_logging() app_logger = get_app_logger() access_logger = get_access_logger() credential_logger = get_credential_logger() - config = Config.from_env() + # Initialize database for persistent storage + try: + initialize_database(config.database_path) + app_logger.info(f"Database initialized at: {config.database_path}") + except Exception as e: + app_logger.warning( + f"Database initialization failed: {e}. Continuing with in-memory only." + ) - tracker = AccessTracker() + tracker = AccessTracker(config.max_pages_limit, config.ban_duration_seconds) + analyzer = Analyzer() Handler.config = config Handler.tracker = tracker + Handler.analyzer = analyzer Handler.counter = config.canary_token_tries Handler.app_logger = app_logger Handler.access_logger = access_logger @@ -60,35 +87,55 @@ def main(): if len(sys.argv) == 2: try: - with open(sys.argv[1], 'r') as f: + with open(sys.argv[1], "r") as f: Handler.webpages = f.readlines() if not Handler.webpages: - app_logger.warning('The file provided was empty. Using randomly generated links.') + app_logger.warning( + "The file provided was empty. Using randomly generated links." + ) Handler.webpages = None except IOError: app_logger.warning("Can't read input file. Using randomly generated links.") - try: - app_logger.info(f'Starting deception server on port {config.port}...') - app_logger.info(f'Dashboard available at: {config.dashboard_secret_path}') - if config.canary_token_url: - app_logger.info(f'Canary token will appear after {config.canary_token_tries} tries') - else: - app_logger.info('No canary token configured (set CANARY_TOKEN_URL to enable)') + # tasks master init + tasks_master = get_tasksmaster() + tasks_master.run_scheduled_tasks() - server = HTTPServer(('0.0.0.0', config.port), Handler) - app_logger.info('Server started. Use to stop.') + try: + + banner = f""" + +============================================================ +DASHBOARD AVAILABLE AT +{config.dashboard_secret_path} +============================================================ + """ + app_logger.info(banner) + app_logger.info(f"Starting deception server on port {config.port}...") + if config.canary_token_url: + app_logger.info( + f"Canary token will appear after {config.canary_token_tries} tries" + ) + else: + app_logger.info( + "No canary token configured (set CANARY_TOKEN_URL to enable)" + ) + + server = HTTPServer(("0.0.0.0", config.port), Handler) + app_logger.info("Server started. Use to stop.") server.serve_forever() except KeyboardInterrupt: - app_logger.info('Stopping server...') + app_logger.info("Stopping server...") server.socket.close() - app_logger.info('Server stopped') + app_logger.info("Server stopped") except Exception as e: - app_logger.error(f'Error starting HTTP server on port {config.port}: {e}') - app_logger.error(f'Make sure you are root, if needed, and that port {config.port} is open.') + app_logger.error(f"Error starting HTTP server on port {config.port}: {e}") + app_logger.error( + f"Make sure you are root, if needed, and that port {config.port} is open." + ) exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/server_errors.py b/src/server_errors.py index 7591c64..7b55654 100644 --- a/src/server_errors.py +++ b/src/server_errors.py @@ -7,13 +7,13 @@ from wordlists import get_wordlists def generate_server_error() -> tuple[str, str]: wl = get_wordlists() server_errors = wl.server_errors - + if not server_errors: return ("500 Internal Server Error", "text/html") - + server_type = random.choice(list(server_errors.keys())) server_config = server_errors[server_type] - + error_codes = { 400: "Bad Request", 401: "Unauthorized", @@ -21,45 +21,45 @@ def generate_server_error() -> tuple[str, str]: 404: "Not Found", 500: "Internal Server Error", 502: "Bad Gateway", - 503: "Service Unavailable" + 503: "Service Unavailable", } - + code = random.choice(list(error_codes.keys())) message = error_codes[code] - - template = server_config.get('template', '') - version = random.choice(server_config.get('versions', ['1.0'])) - - html = template.replace('{code}', str(code)) - html = html.replace('{message}', message) - html = html.replace('{version}', version) - - if server_type == 'apache': - os = random.choice(server_config.get('os', ['Ubuntu'])) - html = html.replace('{os}', os) - html = html.replace('{host}', 'localhost') - + + template = server_config.get("template", "") + version = random.choice(server_config.get("versions", ["1.0"])) + + html = template.replace("{code}", str(code)) + html = html.replace("{message}", message) + html = html.replace("{version}", version) + + if server_type == "apache": + os = random.choice(server_config.get("os", ["Ubuntu"])) + html = html.replace("{os}", os) + html = html.replace("{host}", "localhost") + return (html, "text/html") def get_server_header(server_type: str = None) -> str: wl = get_wordlists() server_errors = wl.server_errors - + if not server_errors: return "nginx/1.18.0" - + if not server_type: server_type = random.choice(list(server_errors.keys())) - + server_config = server_errors.get(server_type, {}) - version = random.choice(server_config.get('versions', ['1.0'])) - + version = random.choice(server_config.get("versions", ["1.0"])) + server_headers = { - 'nginx': f"nginx/{version}", - 'apache': f"Apache/{version}", - 'iis': f"Microsoft-IIS/{version}", - 'tomcat': f"Apache-Coyote/1.1" + "nginx": f"nginx/{version}", + "apache": f"Apache/{version}", + "iis": f"Microsoft-IIS/{version}", + "tomcat": f"Apache-Coyote/1.1", } - + return server_headers.get(server_type, "nginx/1.18.0") diff --git a/src/sql_errors.py b/src/sql_errors.py index dc84886..583f7ed 100644 --- a/src/sql_errors.py +++ b/src/sql_errors.py @@ -9,87 +9,91 @@ from wordlists import get_wordlists def detect_sql_injection_pattern(query_string: str) -> Optional[str]: if not query_string: return None - + query_lower = query_string.lower() - + patterns = { - 'quote': [r"'", r'"', r'`'], - 'comment': [r'--', r'#', r'/\*', r'\*/'], - 'union': [r'\bunion\b', r'\bunion\s+select\b'], - 'boolean': [r'\bor\b.*=.*', r'\band\b.*=.*', r"'.*or.*'.*=.*'"], - 'time_based': [r'\bsleep\b', r'\bwaitfor\b', r'\bdelay\b', r'\bbenchmark\b'], - 'stacked': [r';.*select', r';.*drop', r';.*insert', r';.*update', r';.*delete'], - 'command': [r'\bexec\b', r'\bexecute\b', r'\bxp_cmdshell\b'], - 'info_schema': [r'information_schema', r'table_schema', r'table_name'], + "quote": [r"'", r'"', r"`"], + "comment": [r"--", r"#", r"/\*", r"\*/"], + "union": [r"\bunion\b", r"\bunion\s+select\b"], + "boolean": [r"\bor\b.*=.*", r"\band\b.*=.*", r"'.*or.*'.*=.*'"], + "time_based": [r"\bsleep\b", r"\bwaitfor\b", r"\bdelay\b", r"\bbenchmark\b"], + "stacked": [r";.*select", r";.*drop", r";.*insert", r";.*update", r";.*delete"], + "command": [r"\bexec\b", r"\bexecute\b", r"\bxp_cmdshell\b"], + "info_schema": [r"information_schema", r"table_schema", r"table_name"], } - + for injection_type, pattern_list in patterns.items(): for pattern in pattern_list: if re.search(pattern, query_lower): return injection_type - + return None -def get_random_sql_error(db_type: str = None, injection_type: str = None) -> Tuple[str, str]: +def get_random_sql_error( + db_type: str = None, injection_type: str = None +) -> Tuple[str, str]: wl = get_wordlists() sql_errors = wl.sql_errors - + if not sql_errors: return ("Database error occurred", "text/plain") - + if not db_type: db_type = random.choice(list(sql_errors.keys())) - + db_errors = sql_errors.get(db_type, {}) - + if injection_type and injection_type in db_errors: errors = db_errors[injection_type] - elif 'generic' in db_errors: - errors = db_errors['generic'] + elif "generic" in db_errors: + errors = db_errors["generic"] else: all_errors = [] for error_list in db_errors.values(): if isinstance(error_list, list): all_errors.extend(error_list) errors = all_errors if all_errors else ["Database error occurred"] - + error_message = random.choice(errors) if errors else "Database error occurred" - - if '{table}' in error_message: - tables = ['users', 'products', 'orders', 'customers', 'accounts', 'sessions'] - error_message = error_message.replace('{table}', random.choice(tables)) - - if '{column}' in error_message: - columns = ['id', 'name', 'email', 'password', 'username', 'created_at'] - error_message = error_message.replace('{column}', random.choice(columns)) - + + if "{table}" in error_message: + tables = ["users", "products", "orders", "customers", "accounts", "sessions"] + error_message = error_message.replace("{table}", random.choice(tables)) + + if "{column}" in error_message: + columns = ["id", "name", "email", "password", "username", "created_at"] + error_message = error_message.replace("{column}", random.choice(columns)) + return (error_message, "text/plain") -def generate_sql_error_response(query_string: str, db_type: str = None) -> Tuple[str, str, int]: +def generate_sql_error_response( + query_string: str, db_type: str = None +) -> Tuple[str, str, int]: injection_type = detect_sql_injection_pattern(query_string) - + if not injection_type: return (None, None, None) - + error_message, content_type = get_random_sql_error(db_type, injection_type) - + status_code = 500 - + if random.random() < 0.3: status_code = 200 - + return (error_message, content_type, status_code) def get_sql_response_with_data(path: str, params: str) -> str: import json from generators import random_username, random_email, random_password - + injection_type = detect_sql_injection_pattern(params) - - if injection_type in ['union', 'boolean', 'stacked']: + + if injection_type in ["union", "boolean", "stacked"]: data = { "success": True, "results": [ @@ -98,15 +102,14 @@ def get_sql_response_with_data(path: str, params: str) -> str: "username": random_username(), "email": random_email(), "password_hash": random_password(), - "role": random.choice(["admin", "user", "moderator"]) + "role": random.choice(["admin", "user", "moderator"]), } for i in range(1, random.randint(2, 5)) - ] + ], } return json.dumps(data, indent=2) - - return json.dumps({ - "success": True, - "message": "Query executed successfully", - "results": [] - }, indent=2) + + return json.dumps( + {"success": True, "message": "Query executed successfully", "results": []}, + indent=2, + ) diff --git a/src/tasks/analyze_ips.py b/src/tasks/analyze_ips.py new file mode 100644 index 0000000..7602f18 --- /dev/null +++ b/src/tasks/analyze_ips.py @@ -0,0 +1,430 @@ +from sqlalchemy import select +from typing import Optional +from database import get_database, DatabaseManager +from zoneinfo import ZoneInfo +from pathlib import Path +from datetime import datetime, timedelta +import re +import urllib.parse +from wordlists import get_wordlists +from config import get_config +from logger import get_app_logger +import requests +from sanitizer import sanitize_for_storage, sanitize_dict + +# ---------------------- +# TASK CONFIG +# ---------------------- + +TASK_CONFIG = { + "name": "analyze-ips", + "cron": "*/1 * * * *", + "enabled": True, + "run_when_loaded": True, +} + + +def main(): + config = get_config() + db_manager = get_database() + app_logger = get_app_logger() + + http_risky_methods_threshold = config.http_risky_methods_threshold + violated_robots_threshold = config.violated_robots_threshold + uneven_request_timing_threshold = config.uneven_request_timing_threshold + user_agents_used_threshold = config.user_agents_used_threshold + attack_urls_threshold = config.attack_urls_threshold + uneven_request_timing_time_window_seconds = ( + config.uneven_request_timing_time_window_seconds + ) + app_logger.debug(f"http_risky_methods_threshold: {http_risky_methods_threshold}") + score = {} + score["attacker"] = { + "risky_http_methods": False, + "robots_violations": False, + "uneven_request_timing": False, + "different_user_agents": False, + "attack_url": False, + } + score["good_crawler"] = { + "risky_http_methods": False, + "robots_violations": False, + "uneven_request_timing": False, + "different_user_agents": False, + "attack_url": False, + } + score["bad_crawler"] = { + "risky_http_methods": False, + "robots_violations": False, + "uneven_request_timing": False, + "different_user_agents": False, + "attack_url": False, + } + score["regular_user"] = { + "risky_http_methods": False, + "robots_violations": False, + "uneven_request_timing": False, + "different_user_agents": False, + "attack_url": False, + } + + # 1-3 low, 4-6 mid, 7-9 high, 10-20 extreme + weights = { + "attacker": { + "risky_http_methods": 6, + "robots_violations": 4, + "uneven_request_timing": 3, + "different_user_agents": 8, + "attack_url": 15, + }, + "good_crawler": { + "risky_http_methods": 1, + "robots_violations": 0, + "uneven_request_timing": 0, + "different_user_agents": 0, + "attack_url": 0, + }, + "bad_crawler": { + "risky_http_methods": 2, + "robots_violations": 7, + "uneven_request_timing": 0, + "different_user_agents": 5, + "attack_url": 5, + }, + "regular_user": { + "risky_http_methods": 0, + "robots_violations": 0, + "uneven_request_timing": 8, + "different_user_agents": 3, + "attack_url": 0, + }, + } + # Get IPs with recent activity (last minute to match cron schedule) + recent_accesses = db_manager.get_access_logs(limit=999999999, since_minutes=1) + ips_to_analyze = {item["ip"] for item in recent_accesses} + + if not ips_to_analyze: + app_logger.debug("[Background Task] analyze-ips: No recent activity, skipping") + return + + for ip in ips_to_analyze: + # Get full history for this IP to perform accurate analysis + ip_accesses = db_manager.get_access_logs(limit=999999999, ip_filter=ip) + total_accesses_count = len(ip_accesses) + if total_accesses_count <= 0: + return + + # Set category as "unknown" for the first 3 requests + if total_accesses_count < 3: + category = "unknown" + analyzed_metrics = {} + category_scores = { + "attacker": 0, + "good_crawler": 0, + "bad_crawler": 0, + "regular_user": 0, + "unknown": 0, + } + last_analysis = datetime.now() + db_manager.update_ip_stats_analysis( + ip, analyzed_metrics, category, category_scores, last_analysis + ) + return 0 + # --------------------- HTTP Methods --------------------- + get_accesses_count = len( + [item for item in ip_accesses if item["method"] == "GET"] + ) + post_accesses_count = len( + [item for item in ip_accesses if item["method"] == "POST"] + ) + put_accesses_count = len( + [item for item in ip_accesses if item["method"] == "PUT"] + ) + delete_accesses_count = len( + [item for item in ip_accesses if item["method"] == "DELETE"] + ) + head_accesses_count = len( + [item for item in ip_accesses if item["method"] == "HEAD"] + ) + options_accesses_count = len( + [item for item in ip_accesses if item["method"] == "OPTIONS"] + ) + patch_accesses_count = len( + [item for item in ip_accesses if item["method"] == "PATCH"] + ) + if total_accesses_count > http_risky_methods_threshold: + http_method_attacker_score = ( + post_accesses_count + + put_accesses_count + + delete_accesses_count + + options_accesses_count + + patch_accesses_count + ) / total_accesses_count + else: + http_method_attacker_score = 0 + # print(f"HTTP Method attacker score: {http_method_attacker_score}") + if http_method_attacker_score >= http_risky_methods_threshold: + score["attacker"]["risky_http_methods"] = True + score["good_crawler"]["risky_http_methods"] = False + score["bad_crawler"]["risky_http_methods"] = True + score["regular_user"]["risky_http_methods"] = False + else: + score["attacker"]["risky_http_methods"] = False + score["good_crawler"]["risky_http_methods"] = True + score["bad_crawler"]["risky_http_methods"] = False + score["regular_user"]["risky_http_methods"] = False + # --------------------- Robots Violations --------------------- + # respect robots.txt and login/config pages access frequency + robots_disallows = [] + robots_path = Path(__file__).parent.parent / "templates" / "html" / "robots.txt" + with open(robots_path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split(":") + + if parts[0] == "Disallow": + parts[1] = parts[1].rstrip("/") + # print(f"DISALLOW {parts[1]}") + robots_disallows.append(parts[1].strip()) + # if 0 100% sure is good crawler, if >10% of robots violated is bad crawler or attacker + violated_robots_count = len( + [ + item + for item in ip_accesses + if any( + item["path"].rstrip("/").startswith(disallow) + for disallow in robots_disallows + ) + ] + ) + # print(f"Violated robots count: {violated_robots_count}") + if total_accesses_count > 0: + violated_robots_ratio = violated_robots_count / total_accesses_count + else: + violated_robots_ratio = 0 + if violated_robots_ratio >= violated_robots_threshold: + score["attacker"]["robots_violations"] = True + score["good_crawler"]["robots_violations"] = False + score["bad_crawler"]["robots_violations"] = True + score["regular_user"]["robots_violations"] = False + else: + score["attacker"]["robots_violations"] = False + score["good_crawler"]["robots_violations"] = False + score["bad_crawler"]["robots_violations"] = False + score["regular_user"]["robots_violations"] = False + + # --------------------- Requests Timing --------------------- + # Request rate and timing: steady, throttled, polite vs attackers' bursty, aggressive, or oddly rhythmic behavior + timestamps = [datetime.fromisoformat(item["timestamp"]) for item in ip_accesses] + now_utc = datetime.now() + timestamps = [ + ts + for ts in timestamps + if now_utc - ts + <= timedelta(seconds=uneven_request_timing_time_window_seconds) + ] + timestamps = sorted(timestamps, reverse=True) + time_diffs = [] + for i in range(0, len(timestamps) - 1): + diff = (timestamps[i] - timestamps[i + 1]).total_seconds() + time_diffs.append(diff) + + mean = 0 + variance = 0 + std = 0 + cv = 0 + if time_diffs: + mean = sum(time_diffs) / len(time_diffs) + variance = sum((x - mean) ** 2 for x in time_diffs) / len(time_diffs) + std = variance**0.5 + cv = std / mean + app_logger.debug( + f"Mean: {mean} - Variance {variance} - Standard Deviation {std} - Coefficient of Variation: {cv}" + ) + if cv >= uneven_request_timing_threshold: + score["attacker"]["uneven_request_timing"] = True + score["good_crawler"]["uneven_request_timing"] = False + score["bad_crawler"]["uneven_request_timing"] = False + score["regular_user"]["uneven_request_timing"] = True + else: + score["attacker"]["uneven_request_timing"] = False + score["good_crawler"]["uneven_request_timing"] = False + score["bad_crawler"]["uneven_request_timing"] = False + score["regular_user"]["uneven_request_timing"] = False + # --------------------- Different User Agents --------------------- + # Header Quality and Consistency: Crawlers tend to use complete and consistent headers, attackers might miss, fake, or change headers + user_agents_used = [item["user_agent"] for item in ip_accesses] + user_agents_used = list(dict.fromkeys(user_agents_used)) + # print(f"User agents used: {user_agents_used}") + if len(user_agents_used) >= user_agents_used_threshold: + score["attacker"]["different_user_agents"] = True + score["good_crawler"]["different_user_agents"] = False + score["bad_crawler"]["different_user_agentss"] = True + score["regular_user"]["different_user_agents"] = False + else: + score["attacker"]["different_user_agents"] = False + score["good_crawler"]["different_user_agents"] = False + score["bad_crawler"]["different_user_agents"] = False + score["regular_user"]["different_user_agents"] = False + # --------------------- Attack URLs --------------------- + attack_urls_found_list = [] + wl = get_wordlists() + if wl.attack_patterns: + queried_paths = [item["path"] for item in ip_accesses] + for queried_path in queried_paths: + # URL decode the path to catch encoded attacks + try: + decoded_path = urllib.parse.unquote(queried_path) + # Double decode to catch double-encoded attacks + decoded_path_twice = urllib.parse.unquote(decoded_path) + except Exception: + decoded_path = queried_path + decoded_path_twice = queried_path + + for name, pattern in wl.attack_patterns.items(): + # Check original, decoded, and double-decoded paths + if ( + re.search(pattern, queried_path, re.IGNORECASE) + or re.search(pattern, decoded_path, re.IGNORECASE) + or re.search(pattern, decoded_path_twice, re.IGNORECASE) + ): + attack_urls_found_list.append(f"{name}: {pattern}") + + # remove duplicates + attack_urls_found_list = set(attack_urls_found_list) + attack_urls_found_list = list(attack_urls_found_list) + + if len(attack_urls_found_list) >= attack_urls_threshold: + score["attacker"]["attack_url"] = True + score["good_crawler"]["attack_url"] = False + score["bad_crawler"]["attack_url"] = False + score["regular_user"]["attack_url"] = False + else: + score["attacker"]["attack_url"] = False + score["good_crawler"]["attack_url"] = False + score["bad_crawler"]["attack_url"] = False + score["regular_user"]["attack_url"] = False + # --------------------- Calculate score --------------------- + attacker_score = good_crawler_score = bad_crawler_score = regular_user_score = 0 + attacker_score = ( + score["attacker"]["risky_http_methods"] + * weights["attacker"]["risky_http_methods"] + ) + attacker_score = ( + attacker_score + + score["attacker"]["robots_violations"] + * weights["attacker"]["robots_violations"] + ) + attacker_score = ( + attacker_score + + score["attacker"]["uneven_request_timing"] + * weights["attacker"]["uneven_request_timing"] + ) + attacker_score = ( + attacker_score + + score["attacker"]["different_user_agents"] + * weights["attacker"]["different_user_agents"] + ) + attacker_score = ( + attacker_score + + score["attacker"]["attack_url"] * weights["attacker"]["attack_url"] + ) + good_crawler_score = ( + score["good_crawler"]["risky_http_methods"] + * weights["good_crawler"]["risky_http_methods"] + ) + good_crawler_score = ( + good_crawler_score + + score["good_crawler"]["robots_violations"] + * weights["good_crawler"]["robots_violations"] + ) + good_crawler_score = ( + good_crawler_score + + score["good_crawler"]["uneven_request_timing"] + * weights["good_crawler"]["uneven_request_timing"] + ) + good_crawler_score = ( + good_crawler_score + + score["good_crawler"]["different_user_agents"] + * weights["good_crawler"]["different_user_agents"] + ) + good_crawler_score = ( + good_crawler_score + + score["good_crawler"]["attack_url"] + * weights["good_crawler"]["attack_url"] + ) + bad_crawler_score = ( + score["bad_crawler"]["risky_http_methods"] + * weights["bad_crawler"]["risky_http_methods"] + ) + bad_crawler_score = ( + bad_crawler_score + + score["bad_crawler"]["robots_violations"] + * weights["bad_crawler"]["robots_violations"] + ) + bad_crawler_score = ( + bad_crawler_score + + score["bad_crawler"]["uneven_request_timing"] + * weights["bad_crawler"]["uneven_request_timing"] + ) + bad_crawler_score = ( + bad_crawler_score + + score["bad_crawler"]["different_user_agents"] + * weights["bad_crawler"]["different_user_agents"] + ) + bad_crawler_score = ( + bad_crawler_score + + score["bad_crawler"]["attack_url"] * weights["bad_crawler"]["attack_url"] + ) + regular_user_score = ( + score["regular_user"]["risky_http_methods"] + * weights["regular_user"]["risky_http_methods"] + ) + regular_user_score = ( + regular_user_score + + score["regular_user"]["robots_violations"] + * weights["regular_user"]["robots_violations"] + ) + regular_user_score = ( + regular_user_score + + score["regular_user"]["uneven_request_timing"] + * weights["regular_user"]["uneven_request_timing"] + ) + regular_user_score = ( + regular_user_score + + score["regular_user"]["different_user_agents"] + * weights["regular_user"]["different_user_agents"] + ) + regular_user_score = ( + regular_user_score + + score["regular_user"]["attack_url"] + * weights["regular_user"]["attack_url"] + ) + score_details = f""" + Attacker score: {attacker_score} + Good Crawler score: {good_crawler_score} + Bad Crawler score: {bad_crawler_score} + Regular User score: {regular_user_score} + """ + app_logger.debug(score_details) + analyzed_metrics = { + "risky_http_methods": http_method_attacker_score, + "robots_violations": violated_robots_ratio, + "uneven_request_timing": mean, + "different_user_agents": user_agents_used, + "attack_url": attack_urls_found_list, + } + category_scores = { + "attacker": attacker_score, + "good_crawler": good_crawler_score, + "bad_crawler": bad_crawler_score, + "regular_user": regular_user_score, + } + category = max(category_scores, key=category_scores.get) + last_analysis = datetime.now() + db_manager.update_ip_stats_analysis( + ip, analyzed_metrics, category, category_scores, last_analysis + ) + return diff --git a/src/tasks/fetch_ip_rep.py b/src/tasks/fetch_ip_rep.py new file mode 100644 index 0000000..eac6645 --- /dev/null +++ b/src/tasks/fetch_ip_rep.py @@ -0,0 +1,73 @@ +from database import get_database +from logger import get_app_logger +import requests +from sanitizer import sanitize_for_storage, sanitize_dict +from geo_utils import get_most_recent_geoip_data, extract_city_from_coordinates + +# ---------------------- +# TASK CONFIG +# ---------------------- + +TASK_CONFIG = { + "name": "fetch-ip-rep", + "cron": "*/5 * * * *", + "enabled": True, + "run_when_loaded": True, +} + + +def main(): + db_manager = get_database() + app_logger = get_app_logger() + + # Only get IPs that haven't been enriched yet + unenriched_ips = db_manager.get_unenriched_ips(limit=50) + app_logger.info( + f"{len(unenriched_ips)} IP's need to be have reputation enrichment." + ) + for ip in unenriched_ips: + try: + api_url = "https://iprep.lcrawl.com/api/iprep/" + params = {"cidr": ip} + headers = {"Content-Type": "application/json"} + response = requests.get(api_url, headers=headers, params=params, timeout=10) + payload = response.json() + + if payload.get("results"): + results = payload["results"] + + # Get the most recent result (first in list, sorted by record_added) + most_recent = results[0] + geoip_data = most_recent.get("geoip_data", {}) + list_on = most_recent.get("list_on", {}) + + # Extract standard fields + country_iso_code = geoip_data.get("country_iso_code") + asn = geoip_data.get("asn_autonomous_system_number") + asn_org = geoip_data.get("asn_autonomous_system_organization") + latitude = geoip_data.get("location_latitude") + longitude = geoip_data.get("location_longitude") + + # Extract city from coordinates using reverse geocoding + city = extract_city_from_coordinates(geoip_data) + + sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3) + sanitized_asn = sanitize_for_storage(asn, 100) + sanitized_asn_org = sanitize_for_storage(asn_org, 100) + sanitized_city = sanitize_for_storage(city, 100) if city else None + sanitized_list_on = sanitize_dict(list_on, 100000) + + db_manager.update_ip_rep_infos( + ip, + sanitized_country_iso_code, + sanitized_asn, + sanitized_asn_org, + sanitized_list_on, + sanitized_city, + latitude, + longitude, + ) + except requests.RequestException as e: + app_logger.warning(f"Failed to fetch IP rep for {ip}: {e}") + except Exception as e: + app_logger.error(f"Error processing IP {ip}: {e}") diff --git a/src/tasks/memory_cleanup.py b/src/tasks/memory_cleanup.py new file mode 100644 index 0000000..38a27a2 --- /dev/null +++ b/src/tasks/memory_cleanup.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +""" +Memory cleanup task for Krawl honeypot. +Periodically trims unbounded in-memory structures to prevent OOM. +""" + +from database import get_database +from logger import get_app_logger + +# ---------------------- +# TASK CONFIG +# ---------------------- + +TASK_CONFIG = { + "name": "memory-cleanup", + "cron": "*/5 * * * *", # Run every 5 minutes + "enabled": True, + "run_when_loaded": False, +} + +app_logger = get_app_logger() + + +def main(): + """ + Clean up in-memory structures in the tracker. + Called periodically to prevent unbounded memory growth. + """ + try: + # Import here to avoid circular imports + from handler import Handler + + if not Handler.tracker: + app_logger.warning("Tracker not initialized, skipping memory cleanup") + return + + # Get memory stats before cleanup + stats_before = Handler.tracker.get_memory_stats() + + # Run cleanup + Handler.tracker.cleanup_memory() + + # Get memory stats after cleanup + stats_after = Handler.tracker.get_memory_stats() + + # Log changes + access_log_reduced = ( + stats_before["access_log_size"] - stats_after["access_log_size"] + ) + cred_reduced = ( + stats_before["credential_attempts_size"] + - stats_after["credential_attempts_size"] + ) + + if access_log_reduced > 0 or cred_reduced > 0: + app_logger.info( + f"Memory cleanup: Trimmed {access_log_reduced} access logs, " + f"{cred_reduced} credential attempts" + ) + + # Log current memory state for monitoring + app_logger.debug( + f"Memory stats after cleanup: " + f"access_logs={stats_after['access_log_size']}, " + f"credentials={stats_after['credential_attempts_size']}, " + f"unique_ips={stats_after['unique_ips_tracked']}" + ) + + except Exception as e: + app_logger.error(f"Error during memory cleanup: {e}") diff --git a/src/tasks/top_attacking_ips.py b/src/tasks/top_attacking_ips.py new file mode 100644 index 0000000..c0cfbec --- /dev/null +++ b/src/tasks/top_attacking_ips.py @@ -0,0 +1,76 @@ +# tasks/export_malicious_ips.py + +import os +from logger import get_app_logger +from database import get_database +from config import get_config +from models import IpStats +from ip_utils import is_valid_public_ip + +app_logger = get_app_logger() + +# ---------------------- +# TASK CONFIG +# ---------------------- +TASK_CONFIG = { + "name": "export-malicious-ips", + "cron": "*/5 * * * *", + "enabled": True, + "run_when_loaded": True, +} + +EXPORTS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "exports") +OUTPUT_FILE = os.path.join(EXPORTS_DIR, "malicious_ips.txt") + + +# ---------------------- +# TASK LOGIC +# ---------------------- +def main(): + """ + Export all attacker IPs to a text file, matching the "Attackers by Total Requests" dashboard table. + Uses the same query as the dashboard: IpStats where category == "attacker", ordered by total_requests. + TasksMaster will call this function based on the cron schedule. + """ + task_name = TASK_CONFIG.get("name") + app_logger.info(f"[Background Task] {task_name} starting...") + + try: + db = get_database() + session = db.session + + # Query attacker IPs from IpStats (same as dashboard "Attackers by Total Requests") + attackers = ( + session.query(IpStats) + .filter(IpStats.category == "attacker") + .order_by(IpStats.total_requests.desc()) + .all() + ) + + # Filter out local/private IPs and the server's own IP + config = get_config() + server_ip = config.get_server_ip() + + public_ips = [ + attacker.ip + for attacker in attackers + if is_valid_public_ip(attacker.ip, server_ip) + ] + + # Ensure exports directory exists + os.makedirs(EXPORTS_DIR, exist_ok=True) + + # Write IPs to file (one per line) + with open(OUTPUT_FILE, "w") as f: + for ip in public_ips: + f.write(f"{ip}\n") + + app_logger.info( + f"[Background Task] {task_name} exported {len(public_ips)} attacker IPs " + f"(filtered {len(attackers) - len(public_ips)} local/private IPs) to {OUTPUT_FILE}" + ) + + except Exception as e: + app_logger.error(f"[Background Task] {task_name} failed: {e}") + finally: + db.close_session() diff --git a/src/tasks_master.py b/src/tasks_master.py new file mode 100644 index 0000000..9017c49 --- /dev/null +++ b/src/tasks_master.py @@ -0,0 +1,321 @@ +import os +import sys +import datetime +import functools +import threading +import importlib +import importlib.util + +from logger import ( + initialize_logging, + get_app_logger, + get_access_logger, + get_credential_logger, +) + +app_logger = get_app_logger() + +try: + from apscheduler.schedulers.background import BackgroundScheduler + from apscheduler.triggers.cron import CronTrigger + from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR +except ModuleNotFoundError: + msg = ( + "Required modules are not installed. " + "Can not continue with module / application loading.\n" + "Install it with: pip install -r requirements" + ) + print(msg, file=sys.stderr) + app_logger.error(msg) + exit() + + +# ---------- TASKSMASTER CLASS ---------- +class TasksMaster: + + TASK_DEFAULT_CRON = "*/15 * * * *" + TASK_JITTER = 240 + TASKS_FOLDER = os.path.join(os.path.dirname(__file__), "tasks") + + def __init__(self, scheduler: BackgroundScheduler): + self.tasks = self._config_tasks() + self.scheduler = scheduler + self.last_run_times = {} + self.scheduler.add_listener( + self.job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR + ) + + def _config_tasks(self): + """ + Loads tasks from the TASKS_FOLDER and logs how many were found. + """ + tasks_defined = self._load_tasks_from_folder(self.TASKS_FOLDER) + app_logger.info(f"Scheduled Tasks Loaded from folder: {self.TASKS_FOLDER}") + return tasks_defined + + def _load_tasks_from_folder(self, folder_path): + """ + Loads and registers task modules from a specified folder. + + This function scans the given folder for Python (.py) files, dynamically + imports each as a module, and looks for two attributes: + - TASK_CONFIG: A dictionary containing task metadata, specifically the + 'name' and 'cron' (cron schedule string). + - main: A callable function that represents the task's execution logic. + + Tasks with both attributes are added to a list with their configuration and + execution function. + + Args: + folder_path (str): Path to the folder containing task scripts. + + Returns: + list[dict]: A list of task definitions with keys: + - 'name' (str): The name of the task. + - 'filename' (str): The file the task was loaded from. + - 'cron' (str): The crontab string for scheduling. + - 'enabled' (bool): Whether the task is enabled. + - 'run_when_loaded' (bool): Whether to run the task immediately. + """ + tasks = [] + + if not os.path.exists(folder_path): + app_logger.error(f"{folder_path} does not exist! Unable to load tasks!") + return tasks + + # we sort the files so that we have a set order, which helps with debugging + for filename in sorted(os.listdir(folder_path)): + + # skip any non python files, as well as any __pycache__ or .pyc files that might creep in there + if not filename.endswith(".py") or filename.startswith("__"): + continue + + path = os.path.join(folder_path, filename) + module_name = filename[:-3] + spec = importlib.util.spec_from_file_location(f"tasks.{module_name}", path) + module = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(module) + sys.modules[f"tasks.{module_name}"] = module + except Exception as e: + app_logger.error(f"Failed to import {filename}: {e}") + continue + + # if we have a tasks config and a main function, we attempt to schedule it + if hasattr(module, "TASK_CONFIG") and hasattr(module, "main"): + + # ensure task_config is a dict + if not isinstance(module.TASK_CONFIG, dict): + app_logger.error( + f"TASK_CONFIG is not a dict in {filename}. Skipping task." + ) + continue + + task_cron = module.TASK_CONFIG.get("cron") or self.TASK_DEFAULT_CRON + task_name = module.TASK_CONFIG.get("name", module_name) + + # ensure the task_cron is a valid cron value + try: + CronTrigger.from_crontab(task_cron) + except ValueError as ve: + app_logger.error( + f"Invalid cron format for task {task_name}: {ve} - Skipping this task" + ) + continue + + task = { + "name": module.TASK_CONFIG.get("name", module_name), + "filename": filename, + "cron": task_cron, + "enabled": module.TASK_CONFIG.get("enabled", False), + "run_when_loaded": module.TASK_CONFIG.get("run_when_loaded", False), + } + + tasks.append(task) + + # we are missing things, and we log what's missing + else: + if not hasattr(module, "TASK_CONFIG"): + app_logger.warning(f"Missing TASK_CONFIG in {filename}") + elif not hasattr(module, "main"): + app_logger.warning(f"Missing main() in {filename}") + + return tasks + + def _add_jobs(self): + # for each task in the tasks config file... + for task_to_run in self.tasks: + + # remember, these tasks, are built from the "load_tasks_from_folder" function, + # if you want to pass data from the TASKS_CONFIG dict, you need to pass it there to get it here. + task_name = task_to_run.get("name") + run_when_loaded = task_to_run.get("run_when_loaded") + module_name = os.path.splitext(task_to_run.get("filename"))[0] + task_enabled = task_to_run.get("enabled", False) + + # if no crontab set for this task, we use 15 as the default. + task_cron = task_to_run.get("cron") or self.TASK_DEFAULT_CRON + + # if task is disabled, skip this one + if not task_enabled: + app_logger.info( + f"{task_name} is disabled in client config. Skipping task" + ) + continue + try: + if os.path.isfile( + os.path.join(self.TASKS_FOLDER, task_to_run.get("filename")) + ): + # schedule the task now that everything has checked out above... + self._schedule_task( + task_name, module_name, task_cron, run_when_loaded + ) + app_logger.info( + f"Scheduled {module_name} cron is set to {task_cron}.", + extra={"task": task_to_run}, + ) + else: + app_logger.info( + f"Skipping invalid or unsafe file: {task_to_run.get('filename')}", + extra={"task": task_to_run}, + ) + + except Exception as e: + app_logger.error( + f"Error scheduling task: {e}", extra={"tasks": task_to_run} + ) + + def _schedule_task(self, task_name, module_name, task_cron, run_when_loaded): + try: + # Dynamically import the module + module = importlib.import_module(f"tasks.{module_name}") + + # Check if the module has a 'main' function + if hasattr(module, "main"): + app_logger.info(f"Scheduling {task_name} - {module_name} Main Function") + + # unique_job_id + job_identifier = f"{module_name}__{task_name}" + + # little insurance to make sure the cron is set to something and not none + if task_cron is None: + task_cron = self.TASK_DEFAULT_CRON + + trigger = CronTrigger.from_crontab(task_cron) + + # schedule the task / job + if run_when_loaded: + app_logger.info( + f"Task: {task_name} is set to run instantly. Scheduling to run on scheduler start" + ) + + self.scheduler.add_job( + module.main, + trigger, + id=job_identifier, + jitter=self.TASK_JITTER, + name=task_name, + next_run_time=datetime.datetime.now(), + max_instances=1, + ) + else: + self.scheduler.add_job( + module.main, + trigger, + id=job_identifier, + jitter=self.TASK_JITTER, + name=task_name, + max_instances=1, + ) + else: + app_logger.error(f"{module_name} does not define a 'main' function.") + + except Exception as e: + app_logger.error(f"Failed to load {module_name}: {e}") + + def job_listener(self, event): + job_id = event.job_id + self.last_run_times[job_id] = datetime.datetime.now() + + if event.exception: + app_logger.error(f"Job {event.job_id} failed: {event.exception}") + else: + app_logger.info(f"Job {event.job_id} completed successfully.") + + def list_jobs(self): + scheduled_jobs = self.scheduler.get_jobs() + jobs_list = [] + + for job in scheduled_jobs: + jobs_list.append( + { + "id": job.id, + "name": job.name, + "next_run": job.next_run_time, + } + ) + return jobs_list + + def run_scheduled_tasks(self): + """ + Runs and schedules enabled tasks using the background scheduler. + + This method performs the following: + 1. Retrieves the current task configurations and updates internal state. + 2. Adds new jobs to the scheduler based on the latest configuration. + 3. Starts the scheduler to begin executing tasks at their defined intervals. + + This ensures the scheduler is always running with the most up-to-date + task definitions and enabled status. + """ + + # Add enabled tasks to the scheduler + self._add_jobs() + + # Start the scheduler to begin executing the scheduled tasks (if not already running) + if not self.scheduler.running: + self.scheduler.start() + + +# ---------- SINGLETON WRAPPER ---------- +T = type + + +def singleton_loader(func): + """Decorator to ensure only one instance exists.""" + cache: dict[str, T] = {} + lock = threading.Lock() + + @functools.wraps(func) + def wrapper(*args, **kwargs) -> T: + with lock: + if func.__name__ not in cache: + cache[func.__name__] = func(*args, **kwargs) + return cache[func.__name__] + + return wrapper + + +@singleton_loader +def get_tasksmaster(scheduler: BackgroundScheduler | None = None) -> TasksMaster: + """ + Returns the singleton TasksMaster instance. + + - Automatically creates a BackgroundScheduler if none is provided. + - Automatically starts the scheduler when the singleton is created. + + :param scheduler: Optional APScheduler instance. If None, a new BackgroundScheduler will be created. + """ + if scheduler is None: + scheduler = BackgroundScheduler() + + tm_instance = TasksMaster(scheduler) + + # Auto-start scheduler if not already running + if not scheduler.running: + scheduler.start() + app_logger.info( + "TasksMaster scheduler started automatically with singleton creation." + ) + + return tm_instance diff --git a/src/templates/__init__.py b/src/templates/__init__.py index 3eb9f72..4c6c355 100644 --- a/src/templates/__init__.py +++ b/src/templates/__init__.py @@ -8,8 +8,8 @@ from .template_loader import load_template, clear_cache, TemplateNotFoundError from . import html_templates __all__ = [ - 'load_template', - 'clear_cache', - 'TemplateNotFoundError', - 'html_templates', + "load_template", + "clear_cache", + "TemplateNotFoundError", + "html_templates", ] diff --git a/src/templates/dashboard_template.py b/src/templates/dashboard_template.py index a267278..89ca4fb 100644 --- a/src/templates/dashboard_template.py +++ b/src/templates/dashboard_template.py @@ -5,57 +5,73 @@ Dashboard template for viewing honeypot statistics. Customize this template to change the dashboard appearance. """ +import html +from datetime import datetime +from zoneinfo import ZoneInfo -def generate_dashboard(stats: dict) -> str: - """Generate dashboard HTML with access statistics""" - - # Generate IP rows - top_ips_rows = '\n'.join([ - f'{i+1}{ip}{count}' - for i, (ip, count) in enumerate(stats['top_ips']) - ]) or 'No data' - # Generate paths rows - top_paths_rows = '\n'.join([ - f'{i+1}{path}{count}' - for i, (path, count) in enumerate(stats['top_paths']) - ]) or 'No data' +def _escape(value) -> str: + """Escape HTML special characters to prevent XSS attacks.""" + if value is None: + return "" + return html.escape(str(value)) - # Generate User-Agent rows - top_ua_rows = '\n'.join([ - f'{i+1}{ua[:80]}{count}' - for i, (ua, count) in enumerate(stats['top_user_agents']) - ]) or 'No data' - # Generate suspicious accesses rows - suspicious_rows = '\n'.join([ - f'{log["ip"]}{log["path"]}{log["user_agent"][:60]}{log["timestamp"].split("T")[1][:8]}' - for log in stats['recent_suspicious'][-10:] - ]) or 'No suspicious activity detected' +def format_timestamp(iso_timestamp: str, time_only: bool = False) -> str: + """Format ISO timestamp for display with timezone conversion - # Generate honeypot triggered IPs rows - honeypot_rows = '\n'.join([ - f'{ip}{", ".join(paths)}{len(paths)}' - for ip, paths in stats.get('honeypot_triggered_ips', []) - ]) or 'No honeypot triggers yet' + Args: + iso_timestamp: ISO format timestamp string (UTC) + time_only: If True, return only HH:MM:SS, otherwise full datetime + """ + try: + # Parse UTC timestamp + dt = datetime.fromisoformat(iso_timestamp) + if time_only: + return dt.strftime("%H:%M:%S") + return dt.strftime("%Y-%m-%d %H:%M:%S") + except Exception: + # Fallback for old format + return ( + iso_timestamp.split("T")[1][:8] if "T" in iso_timestamp else iso_timestamp + ) - # Generate attack types rows - attack_type_rows = '\n'.join([ - f'{log["ip"]}{log["path"]}{", ".join(log["attack_types"])}{log["user_agent"][:60]}{log["timestamp"].split("T")[1][:8]}' - for log in stats.get('attack_types', [])[-10:] - ]) or 'No attacks detected' - # Generate credential attempts rows - credential_rows = '\n'.join([ - f'{log["ip"]}{log["username"]}{log["password"]}{log["path"]}{log["timestamp"].split("T")[1][:8]}' - for log in stats.get('credential_attempts', [])[-20:] - ]) or 'No credentials captured yet' +def generate_dashboard(stats: dict, dashboard_path: str = "") -> str: + """Generate dashboard HTML with access statistics + + Args: + stats: Statistics dictionary + dashboard_path: The secret dashboard path for generating API URLs + """ + + # Generate suspicious accesses rows with clickable IPs + suspicious_rows = ( + "\n".join([f""" + {_escape(log["ip"])} + {_escape(log["path"])} + {_escape(log["user_agent"][:60])} + {format_timestamp(log["timestamp"], time_only=True)} + + + +
+
Loading stats...
+
+ + """ for log in stats["recent_suspicious"][-10:]]) + or 'No suspicious activity detected' + ) return f""" Krawl Dashboard + + + +
-

🕷️ Krawl Dashboard

- + + +

Krawl Dashboard

+
{stats['total_accesses']}
@@ -169,26 +685,20 @@ def generate_dashboard(stats: dict) -> str:
{len(stats.get('credential_attempts', []))}
Credentials Captured
+
+
{stats.get('unique_attackers', 0)}
+
Unique Attackers
+
-
-

🍯 Honeypot Triggers

- - - - - - - - - - {honeypot_rows} - -
IP AddressAccessed PathsCount
+ -
-

⚠️ Recent Suspicious Activity

+
+
+

Recent Suspicious Activity

@@ -205,89 +715,1951 @@ def generate_dashboard(stats: dict) -> str:
-

🔑 Captured Credentials

-
+
+

Honeypot Triggers by IP

+
+
+ Page 1/1 + + 0 total +
+ + +
+
+
+ + + + + + + + + +
#IP AddressAccessed PathsCount
Loading...
+
+ +
+
+
+

Top IP Addresses

+
+
+ Page 1/1 + + 0 total +
+ + +
+
+ + + + + + + + + + + +
#IP AddressAccess Count
Loading...
+
+ +
+
+

Top User-Agents

+
+
+ Page 1/1 + + 0 total +
+ + +
+
+ + + + + + + + + + + +
#User-AgentCount
Loading...
+
+
+
+ +
+
+
+

IP Origins Map

+
+ + + + + +
+
+
+
Loading map...
+
+
+ +
+
+

Attackers by Total Requests

+
+
+ Page 1/1 + + 0 total +
+ + +
+
+ + + + + + + + + + + + + + + +
#IP AddressTotal RequestsFirst SeenLast SeenLocation
+
+ +
+
+

Captured Credentials

+
+
+ Page 1/1 + + 0 total +
+ + +
+
+ + + + - + - - {credential_rows} + +
# IP Address Username Password PathTimeTime
Loading...
-

😈 Detected Attack Types

- +
+

Detected Attack Types

+
+
+ Page 1/1 + + 0 total +
+ + +
+
+
+ - + - - {attack_type_rows} + +
# IP Address Path Attack Types User-AgentTimeTime
Loading...
-
-

Top IP Addresses

- - - - - - - - - - {top_ips_rows} - -
#IP AddressAccess Count
+
+
+

Most Recurring Attack Types

+
Top 10 Attack Vectors
+
+
+ +
+
-
-

Top Paths

- - - - - - - - - - {top_paths_rows} - -
#PathAccess Count
-
- -
-

Top User-Agents

- - - - - - - - - - {top_ua_rows} - -
#User-AgentCount
+
+
+ +
+ +
+
+ """ diff --git a/src/templates/html/main_page.html b/src/templates/html/main_page.html new file mode 100644 index 0000000..c72dbe7 --- /dev/null +++ b/src/templates/html/main_page.html @@ -0,0 +1,106 @@ + + + + + Krawl me! + + + +
+

Krawl me!

+
{counter}
+ + +
+ + \ No newline at end of file diff --git a/src/templates/html_templates.py b/src/templates/html_templates.py index a7cefbc..50d94dc 100644 --- a/src/templates/html_templates.py +++ b/src/templates/html_templates.py @@ -60,3 +60,8 @@ def product_search() -> str: def input_form() -> str: """Generate input form page for XSS honeypot""" return load_template("input_form") + + +def main_page(counter: int, content: str) -> str: + """Generate main Krawl page with links and canary token""" + return load_template("main_page", counter=counter, content=content) diff --git a/src/templates/static/krawl-svg.svg b/src/templates/static/krawl-svg.svg new file mode 100644 index 0000000..2d15e51 --- /dev/null +++ b/src/templates/static/krawl-svg.svg @@ -0,0 +1,95 @@ + + + + diff --git a/src/templates/template_loader.py b/src/templates/template_loader.py index fd1febc..fe53bf5 100644 --- a/src/templates/template_loader.py +++ b/src/templates/template_loader.py @@ -11,6 +11,7 @@ from typing import Dict class TemplateNotFoundError(Exception): """Raised when a template file cannot be found.""" + pass @@ -42,11 +43,11 @@ def load_template(name: str, **kwargs) -> str: """ # debug # print(f"Loading Template: {name}") - + # Check cache first if name not in _template_cache: # Determine file path based on whether name has an extension - if '.' in name: + if "." in name: file_path = _TEMPLATE_DIR / name else: file_path = _TEMPLATE_DIR / f"{name}.html" @@ -54,7 +55,7 @@ def load_template(name: str, **kwargs) -> str: if not file_path.exists(): raise TemplateNotFoundError(f"Template '{name}' not found at {file_path}") - _template_cache[name] = file_path.read_text(encoding='utf-8') + _template_cache[name] = file_path.read_text(encoding="utf-8") template = _template_cache[name] diff --git a/src/tracker.py b/src/tracker.py index 8465031..60e05f0 100644 --- a/src/tracker.py +++ b/src/tracker.py @@ -1,44 +1,114 @@ #!/usr/bin/env python3 -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Optional from collections import defaultdict from datetime import datetime +from zoneinfo import ZoneInfo import re import urllib.parse + from wordlists import get_wordlists +from database import get_database, DatabaseManager +from ip_utils import is_local_or_private_ip, is_valid_public_ip class AccessTracker: - """Track IP addresses and paths accessed""" - def __init__(self): + """ + Track IP addresses and paths accessed. + + Maintains in-memory structures for fast dashboard access and + persists data to SQLite for long-term storage and analysis. + """ + + def __init__( + self, + max_pages_limit, + ban_duration_seconds, + db_manager: Optional[DatabaseManager] = None, + ): + """ + Initialize the access tracker. + + Args: + db_manager: Optional DatabaseManager for persistence. + If None, will use the global singleton. + """ + self.max_pages_limit = max_pages_limit + self.ban_duration_seconds = ban_duration_seconds self.ip_counts: Dict[str, int] = defaultdict(int) self.path_counts: Dict[str, int] = defaultdict(int) self.user_agent_counts: Dict[str, int] = defaultdict(int) self.access_log: List[Dict] = [] self.credential_attempts: List[Dict] = [] + + # Memory limits for in-memory lists (prevents unbounded growth) + self.max_access_log_size = 10_000 # Keep only recent 10k accesses + self.max_credential_log_size = 5_000 # Keep only recent 5k attempts + self.max_counter_keys = 100_000 # Max unique IPs/paths/user agents + + # Track pages visited by each IP (for good crawler limiting) + self.ip_page_visits: Dict[str, Dict[str, object]] = defaultdict(dict) + self.suspicious_patterns = [ - 'bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python-requests', - 'scanner', 'nikto', 'sqlmap', 'nmap', 'masscan', 'nessus', 'acunetix', - 'burp', 'zap', 'w3af', 'metasploit', 'nuclei', 'gobuster', 'dirbuster' + "bot", + "crawler", + "spider", + "scraper", + "curl", + "wget", + "python-requests", + "scanner", + "nikto", + "sqlmap", + "nmap", + "masscan", + "nessus", + "acunetix", + "burp", + "zap", + "w3af", + "metasploit", + "nuclei", + "gobuster", + "dirbuster", ] # Load attack patterns from wordlists wl = get_wordlists() self.attack_types = wl.attack_patterns - + # Fallback if wordlists not loaded if not self.attack_types: self.attack_types = { - 'path_traversal': r'\.\.', - 'sql_injection': r"('|--|;|\bOR\b|\bUNION\b|\bSELECT\b|\bDROP\b)", - 'xss_attempt': r'( Optional[DatabaseManager]: + """ + Get the database manager, lazily initializing if needed. + + Returns: + DatabaseManager instance or None if not available + """ + if self._db_manager is None: + try: + self._db_manager = get_database() + except Exception: + # Database not initialized, persistence disabled + pass + return self._db_manager + def parse_credentials(self, post_data: str) -> Tuple[str, str]: """ Parse username and password from POST data. @@ -46,82 +116,182 @@ class AccessTracker: """ if not post_data: return None, None - + username = None password = None - + try: # Parse URL-encoded form data parsed = urllib.parse.parse_qs(post_data) - + # Common username field names - username_fields = ['username', 'user', 'login', 'email', 'log', 'userid', 'account'] + username_fields = [ + "username", + "user", + "login", + "email", + "log", + "userid", + "account", + ] for field in username_fields: if field in parsed and parsed[field]: username = parsed[field][0] break - + # Common password field names - password_fields = ['password', 'pass', 'passwd', 'pwd', 'passphrase'] + password_fields = ["password", "pass", "passwd", "pwd", "passphrase"] for field in password_fields: if field in parsed and parsed[field]: password = parsed[field][0] break - + except Exception: # If parsing fails, try simple regex patterns - username_match = re.search(r'(?:username|user|login|email|log)=([^&\s]+)', post_data, re.IGNORECASE) - password_match = re.search(r'(?:password|pass|passwd|pwd)=([^&\s]+)', post_data, re.IGNORECASE) - + username_match = re.search( + r"(?:username|user|login|email|log)=([^&\s]+)", post_data, re.IGNORECASE + ) + password_match = re.search( + r"(?:password|pass|passwd|pwd)=([^&\s]+)", post_data, re.IGNORECASE + ) + if username_match: username = urllib.parse.unquote_plus(username_match.group(1)) if password_match: password = urllib.parse.unquote_plus(password_match.group(1)) - + return username, password - def record_credential_attempt(self, ip: str, path: str, username: str, password: str): - """Record a credential login attempt""" - self.credential_attempts.append({ - 'ip': ip, - 'path': path, - 'username': username, - 'password': password, - 'timestamp': datetime.now().isoformat() - }) + def record_credential_attempt( + self, ip: str, path: str, username: str, password: str + ): + """ + Record a credential login attempt. + + Stores in both in-memory list and SQLite database. + Skips recording if the IP is the server's own public IP. + """ + # Skip if this is the server's own IP + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + if server_ip and ip == server_ip: + return + + # In-memory storage for dashboard + self.credential_attempts.append( + { + "ip": ip, + "path": path, + "username": username, + "password": password, + "timestamp": datetime.now().isoformat(), + } + ) + + # Trim if exceeding max size (prevent unbounded growth) + if len(self.credential_attempts) > self.max_credential_log_size: + self.credential_attempts = self.credential_attempts[ + -self.max_credential_log_size : + ] + + # Persist to database + if self.db: + try: + self.db.persist_credential( + ip=ip, path=path, username=username, password=password + ) + except Exception: + # Don't crash if database persistence fails + pass + + def record_access( + self, + ip: str, + path: str, + user_agent: str = "", + body: str = "", + method: str = "GET", + ): + """ + Record an access attempt. + + Stores in both in-memory structures and SQLite database. + Skips recording if the IP is the server's own public IP. + + Args: + ip: Client IP address + path: Requested path + user_agent: Client user agent string + body: Request body (for POST/PUT) + method: HTTP method + """ + # Skip if this is the server's own IP + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + if server_ip and ip == server_ip: + return - def record_access(self, ip: str, path: str, user_agent: str = '', body: str = ''): - """Record an access attempt""" self.ip_counts[ip] += 1 self.path_counts[path] += 1 if user_agent: self.user_agent_counts[user_agent] += 1 - - # path attack type detection + + # Path attack type detection attack_findings = self.detect_attack_type(path) - # post / put data + # POST/PUT body attack detection if len(body) > 0: attack_findings.extend(self.detect_attack_type(body)) - is_suspicious = self.is_suspicious_user_agent(user_agent) or self.is_honeypot_path(path) or len(attack_findings) > 0 + is_suspicious = ( + self.is_suspicious_user_agent(user_agent) + or self.is_honeypot_path(path) + or len(attack_findings) > 0 + ) + is_honeypot = self.is_honeypot_path(path) - # Track if this IP accessed a honeypot path - if self.is_honeypot_path(path): + if is_honeypot: self.honeypot_triggered[ip].append(path) - - self.access_log.append({ - 'ip': ip, - 'path': path, - 'user_agent': user_agent, - 'suspicious': is_suspicious, - 'honeypot_triggered': self.is_honeypot_path(path), - 'attack_types':attack_findings, - 'timestamp': datetime.now().isoformat() - }) - def detect_attack_type(self, data:str) -> list[str]: + # In-memory storage for dashboard + self.access_log.append( + { + "ip": ip, + "path": path, + "user_agent": user_agent, + "suspicious": is_suspicious, + "honeypot_triggered": self.is_honeypot_path(path), + "attack_types": attack_findings, + "timestamp": datetime.now().isoformat(), + } + ) + + # Trim if exceeding max size (prevent unbounded growth) + if len(self.access_log) > self.max_access_log_size: + self.access_log = self.access_log[-self.max_access_log_size :] + + # Persist to database + if self.db: + try: + self.db.persist_access( + ip=ip, + path=path, + user_agent=user_agent, + method=method, + is_suspicious=is_suspicious, + is_honeypot_trigger=is_honeypot, + attack_types=attack_findings if attack_findings else None, + ) + except Exception: + # Don't crash if database persistence fails + pass + + def detect_attack_type(self, data: str) -> list[str]: """ Returns a list of all attack types found in path data """ @@ -134,27 +304,37 @@ class AccessTracker: def is_honeypot_path(self, path: str) -> bool: """Check if path is one of the honeypot traps from robots.txt""" honeypot_paths = [ - '/admin', - '/admin/', - '/backup', - '/backup/', - '/config', - '/config/', - '/private', - '/private/', - '/database', - '/database/', - '/credentials.txt', - '/passwords.txt', - '/admin_notes.txt', - '/api_keys.json', - '/.env', - '/wp-admin', - '/wp-admin/', - '/phpmyadmin', - '/phpMyAdmin/' + "/admin", + "/admin/", + "/backup", + "/backup/", + "/config", + "/config/", + "/private", + "/private/", + "/database", + "/database/", + "/credentials.txt", + "/passwords.txt", + "/admin_notes.txt", + "/api_keys.json", + "/.env", + "/wp-admin", + "/wp-admin/", + "/phpmyadmin", + "/phpMyAdmin/", ] - return path in honeypot_paths or any(hp in path.lower() for hp in ['/backup', '/admin', '/config', '/private', '/database', 'phpmyadmin']) + return path in honeypot_paths or any( + hp in path.lower() + for hp in [ + "/backup", + "/admin", + "/config", + "/private", + "/database", + "phpmyadmin", + ] + ) def is_suspicious_user_agent(self, user_agent: str) -> bool: """Check if user agent matches suspicious patterns""" @@ -163,48 +343,340 @@ class AccessTracker: ua_lower = user_agent.lower() return any(pattern in ua_lower for pattern in self.suspicious_patterns) + def get_category_by_ip(self, client_ip: str) -> str: + """ + Check if an IP has been categorized as a 'good crawler' in the database. + Uses the IP category from IpStats table. + + Args: + client_ip: The client IP address (will be sanitized) + + Returns: + True if the IP is categorized as 'good crawler', False otherwise + """ + try: + from sanitizer import sanitize_ip + + # Sanitize the IP address + safe_ip = sanitize_ip(client_ip) + + # Query the database for this IP's category + db = self.db + if not db: + return False + + ip_stats = db.get_ip_stats_by_ip(safe_ip) + if not ip_stats or not ip_stats.get("category"): + return False + + # Check if category matches "good crawler" + category = ip_stats.get("category", "").lower().strip() + return category + + except Exception as e: + # Log but don't crash on database errors + import logging + + logging.error(f"Error checking IP category for {client_ip}: {str(e)}") + return False + + def increment_page_visit(self, client_ip: str) -> int: + """ + Increment page visit counter for an IP and return the new count. + Implements incremental bans: each violation increases ban duration exponentially. + + Ban duration formula: base_duration * (2 ^ violation_count) + - 1st violation: base_duration (e.g., 60 seconds) + - 2nd violation: base_duration * 2 (120 seconds) + - 3rd violation: base_duration * 4 (240 seconds) + - Nth violation: base_duration * 2^(N-1) + + Args: + client_ip: The client IP address + + Returns: + The updated page visit count for this IP + """ + # Skip if this is the server's own IP + from config import get_config + + config = get_config() + server_ip = config.get_server_ip() + if server_ip and client_ip == server_ip: + return 0 + + try: + # Initialize if not exists + if client_ip not in self.ip_page_visits: + self.ip_page_visits[client_ip] = { + "count": 0, + "ban_timestamp": None, + "total_violations": 0, + "ban_multiplier": 1, + } + + # Increment count + self.ip_page_visits[client_ip]["count"] += 1 + + # Set ban if reached limit + if self.ip_page_visits[client_ip]["count"] >= self.max_pages_limit: + # Increment violation counter + self.ip_page_visits[client_ip]["total_violations"] += 1 + violations = self.ip_page_visits[client_ip]["total_violations"] + + # Calculate exponential ban multiplier: 2^(violations - 1) + # Violation 1: 2^0 = 1x + # Violation 2: 2^1 = 2x + # Violation 3: 2^2 = 4x + # Violation 4: 2^3 = 8x, etc. + self.ip_page_visits[client_ip]["ban_multiplier"] = 2 ** (violations - 1) + + # Set ban timestamp + self.ip_page_visits[client_ip][ + "ban_timestamp" + ] = datetime.now().isoformat() + + return self.ip_page_visits[client_ip]["count"] + + except Exception: + return 0 + + def is_banned_ip(self, client_ip: str) -> bool: + """ + Check if an IP is currently banned due to exceeding page visit limits. + Uses incremental ban duration based on violation count. + + Ban duration = base_duration * (2 ^ (violations - 1)) + Each time an IP is banned again, duration doubles. + + Args: + client_ip: The client IP address + Returns: + True if the IP is banned, False otherwise + """ + try: + if client_ip in self.ip_page_visits: + ban_timestamp = self.ip_page_visits[client_ip].get("ban_timestamp") + if ban_timestamp is not None: + # Get the ban multiplier for this violation + ban_multiplier = self.ip_page_visits[client_ip].get( + "ban_multiplier", 1 + ) + + # Calculate effective ban duration based on violations + effective_ban_duration = self.ban_duration_seconds * ban_multiplier + + # Check if ban period has expired + ban_time = datetime.fromisoformat(ban_timestamp) + time_diff = datetime.now() - ban_time + + if time_diff.total_seconds() > effective_ban_duration: + # Ban expired, reset for next cycle + # Keep violation count for next offense + self.ip_page_visits[client_ip]["count"] = 0 + self.ip_page_visits[client_ip]["ban_timestamp"] = None + return False + else: + # Still banned + return True + + return False + + except Exception: + return False + + def get_ban_info(self, client_ip: str) -> dict: + """ + Get detailed ban information for an IP. + + Returns: + Dictionary with ban status, violations, and remaining ban time + """ + try: + if client_ip not in self.ip_page_visits: + return { + "is_banned": False, + "violations": 0, + "ban_multiplier": 1, + "remaining_ban_seconds": 0, + } + + ip_data = self.ip_page_visits[client_ip] + ban_timestamp = ip_data.get("ban_timestamp") + + if ban_timestamp is None: + return { + "is_banned": False, + "violations": ip_data.get("total_violations", 0), + "ban_multiplier": ip_data.get("ban_multiplier", 1), + "remaining_ban_seconds": 0, + } + + # Ban is active, calculate remaining time + ban_multiplier = ip_data.get("ban_multiplier", 1) + effective_ban_duration = self.ban_duration_seconds * ban_multiplier + + ban_time = datetime.fromisoformat(ban_timestamp) + time_diff = datetime.now() - ban_time + remaining_seconds = max( + 0, effective_ban_duration - time_diff.total_seconds() + ) + + return { + "is_banned": remaining_seconds > 0, + "violations": ip_data.get("total_violations", 0), + "ban_multiplier": ban_multiplier, + "effective_ban_duration_seconds": effective_ban_duration, + "remaining_ban_seconds": remaining_seconds, + } + + except Exception: + return { + "is_banned": False, + "violations": 0, + "ban_multiplier": 1, + "remaining_ban_seconds": 0, + } + """ + Get the current page visit count for an IP. + + Args: + client_ip: The client IP address + + Returns: + The page visit count for this IP + """ + try: + return self.ip_page_visits.get(client_ip, 0) + except Exception: + return 0 + def get_top_ips(self, limit: int = 10) -> List[Tuple[str, int]]: - """Get top N IP addresses by access count""" - return sorted(self.ip_counts.items(), key=lambda x: x[1], reverse=True)[:limit] + """Get top N IP addresses by access count (excludes local/private IPs)""" + filtered = [ + (ip, count) + for ip, count in self.ip_counts.items() + if not is_local_or_private_ip(ip) + ] + return sorted(filtered, key=lambda x: x[1], reverse=True)[:limit] def get_top_paths(self, limit: int = 10) -> List[Tuple[str, int]]: """Get top N paths by access count""" - return sorted(self.path_counts.items(), key=lambda x: x[1], reverse=True)[:limit] + return sorted(self.path_counts.items(), key=lambda x: x[1], reverse=True)[ + :limit + ] def get_top_user_agents(self, limit: int = 10) -> List[Tuple[str, int]]: """Get top N user agents by access count""" - return sorted(self.user_agent_counts.items(), key=lambda x: x[1], reverse=True)[:limit] + return sorted(self.user_agent_counts.items(), key=lambda x: x[1], reverse=True)[ + :limit + ] def get_suspicious_accesses(self, limit: int = 20) -> List[Dict]: - """Get recent suspicious accesses""" - suspicious = [log for log in self.access_log if log.get('suspicious', False)] + """Get recent suspicious accesses (excludes local/private IPs)""" + suspicious = [ + log + for log in self.access_log + if log.get("suspicious", False) + and not is_local_or_private_ip(log.get("ip", "")) + ] return suspicious[-limit:] def get_attack_type_accesses(self, limit: int = 20) -> List[Dict]: - """Get recent accesses with detected attack types""" - attacks = [log for log in self.access_log if log.get('attack_types')] + """Get recent accesses with detected attack types (excludes local/private IPs)""" + attacks = [ + log + for log in self.access_log + if log.get("attack_types") and not is_local_or_private_ip(log.get("ip", "")) + ] return attacks[-limit:] def get_honeypot_triggered_ips(self) -> List[Tuple[str, List[str]]]: - """Get IPs that accessed honeypot paths""" - return [(ip, paths) for ip, paths in self.honeypot_triggered.items()] + """Get IPs that accessed honeypot paths (excludes local/private IPs)""" + return [ + (ip, paths) + for ip, paths in self.honeypot_triggered.items() + if not is_local_or_private_ip(ip) + ] def get_stats(self) -> Dict: - """Get statistics summary""" - suspicious_count = sum(1 for log in self.access_log if log.get('suspicious', False)) - honeypot_count = sum(1 for log in self.access_log if log.get('honeypot_triggered', False)) + """Get statistics summary from database.""" + if not self.db: + raise RuntimeError("Database not available for dashboard stats") + + # Get aggregate counts from database + stats = self.db.get_dashboard_counts() + + # Add detailed lists from database + stats["top_ips"] = self.db.get_top_ips(10) + stats["top_paths"] = self.db.get_top_paths(10) + stats["top_user_agents"] = self.db.get_top_user_agents(10) + stats["recent_suspicious"] = self.db.get_recent_suspicious(20) + stats["honeypot_triggered_ips"] = self.db.get_honeypot_triggered_ips() + stats["attack_types"] = self.db.get_recent_attacks(20) + stats["credential_attempts"] = self.db.get_credential_attempts(limit=50) + + return stats + + def cleanup_memory(self) -> None: + """ + Clean up in-memory structures to prevent unbounded growth. + Should be called periodically (e.g., every 5 minutes). + + Trimming strategy: + - Keep most recent N entries in logs + - Remove oldest entries when limit exceeded + - Clean expired ban entries from ip_page_visits + """ + # Trim access_log to max size (keep most recent) + if len(self.access_log) > self.max_access_log_size: + self.access_log = self.access_log[-self.max_access_log_size :] + + # Trim credential_attempts to max size (keep most recent) + if len(self.credential_attempts) > self.max_credential_log_size: + self.credential_attempts = self.credential_attempts[ + -self.max_credential_log_size : + ] + + # Clean expired ban entries from ip_page_visits + current_time = datetime.now() + ips_to_clean = [] + for ip, data in self.ip_page_visits.items(): + ban_timestamp = data.get("ban_timestamp") + if ban_timestamp is not None: + try: + ban_time = datetime.fromisoformat(ban_timestamp) + time_diff = (current_time - ban_time).total_seconds() + if time_diff > self.ban_duration_seconds: + # Ban expired, reset the entry + data["count"] = 0 + data["ban_timestamp"] = None + except (ValueError, TypeError): + pass + + # Optional: Remove IPs with zero activity (advanced cleanup) + # Comment out to keep indefinite history of zero-activity IPs + # ips_to_remove = [ + # ip + # for ip, data in self.ip_page_visits.items() + # if data.get("count", 0) == 0 and data.get("ban_timestamp") is None + # ] + # for ip in ips_to_remove: + # del self.ip_page_visits[ip] + + def get_memory_stats(self) -> Dict[str, int]: + """ + Get current memory usage statistics for monitoring. + + Returns: + Dictionary with counts of in-memory items + """ return { - 'total_accesses': len(self.access_log), - 'unique_ips': len(self.ip_counts), - 'unique_paths': len(self.path_counts), - 'suspicious_accesses': suspicious_count, - 'honeypot_triggered': honeypot_count, - 'honeypot_ips': len(self.honeypot_triggered), - 'top_ips': self.get_top_ips(10), - 'top_paths': self.get_top_paths(10), - 'top_user_agents': self.get_top_user_agents(10), - 'recent_suspicious': self.get_suspicious_accesses(20), - 'honeypot_triggered_ips': self.get_honeypot_triggered_ips(), - 'attack_types': self.get_attack_type_accesses(20), - 'credential_attempts': self.credential_attempts[-50:] # Last 50 attempts + "access_log_size": len(self.access_log), + "credential_attempts_size": len(self.credential_attempts), + "unique_ips_tracked": len(self.ip_counts), + "unique_paths_tracked": len(self.path_counts), + "unique_user_agents": len(self.user_agent_counts), + "unique_ip_page_visits": len(self.ip_page_visits), + "honeypot_triggered_ips": len(self.honeypot_triggered), } diff --git a/src/wordlists.py b/src/wordlists.py index bfa6f1a..1910fc7 100644 --- a/src/wordlists.py +++ b/src/wordlists.py @@ -13,124 +13,128 @@ from logger import get_app_logger class Wordlists: """Loads and provides access to wordlists from wordlists.json""" - + def __init__(self): self._data = self._load_config() - + def _load_config(self): """Load wordlists from JSON file""" - config_path = Path(__file__).parent.parent / 'wordlists.json' + config_path = Path(__file__).parent.parent / "wordlists.json" try: - with open(config_path, 'r') as f: + with open(config_path, "r") as f: return json.load(f) except FileNotFoundError: - get_app_logger().warning(f"Wordlists file {config_path} not found, using default values") + get_app_logger().warning( + f"Wordlists file {config_path} not found, using default values" + ) return self._get_defaults() except json.JSONDecodeError as e: get_app_logger().warning(f"Invalid JSON in {config_path}: {e}") return self._get_defaults() - + def _get_defaults(self): """Fallback default wordlists if JSON file is missing or invalid""" return { "usernames": { "prefixes": ["admin", "user", "root"], - "suffixes": ["", "_prod", "_dev"] + "suffixes": ["", "_prod", "_dev"], }, "passwords": { "prefixes": ["P@ssw0rd", "Admin"], - "simple": ["test", "demo", "password"] - }, - "emails": { - "domains": ["example.com", "test.com"] - }, - "api_keys": { - "prefixes": ["sk_live_", "api_", ""] + "simple": ["test", "demo", "password"], }, + "emails": {"domains": ["example.com", "test.com"]}, + "api_keys": {"prefixes": ["sk_live_", "api_", ""]}, "databases": { "names": ["production", "main_db"], - "hosts": ["localhost", "db.internal"] + "hosts": ["localhost", "db.internal"], }, - "applications": { - "names": ["WebApp", "Dashboard"] - }, - "users": { - "roles": ["Administrator", "User"] - } + "applications": {"names": ["WebApp", "Dashboard"]}, + "users": {"roles": ["Administrator", "User"]}, + "server_headers": ["Apache/2.4.41 (Ubuntu)", "nginx/1.18.0"], } - + @property def username_prefixes(self): return self._data.get("usernames", {}).get("prefixes", []) - + @property def username_suffixes(self): return self._data.get("usernames", {}).get("suffixes", []) - + @property def password_prefixes(self): return self._data.get("passwords", {}).get("prefixes", []) - + @property def simple_passwords(self): return self._data.get("passwords", {}).get("simple", []) - + @property def email_domains(self): return self._data.get("emails", {}).get("domains", []) - + @property def api_key_prefixes(self): return self._data.get("api_keys", {}).get("prefixes", []) - + @property def database_names(self): return self._data.get("databases", {}).get("names", []) - + @property def database_hosts(self): return self._data.get("databases", {}).get("hosts", []) - + @property def application_names(self): return self._data.get("applications", {}).get("names", []) - + @property def user_roles(self): return self._data.get("users", {}).get("roles", []) - + @property def directory_files(self): return self._data.get("directory_listing", {}).get("files", []) - + @property def directory_dirs(self): return self._data.get("directory_listing", {}).get("directories", []) - + @property def error_codes(self): return self._data.get("error_codes", []) - + @property def sql_errors(self): return self._data.get("sql_errors", {}) - + @property def attack_patterns(self): return self._data.get("attack_patterns", {}) - + @property def server_errors(self): return self._data.get("server_errors", {}) + @property + def server_headers(self): + return self._data.get("server_headers", []) + + @property + def attack_urls(self): + """Deprecated: use attack_patterns instead. Returns attack_patterns for backward compatibility.""" + return self._data.get("attack_patterns", {}) + _wordlists_instance = None + def get_wordlists(): """Get the singleton Wordlists instance""" global _wordlists_instance if _wordlists_instance is None: _wordlists_instance = Wordlists() return _wordlists_instance - diff --git a/src/xss_detector.py b/src/xss_detector.py index 0f3da14..618ccb2 100644 --- a/src/xss_detector.py +++ b/src/xss_detector.py @@ -8,25 +8,25 @@ from wordlists import get_wordlists def detect_xss_pattern(input_string: str) -> bool: if not input_string: return False - + wl = get_wordlists() - xss_pattern = wl.attack_patterns.get('xss_attempt', '') - + xss_pattern = wl.attack_patterns.get("xss_attempt", "") + if not xss_pattern: - xss_pattern = r'( str: xss_detected = False reflected_content = [] - + for key, value in input_data.items(): if detect_xss_pattern(value): xss_detected = True reflected_content.append(f"

{key}: {value}

") - + if xss_detected: html = f""" @@ -51,7 +51,7 @@ def generate_xss_response(input_data: dict) -> str: """ return html - + return """ diff --git a/tests/test_credentials.sh b/tests/test_credentials.sh new file mode 100755 index 0000000..68ee2c0 --- /dev/null +++ b/tests/test_credentials.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# This script sends various POST requests with credentials to the honeypot + +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +# Configuration +HOST="localhost" +PORT="5000" +BASE_URL="http://${HOST}:${PORT}" + +echo -e "${BLUE}========================================${NC}" +echo -e "${BLUE}Krawl Credential Logging Test Script${NC}" +echo -e "${BLUE}========================================${NC}\n" + +# Check if server is running +echo -e "${YELLOW}Checking if server is running on ${BASE_URL}...${NC}" +if ! curl -s -f "${BASE_URL}/health" > /dev/null 2>&1; then + echo -e "${RED}❌ Server is not running. Please start the Krawl server first.${NC}" + echo -e "${YELLOW}Run: python3 src/server.py${NC}" + exit 1 +fi +echo -e "${GREEN}✓ Server is running${NC}\n" + +# Test 1: Simple login form POST +echo -e "${YELLOW}Test 1: POST to /login with form data${NC}" +curl -s -X POST "${BASE_URL}/login" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=admin&password=admin123" \ + > /dev/null +echo -e "${GREEN}✓ Sent: admin / admin123${NC}\n" + +sleep 1 + +# Test 2: Admin panel login +echo -e "${YELLOW}Test 2: POST to /admin with credentials${NC}" +curl -s -X POST "${BASE_URL}/admin" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "user=root&pass=toor&submit=Login" \ + > /dev/null +echo -e "${GREEN}✓ Sent: root / toor${NC}\n" + +sleep 1 + +# Test 3: WordPress login attempt +echo -e "${YELLOW}Test 3: POST to /wp-login.php${NC}" +curl -s -X POST "${BASE_URL}/wp-login.php" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "log=wpuser&pwd=Password1&wp-submit=Log+In" \ + > /dev/null +echo -e "${GREEN}✓ Sent: wpuser / Password1${NC}\n" + +sleep 1 + +# Test 4: JSON formatted credentials +echo -e "${YELLOW}Test 4: POST to /api/login with JSON${NC}" +curl -s -X POST "${BASE_URL}/api/login" \ + -H "Content-Type: application/json" \ + -d '{"username":"apiuser","password":"apipass123","remember":true}' \ + > /dev/null +echo -e "${GREEN}✓ Sent: apiuser / apipass123${NC}\n" + +sleep 1 + +# Test 5: SSH-style login +echo -e "${YELLOW}Test 5: POST to /ssh with credentials${NC}" +curl -s -X POST "${BASE_URL}/ssh" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=sshuser&password=P@ssw0rd!" \ + > /dev/null +echo -e "${GREEN}✓ Sent: sshuser / P@ssw0rd!${NC}\n" + +sleep 1 + +# Test 6: Database admin +echo -e "${YELLOW}Test 6: POST to /phpmyadmin with credentials${NC}" +curl -s -X POST "${BASE_URL}/phpmyadmin" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "pma_username=dbadmin&pma_password=dbpass123&server=1" \ + > /dev/null +echo -e "${GREEN}✓ Sent: dbadmin / dbpass123${NC}\n" + +sleep 1 + +# Test 7: Multiple fields with email +echo -e "${YELLOW}Test 7: POST to /register with email${NC}" +curl -s -X POST "${BASE_URL}/register" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "email=test@example.com&username=newuser&password=NewPass123&confirm_password=NewPass123" \ + > /dev/null +echo -e "${GREEN}✓ Sent: newuser / NewPass123 (email: test@example.com)${NC}\n" + +sleep 1 + +# Test 8: FTP credentials +echo -e "${YELLOW}Test 8: POST to /ftp/login${NC}" +curl -s -X POST "${BASE_URL}/ftp/login" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "ftpuser=ftpadmin&ftppass=ftp123456" \ + > /dev/null +echo -e "${GREEN}✓ Sent: ftpadmin / ftp123456${NC}\n" + +sleep 1 + +# Test 9: Common brute force attempt +echo -e "${YELLOW}Test 9: Multiple attempts (simulating brute force)${NC}" +for i in {1..3}; do + curl -s -X POST "${BASE_URL}/login" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "username=admin&password=pass${i}" \ + > /dev/null + echo -e "${GREEN}✓ Attempt $i: admin / pass${i}${NC}" + sleep 0.5 +done +echo "" + +sleep 1 + +# Test 10: Special characters in credentials +echo -e "${YELLOW}Test 10: POST with special characters${NC}" +curl -s -X POST "${BASE_URL}/login" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + --data-urlencode "username=user@domain.com" \ + --data-urlencode "password=P@\$\$w0rd!#%" \ + > /dev/null +echo -e "${GREEN}✓ Sent: user@domain.com / P@\$\$w0rd!#%${NC}\n" + +echo -e "${BLUE}========================================${NC}" +echo -e "${GREEN}✓ All credential tests completed!${NC}" +echo -e "${BLUE}========================================${NC}\n" + +echo -e "${YELLOW}Check the results:${NC}" +echo -e " 1. View the log file: ${GREEN}tail -20 logs/credentials.log${NC}" +echo -e " 2. View the dashboard: ${GREEN}${BASE_URL}/dashboard${NC}" +echo -e " 3. Check recent logs: ${GREEN}tail -20 logs/access.log ${NC}\n" + +# Display last 10 credential entries if log file exists +if [ -f "src/logs/credentials.log" ]; then + echo -e "${BLUE}========================================${NC}" + echo -e "${BLUE}Last 10 Captured Credentials:${NC}" + echo -e "${BLUE}========================================${NC}" + tail -10 src/logs/credentials.log + echo "" +fi + +echo -e "${YELLOW}💡 Tip: Open ${BASE_URL}/dashboard in your browser to see the credentials in real-time!${NC}" diff --git a/tests/test_insert_fake_ips.py b/tests/test_insert_fake_ips.py new file mode 100644 index 0000000..6a2c621 --- /dev/null +++ b/tests/test_insert_fake_ips.py @@ -0,0 +1,572 @@ +#!/usr/bin/env python3 + +""" +Test script to insert fake external IPs into the database for testing the dashboard. +This generates realistic-looking test data including: +- Access logs with various suspicious activities +- Credential attempts +- Attack detections (SQL injection, XSS, etc.) +- Category behavior changes for timeline demonstration +- Geolocation data fetched from API with reverse geocoded city names +- Real good crawler IPs (Googlebot, Bingbot, etc.) + +Usage: + python test_insert_fake_ips.py [num_ips] [logs_per_ip] [credentials_per_ip] [--no-cleanup] + +Examples: + python test_insert_fake_ips.py # Generate 20 IPs with defaults, cleanup DB first + python test_insert_fake_ips.py 30 # Generate 30 IPs with defaults + python test_insert_fake_ips.py 30 20 5 # Generate 30 IPs, 20 logs each, 5 credentials each + python test_insert_fake_ips.py --no-cleanup # Generate data without cleaning DB first + +Note: This script will make API calls to fetch geolocation data, so it may take a while. +""" + +import random +import time +import sys +from datetime import datetime, timedelta +from zoneinfo import ZoneInfo +from pathlib import Path +import requests + +# Add parent src directory to path so we can import database and logger +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from database import get_database +from logger import get_app_logger +from geo_utils import extract_city_from_coordinates + +# ---------------------- +# TEST DATA GENERATORS +# ---------------------- + +# Fake IPs for testing - geolocation data will be fetched from API +# These are real public IPs from various locations around the world +FAKE_IPS = [ + # United States + "45.142.120.10", + "107.189.10.143", + "162.243.175.23", + "198.51.100.89", + # Europe + "185.220.101.45", + "195.154.133.20", + "178.128.83.165", + "87.251.67.90", + "91.203.5.165", + "46.105.57.169", + "217.182.143.207", + "188.166.123.45", + # Asia + "103.253.145.36", + "42.112.28.216", + "118.163.74.160", + "43.229.53.35", + "115.78.208.140", + "14.139.56.18", + "61.19.25.207", + "121.126.219.198", + "202.134.4.212", + "171.244.140.134", + # South America + "177.87.169.20", + "200.21.19.58", + "181.13.140.98", + "190.150.24.34", + # Middle East & Africa + "41.223.53.141", + "196.207.35.152", + "5.188.62.214", + "37.48.93.125", + "102.66.137.29", + # Australia & Oceania + "103.28.248.110", + "202.168.45.33", + # Additional European IPs + "94.102.49.190", + "213.32.93.140", + "79.137.79.167", + "37.9.169.146", + "188.92.80.123", + "80.240.25.198", +] + +# Real good crawler IPs (Googlebot, Bingbot, etc.) - geolocation will be fetched from API +GOOD_CRAWLER_IPS = [ + "66.249.66.1", # Googlebot + "66.249.79.23", # Googlebot + "40.77.167.52", # Bingbot + "157.55.39.145", # Bingbot + "17.58.98.100", # Applebot + "199.59.150.39", # Twitterbot + "54.236.1.15", # Amazon Bot +] + +FAKE_PATHS = [ + "/admin", + "/login", + "/admin/login", + "/api/users", + "/wp-admin", + "/.env", + "/config.php", + "/admin.php", + "/shell.php", + "/../../../etc/passwd", + "/sqlmap", + "/w00t.php", + "/shell", + "/joomla/administrator", +] + +FAKE_USER_AGENTS = [ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", + "Nmap Scripting Engine", + "curl/7.68.0", + "python-requests/2.28.1", + "sqlmap/1.6.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", + "ZmEu", + "nikto/2.1.6", +] + +FAKE_CREDENTIALS = [ + ("admin", "admin"), + ("admin", "password"), + ("root", "123456"), + ("test", "test"), + ("guest", "guest"), + ("user", "12345"), +] + +ATTACK_TYPES = [ + "sql_injection", + "xss_attempt", + "path_traversal", + "suspicious_pattern", + "credential_submission", +] + +CATEGORIES = [ + "attacker", + "bad_crawler", + "good_crawler", + "regular_user", + "unknown", +] + + +def generate_category_scores(): + """Generate random category scores.""" + scores = { + "attacker": random.randint(0, 100), + "good_crawler": random.randint(0, 100), + "bad_crawler": random.randint(0, 100), + "regular_user": random.randint(0, 100), + "unknown": random.randint(0, 100), + } + return scores + + +def generate_analyzed_metrics(): + """Generate random analyzed metrics.""" + return { + "request_frequency": random.uniform(0.1, 100.0), + "suspicious_patterns": random.randint(0, 20), + "credential_attempts": random.randint(0, 10), + "attack_diversity": random.uniform(0, 1.0), + } + + +def cleanup_database(db_manager, app_logger): + """ + Clean up all existing test data from the database. + + Args: + db_manager: Database manager instance + app_logger: Logger instance + """ + from models import ( + AccessLog, + CredentialAttempt, + AttackDetection, + IpStats, + CategoryHistory, + ) + + app_logger.info("=" * 60) + app_logger.info("Cleaning up existing database data") + app_logger.info("=" * 60) + + session = db_manager.session + try: + # Delete all records from each table + deleted_attack_detections = session.query(AttackDetection).delete() + deleted_access_logs = session.query(AccessLog).delete() + deleted_credentials = session.query(CredentialAttempt).delete() + deleted_category_history = session.query(CategoryHistory).delete() + deleted_ip_stats = session.query(IpStats).delete() + + session.commit() + + app_logger.info(f"Deleted {deleted_access_logs} access logs") + app_logger.info(f"Deleted {deleted_attack_detections} attack detections") + app_logger.info(f"Deleted {deleted_credentials} credential attempts") + app_logger.info(f"Deleted {deleted_category_history} category history records") + app_logger.info(f"Deleted {deleted_ip_stats} IP statistics") + app_logger.info("✓ Database cleanup complete") + except Exception as e: + session.rollback() + app_logger.error(f"Error during database cleanup: {e}") + raise + finally: + db_manager.close_session() + + +def fetch_geolocation_from_api(ip: str, app_logger) -> tuple: + """ + Fetch geolocation data from the IP reputation API. + Uses the most recent result and extracts city from coordinates. + + Args: + ip: IP address to lookup + app_logger: Logger instance + + Returns: + Tuple of (country_code, city, asn, asn_org) or None if failed + """ + try: + api_url = "https://iprep.lcrawl.com/api/iprep/" + params = {"cidr": ip} + headers = {"Content-Type": "application/json"} + response = requests.get(api_url, headers=headers, params=params, timeout=10) + + if response.status_code == 200: + payload = response.json() + if payload.get("results"): + results = payload["results"] + + # Get the most recent result (first in list, sorted by record_added) + most_recent = results[0] + geoip_data = most_recent.get("geoip_data", {}) + + country_code = geoip_data.get("country_iso_code") + asn = geoip_data.get("asn_autonomous_system_number") + asn_org = geoip_data.get("asn_autonomous_system_organization") + + # Extract city from coordinates using reverse geocoding + city = extract_city_from_coordinates(geoip_data) + + return (country_code, city, asn, asn_org) + except requests.RequestException as e: + app_logger.warning(f"Failed to fetch geolocation for {ip}: {e}") + except Exception as e: + app_logger.error(f"Error processing geolocation for {ip}: {e}") + + return None + + +def generate_fake_data( + num_ips: int = 20, + logs_per_ip: int = 15, + credentials_per_ip: int = 3, + include_good_crawlers: bool = True, + cleanup: bool = True, +): + """ + Generate and insert fake test data into the database. + + Args: + num_ips: Number of unique fake IPs to generate (default: 20) + logs_per_ip: Number of access logs per IP (default: 15) + credentials_per_ip: Number of credential attempts per IP (default: 3) + include_good_crawlers: Whether to add real good crawler IPs with API-fetched geolocation (default: True) + cleanup: Whether to clean up existing database data before generating new data (default: True) + """ + db_manager = get_database() + app_logger = get_app_logger() + + # Ensure database is initialized + if not db_manager._initialized: + db_manager.initialize() + + # Clean up existing data if requested + if cleanup: + cleanup_database(db_manager, app_logger) + print() # Add blank line for readability + + app_logger.info("=" * 60) + app_logger.info("Starting fake IP data generation for testing") + app_logger.info("=" * 60) + + total_logs = 0 + total_credentials = 0 + total_attacks = 0 + total_category_changes = 0 + + # Select random IPs from the pool + selected_ips = random.sample(FAKE_IPS, min(num_ips, len(FAKE_IPS))) + + # Create a varied distribution of request counts for better visualization + # Some IPs with very few requests, some with moderate, some with high + request_counts = [] + for i in range(len(selected_ips)): + if i < len(selected_ips) // 5: # 20% high-traffic IPs + count = random.randint(1000, 10000) + elif i < len(selected_ips) // 2: # 30% medium-traffic IPs + count = random.randint(100, 1000) + else: # 50% low-traffic IPs + count = random.randint(5, 100) + request_counts.append(count) + + random.shuffle(request_counts) # Randomize the order + + for idx, ip in enumerate(selected_ips): + current_logs_count = request_counts[idx] + app_logger.info( + f"\nGenerating data for IP: {ip} ({current_logs_count} requests)" + ) + + # Generate access logs for this IP + for _ in range(current_logs_count): + path = random.choice(FAKE_PATHS) + user_agent = random.choice(FAKE_USER_AGENTS) + is_suspicious = random.choice( + [True, False, False] + ) # 33% chance of suspicious + is_honeypot = random.choice( + [True, False, False, False] + ) # 25% chance of honeypot trigger + + # Randomly decide if this log has attack detections + attack_types = None + if random.choice([True, False, False]): # 33% chance + num_attacks = random.randint(1, 3) + attack_types = random.sample(ATTACK_TYPES, num_attacks) + + log_id = db_manager.persist_access( + ip=ip, + path=path, + user_agent=user_agent, + method=random.choice(["GET", "POST"]), + is_suspicious=is_suspicious, + is_honeypot_trigger=is_honeypot, + attack_types=attack_types, + ) + + if log_id: + total_logs += 1 + if attack_types: + total_attacks += len(attack_types) + + # Generate credential attempts for this IP + for _ in range(credentials_per_ip): + username, password = random.choice(FAKE_CREDENTIALS) + path = random.choice(["/login", "/admin/login", "/api/auth"]) + + cred_id = db_manager.persist_credential( + ip=ip, + path=path, + username=username, + password=password, + ) + + if cred_id: + total_credentials += 1 + + app_logger.info(f" ✓ Generated {current_logs_count} access logs") + app_logger.info(f" ✓ Generated {credentials_per_ip} credential attempts") + + # Fetch geolocation data from API + app_logger.info(f" 🌍 Fetching geolocation from API...") + geo_data = fetch_geolocation_from_api(ip, app_logger) + + if geo_data: + country_code, city, asn, asn_org = geo_data + db_manager.update_ip_rep_infos( + ip=ip, + country_code=country_code, + asn=asn if asn else 12345, + asn_org=asn_org or "Unknown", + list_on={}, + city=city, + ) + location_display = ( + f"{city}, {country_code}" if city else country_code or "Unknown" + ) + app_logger.info( + f" 📍 API-fetched geolocation: {location_display} ({asn_org or 'Unknown'})" + ) + else: + app_logger.warning(f" ⚠ Could not fetch geolocation for {ip}") + + # Small delay to be nice to the API + time.sleep(0.5) + + # Trigger behavior/category changes to demonstrate timeline feature + # First analysis + initial_category = random.choice(CATEGORIES) + app_logger.info( + f" ⟳ Analyzing behavior - Initial category: {initial_category}" + ) + + db_manager.update_ip_stats_analysis( + ip=ip, + analyzed_metrics=generate_analyzed_metrics(), + category=initial_category, + category_scores=generate_category_scores(), + last_analysis=datetime.now(tz=ZoneInfo("UTC")), + ) + total_category_changes += 1 + + # Small delay to ensure timestamps are different + time.sleep(0.1) + + # Second analysis with potential category change (70% chance) + if random.random() < 0.7: + new_category = random.choice( + [c for c in CATEGORIES if c != initial_category] + ) + app_logger.info( + f" ⟳ Behavior change detected: {initial_category} → {new_category}" + ) + + db_manager.update_ip_stats_analysis( + ip=ip, + analyzed_metrics=generate_analyzed_metrics(), + category=new_category, + category_scores=generate_category_scores(), + last_analysis=datetime.now(tz=ZoneInfo("UTC")), + ) + total_category_changes += 1 + + # Optional third change (40% chance) + if random.random() < 0.4: + final_category = random.choice( + [c for c in CATEGORIES if c != new_category] + ) + app_logger.info( + f" ⟳ Another behavior change: {new_category} → {final_category}" + ) + + time.sleep(0.1) + db_manager.update_ip_stats_analysis( + ip=ip, + analyzed_metrics=generate_analyzed_metrics(), + category=final_category, + category_scores=generate_category_scores(), + last_analysis=datetime.now(tz=ZoneInfo("UTC")), + ) + total_category_changes += 1 + + # Add good crawler IPs with real geolocation from API + total_good_crawlers = 0 + if include_good_crawlers: + app_logger.info("\n" + "=" * 60) + app_logger.info("Adding Good Crawler IPs with API-fetched geolocation") + app_logger.info("=" * 60) + + for crawler_ip in GOOD_CRAWLER_IPS: + app_logger.info(f"\nProcessing Good Crawler: {crawler_ip}") + + # Fetch real geolocation from API + geo_data = fetch_geolocation_from_api(crawler_ip, app_logger) + + # Don't generate access logs for good crawlers to prevent re-categorization + # We'll just create the IP stats entry with the category set + app_logger.info( + f" ✓ Adding as good crawler (no logs to prevent re-categorization)" + ) + + # First, we need to create the IP in the database via persist_access + # (but we'll only create one minimal log entry) + db_manager.persist_access( + ip=crawler_ip, + path="/robots.txt", # Minimal, normal crawler behavior + user_agent="Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + method="GET", + is_suspicious=False, + is_honeypot_trigger=False, + attack_types=None, + ) + + # Add geolocation if API fetch was successful + if geo_data: + country_code, city, asn, asn_org = geo_data + db_manager.update_ip_rep_infos( + ip=crawler_ip, + country_code=country_code, + asn=asn if asn else 12345, + asn_org=asn_org, + list_on={}, + city=city, + ) + app_logger.info( + f" 📍 API-fetched geolocation: {city}, {country_code} ({asn_org})" + ) + else: + app_logger.warning(f" ⚠ Could not fetch geolocation for {crawler_ip}") + + # Set category to good_crawler - this sets manual_category=True to prevent re-analysis + db_manager.update_ip_stats_analysis( + ip=crawler_ip, + analyzed_metrics={ + "request_frequency": 0.1, # Very low frequency + "suspicious_patterns": 0, + "credential_attempts": 0, + "attack_diversity": 0.0, + }, + category="good_crawler", + category_scores={ + "attacker": 0, + "good_crawler": 100, + "bad_crawler": 0, + "regular_user": 0, + "unknown": 0, + }, + last_analysis=datetime.now(tz=ZoneInfo("UTC")), + ) + total_good_crawlers += 1 + time.sleep(0.5) # Small delay between API calls + + # Print summary + app_logger.info("\n" + "=" * 60) + app_logger.info("Test Data Generation Complete!") + app_logger.info("=" * 60) + app_logger.info(f"Total IPs created: {len(selected_ips) + total_good_crawlers}") + app_logger.info(f" - Attackers/Mixed: {len(selected_ips)}") + app_logger.info(f" - Good Crawlers: {total_good_crawlers}") + app_logger.info(f"Total access logs: {total_logs}") + app_logger.info(f"Total attack detections: {total_attacks}") + app_logger.info(f"Total credential attempts: {total_credentials}") + app_logger.info(f"Total category changes: {total_category_changes}") + app_logger.info("=" * 60) + app_logger.info("\nYou can now view the dashboard with this test data.") + app_logger.info( + "The 'Behavior Timeline' will show category transitions for each IP." + ) + app_logger.info( + "All IPs have API-fetched geolocation with reverse geocoded city names." + ) + app_logger.info("Run: python server.py") + app_logger.info("=" * 60) + + +if __name__ == "__main__": + import sys + + # Allow command-line arguments for customization + num_ips = int(sys.argv[1]) if len(sys.argv) > 1 else 20 + logs_per_ip = int(sys.argv[2]) if len(sys.argv) > 2 else 15 + credentials_per_ip = int(sys.argv[3]) if len(sys.argv) > 3 else 3 + # Add --no-cleanup flag to skip database cleanup + cleanup = "--no-cleanup" not in sys.argv + + generate_fake_data( + num_ips, + logs_per_ip, + credentials_per_ip, + include_good_crawlers=True, + cleanup=cleanup, + ) diff --git a/wordlists.json b/wordlists.json index c0f1a17..3ea6f40 100644 --- a/wordlists.json +++ b/wordlists.json @@ -353,10 +353,21 @@ } }, "attack_patterns": { - "path_traversal": "\\.\\.", + "path_traversal": "(\\.\\.|%2e%2e|%252e%252e|\\.{2,}|%c0%ae|%c1%9c)", "sql_injection": "('|\"|`|--|#|/\\*|\\*/|\\bunion\\b|\\bunion\\s+select\\b|\\bor\\b.*=.*|\\band\\b.*=.*|'.*or.*'.*=.*'|\\bsleep\\b|\\bwaitfor\\b|\\bdelay\\b|\\bbenchmark\\b|;.*select|;.*drop|;.*insert|;.*update|;.*delete|\\bexec\\b|\\bexecute\\b|\\bxp_cmdshell\\b|information_schema|table_schema|table_name)", "xss_attempt": "(