feat: initial Dockerized domain intelligence dashboard
- FastAPI backend with DuckDB pushdown queries on 72M parquet - Async enrichment worker: HTTP, SSL, DNS MX, CMS fingerprint, ip-api.com - Resumable parquet download with HTTP Range support - Lead scoring engine (max 100 pts, target countries ES,GB,DE,FR,RO,PT,AD,IT) - Single-file Alpine.js + Chart.js dashboard on port 6677 - SQLite enrichment DB with job queue and scores tables - Dockerized with persistent /data volume Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
270
app/enricher.py
Normal file
270
app/enricher.py
Normal file
@@ -0,0 +1,270 @@
|
||||
import asyncio
|
||||
import os
|
||||
import ssl
|
||||
import socket
|
||||
import datetime
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
import dns.resolver
|
||||
import aiosqlite
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.db import SQLITE_PATH
|
||||
from app.scorer import score
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", "50"))
|
||||
# ip-api.com free tier: 45 req/min → ~1.33/s. We use a separate slower semaphore.
|
||||
IP_API_SEMAPHORE: Optional[asyncio.Semaphore] = None
|
||||
IP_API_RATE = 45 # per minute
|
||||
|
||||
_worker_task: Optional[asyncio.Task] = None
|
||||
_paused = False
|
||||
|
||||
|
||||
def get_ip_semaphore():
|
||||
global IP_API_SEMAPHORE
|
||||
if IP_API_SEMAPHORE is None:
|
||||
IP_API_SEMAPHORE = asyncio.Semaphore(1)
|
||||
return IP_API_SEMAPHORE
|
||||
|
||||
|
||||
CMS_SIGNATURES = {
|
||||
"wordpress": ["/wp-content/", "/wp-includes/", 'name="generator" content="WordPress'],
|
||||
"joomla": ["/components/com_", "Joomla!", 'name="generator" content="Joomla'],
|
||||
"drupal": ["/sites/default/files/", "Drupal.settings", 'name="generator" content="Drupal'],
|
||||
"wix": ["wix.com", "X-Wix-"],
|
||||
"squarespace": ["squarespace.com", "X-Squarespace-"],
|
||||
"shopify": ["cdn.shopify.com", "Shopify.theme"],
|
||||
"prestashop": ["PrestaShop", "/modules/"],
|
||||
"magento": ["Mage.Cookies", "X-Magento-"],
|
||||
"typo3": ["typo3", "TYPO3 CMS"],
|
||||
"opencart": ["route=common/home", "OpenCart"],
|
||||
}
|
||||
|
||||
|
||||
def detect_cms(html: str, headers: dict) -> Optional[str]:
|
||||
combined = html[:50000] + " ".join(f"{k}:{v}" for k, v in headers.items())
|
||||
for cms, sigs in CMS_SIGNATURES.items():
|
||||
if any(sig.lower() in combined.lower() for sig in sigs):
|
||||
return cms
|
||||
return None
|
||||
|
||||
|
||||
async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
|
||||
try:
|
||||
ctx = ssl.create_default_context()
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def _check():
|
||||
with socket.create_connection((domain, 443), timeout=5) as sock:
|
||||
with ctx.wrap_socket(sock, server_hostname=domain) as ssock:
|
||||
cert = ssock.getpeercert()
|
||||
expiry_str = cert.get("notAfter", "")
|
||||
expiry = datetime.datetime.strptime(expiry_str, "%b %d %H:%M:%S %Y %Z")
|
||||
days = (expiry - datetime.datetime.utcnow()).days
|
||||
return True, days
|
||||
|
||||
return await loop.run_in_executor(None, _check)
|
||||
except Exception:
|
||||
return False, None
|
||||
|
||||
|
||||
async def check_mx(domain: str) -> bool:
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def _check():
|
||||
try:
|
||||
answers = dns.resolver.resolve(domain, "MX", lifetime=5)
|
||||
return len(answers) > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
return await loop.run_in_executor(None, _check)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
_ip_last_call = 0.0
|
||||
_ip_lock = asyncio.Lock() if False else None # initialized lazily
|
||||
|
||||
|
||||
async def get_ip_country(ip: str) -> Optional[str]:
|
||||
global _ip_last_call
|
||||
# Enforce 45 req/min = 1 req per 1.33s
|
||||
async with get_ip_semaphore():
|
||||
now = asyncio.get_event_loop().time()
|
||||
wait = (1 / (IP_API_RATE / 60)) - (now - _ip_last_call)
|
||||
if wait > 0:
|
||||
await asyncio.sleep(wait)
|
||||
_ip_last_call = asyncio.get_event_loop().time()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as client:
|
||||
resp = await client.get(f"http://ip-api.com/json/{ip}?fields=countryCode")
|
||||
if resp.status_code == 200:
|
||||
return resp.json().get("countryCode")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
async def enrich_domain(domain: str) -> dict:
|
||||
result = {
|
||||
"domain": domain,
|
||||
"is_live": False,
|
||||
"status_code": None,
|
||||
"ssl_valid": False,
|
||||
"ssl_expiry_days": None,
|
||||
"cms": None,
|
||||
"has_mx": False,
|
||||
"ip_country": None,
|
||||
"page_title": None,
|
||||
"server": None,
|
||||
"enriched_at": datetime.datetime.utcnow().isoformat(),
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10,
|
||||
follow_redirects=True,
|
||||
verify=False,
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
|
||||
) as client:
|
||||
resp = await client.get(f"http://{domain}")
|
||||
result["is_live"] = resp.status_code in (200, 301, 302, 303, 307, 308)
|
||||
result["status_code"] = resp.status_code
|
||||
result["server"] = resp.headers.get("server")
|
||||
|
||||
html = resp.text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
title_tag = soup.find("title")
|
||||
result["page_title"] = title_tag.get_text(strip=True)[:500] if title_tag else None
|
||||
result["cms"] = detect_cms(html, dict(resp.headers))
|
||||
|
||||
# Resolve IP for country lookup
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
|
||||
result["ip_country"] = await get_ip_country(ip)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)[:500]
|
||||
|
||||
# SSL check (independent of HTTP)
|
||||
ssl_valid, ssl_days = await check_ssl(domain)
|
||||
result["ssl_valid"] = ssl_valid
|
||||
result["ssl_expiry_days"] = ssl_days
|
||||
|
||||
# MX check
|
||||
result["has_mx"] = await check_mx(domain)
|
||||
|
||||
# Score
|
||||
result["score"] = score(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def save_enriched(data: dict):
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
await db.execute(
|
||||
"""INSERT INTO enriched_domains
|
||||
(domain, is_live, status_code, ssl_valid, ssl_expiry_days, cms,
|
||||
has_mx, ip_country, page_title, server, enriched_at, error, score)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
ON CONFLICT(domain) DO UPDATE SET
|
||||
is_live=excluded.is_live, status_code=excluded.status_code,
|
||||
ssl_valid=excluded.ssl_valid, ssl_expiry_days=excluded.ssl_expiry_days,
|
||||
cms=excluded.cms, has_mx=excluded.has_mx, ip_country=excluded.ip_country,
|
||||
page_title=excluded.page_title, server=excluded.server,
|
||||
enriched_at=excluded.enriched_at, error=excluded.error, score=excluded.score""",
|
||||
(
|
||||
data["domain"], data["is_live"], data["status_code"],
|
||||
data["ssl_valid"], data["ssl_expiry_days"], data["cms"],
|
||||
data["has_mx"], data["ip_country"], data["page_title"],
|
||||
data["server"], data["enriched_at"], data["error"], data["score"],
|
||||
),
|
||||
)
|
||||
await db.execute(
|
||||
"""INSERT INTO scores (domain, score) VALUES (?,?)
|
||||
ON CONFLICT(domain) DO UPDATE SET score=excluded.score, scored_at=datetime('now')""",
|
||||
(data["domain"], data["score"]),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def mark_job(domain: str, status: str, error: str = None):
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
if status == "running":
|
||||
await db.execute(
|
||||
"UPDATE job_queue SET status=?, started_at=datetime('now') WHERE domain=?",
|
||||
(status, domain),
|
||||
)
|
||||
elif status in ("done", "failed"):
|
||||
await db.execute(
|
||||
"UPDATE job_queue SET status=?, completed_at=datetime('now'), error=? WHERE domain=?",
|
||||
(status, error, domain),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def worker_loop():
|
||||
global _paused
|
||||
sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
|
||||
|
||||
async def process(domain: str):
|
||||
async with sem:
|
||||
await mark_job(domain, "running")
|
||||
try:
|
||||
data = await enrich_domain(domain)
|
||||
await save_enriched(data)
|
||||
await mark_job(domain, "done")
|
||||
except Exception as e:
|
||||
await mark_job(domain, "failed", str(e)[:500])
|
||||
|
||||
while True:
|
||||
if _paused:
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
async with db.execute(
|
||||
"SELECT domain FROM job_queue WHERE status='pending' LIMIT 100"
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
|
||||
if not rows:
|
||||
await asyncio.sleep(2)
|
||||
continue
|
||||
|
||||
tasks = [asyncio.create_task(process(r[0])) for r in rows]
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
def start_worker():
|
||||
global _worker_task
|
||||
if _worker_task is None or _worker_task.done():
|
||||
_worker_task = asyncio.create_task(worker_loop())
|
||||
_paused = False
|
||||
|
||||
|
||||
def pause_worker():
|
||||
global _paused
|
||||
_paused = True
|
||||
|
||||
|
||||
def resume_worker():
|
||||
global _paused
|
||||
_paused = False
|
||||
start_worker()
|
||||
|
||||
|
||||
def is_running() -> bool:
|
||||
return _worker_task is not None and not _worker_task.done() and not _paused
|
||||
Reference in New Issue
Block a user