Files
DomGod/app/main.py

243 lines
7.9 KiB
Python
Raw Normal View History

import os
import asyncio
import logging
from pathlib import Path
from contextlib import asynccontextmanager
import httpx
import aiosqlite
from typing import Optional
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from dotenv import load_dotenv
load_dotenv()
from app.db import (
DATA_DIR, PARQUET_PATH, SQLITE_PATH,
init_db, get_stats, get_domains, get_enriched,
queue_domains, get_queue_status, build_duckdb_index, index_status,
queue_ai, get_ai_queue_status, save_ai_assessment,
)
from app.enricher import start_worker, pause_worker, resume_worker, is_running
from app.scorer import run_scoring
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
PARQUET_URL = os.getenv("PARQUET_URL", "")
async def download_parquet():
if PARQUET_PATH.exists():
logger.info("Using cached parquet at %s", PARQUET_PATH)
return
DATA_DIR.mkdir(parents=True, exist_ok=True)
tmp_path = PARQUET_PATH.with_suffix(".tmp")
downloaded = tmp_path.stat().st_size if tmp_path.exists() else 0
headers = {"Range": f"bytes={downloaded}-"} if downloaded > 0 else {}
logger.info("Downloading parquet from %s (offset=%d)...", PARQUET_URL, downloaded)
async with httpx.AsyncClient(follow_redirects=True, timeout=None) as client:
async with client.stream("GET", PARQUET_URL, headers=headers) as resp:
if resp.status_code == 416:
tmp_path.rename(PARQUET_PATH)
return
resp.raise_for_status()
total = int(resp.headers.get("content-length", 0)) + downloaded
mode = "ab" if downloaded > 0 else "wb"
with open(tmp_path, mode) as f:
received = downloaded
async for chunk in resp.aiter_bytes(chunk_size=1024 * 1024):
f.write(chunk)
received += len(chunk)
if total:
logger.info("Download: %.1f%% (%d/%d)", received / total * 100, received, total)
tmp_path.rename(PARQUET_PATH)
logger.info("Parquet download complete")
@asynccontextmanager
async def lifespan(app: FastAPI):
await download_parquet()
await init_db()
# Build DuckDB index in background — queries still work (slower) while building
asyncio.create_task(build_duckdb_index())
start_worker()
logger.info("DomGod ready on port 6677")
yield
app = FastAPI(title="DomGod", lifespan=lifespan)
# ── API ──────────────────────────────────────────────────────────────────────
@app.get("/api/stats")
async def stats():
return await get_stats()
@app.get("/api/index/status")
async def get_index_status():
return index_status()
@app.get("/api/domains")
async def domains(
tld: str = Query(None),
page: int = Query(1, ge=1),
limit: int = Query(100, ge=1, le=500),
live_only: bool = Query(False),
alpha_only: bool = Query(False),
no_sld: bool = Query(False),
keyword: str = Query(None),
):
total, rows = await get_domains(
tld=tld, page=page, limit=limit,
alpha_only=alpha_only, no_sld=no_sld,
keyword=keyword, live_only=live_only,
)
return {"page": page, "limit": limit, "total": total, "results": rows}
@app.post("/api/enrich/batch")
async def enrich_batch(body: dict):
domains_list = body.get("domains", [])
if not domains_list:
return JSONResponse({"error": "no domains provided"}, status_code=400)
await queue_domains(domains_list)
resume_worker()
return {"queued": len(domains_list)}
@app.get("/api/enrich/status")
async def enrich_status():
status = await get_queue_status()
status["worker_running"] = is_running()
return status
@app.post("/api/enrich/retry")
async def enrich_retry():
async with aiosqlite.connect(SQLITE_PATH) as db:
await db.execute("UPDATE job_queue SET status='pending', error=NULL WHERE status='failed'")
await db.commit()
resume_worker()
return {"status": "retrying"}
@app.post("/api/enrich/pause")
async def enrich_pause():
pause_worker()
return {"status": "paused"}
@app.post("/api/enrich/resume")
async def enrich_resume():
resume_worker()
return {"status": "resumed"}
@app.get("/api/enriched")
async def enriched(
min_score: int = Query(0, ge=0, le=100),
cms: str = Query(None),
country: str = Query(None),
kit_digital: Optional[bool] = Query(None),
page: int = Query(1, ge=1),
limit: int = Query(100, ge=1, le=1000),
):
total, rows = await get_enriched(
min_score=min_score, cms=cms, country=country,
kit_digital=kit_digital, page=page, limit=limit,
)
return {"page": page, "limit": limit, "total": total, "results": rows}
# ── AI assessment endpoints ───────────────────────────────────────────────────
@app.post("/api/ai/assess/batch")
async def ai_assess_batch(body: dict):
domains_list = body.get("domains", [])
if not domains_list:
return JSONResponse({"error": "no domains provided"}, status_code=400)
await queue_ai(domains_list)
return {"queued": len(domains_list)}
@app.get("/api/ai/status")
async def ai_status():
return await get_ai_queue_status()
@app.post("/api/ai/assess/single")
async def ai_assess_single(body: dict):
feat: deep site analysis engine + fix AI assess for any domain site_analyzer.py (new): - Fresh scrape with timing, page size, server, CMS detection - Lorem ipsum detection (16 phrases incl. user's example) - Placeholder content detection (hello world, sample page, etc.) - Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity - Webmaster: Google Search Console, Bing, Yandex verification tags - sitemap.xml and robots.txt check + Googlebot block detection - Mobile viewport check, word count, image/script count - Full contact extraction: emails, phones, WhatsApp, social links - Kit Digital signal detection AI worker fix: - No longer requires pre-enrichment — works on ANY selected domain - Does fresh site_analyzer scrape then calls Gemini with full context - Stores site_analysis JSON alongside AI assessment - Upserts into enriched_domains even if domain was never enriched Gemini prompt now includes: - Complete technical snapshot (load time, size, server, SSL) - Full SEO signals (sitemap, robots, analytics, webmaster verified) - Content quality (lorem ipsum matches, placeholder matches) - Kit Digital signals - All extracted contacts - 500-word page text sample - Outputs: summary, site_quality_score/10, content_issues[], urgency_signals[], performance_notes, seo_status, best_contact_channel+value, all_contacts, ES pitch, services_needed, outreach_notes UI: rich AI modal with summary banner, quality grid, content issues, urgency signals, full contact list, technical snapshot Fixes: correct Replicate token, ai_queue status='running' bug Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 17:46:01 +02:00
"""Immediate (blocking) AI assessment — does fresh scrape, no pre-enrichment needed."""
domain = body.get("domain")
if not domain:
return JSONResponse({"error": "no domain"}, status_code=400)
feat: deep site analysis engine + fix AI assess for any domain site_analyzer.py (new): - Fresh scrape with timing, page size, server, CMS detection - Lorem ipsum detection (16 phrases incl. user's example) - Placeholder content detection (hello world, sample page, etc.) - Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity - Webmaster: Google Search Console, Bing, Yandex verification tags - sitemap.xml and robots.txt check + Googlebot block detection - Mobile viewport check, word count, image/script count - Full contact extraction: emails, phones, WhatsApp, social links - Kit Digital signal detection AI worker fix: - No longer requires pre-enrichment — works on ANY selected domain - Does fresh site_analyzer scrape then calls Gemini with full context - Stores site_analysis JSON alongside AI assessment - Upserts into enriched_domains even if domain was never enriched Gemini prompt now includes: - Complete technical snapshot (load time, size, server, SSL) - Full SEO signals (sitemap, robots, analytics, webmaster verified) - Content quality (lorem ipsum matches, placeholder matches) - Kit Digital signals - All extracted contacts - 500-word page text sample - Outputs: summary, site_quality_score/10, content_issues[], urgency_signals[], performance_notes, seo_status, best_contact_channel+value, all_contacts, ES pitch, services_needed, outreach_notes UI: rich AI modal with summary banner, quality grid, content issues, urgency signals, full contact list, technical snapshot Fixes: correct Replicate token, ai_queue status='running' bug Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 17:46:01 +02:00
from app.site_analyzer import analyze_site
from app.replicate_ai import assess_domain as gemini_assess
feat: deep site analysis engine + fix AI assess for any domain site_analyzer.py (new): - Fresh scrape with timing, page size, server, CMS detection - Lorem ipsum detection (16 phrases incl. user's example) - Placeholder content detection (hello world, sample page, etc.) - Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity - Webmaster: Google Search Console, Bing, Yandex verification tags - sitemap.xml and robots.txt check + Googlebot block detection - Mobile viewport check, word count, image/script count - Full contact extraction: emails, phones, WhatsApp, social links - Kit Digital signal detection AI worker fix: - No longer requires pre-enrichment — works on ANY selected domain - Does fresh site_analyzer scrape then calls Gemini with full context - Stores site_analysis JSON alongside AI assessment - Upserts into enriched_domains even if domain was never enriched Gemini prompt now includes: - Complete technical snapshot (load time, size, server, SSL) - Full SEO signals (sitemap, robots, analytics, webmaster verified) - Content quality (lorem ipsum matches, placeholder matches) - Kit Digital signals - All extracted contacts - 500-word page text sample - Outputs: summary, site_quality_score/10, content_issues[], urgency_signals[], performance_notes, seo_status, best_contact_channel+value, all_contacts, ES pitch, services_needed, outreach_notes UI: rich AI modal with summary banner, quality grid, content issues, urgency signals, full contact list, technical snapshot Fixes: correct Replicate token, ai_queue status='running' bug Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 17:46:01 +02:00
analysis = await analyze_site(domain)
assessment = await gemini_assess(analysis)
await save_ai_assessment(domain, assessment, site_analysis=analysis)
return {**assessment, "site_analysis": analysis}
@app.get("/api/export")
async def export_csv(
min_score: int = Query(0),
cms: str = Query(None),
country: str = Query(None),
tier: str = Query(None),
):
if tier == "hot":
min_score = 80
elif tier == "warm":
min_score = 50
max_score = 79 if tier == "warm" else 100
async def generate():
yield "domain,score,cms,ssl_expiry_days,ip_country,is_live,status_code,has_mx,server,page_title,enriched_at\n"
p = 1
while True:
_, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=p, limit=500)
if not rows:
break
for r in rows:
if r.get("score", 0) > max_score:
continue
line = ",".join(
f'"{str(r.get(col) or "").replace(chr(34), chr(39))}"'
for col in ["domain", "score", "cms", "ssl_expiry_days", "ip_country",
"is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"]
)
yield line + "\n"
p += 1
fname = f"domgod_{tier or 'export'}_score{min_score}.csv"
return StreamingResponse(
generate(), media_type="text/csv",
headers={"Content-Disposition": f'attachment; filename="{fname}"'},
)
@app.post("/api/score/run")
async def score_run():
return await run_scoring()
# ── Static UI ────────────────────────────────────────────────────────────────
static_dir = Path(__file__).parent / "static"
app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")
if __name__ == "__main__":
import uvicorn
uvicorn.run("app.main:app", host="0.0.0.0", port=6677, log_level="info")