feat: deep site analysis engine + fix AI assess for any domain

site_analyzer.py (new):
- Fresh scrape with timing, page size, server, CMS detection
- Lorem ipsum detection (16 phrases incl. user's example)
- Placeholder content detection (hello world, sample page, etc.)
- Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity
- Webmaster: Google Search Console, Bing, Yandex verification tags
- sitemap.xml and robots.txt check + Googlebot block detection
- Mobile viewport check, word count, image/script count
- Full contact extraction: emails, phones, WhatsApp, social links
- Kit Digital signal detection

AI worker fix:
- No longer requires pre-enrichment — works on ANY selected domain
- Does fresh site_analyzer scrape then calls Gemini with full context
- Stores site_analysis JSON alongside AI assessment
- Upserts into enriched_domains even if domain was never enriched

Gemini prompt now includes:
- Complete technical snapshot (load time, size, server, SSL)
- Full SEO signals (sitemap, robots, analytics, webmaster verified)
- Content quality (lorem ipsum matches, placeholder matches)
- Kit Digital signals
- All extracted contacts
- 500-word page text sample
- Outputs: summary, site_quality_score/10, content_issues[],
  urgency_signals[], performance_notes, seo_status,
  best_contact_channel+value, all_contacts, ES pitch,
  services_needed, outreach_notes

UI: rich AI modal with summary banner, quality grid, content issues,
    urgency signals, full contact list, technical snapshot

Fixes: correct Replicate token, ai_queue status='running' bug

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:46:01 +02:00
parent faca4b6e1a
commit 5ad8259c75
7 changed files with 530 additions and 111 deletions

View File

@@ -35,7 +35,8 @@ CREATE TABLE IF NOT EXISTS enriched_domains (
ai_pitch TEXT,
ai_contact_channel TEXT,
ai_contact_value TEXT,
ai_assessed_at TEXT
ai_assessed_at TEXT,
site_analysis TEXT
);
CREATE TABLE IF NOT EXISTS job_queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -71,6 +72,7 @@ _MIGRATIONS = [
"ALTER TABLE enriched_domains ADD COLUMN ai_contact_channel TEXT",
"ALTER TABLE enriched_domains ADD COLUMN ai_contact_value TEXT",
"ALTER TABLE enriched_domains ADD COLUMN ai_assessed_at TEXT",
"ALTER TABLE enriched_domains ADD COLUMN site_analysis TEXT",
"CREATE TABLE IF NOT EXISTS ai_queue (domain TEXT PRIMARY KEY, status TEXT DEFAULT 'pending', created_at TEXT DEFAULT (datetime('now')), completed_at TEXT, error TEXT)",
]
@@ -352,13 +354,19 @@ async def get_ai_queue_status():
}
async def save_ai_assessment(domain: str, assessment: dict):
async def save_ai_assessment(domain: str, assessment: dict, site_analysis: dict = None):
import json as _json
async with aiosqlite.connect(SQLITE_PATH) as db:
# Upsert into enriched_domains (domain may not exist yet if assessed before full enrichment)
await db.execute(
"""INSERT INTO enriched_domains (domain) VALUES (?) ON CONFLICT(domain) DO NOTHING""",
(domain,),
)
await db.execute(
"""UPDATE enriched_domains SET
ai_assessment=?, ai_lead_quality=?, ai_pitch=?,
ai_contact_channel=?, ai_contact_value=?, ai_assessed_at=datetime('now')
ai_contact_channel=?, ai_contact_value=?, ai_assessed_at=datetime('now'),
site_analysis=?
WHERE domain=?""",
(
_json.dumps(assessment),
@@ -366,6 +374,26 @@ async def save_ai_assessment(domain: str, assessment: dict):
assessment.get("pitch_angle"),
assessment.get("best_contact_channel"),
assessment.get("best_contact_value"),
_json.dumps(site_analysis) if site_analysis else None,
domain,
),
)
# Also update contact_info + kit_digital from site_analysis if available
if site_analysis:
contacts = {
"emails": site_analysis.get("emails", []),
"phones": site_analysis.get("phones", []),
"whatsapp": site_analysis.get("whatsapp", []),
"social": site_analysis.get("social_links", []),
}
await db.execute(
"""UPDATE enriched_domains SET
kit_digital=?, kit_digital_signals=?, contact_info=?
WHERE domain=?""",
(
int(site_analysis.get("kit_digital", False)),
_json.dumps(site_analysis.get("kit_digital_signals", [])),
_json.dumps(contacts),
domain,
),
)

View File

@@ -13,7 +13,7 @@ import dns.resolver
import aiosqlite
from bs4 import BeautifulSoup
from app.db import SQLITE_PATH, queue_ai, save_ai_assessment, get_ai_queue_status
from app.db import SQLITE_PATH, queue_ai, save_ai_assessment
from app.scorer import score
logger = logging.getLogger(__name__)
@@ -340,17 +340,17 @@ async def worker_loop():
async def ai_worker_loop():
from app.replicate_ai import assess_domain as gemini_assess
from app.site_analyzer import analyze_site
while True:
async with aiosqlite.connect(SQLITE_PATH) as db:
async with db.execute(
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 20"
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10"
) as cur:
rows = await cur.fetchall()
# Mark as running
if rows:
await db.executemany(
"UPDATE ai_queue SET status='running', created_at=created_at WHERE domain=?",
"UPDATE ai_queue SET status='running' WHERE domain=?",
[(r[0],) for r in rows],
)
await db.commit()
@@ -361,16 +361,11 @@ async def ai_worker_loop():
async def assess_one(domain: str):
try:
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
"SELECT * FROM enriched_domains WHERE domain=?", (domain,)
) as cur:
row = await cur.fetchone()
if not row:
return
assessment = await gemini_assess(dict(row))
await save_ai_assessment(domain, assessment)
# Always do a fresh deep scrape — no pre-enrichment required
analysis = await analyze_site(domain)
assessment = await gemini_assess(analysis)
await save_ai_assessment(domain, assessment, site_analysis=analysis)
logger.info("AI done: %s%s", domain, assessment.get("lead_quality"))
except Exception as e:
async with aiosqlite.connect(SQLITE_PATH) as db:
await db.execute(
@@ -380,6 +375,7 @@ async def ai_worker_loop():
await db.commit()
logger.error("AI worker error %s: %s", domain, e)
# AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore)
await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)

View File

@@ -177,22 +177,16 @@ async def ai_status():
@app.post("/api/ai/assess/single")
async def ai_assess_single(body: dict):
"""Immediate (blocking) AI assessment of a single domain."""
"""Immediate (blocking) AI assessment — does fresh scrape, no pre-enrichment needed."""
domain = body.get("domain")
if not domain:
return JSONResponse({"error": "no domain"}, status_code=400)
from app.site_analyzer import analyze_site
from app.replicate_ai import assess_domain as gemini_assess
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
"SELECT * FROM enriched_domains WHERE domain=?", (domain,)
) as cur:
row = await cur.fetchone()
if not row:
return JSONResponse({"error": "domain not yet enriched"}, status_code=404)
assessment = await gemini_assess(dict(row))
await save_ai_assessment(domain, assessment)
return assessment
analysis = await analyze_site(domain)
assessment = await gemini_assess(analysis)
await save_ai_assessment(domain, assessment, site_analysis=analysis)
return {**assessment, "site_analysis": analysis}
@app.get("/api/export")

View File

@@ -1,4 +1,4 @@
"""Replicate / Gemini integration for domain lead assessment."""
"""Replicate / Gemini integration — deep site assessment."""
import asyncio
import json
import logging
@@ -10,7 +10,7 @@ import httpx
logger = logging.getLogger(__name__)
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj")
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") # override via env
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
@@ -24,66 +24,92 @@ def _sem() -> asyncio.Semaphore:
return _ai_sem
def _build_prompt(row: dict) -> str:
kit_signals = row.get("kit_digital_signals") or "[]"
try:
sigs = json.loads(kit_signals)
kit_block = "\n".join(f" - {s}" for s in sigs) if sigs else " None detected"
except Exception:
kit_block = f" {kit_signals}"
def _build_prompt(a: dict) -> str:
"""Build the Gemini prompt from a full site analysis dict."""
contacts_block = []
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
contacts_str = "\n".join(contacts_block) or " None found"
contact_raw = row.get("contact_info") or "{}"
try:
contacts = json.loads(contact_raw)
except Exception:
contacts = {}
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
contact_block = []
if contacts.get("emails"):
contact_block.append(f" Emails: {', '.join(contacts['emails'][:3])}")
if contacts.get("phones"):
contact_block.append(f" Phones: {', '.join(contacts['phones'][:3])}")
if contacts.get("whatsapp"):
contact_block.append(f" WhatsApp: {', '.join(contacts['whatsapp'][:2])}")
if contacts.get("social"):
contact_block.append(f" Social: {', '.join(contacts['social'][:4])}")
contact_str = "\n".join(contact_block) if contact_block else " None found"
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
return f"""You are a sales intelligence analyst evaluating Spanish SME websites for IT services upsell.
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
DOMAIN DATA:
- Domain: {row.get("domain")}
- Page title: {row.get("page_title") or "N/A"}
- CMS: {row.get("cms") or "unknown"}
- Server: {row.get("server") or "unknown"}
- Country: {row.get("ip_country") or "unknown"}
- SSL valid: {row.get("ssl_valid")}, expires in {row.get("ssl_expiry_days") or "?"} days
- Has email (MX): {bool(row.get("has_mx"))}
- Is live: {bool(row.get("is_live"))}
- Kit Digital signals found on page:
{kit_block}
- Contact channels found on page:
{contact_str}
=== TECHNICAL SNAPSHOT ===
Domain: {a.get("domain")}
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
Final URL: {a.get("final_url")}
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
Mobile viewport: {a.get("has_mobile_viewport")}
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
Kit Digital is a Spanish government program (up to €12k grants for SME digitalization). Sites that received it MUST display EU/digitalizadores logos. These businesses have proven they invest in IT services and may need follow-up: new website, SEO, hosting migration, security, maintenance contracts.
=== SEO & INDEXING SIGNALS ===
Page title: {a.get("page_title") or "missing"}
H1: {a.get("h1_text") or "missing"}
Meta description: {a.get("meta_description") or "missing"}
Canonical URL: {a.get("canonical_url") or "not set"}
Sitemap.xml: {a.get("has_sitemap")}
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
Analytics: {analytics_str}
Webmaster verified:{webmaster_str}
Assess this lead and respond ONLY with valid JSON (no markdown, no explanation outside the JSON):
=== CONTENT QUALITY ===
Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str}
Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str}
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
Detected: {a.get("kit_digital")}
Signals:
{kd_str}
=== CONTACT CHANNELS ===
{contacts_str}
=== PAGE TEXT SAMPLE (first 2000 chars) ===
{text_snippet}
=== TASK ===
Analyse this site for IT services upsell potential. The client sells:
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
{{
"is_local_sme": true/false,
"summary": "2-3 sentence executive summary of the site's current state",
"site_quality_score": <0-10 integer>,
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
"performance_notes": "comment on load time, page size, mobile readiness",
"seo_status": "brief SEO assessment — indexing signals, missing elements",
"kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence explaining why or why not",
"kit_digital_reasoning": "1 sentence — why confirmed or not",
"is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences on why this is a good/bad lead for IT services sales",
"lead_reasoning": "1-2 sentences on why",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "the actual email/phone/URL to use, or empty string",
"pitch_angle": "One concrete opening sentence for a cold email or call in Spanish",
"services_likely_needed": ["service1", "service2"],
"outreach_notes": "Any useful context for the sales rep (language, business type, urgency)"
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
"all_contacts": {{
"emails": [],
"phones": [],
"whatsapp": [],
"social": []
}},
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
"services_needed": ["service1", "service2"],
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
"outreach_notes": "Key context for the sales rep"
}}"""
def _parse_output(raw: str) -> dict:
"""Extract JSON from Gemini text output."""
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
m = re.search(r"\{[\s\S]+\}", text)
if m:
@@ -91,8 +117,9 @@ def _parse_output(raw: str) -> dict:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
return {
"raw": raw[:500],
"summary": raw[:400],
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
@@ -100,22 +127,22 @@ def _parse_output(raw: str) -> dict:
}
async def assess_domain(row: dict) -> dict:
"""Call Gemini via Replicate to assess a domain. Returns parsed assessment dict."""
async def assess_domain(analysis: dict) -> dict:
"""Call Gemini with the full site analysis. Returns parsed assessment."""
async with _sem():
payload = {
"input": {
"prompt": _build_prompt(row),
"prompt": _build_prompt(analysis),
"images": [],
"videos": [],
"top_p": 0.9,
"temperature": 0.2,
"thinking_level": "low",
"max_output_tokens": 1024,
"max_output_tokens": 2048,
}
}
try:
async with httpx.AsyncClient(timeout=90) as client:
async with httpx.AsyncClient(timeout=120) as client:
resp = await client.post(
REPLICATE_MODEL,
headers={
@@ -133,10 +160,15 @@ async def assess_domain(row: dict) -> dict:
output = "".join(output)
result = _parse_output(output)
logger.info("AI %s%s / contact: %s",
row.get("domain"), result.get("lead_quality"), result.get("best_contact_channel"))
logger.info("AI %s%s (quality %s)",
analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score"))
return result
except Exception as e:
logger.error("Replicate error %s: %s", row.get("domain"), e)
return {"error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": ""}
logger.error("Replicate error %s: %s", analysis.get("domain"), e)
return {
"error": str(e)[:300],
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
}

277
app/site_analyzer.py Normal file
View File

@@ -0,0 +1,277 @@
"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
import asyncio
import re
import time
import logging
from typing import Optional
import httpx
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
# ── Content quality ───────────────────────────────────────────────────────────
LOREM_PHRASES = [
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
"nulla id nibh", "aenean dignissim", "aliquam tincidunt",
"vestibulum commodo", "fusce nunc lacus", "consectetuer",
"cras ornare tristique", "ntulla nec ante", "risus id metus",
"praesent placerat", "fusce pellentesque", "suscipit nibh",
"integer vitae libero", "felis quis tortor",
]
PLACEHOLDER_PHRASES = [
"under construction", "coming soon", "sample page",
"this is a demo", "default post", "hello world",
"test post", "uncategorized",
]
# ── Analytics & webmaster tags ────────────────────────────────────────────────
ANALYTICS = {
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
"facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"],
"hotjar": ["static.hotjar.com"],
"clarity": ["clarity.ms/tag"],
}
WEBMASTER = {
"google_search_console": ['google-site-verification'],
"bing_webmaster": ['msvalidate.01'],
"yandex": ['yandex-verification'],
}
KIT_IMG_PATS = [
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
"prtr", "plan-recuperacion", "acelerapyme", "cofinanciado",
]
KIT_TEXT_PATS = [
"kit digital", "agente digitalizador", "fondos europeos",
"next generation eu", "nextgenerationeu", "plan de recuperación",
"prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme",
]
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
async def analyze_site(domain: str) -> dict:
"""Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
result = {
"domain": domain,
"reachable": False,
"load_time_ms": None,
"status_code": None,
"final_url": None,
"page_size_kb": None,
"server": None,
"cms": None,
"ssl_valid": False,
"ssl_expiry_days": None,
# Content quality
"has_lorem_ipsum": False,
"lorem_matches": [],
"has_placeholder": False,
"placeholder_matches": [],
"word_count": 0,
"image_count": 0,
"broken_images": 0,
"script_count": 0,
"has_mobile_viewport": False,
"page_title": None,
"meta_description": None,
"h1_text": None,
"visible_text_snippet": "",
# SEO / webmaster
"has_sitemap": False,
"has_robots": False,
"robots_disallows_google": False,
"analytics_present": [],
"webmaster_verified": [],
"canonical_url": None,
"og_title": None,
# Kit Digital
"kit_digital": False,
"kit_digital_signals": [],
# Contacts
"emails": [],
"phones": [],
"whatsapp": [],
"social_links": [],
# Errors
"error": None,
}
# ── Fetch main page ───────────────────────────────────────────────────────
try:
t0 = time.monotonic()
async with httpx.AsyncClient(
timeout=15, follow_redirects=True, verify=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
) as client:
resp = await client.get(f"https://{domain}")
if resp.status_code >= 400:
resp = await client.get(f"http://{domain}")
load_ms = int((time.monotonic() - t0) * 1000)
html = resp.text
result.update({
"reachable": resp.status_code < 400,
"load_time_ms": load_ms,
"status_code": resp.status_code,
"final_url": str(resp.url),
"page_size_kb": round(len(resp.content) / 1024, 1),
"server": resp.headers.get("server"),
})
soup = BeautifulSoup(html, "html.parser")
hl = html.lower()
# Title, meta
title_tag = soup.find("title")
result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
meta_desc = soup.find("meta", attrs={"name": "description"})
result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
h1 = soup.find("h1")
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
# Mobile viewport
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
# Canonical + OG
canon = soup.find("link", rel="canonical")
result["canonical_url"] = canon.get("href") if canon else None
og = soup.find("meta", property="og:title")
result["og_title"] = og.get("content") if og else None
# Visible text
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
visible_text = soup.get_text(separator=" ", strip=True)
words = visible_text.split()
result["word_count"] = len(words)
result["visible_text_snippet"] = " ".join(words[:500])
# Lorem ipsum / placeholder detection
vl = visible_text.lower()
lorem_hits = [p for p in LOREM_PHRASES if p in vl]
result["has_lorem_ipsum"] = len(lorem_hits) > 0
result["lorem_matches"] = lorem_hits[:5]
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
result["has_placeholder"] = len(ph_hits) > 0
result["placeholder_matches"] = ph_hits[:3]
# Images & scripts
imgs = soup.find_all("img")
result["image_count"] = len(imgs)
result["script_count"] = len(soup.find_all("script", src=True))
# Analytics / webmaster tags
for name, sigs in ANALYTICS.items():
if any(s.lower() in hl for s in sigs):
result["analytics_present"].append(name)
for name, sigs in WEBMASTER.items():
if any(s.lower() in hl for s in sigs):
result["webmaster_verified"].append(name)
# Kit Digital
kd_signals = []
for img in imgs:
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
for p in KIT_IMG_PATS:
if p in combined:
kd_signals.append(f"img:{p}")
break
for p in KIT_TEXT_PATS:
if p in hl:
kd_signals.append(f"text:{p}")
for a in soup.find_all("a", href=True):
href = a["href"].lower()
if "acelerapyme" in href or "red.es" in href or "kit-digital" in href:
kd_signals.append(f"link:{href[:50]}")
kd_signals = list(dict.fromkeys(kd_signals))[:10]
result["kit_digital"] = len(kd_signals) > 0
result["kit_digital_signals"] = kd_signals
# Contacts
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith("mailto:"):
em = href[7:].split("?")[0].strip().lower()
if em and em not in result["emails"]:
result["emails"].append(em)
elif href.startswith("tel:"):
ph = re.sub(r"[^\d+]", "", href[4:])
if ph and ph not in result["phones"]:
result["phones"].append(ph)
elif "wa.me" in href or "api.whatsapp.com" in href:
if href not in result["whatsapp"]:
result["whatsapp"].append(href[:80])
else:
for sd in SOCIAL_DOM:
if sd in href.lower():
clean = href.split("?")[0].rstrip("/")
if clean not in result["social_links"]:
result["social_links"].append(clean)
break
for em in EMAIL_RE.findall(html[:80000]):
em = em.lower()
if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
result["emails"].append(em)
for ph in PHONE_RE.findall(visible_text):
ph_c = re.sub(r"[\s\-]", "", ph)
if ph_c not in result["phones"]:
result["phones"].append(ph_c)
# Cap
for k in ["emails", "phones", "whatsapp", "social_links"]:
result[k] = list(dict.fromkeys(result[k]))[:5]
# CMS
from app.enricher import detect_cms
result["cms"] = detect_cms(html, dict(resp.headers))
except Exception as e:
result["error"] = str(e)[:300]
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
async def _check_url(url: str) -> Optional[str]:
try:
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
r = await c.get(url)
return r.text if r.status_code == 200 else None
except Exception:
return None
sitemap_txt, robots_txt = await asyncio.gather(
_check_url(f"https://{domain}/sitemap.xml"),
_check_url(f"https://{domain}/robots.txt"),
)
result["has_sitemap"] = sitemap_txt is not None
result["has_robots"] = robots_txt is not None
if robots_txt:
robots_lower = robots_txt.lower()
result["robots_disallows_google"] = (
"disallow: /" in robots_lower and "googlebot" in robots_lower
)
# ── SSL ───────────────────────────────────────────────────────────────────
import ssl as _ssl, socket as _socket
try:
def _ssl_check():
import datetime as _dt
ctx = _ssl.create_default_context()
with _socket.create_connection((domain, 443), timeout=5) as s:
with ctx.wrap_socket(s, server_hostname=domain) as ss:
cert = ss.getpeercert()
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
return True, (_dt.datetime.utcnow() - exp).days * -1
loop = asyncio.get_event_loop()
result["ssl_valid"], result["ssl_expiry_days"] = await loop.run_in_executor(None, _ssl_check)
except Exception:
pass
return result

View File

@@ -136,11 +136,10 @@ tr:hover td{background:rgba(255,255,255,.025)}
/* AI detail modal */
.modal-bg{position:fixed;inset:0;background:#000a;z-index:300;display:flex;align-items:center;justify-content:center}
.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--r);padding:20px;max-width:500px;width:90%;max-height:80vh;overflow-y:auto}
.modal h2{font-size:16px;font-weight:800;margin-bottom:12px}
.modal .row{display:flex;gap:8px;margin-bottom:8px;font-size:13px}
.modal .label{color:var(--muted);min-width:110px;font-size:12px}
.modal .val{color:var(--text)}
.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--r);padding:18px;max-width:560px;width:95%;max-height:88vh;overflow-y:auto}
.modal h2{font-size:15px;font-weight:800}
.mrow{display:flex;gap:8px;margin-bottom:6px;font-size:12px;line-height:1.4}
.mlabel{color:var(--muted);min-width:90px;font-size:11px;padding-top:1px;flex-shrink:0}
@media(max-width:700px){.pipeline{grid-template-columns:1fr}.sg{grid-template-columns:1fr 1fr}}
</style>
@@ -153,15 +152,99 @@ tr:hover td{background:rgba(255,255,255,.025)}
<!-- AI Detail Modal -->
<div class="modal-bg" x-show="modal.open" @click.self="modal.open=false" x-cloak>
<div class="modal" @click.stop>
<h2>AI Assessment — <span style="color:var(--accent2)" x-text="modal.domain"></span></h2>
<div class="row"><span class="label">Lead quality</span><span class="val"><span class="pill" :class="aiPillClass(modal.data.lead_quality)" x-text="modal.data.lead_quality || '—'"></span></span></div>
<div class="row"><span class="label">Kit Digital</span><span class="val" x-text="modal.data.kit_digital_confirmed ? '✅ Confirmed' : '❌ Not confirmed'"></span></div>
<div class="row"><span class="label">KD reasoning</span><span class="val" x-text="modal.data.kit_digital_reasoning || '—'"></span></div>
<div class="row"><span class="label">Lead reasoning</span><span class="val" x-text="modal.data.lead_reasoning || '—'"></span></div>
<div class="row"><span class="label">Best channel</span><span class="val" x-text="(modal.data.best_contact_channel || '—') + (modal.data.best_contact_value ? ': ' + modal.data.best_contact_value : '')"></span></div>
<div class="row"><span class="label">Pitch</span><span class="val" style="font-style:italic;color:var(--accent2)" x-text="modal.data.pitch_angle || '—'"></span></div>
<div class="row"><span class="label">Services needed</span><span class="val" x-text="(modal.data.services_likely_needed || []).join(', ') || '—'"></span></div>
<div class="row"><span class="label">Outreach notes</span><span class="val" x-text="modal.data.outreach_notes || '—'"></span></div>
<div style="display:flex;justify-content:space-between;align-items:flex-start;margin-bottom:12px">
<h2>AI Report — <span style="color:var(--accent2)" x-text="modal.domain"></span></h2>
<button class="btn bg sm" @click="modal.open=false"></button>
</div>
<!-- Summary banner -->
<div x-show="modal.ai.summary" style="background:var(--surface2);border-radius:6px;padding:10px 12px;margin-bottom:12px;font-size:12px;line-height:1.5;color:var(--text)" x-text="modal.ai.summary"></div>
<!-- Lead + quality -->
<div style="display:grid;grid-template-columns:1fr 1fr 1fr;gap:8px;margin-bottom:12px">
<div style="background:var(--surface2);border-radius:6px;padding:8px;text-align:center">
<div style="font-size:10px;color:var(--muted);margin-bottom:3px">LEAD</div>
<span class="pill" :class="aiPillClass(modal.ai.lead_quality)" x-text="modal.ai.lead_quality||'—'"></span>
</div>
<div style="background:var(--surface2);border-radius:6px;padding:8px;text-align:center">
<div style="font-size:10px;color:var(--muted);margin-bottom:3px">SITE QUALITY</div>
<span class="score" :style="qualityBg(modal.ai.site_quality_score)" x-text="(modal.ai.site_quality_score??'—')+'/10'"></span>
</div>
<div style="background:var(--surface2);border-radius:6px;padding:8px;text-align:center">
<div style="font-size:10px;color:var(--muted);margin-bottom:3px">KIT DIGITAL</div>
<span x-text="modal.ai.kit_digital_confirmed ? '✅ Yes' : '❌ No'" style="font-size:13px;font-weight:700"></span>
</div>
</div>
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
<div class="mrow"><span class="mlabel">SEO status</span><span x-text="modal.ai.seo_status||'—'"></span></div>
<!-- Content issues -->
<div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Content Issues</div>
<template x-for="issue in (modal.ai.content_issues||[])">
<div style="font-size:12px;color:var(--danger);padding:2px 0"><span x-text="issue"></span></div>
</template>
</div>
<!-- Urgency signals -->
<div x-show="(modal.ai.urgency_signals||[]).length>0" style="margin:8px 0">
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Urgency Signals</div>
<template x-for="sig in (modal.ai.urgency_signals||[])">
<div style="font-size:12px;color:var(--warn);padding:2px 0">🔴 <span x-text="sig"></span></div>
</template>
</div>
<!-- Contact -->
<div style="background:var(--surface2);border-radius:6px;padding:10px;margin:8px 0">
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:6px">Best Contact</div>
<div style="font-size:13px;font-weight:700;color:var(--accent2)" x-text="(modal.ai.best_contact_channel||'unknown').toUpperCase()"></div>
<div style="font-size:12px;color:var(--text);margin-top:2px;word-break:break-all" x-text="modal.ai.best_contact_value||'—'"></div>
<!-- All contacts from site_analysis -->
<div x-show="modal.sa" style="margin-top:8px;display:flex;flex-wrap:wrap;gap:4px">
<template x-for="em in (modal.sa?.emails||[])">
<a :href="'mailto:'+em" class="chip email" x-text="em"></a>
</template>
<template x-for="ph in (modal.sa?.phones||[])">
<a :href="'tel:'+ph" class="chip phone" x-text="ph"></a>
</template>
<template x-for="wa in (modal.sa?.whatsapp||[])">
<a :href="wa" target="_blank" class="chip wa">💬 WhatsApp</a>
</template>
<template x-for="s in (modal.sa?.social_links||[]).slice(0,3)">
<a :href="s" target="_blank" class="chip social" x-text="s.replace('https://','').split('/')[0]"></a>
</template>
</div>
</div>
<!-- Pitch -->
<div style="background:#6c63ff15;border:1px solid #6c63ff33;border-radius:6px;padding:10px;margin:8px 0">
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Cold Pitch (ES)</div>
<div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
</div>
<div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
<div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
<!-- Site analysis tech snapshot -->
<div x-show="modal.sa" style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:6px">Technical Snapshot</div>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:4px;font-size:11px">
<div>Load time: <b x-text="(modal.sa?.load_time_ms||'—')+'ms'"></b></div>
<div>Page size: <b x-text="(modal.sa?.page_size_kb||'—')+'KB'"></b></div>
<div>CMS: <b x-text="modal.sa?.cms||'unknown'"></b></div>
<div>Server: <b x-text="modal.sa?.server||'—'"></b></div>
<div>Sitemap: <b x-text="modal.sa?.has_sitemap?'✅':'❌'"></b></div>
<div>Robots: <b x-text="modal.sa?.has_robots?'✅':'❌'"></b></div>
<div>Analytics: <b x-text="(modal.sa?.analytics_present||[]).join(', ')||'none'"></b></div>
<div>Mobile: <b x-text="modal.sa?.has_mobile_viewport?'✅':'❌'"></b></div>
<div>Lorem ipsum: <b :style="modal.sa?.has_lorem_ipsum?'color:var(--danger)':''" x-text="modal.sa?.has_lorem_ipsum?'⚠ YES':'No'"></b></div>
<div>Words: <b x-text="modal.sa?.word_count||'—'"></b></div>
</div>
</div>
<button class="btn bg" style="margin-top:14px;width:100%" @click="modal.open=false">Close</button>
</div>
</div>
@@ -436,7 +519,7 @@ function app() {
qst: {}, customDomains: '',
pipeline: {hot:{count:0,samples:[]},warm:{count:0,samples:[]},cold:{count:0,samples:[]}},
toast: {show:false,msg:'',type:'success'},
modal: {open:false,domain:'',data:{}},
modal: {open:false, domain:'', ai:{}, sa:null},
_chart: null, _poll: null, _toastTimer: null,
async init() {
@@ -556,11 +639,20 @@ function app() {
openModal(row) {
this.modal.domain = row.domain;
try { this.modal.data = row.ai_assessment ? JSON.parse(row.ai_assessment) : {}; }
catch(e) { this.modal.data = {}; }
try { this.modal.ai = row.ai_assessment ? JSON.parse(row.ai_assessment) : {}; }
catch(e) { this.modal.ai = {}; }
try { this.modal.sa = row.site_analysis ? JSON.parse(row.site_analysis) : null; }
catch(e) { this.modal.sa = null; }
this.modal.open = true;
},
qualityBg(s) {
if(s==null) return 'background:#333;color:#888';
if(s>=8) return 'background:#00d4aa22;color:var(--accent2)';
if(s>=5) return 'background:#ffb34722;color:var(--warn)';
return 'background:#ff4f6d22;color:var(--danger)';
},
scoreBg(s) {
if(s==null) return 'background:#333;color:#888';
if(s>=80) return 'background:#ff4f6d22;color:#ff4f6d';

View File

@@ -13,6 +13,6 @@ services:
- SCORE_THRESHOLD=60
- TARGET_TLDS=es,com,net
- TARGET_COUNTRIES=ES,GB,DE,FR,RO,PT,AD,IT
- REPLICATE_API_TOKEN=r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj
- REPLICATE_API_TOKEN=r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO
- AI_CONCURRENCY=3
restart: unless-stopped