feat: Gemini AI assessment, Kit Digital detection, contact extraction

Kit Digital detection (enricher.py):
- Scans img src/alt/srcset for digitalizadores, kit-digital, fondos-europeos etc
- Scans page text for Kit Digital, Agente Digitalizador, Next Generation EU, PRTR
- Scans links for acelerapyme.es, red.es, kit-digital refs
- +20 score bonus for Kit Digital confirmed sites (proven IT buyers)

Contact extraction (enricher.py):
- Pulls mailto/tel/wa.me links from HTML
- Extracts email addresses via regex, phone numbers (ES format)
- Detects social media links (FB, IG, LinkedIn, Twitter, TikTok)
- Stored as JSON in contact_info column

Gemini via Replicate (replicate_ai.py):
- Assesses lead quality (HOT/WARM/COLD), Kit Digital confirmation
- Identifies best contact channel + actual value (email/phone/WA)
- Writes Spanish cold-call/email pitch angle
- Lists services likely needed + outreach notes
- 3 concurrent requests, 90s timeout, JSON output parsing

DB: migration adds kit_digital, kit_digital_signals, contact_info,
    ai_assessment, ai_lead_quality, ai_pitch, ai_contact_channel/value,
    ai_queue table

UI: Kit Digital 🏅 badge, AI quality pill (clickable modal with full
    assessment), contact chips (email/phone/WA/social), AI Assess button,
    Kit Digital only filter, AI queue status in enrichment tab

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:25:06 +02:00
parent 7acff12242
commit faca4b6e1a
7 changed files with 875 additions and 382 deletions

View File

@@ -1,5 +1,7 @@
import asyncio
import json
import os
import re
import ssl
import socket
import datetime
@@ -11,49 +13,149 @@ import dns.resolver
import aiosqlite
from bs4 import BeautifulSoup
from app.db import SQLITE_PATH
from app.db import SQLITE_PATH, queue_ai, save_ai_assessment, get_ai_queue_status
from app.scorer import score
logger = logging.getLogger(__name__)
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", "50"))
# ip-api.com free tier: 45 req/min → ~1.33/s. We use a separate slower semaphore.
IP_API_SEMAPHORE: Optional[asyncio.Semaphore] = None
IP_API_RATE = 45 # per minute
IP_API_RATE = 45 # req/min free tier
_worker_task: Optional[asyncio.Task] = None
_ai_worker_task: Optional[asyncio.Task] = None
_paused = False
_ip_sem: Optional[asyncio.Semaphore] = None
_ip_last_call = 0.0
def get_ip_semaphore():
global IP_API_SEMAPHORE
if IP_API_SEMAPHORE is None:
IP_API_SEMAPHORE = asyncio.Semaphore(1)
return IP_API_SEMAPHORE
def _get_ip_sem():
global _ip_sem
if _ip_sem is None:
_ip_sem = asyncio.Semaphore(1)
return _ip_sem
# ── CMS detection ────────────────────────────────────────────────────────────
CMS_SIGNATURES = {
"wordpress": ["/wp-content/", "/wp-includes/", 'name="generator" content="WordPress'],
"joomla": ["/components/com_", "Joomla!", 'name="generator" content="Joomla'],
"drupal": ["/sites/default/files/", "Drupal.settings", 'name="generator" content="Drupal'],
"wix": ["wix.com", "X-Wix-"],
"wordpress": ["/wp-content/", "/wp-includes/", 'content="WordPress'],
"joomla": ["/components/com_", "Joomla!", 'content="Joomla'],
"drupal": ["/sites/default/files/", "Drupal.settings", 'content="Drupal'],
"wix": ["static.wixstatic.com", "X-Wix-"],
"squarespace": ["squarespace.com", "X-Squarespace-"],
"shopify": ["cdn.shopify.com", "Shopify.theme"],
"prestashop": ["PrestaShop", "/modules/"],
"magento": ["Mage.Cookies", "X-Magento-"],
"typo3": ["typo3", "TYPO3 CMS"],
"opencart": ["route=common/home", "OpenCart"],
"shopify": ["cdn.shopify.com", "Shopify.theme"],
"prestashop": ["PrestaShop", "/modules/prestashop"],
"magento": ["Mage.Cookies", "X-Magento-"],
"typo3": ["typo3temp", "TYPO3 CMS"],
"opencart": ["route=common/home", "OpenCart"],
}
def detect_cms(html: str, headers: dict) -> Optional[str]:
combined = html[:50000] + " ".join(f"{k}:{v}" for k, v in headers.items())
combined = html[:60000] + " ".join(f"{k}:{v}" for k, v in headers.items())
cl = combined.lower()
for cms, sigs in CMS_SIGNATURES.items():
if any(sig.lower() in combined.lower() for sig in sigs):
if any(s.lower() in cl for s in sigs):
return cms
return None
# ── Kit Digital detection ────────────────────────────────────────────────────
KIT_IMG_PATS = [
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
"prtr", "plan-recuperacion", "planderecuperacion",
"acelerapyme", "logo-ue", "recovery-eu", "cofinanciado",
]
KIT_TEXT_PATS = [
"kit digital", "agente digitalizador", "agentes digitalizadores",
"fondos europeos", "next generation eu", "nextgenerationeu",
"plan de recuperación", "plan de recuperacion",
"plan de digitalización", "digitalización pymes",
"prtr", "financiado por la unión europea",
"red.es/kit-digital", "acelerapyme.es",
]
KIT_LINK_PATS = ["acelerapyme", "red.es", "kit-digital", "kitdigital"]
def detect_kit_digital(soup, html: str) -> tuple[bool, list]:
signals = []
hl = html.lower()
for img in soup.find_all("img"):
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
for p in KIT_IMG_PATS:
if p in combined:
signals.append(f"img:{p}")
break
for p in KIT_TEXT_PATS:
if p in hl:
signals.append(f"text:{p}")
for a in soup.find_all("a", href=True):
href = a["href"].lower()
if any(p in href for p in KIT_LINK_PATS):
signals.append(f"link:{href[:60]}")
signals = list(dict.fromkeys(signals))[:15]
return len(signals) > 0, signals
# ── Contact extraction ────────────────────────────────────────────────────────
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
SOCIAL_DOMAINS = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
def extract_contacts(soup, html: str) -> dict:
contacts: dict = {"emails": [], "phones": [], "whatsapp": [], "social": []}
# mailto links
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith("mailto:"):
em = href[7:].split("?")[0].strip()
if em and em not in contacts["emails"]:
contacts["emails"].append(em)
elif href.startswith("tel:"):
ph = re.sub(r"[^\d+]", "", href[4:])
if ph and ph not in contacts["phones"]:
contacts["phones"].append(ph)
elif "wa.me" in href or "api.whatsapp.com" in href:
if href not in contacts["whatsapp"]:
contacts["whatsapp"].append(href[:80])
else:
for sd in SOCIAL_DOMAINS:
if sd in href.lower():
clean = href.split("?")[0].rstrip("/")
if clean not in contacts["social"]:
contacts["social"].append(clean)
break
# Email regex in raw HTML (catches obfuscated ones)
for em in EMAIL_RE.findall(html[:100000]):
em = em.lower()
if em not in contacts["emails"] and not em.endswith((".png", ".jpg", ".css", ".js")):
contacts["emails"].append(em)
# Phone numbers in visible text
for ph in PHONE_RE.findall(soup.get_text()):
ph_clean = re.sub(r"[\s\-]", "", ph)
if ph_clean not in contacts["phones"]:
contacts["phones"].append(ph_clean)
# Dedupe + cap
for k in contacts:
contacts[k] = list(dict.fromkeys(contacts[k]))[:5]
return contacts
# ── SSL / MX / IP ─────────────────────────────────────────────────────────────
async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
try:
ctx = ssl.create_default_context()
@@ -63,11 +165,8 @@ async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
with socket.create_connection((domain, 443), timeout=5) as sock:
with ctx.wrap_socket(sock, server_hostname=domain) as ssock:
cert = ssock.getpeercert()
expiry_str = cert.get("notAfter", "")
expiry = datetime.datetime.strptime(expiry_str, "%b %d %H:%M:%S %Y %Z")
days = (expiry - datetime.datetime.utcnow()).days
return True, days
expiry = datetime.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
return True, (expiry - datetime.datetime.utcnow()).days
return await loop.run_in_executor(None, _check)
except Exception:
return False, None
@@ -76,64 +175,53 @@ async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
async def check_mx(domain: str) -> bool:
try:
loop = asyncio.get_event_loop()
def _check():
try:
answers = dns.resolver.resolve(domain, "MX", lifetime=5)
return len(answers) > 0
return len(dns.resolver.resolve(domain, "MX", lifetime=5)) > 0
except Exception:
return False
return await loop.run_in_executor(None, _check)
except Exception:
return False
_ip_last_call = 0.0
_ip_lock = asyncio.Lock() if False else None # initialized lazily
async def get_ip_country(ip: str) -> Optional[str]:
global _ip_last_call
# Enforce 45 req/min = 1 req per 1.33s
async with get_ip_semaphore():
async with _get_ip_sem():
now = asyncio.get_event_loop().time()
wait = (1 / (IP_API_RATE / 60)) - (now - _ip_last_call)
wait = (60 / IP_API_RATE) - (now - _ip_last_call)
if wait > 0:
await asyncio.sleep(wait)
_ip_last_call = asyncio.get_event_loop().time()
try:
async with httpx.AsyncClient(timeout=5) as client:
resp = await client.get(f"http://ip-api.com/json/{ip}?fields=countryCode")
if resp.status_code == 200:
return resp.json().get("countryCode")
r = await client.get(f"http://ip-api.com/json/{ip}?fields=countryCode")
if r.status_code == 200:
return r.json().get("countryCode")
except Exception:
pass
return None
# ── Main enrichment ───────────────────────────────────────────────────────────
async def enrich_domain(domain: str) -> dict:
result = {
"domain": domain,
"is_live": False,
"status_code": None,
"ssl_valid": False,
"ssl_expiry_days": None,
"cms": None,
"has_mx": False,
"ip_country": None,
"page_title": None,
"is_live": False, "status_code": None,
"ssl_valid": False, "ssl_expiry_days": None,
"cms": None, "has_mx": False,
"ip_country": None, "page_title": None,
"server": None,
"kit_digital": False, "kit_digital_signals": "[]",
"contact_info": "{}",
"enriched_at": datetime.datetime.utcnow().isoformat(),
"error": None,
}
try:
async with httpx.AsyncClient(
timeout=10,
follow_redirects=True,
verify=False,
timeout=12, follow_redirects=True, verify=False,
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
) as client:
resp = await client.get(f"http://{domain}")
@@ -143,11 +231,16 @@ async def enrich_domain(domain: str) -> dict:
html = resp.text
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.find("title")
result["page_title"] = title_tag.get_text(strip=True)[:500] if title_tag else None
title = soup.find("title")
result["page_title"] = title.get_text(strip=True)[:500] if title else None
result["cms"] = detect_cms(html, dict(resp.headers))
# Resolve IP for country lookup
kit, signals = detect_kit_digital(soup, html)
result["kit_digital"] = kit
result["kit_digital_signals"] = json.dumps(signals)
result["contact_info"] = json.dumps(extract_contacts(soup, html))
try:
loop = asyncio.get_event_loop()
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
@@ -158,15 +251,10 @@ async def enrich_domain(domain: str) -> dict:
except Exception as e:
result["error"] = str(e)[:500]
# SSL check (independent of HTTP)
ssl_valid, ssl_days = await check_ssl(domain)
result["ssl_valid"] = ssl_valid
result["ssl_expiry_days"] = ssl_days
# MX check
result["has_mx"] = await check_mx(domain)
# Score
result["score"] = score(result)
return result
@@ -177,19 +265,24 @@ async def save_enriched(data: dict):
await db.execute(
"""INSERT INTO enriched_domains
(domain, is_live, status_code, ssl_valid, ssl_expiry_days, cms,
has_mx, ip_country, page_title, server, enriched_at, error, score)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)
has_mx, ip_country, page_title, server, enriched_at, error, score,
kit_digital, kit_digital_signals, contact_info)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(domain) DO UPDATE SET
is_live=excluded.is_live, status_code=excluded.status_code,
ssl_valid=excluded.ssl_valid, ssl_expiry_days=excluded.ssl_expiry_days,
cms=excluded.cms, has_mx=excluded.has_mx, ip_country=excluded.ip_country,
page_title=excluded.page_title, server=excluded.server,
enriched_at=excluded.enriched_at, error=excluded.error, score=excluded.score""",
enriched_at=excluded.enriched_at, error=excluded.error, score=excluded.score,
kit_digital=excluded.kit_digital,
kit_digital_signals=excluded.kit_digital_signals,
contact_info=excluded.contact_info""",
(
data["domain"], data["is_live"], data["status_code"],
data["ssl_valid"], data["ssl_expiry_days"], data["cms"],
data["has_mx"], data["ip_country"], data["page_title"],
data["server"], data["enriched_at"], data["error"], data["score"],
int(data["kit_digital"]), data["kit_digital_signals"], data["contact_info"],
),
)
await db.execute(
@@ -205,18 +298,17 @@ async def mark_job(domain: str, status: str, error: str = None):
if status == "running":
await db.execute(
"UPDATE job_queue SET status=?, started_at=datetime('now') WHERE domain=?",
(status, domain),
)
(status, domain))
elif status in ("done", "failed"):
await db.execute(
"UPDATE job_queue SET status=?, completed_at=datetime('now'), error=? WHERE domain=?",
(status, error, domain),
)
(status, error, domain))
await db.commit()
# ── Enrichment worker ─────────────────────────────────────────────────────────
async def worker_loop():
global _paused
sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
async def process(domain: str):
@@ -233,26 +325,70 @@ async def worker_loop():
if _paused:
await asyncio.sleep(1)
continue
async with aiosqlite.connect(SQLITE_PATH) as db:
async with db.execute(
"SELECT domain FROM job_queue WHERE status='pending' LIMIT 100"
) as cur:
rows = await cur.fetchall()
if not rows:
await asyncio.sleep(2)
continue
await asyncio.gather(*[asyncio.create_task(process(r[0])) for r in rows], return_exceptions=True)
tasks = [asyncio.create_task(process(r[0])) for r in rows]
await asyncio.gather(*tasks, return_exceptions=True)
# ── AI assessment worker ──────────────────────────────────────────────────────
async def ai_worker_loop():
from app.replicate_ai import assess_domain as gemini_assess
while True:
async with aiosqlite.connect(SQLITE_PATH) as db:
async with db.execute(
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 20"
) as cur:
rows = await cur.fetchall()
# Mark as running
if rows:
await db.executemany(
"UPDATE ai_queue SET status='running', created_at=created_at WHERE domain=?",
[(r[0],) for r in rows],
)
await db.commit()
if not rows:
await asyncio.sleep(3)
continue
async def assess_one(domain: str):
try:
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
"SELECT * FROM enriched_domains WHERE domain=?", (domain,)
) as cur:
row = await cur.fetchone()
if not row:
return
assessment = await gemini_assess(dict(row))
await save_ai_assessment(domain, assessment)
except Exception as e:
async with aiosqlite.connect(SQLITE_PATH) as db:
await db.execute(
"UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?",
(domain,),
)
await db.commit()
logger.error("AI worker error %s: %s", domain, e)
await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)
def start_worker():
global _worker_task
global _worker_task, _ai_worker_task
if _worker_task is None or _worker_task.done():
_worker_task = asyncio.create_task(worker_loop())
_paused = False
if _ai_worker_task is None or _ai_worker_task.done():
_ai_worker_task = asyncio.create_task(ai_worker_loop())
def pause_worker():