feat: Gemini AI assessment, Kit Digital detection, contact extraction
Kit Digital detection (enricher.py):
- Scans img src/alt/srcset for digitalizadores, kit-digital, fondos-europeos etc
- Scans page text for Kit Digital, Agente Digitalizador, Next Generation EU, PRTR
- Scans links for acelerapyme.es, red.es, kit-digital refs
- +20 score bonus for Kit Digital confirmed sites (proven IT buyers)
Contact extraction (enricher.py):
- Pulls mailto/tel/wa.me links from HTML
- Extracts email addresses via regex, phone numbers (ES format)
- Detects social media links (FB, IG, LinkedIn, Twitter, TikTok)
- Stored as JSON in contact_info column
Gemini via Replicate (replicate_ai.py):
- Assesses lead quality (HOT/WARM/COLD), Kit Digital confirmation
- Identifies best contact channel + actual value (email/phone/WA)
- Writes Spanish cold-call/email pitch angle
- Lists services likely needed + outreach notes
- 3 concurrent requests, 90s timeout, JSON output parsing
DB: migration adds kit_digital, kit_digital_signals, contact_info,
ai_assessment, ai_lead_quality, ai_pitch, ai_contact_channel/value,
ai_queue table
UI: Kit Digital 🏅 badge, AI quality pill (clickable modal with full
assessment), contact chips (email/phone/WA/social), AI Assess button,
Kit Digital only filter, AI queue status in enrichment tab
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
282
app/enricher.py
282
app/enricher.py
@@ -1,5 +1,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import ssl
|
||||
import socket
|
||||
import datetime
|
||||
@@ -11,49 +13,149 @@ import dns.resolver
|
||||
import aiosqlite
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.db import SQLITE_PATH
|
||||
from app.db import SQLITE_PATH, queue_ai, save_ai_assessment, get_ai_queue_status
|
||||
from app.scorer import score
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", "50"))
|
||||
# ip-api.com free tier: 45 req/min → ~1.33/s. We use a separate slower semaphore.
|
||||
IP_API_SEMAPHORE: Optional[asyncio.Semaphore] = None
|
||||
IP_API_RATE = 45 # per minute
|
||||
IP_API_RATE = 45 # req/min free tier
|
||||
|
||||
_worker_task: Optional[asyncio.Task] = None
|
||||
_ai_worker_task: Optional[asyncio.Task] = None
|
||||
_paused = False
|
||||
_ip_sem: Optional[asyncio.Semaphore] = None
|
||||
_ip_last_call = 0.0
|
||||
|
||||
|
||||
def get_ip_semaphore():
|
||||
global IP_API_SEMAPHORE
|
||||
if IP_API_SEMAPHORE is None:
|
||||
IP_API_SEMAPHORE = asyncio.Semaphore(1)
|
||||
return IP_API_SEMAPHORE
|
||||
def _get_ip_sem():
|
||||
global _ip_sem
|
||||
if _ip_sem is None:
|
||||
_ip_sem = asyncio.Semaphore(1)
|
||||
return _ip_sem
|
||||
|
||||
|
||||
# ── CMS detection ────────────────────────────────────────────────────────────
|
||||
|
||||
CMS_SIGNATURES = {
|
||||
"wordpress": ["/wp-content/", "/wp-includes/", 'name="generator" content="WordPress'],
|
||||
"joomla": ["/components/com_", "Joomla!", 'name="generator" content="Joomla'],
|
||||
"drupal": ["/sites/default/files/", "Drupal.settings", 'name="generator" content="Drupal'],
|
||||
"wix": ["wix.com", "X-Wix-"],
|
||||
"wordpress": ["/wp-content/", "/wp-includes/", 'content="WordPress'],
|
||||
"joomla": ["/components/com_", "Joomla!", 'content="Joomla'],
|
||||
"drupal": ["/sites/default/files/", "Drupal.settings", 'content="Drupal'],
|
||||
"wix": ["static.wixstatic.com", "X-Wix-"],
|
||||
"squarespace": ["squarespace.com", "X-Squarespace-"],
|
||||
"shopify": ["cdn.shopify.com", "Shopify.theme"],
|
||||
"prestashop": ["PrestaShop", "/modules/"],
|
||||
"magento": ["Mage.Cookies", "X-Magento-"],
|
||||
"typo3": ["typo3", "TYPO3 CMS"],
|
||||
"opencart": ["route=common/home", "OpenCart"],
|
||||
"shopify": ["cdn.shopify.com", "Shopify.theme"],
|
||||
"prestashop": ["PrestaShop", "/modules/prestashop"],
|
||||
"magento": ["Mage.Cookies", "X-Magento-"],
|
||||
"typo3": ["typo3temp", "TYPO3 CMS"],
|
||||
"opencart": ["route=common/home", "OpenCart"],
|
||||
}
|
||||
|
||||
|
||||
def detect_cms(html: str, headers: dict) -> Optional[str]:
|
||||
combined = html[:50000] + " ".join(f"{k}:{v}" for k, v in headers.items())
|
||||
combined = html[:60000] + " ".join(f"{k}:{v}" for k, v in headers.items())
|
||||
cl = combined.lower()
|
||||
for cms, sigs in CMS_SIGNATURES.items():
|
||||
if any(sig.lower() in combined.lower() for sig in sigs):
|
||||
if any(s.lower() in cl for s in sigs):
|
||||
return cms
|
||||
return None
|
||||
|
||||
|
||||
# ── Kit Digital detection ────────────────────────────────────────────────────
|
||||
|
||||
KIT_IMG_PATS = [
|
||||
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
|
||||
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
|
||||
"prtr", "plan-recuperacion", "planderecuperacion",
|
||||
"acelerapyme", "logo-ue", "recovery-eu", "cofinanciado",
|
||||
]
|
||||
KIT_TEXT_PATS = [
|
||||
"kit digital", "agente digitalizador", "agentes digitalizadores",
|
||||
"fondos europeos", "next generation eu", "nextgenerationeu",
|
||||
"plan de recuperación", "plan de recuperacion",
|
||||
"plan de digitalización", "digitalización pymes",
|
||||
"prtr", "financiado por la unión europea",
|
||||
"red.es/kit-digital", "acelerapyme.es",
|
||||
]
|
||||
KIT_LINK_PATS = ["acelerapyme", "red.es", "kit-digital", "kitdigital"]
|
||||
|
||||
|
||||
def detect_kit_digital(soup, html: str) -> tuple[bool, list]:
|
||||
signals = []
|
||||
hl = html.lower()
|
||||
|
||||
for img in soup.find_all("img"):
|
||||
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
||||
for p in KIT_IMG_PATS:
|
||||
if p in combined:
|
||||
signals.append(f"img:{p}")
|
||||
break
|
||||
|
||||
for p in KIT_TEXT_PATS:
|
||||
if p in hl:
|
||||
signals.append(f"text:{p}")
|
||||
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"].lower()
|
||||
if any(p in href for p in KIT_LINK_PATS):
|
||||
signals.append(f"link:{href[:60]}")
|
||||
|
||||
signals = list(dict.fromkeys(signals))[:15]
|
||||
return len(signals) > 0, signals
|
||||
|
||||
|
||||
# ── Contact extraction ────────────────────────────────────────────────────────
|
||||
|
||||
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
||||
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
||||
SOCIAL_DOMAINS = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
|
||||
|
||||
|
||||
def extract_contacts(soup, html: str) -> dict:
|
||||
contacts: dict = {"emails": [], "phones": [], "whatsapp": [], "social": []}
|
||||
|
||||
# mailto links
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("mailto:"):
|
||||
em = href[7:].split("?")[0].strip()
|
||||
if em and em not in contacts["emails"]:
|
||||
contacts["emails"].append(em)
|
||||
elif href.startswith("tel:"):
|
||||
ph = re.sub(r"[^\d+]", "", href[4:])
|
||||
if ph and ph not in contacts["phones"]:
|
||||
contacts["phones"].append(ph)
|
||||
elif "wa.me" in href or "api.whatsapp.com" in href:
|
||||
if href not in contacts["whatsapp"]:
|
||||
contacts["whatsapp"].append(href[:80])
|
||||
else:
|
||||
for sd in SOCIAL_DOMAINS:
|
||||
if sd in href.lower():
|
||||
clean = href.split("?")[0].rstrip("/")
|
||||
if clean not in contacts["social"]:
|
||||
contacts["social"].append(clean)
|
||||
break
|
||||
|
||||
# Email regex in raw HTML (catches obfuscated ones)
|
||||
for em in EMAIL_RE.findall(html[:100000]):
|
||||
em = em.lower()
|
||||
if em not in contacts["emails"] and not em.endswith((".png", ".jpg", ".css", ".js")):
|
||||
contacts["emails"].append(em)
|
||||
|
||||
# Phone numbers in visible text
|
||||
for ph in PHONE_RE.findall(soup.get_text()):
|
||||
ph_clean = re.sub(r"[\s\-]", "", ph)
|
||||
if ph_clean not in contacts["phones"]:
|
||||
contacts["phones"].append(ph_clean)
|
||||
|
||||
# Dedupe + cap
|
||||
for k in contacts:
|
||||
contacts[k] = list(dict.fromkeys(contacts[k]))[:5]
|
||||
|
||||
return contacts
|
||||
|
||||
|
||||
# ── SSL / MX / IP ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
|
||||
try:
|
||||
ctx = ssl.create_default_context()
|
||||
@@ -63,11 +165,8 @@ async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
|
||||
with socket.create_connection((domain, 443), timeout=5) as sock:
|
||||
with ctx.wrap_socket(sock, server_hostname=domain) as ssock:
|
||||
cert = ssock.getpeercert()
|
||||
expiry_str = cert.get("notAfter", "")
|
||||
expiry = datetime.datetime.strptime(expiry_str, "%b %d %H:%M:%S %Y %Z")
|
||||
days = (expiry - datetime.datetime.utcnow()).days
|
||||
return True, days
|
||||
|
||||
expiry = datetime.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
||||
return True, (expiry - datetime.datetime.utcnow()).days
|
||||
return await loop.run_in_executor(None, _check)
|
||||
except Exception:
|
||||
return False, None
|
||||
@@ -76,64 +175,53 @@ async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
|
||||
async def check_mx(domain: str) -> bool:
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def _check():
|
||||
try:
|
||||
answers = dns.resolver.resolve(domain, "MX", lifetime=5)
|
||||
return len(answers) > 0
|
||||
return len(dns.resolver.resolve(domain, "MX", lifetime=5)) > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
return await loop.run_in_executor(None, _check)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
_ip_last_call = 0.0
|
||||
_ip_lock = asyncio.Lock() if False else None # initialized lazily
|
||||
|
||||
|
||||
async def get_ip_country(ip: str) -> Optional[str]:
|
||||
global _ip_last_call
|
||||
# Enforce 45 req/min = 1 req per 1.33s
|
||||
async with get_ip_semaphore():
|
||||
async with _get_ip_sem():
|
||||
now = asyncio.get_event_loop().time()
|
||||
wait = (1 / (IP_API_RATE / 60)) - (now - _ip_last_call)
|
||||
wait = (60 / IP_API_RATE) - (now - _ip_last_call)
|
||||
if wait > 0:
|
||||
await asyncio.sleep(wait)
|
||||
_ip_last_call = asyncio.get_event_loop().time()
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as client:
|
||||
resp = await client.get(f"http://ip-api.com/json/{ip}?fields=countryCode")
|
||||
if resp.status_code == 200:
|
||||
return resp.json().get("countryCode")
|
||||
r = await client.get(f"http://ip-api.com/json/{ip}?fields=countryCode")
|
||||
if r.status_code == 200:
|
||||
return r.json().get("countryCode")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
# ── Main enrichment ───────────────────────────────────────────────────────────
|
||||
|
||||
async def enrich_domain(domain: str) -> dict:
|
||||
result = {
|
||||
"domain": domain,
|
||||
"is_live": False,
|
||||
"status_code": None,
|
||||
"ssl_valid": False,
|
||||
"ssl_expiry_days": None,
|
||||
"cms": None,
|
||||
"has_mx": False,
|
||||
"ip_country": None,
|
||||
"page_title": None,
|
||||
"is_live": False, "status_code": None,
|
||||
"ssl_valid": False, "ssl_expiry_days": None,
|
||||
"cms": None, "has_mx": False,
|
||||
"ip_country": None, "page_title": None,
|
||||
"server": None,
|
||||
"kit_digital": False, "kit_digital_signals": "[]",
|
||||
"contact_info": "{}",
|
||||
"enriched_at": datetime.datetime.utcnow().isoformat(),
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10,
|
||||
follow_redirects=True,
|
||||
verify=False,
|
||||
timeout=12, follow_redirects=True, verify=False,
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
|
||||
) as client:
|
||||
resp = await client.get(f"http://{domain}")
|
||||
@@ -143,11 +231,16 @@ async def enrich_domain(domain: str) -> dict:
|
||||
|
||||
html = resp.text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
title_tag = soup.find("title")
|
||||
result["page_title"] = title_tag.get_text(strip=True)[:500] if title_tag else None
|
||||
title = soup.find("title")
|
||||
result["page_title"] = title.get_text(strip=True)[:500] if title else None
|
||||
result["cms"] = detect_cms(html, dict(resp.headers))
|
||||
|
||||
# Resolve IP for country lookup
|
||||
kit, signals = detect_kit_digital(soup, html)
|
||||
result["kit_digital"] = kit
|
||||
result["kit_digital_signals"] = json.dumps(signals)
|
||||
|
||||
result["contact_info"] = json.dumps(extract_contacts(soup, html))
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
|
||||
@@ -158,15 +251,10 @@ async def enrich_domain(domain: str) -> dict:
|
||||
except Exception as e:
|
||||
result["error"] = str(e)[:500]
|
||||
|
||||
# SSL check (independent of HTTP)
|
||||
ssl_valid, ssl_days = await check_ssl(domain)
|
||||
result["ssl_valid"] = ssl_valid
|
||||
result["ssl_expiry_days"] = ssl_days
|
||||
|
||||
# MX check
|
||||
result["has_mx"] = await check_mx(domain)
|
||||
|
||||
# Score
|
||||
result["score"] = score(result)
|
||||
|
||||
return result
|
||||
@@ -177,19 +265,24 @@ async def save_enriched(data: dict):
|
||||
await db.execute(
|
||||
"""INSERT INTO enriched_domains
|
||||
(domain, is_live, status_code, ssl_valid, ssl_expiry_days, cms,
|
||||
has_mx, ip_country, page_title, server, enriched_at, error, score)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
has_mx, ip_country, page_title, server, enriched_at, error, score,
|
||||
kit_digital, kit_digital_signals, contact_info)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
ON CONFLICT(domain) DO UPDATE SET
|
||||
is_live=excluded.is_live, status_code=excluded.status_code,
|
||||
ssl_valid=excluded.ssl_valid, ssl_expiry_days=excluded.ssl_expiry_days,
|
||||
cms=excluded.cms, has_mx=excluded.has_mx, ip_country=excluded.ip_country,
|
||||
page_title=excluded.page_title, server=excluded.server,
|
||||
enriched_at=excluded.enriched_at, error=excluded.error, score=excluded.score""",
|
||||
enriched_at=excluded.enriched_at, error=excluded.error, score=excluded.score,
|
||||
kit_digital=excluded.kit_digital,
|
||||
kit_digital_signals=excluded.kit_digital_signals,
|
||||
contact_info=excluded.contact_info""",
|
||||
(
|
||||
data["domain"], data["is_live"], data["status_code"],
|
||||
data["ssl_valid"], data["ssl_expiry_days"], data["cms"],
|
||||
data["has_mx"], data["ip_country"], data["page_title"],
|
||||
data["server"], data["enriched_at"], data["error"], data["score"],
|
||||
int(data["kit_digital"]), data["kit_digital_signals"], data["contact_info"],
|
||||
),
|
||||
)
|
||||
await db.execute(
|
||||
@@ -205,18 +298,17 @@ async def mark_job(domain: str, status: str, error: str = None):
|
||||
if status == "running":
|
||||
await db.execute(
|
||||
"UPDATE job_queue SET status=?, started_at=datetime('now') WHERE domain=?",
|
||||
(status, domain),
|
||||
)
|
||||
(status, domain))
|
||||
elif status in ("done", "failed"):
|
||||
await db.execute(
|
||||
"UPDATE job_queue SET status=?, completed_at=datetime('now'), error=? WHERE domain=?",
|
||||
(status, error, domain),
|
||||
)
|
||||
(status, error, domain))
|
||||
await db.commit()
|
||||
|
||||
|
||||
# ── Enrichment worker ─────────────────────────────────────────────────────────
|
||||
|
||||
async def worker_loop():
|
||||
global _paused
|
||||
sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
|
||||
|
||||
async def process(domain: str):
|
||||
@@ -233,26 +325,70 @@ async def worker_loop():
|
||||
if _paused:
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
async with db.execute(
|
||||
"SELECT domain FROM job_queue WHERE status='pending' LIMIT 100"
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
|
||||
if not rows:
|
||||
await asyncio.sleep(2)
|
||||
continue
|
||||
await asyncio.gather(*[asyncio.create_task(process(r[0])) for r in rows], return_exceptions=True)
|
||||
|
||||
tasks = [asyncio.create_task(process(r[0])) for r in rows]
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# ── AI assessment worker ──────────────────────────────────────────────────────
|
||||
|
||||
async def ai_worker_loop():
|
||||
from app.replicate_ai import assess_domain as gemini_assess
|
||||
|
||||
while True:
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
async with db.execute(
|
||||
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 20"
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
# Mark as running
|
||||
if rows:
|
||||
await db.executemany(
|
||||
"UPDATE ai_queue SET status='running', created_at=created_at WHERE domain=?",
|
||||
[(r[0],) for r in rows],
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
if not rows:
|
||||
await asyncio.sleep(3)
|
||||
continue
|
||||
|
||||
async def assess_one(domain: str):
|
||||
try:
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(
|
||||
"SELECT * FROM enriched_domains WHERE domain=?", (domain,)
|
||||
) as cur:
|
||||
row = await cur.fetchone()
|
||||
if not row:
|
||||
return
|
||||
assessment = await gemini_assess(dict(row))
|
||||
await save_ai_assessment(domain, assessment)
|
||||
except Exception as e:
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
await db.execute(
|
||||
"UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?",
|
||||
(domain,),
|
||||
)
|
||||
await db.commit()
|
||||
logger.error("AI worker error %s: %s", domain, e)
|
||||
|
||||
await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)
|
||||
|
||||
|
||||
def start_worker():
|
||||
global _worker_task
|
||||
global _worker_task, _ai_worker_task
|
||||
if _worker_task is None or _worker_task.done():
|
||||
_worker_task = asyncio.create_task(worker_loop())
|
||||
_paused = False
|
||||
if _ai_worker_task is None or _ai_worker_task.done():
|
||||
_ai_worker_task = asyncio.create_task(ai_worker_loop())
|
||||
|
||||
|
||||
def pause_worker():
|
||||
|
||||
Reference in New Issue
Block a user