diff --git a/app/db.py b/app/db.py
index 486ba75..f093e2f 100644
--- a/app/db.py
+++ b/app/db.py
@@ -436,10 +436,14 @@ async def save_prescreen_results(results: list[dict]):
niche = r.get("niche")
site_type = r.get("type") # DeepSeek returns "type" key
if niche or site_type:
- # Classification-only update (domain row must already exist)
+ # Upsert niche/type — works even if the row was never enriched
await db.execute(
- "UPDATE enriched_domains SET niche=?, site_type=? WHERE domain=?",
- (niche, site_type, domain),
+ """INSERT INTO enriched_domains (domain, niche, site_type)
+ VALUES (?, ?, ?)
+ ON CONFLICT(domain) DO UPDATE SET
+ niche=excluded.niche,
+ site_type=excluded.site_type""",
+ (domain, niche, site_type),
)
else:
# Prescreen status upsert — create row if it doesn't exist yet
diff --git a/app/prescreener.py b/app/prescreener.py
index 7bd038d..50250bd 100644
--- a/app/prescreener.py
+++ b/app/prescreener.py
@@ -175,16 +175,48 @@ def _build_classify_prompt(items: list[dict]) -> str:
def _parse_classify_output(raw: str) -> list[dict]:
+ """Extract JSON array from DeepSeek output.
+ Strategy: search the full raw text first (handles cases where the JSON
+ sits outside or inside blocks), then try with think stripped.
+ DeepSeek-R1 sometimes puts its answer inside the think block; stripping
+ it first would lose the data entirely.
+ """
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
- # Strip DeepSeek … reasoning block if present
- text = re.sub(r"[\s\S]*?", "", text).strip()
- m = re.search(r"\[[\s\S]+\]", text)
- if m:
+
+ def _try_parse(s: str):
+ m = re.search(r"\[[\s\S]+\]", s)
+ if not m:
+ return None
try:
- return json.loads(m.group(0))
+ result = json.loads(m.group(0))
+ if isinstance(result, list) and result:
+ return result
except json.JSONDecodeError:
pass
- logger.warning("DeepSeek classification parse failed: %s", raw[:300])
+ return None
+
+ # 1. Try the full output as-is (handles JSON after )
+ parsed = _try_parse(text)
+ if parsed:
+ logger.info("DeepSeek: parsed %d items from full output", len(parsed))
+ return parsed
+
+ # 2. Try ONLY the content inside (handles JSON inside the block)
+ think_m = re.search(r"([\s\S]*?)", text)
+ if think_m:
+ parsed = _try_parse(think_m.group(1))
+ if parsed:
+ logger.info("DeepSeek: parsed %d items from block", len(parsed))
+ return parsed
+
+ # 3. Try with think block stripped (standard path)
+ stripped = re.sub(r"[\s\S]*?", "", text).strip()
+ parsed = _try_parse(stripped)
+ if parsed:
+ logger.info("DeepSeek: parsed %d items after stripping ", len(parsed))
+ return parsed
+
+ logger.warning("DeepSeek classification parse failed, raw snippet: %.400s", raw)
return []
@@ -217,7 +249,10 @@ async def classify_with_deepseek(live_items: list[dict]) -> list[dict]:
if isinstance(output, list):
output = "".join(output)
- return _parse_classify_output(output)
+ logger.info("DeepSeek raw output (first 500 chars): %.500s", output)
+ result = _parse_classify_output(output)
+ logger.info("DeepSeek classified %d / %d domains", len(result), len(live_items))
+ return result
except Exception as e:
logger.error("DeepSeek classification error: %s", e)