fix: DeepSeek niche/type not saving to DB

Two bugs:
1. _parse_classify_output stripped <think> block before searching for JSON.
   DeepSeek-R1 often puts the JSON array inside the think block (especially
   when it "decides" mid-reasoning), so stripping it first destroyed the data.
   Fix: search full output first, then inside <think>, then stripped — three
   fallback strategies with info logging at each step.

2. Phase 2 save used bare UPDATE WHERE domain=? which silently does nothing
   if the domain row doesn't exist yet in enriched_domains.
   Fix: replace with INSERT ... ON CONFLICT DO UPDATE (true upsert).

Also adds logger.info lines so container logs show raw DeepSeek output
and parse result count for easy debugging.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 21:35:49 +02:00
parent 7fc510f903
commit a0c9db1ef2
2 changed files with 49 additions and 10 deletions

View File

@@ -436,10 +436,14 @@ async def save_prescreen_results(results: list[dict]):
niche = r.get("niche") niche = r.get("niche")
site_type = r.get("type") # DeepSeek returns "type" key site_type = r.get("type") # DeepSeek returns "type" key
if niche or site_type: if niche or site_type:
# Classification-only update (domain row must already exist) # Upsert niche/type — works even if the row was never enriched
await db.execute( await db.execute(
"UPDATE enriched_domains SET niche=?, site_type=? WHERE domain=?", """INSERT INTO enriched_domains (domain, niche, site_type)
(niche, site_type, domain), VALUES (?, ?, ?)
ON CONFLICT(domain) DO UPDATE SET
niche=excluded.niche,
site_type=excluded.site_type""",
(domain, niche, site_type),
) )
else: else:
# Prescreen status upsert — create row if it doesn't exist yet # Prescreen status upsert — create row if it doesn't exist yet

View File

@@ -175,16 +175,48 @@ def _build_classify_prompt(items: list[dict]) -> str:
def _parse_classify_output(raw: str) -> list[dict]: def _parse_classify_output(raw: str) -> list[dict]:
"""Extract JSON array from DeepSeek output.
Strategy: search the full raw text first (handles cases where the JSON
sits outside or inside <think> blocks), then try with think stripped.
DeepSeek-R1 sometimes puts its answer inside the think block; stripping
it first would lose the data entirely.
"""
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
# Strip DeepSeek <think>…</think> reasoning block if present
text = re.sub(r"<think>[\s\S]*?</think>", "", text).strip() def _try_parse(s: str):
m = re.search(r"\[[\s\S]+\]", text) m = re.search(r"\[[\s\S]+\]", s)
if m: if not m:
return None
try: try:
return json.loads(m.group(0)) result = json.loads(m.group(0))
if isinstance(result, list) and result:
return result
except json.JSONDecodeError: except json.JSONDecodeError:
pass pass
logger.warning("DeepSeek classification parse failed: %s", raw[:300]) return None
# 1. Try the full output as-is (handles JSON after </think>)
parsed = _try_parse(text)
if parsed:
logger.info("DeepSeek: parsed %d items from full output", len(parsed))
return parsed
# 2. Try ONLY the content inside <think> (handles JSON inside the block)
think_m = re.search(r"<think>([\s\S]*?)</think>", text)
if think_m:
parsed = _try_parse(think_m.group(1))
if parsed:
logger.info("DeepSeek: parsed %d items from <think> block", len(parsed))
return parsed
# 3. Try with think block stripped (standard path)
stripped = re.sub(r"<think>[\s\S]*?</think>", "", text).strip()
parsed = _try_parse(stripped)
if parsed:
logger.info("DeepSeek: parsed %d items after stripping <think>", len(parsed))
return parsed
logger.warning("DeepSeek classification parse failed, raw snippet: %.400s", raw)
return [] return []
@@ -217,7 +249,10 @@ async def classify_with_deepseek(live_items: list[dict]) -> list[dict]:
if isinstance(output, list): if isinstance(output, list):
output = "".join(output) output = "".join(output)
return _parse_classify_output(output) logger.info("DeepSeek raw output (first 500 chars): %.500s", output)
result = _parse_classify_output(output)
logger.info("DeepSeek classified %d / %d domains", len(result), len(live_items))
return result
except Exception as e: except Exception as e:
logger.error("DeepSeek classification error: %s", e) logger.error("DeepSeek classification error: %s", e)