From a0c9db1ef26f594ca1d454e1955bf934e054389b Mon Sep 17 00:00:00 2001 From: Malin Date: Fri, 17 Apr 2026 21:35:49 +0200 Subject: [PATCH] fix: DeepSeek niche/type not saving to DB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs: 1. _parse_classify_output stripped block before searching for JSON. DeepSeek-R1 often puts the JSON array inside the think block (especially when it "decides" mid-reasoning), so stripping it first destroyed the data. Fix: search full output first, then inside , then stripped — three fallback strategies with info logging at each step. 2. Phase 2 save used bare UPDATE WHERE domain=? which silently does nothing if the domain row doesn't exist yet in enriched_domains. Fix: replace with INSERT ... ON CONFLICT DO UPDATE (true upsert). Also adds logger.info lines so container logs show raw DeepSeek output and parse result count for easy debugging. Co-Authored-By: Claude Sonnet 4.6 --- app/db.py | 10 +++++++--- app/prescreener.py | 49 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/app/db.py b/app/db.py index 486ba75..f093e2f 100644 --- a/app/db.py +++ b/app/db.py @@ -436,10 +436,14 @@ async def save_prescreen_results(results: list[dict]): niche = r.get("niche") site_type = r.get("type") # DeepSeek returns "type" key if niche or site_type: - # Classification-only update (domain row must already exist) + # Upsert niche/type — works even if the row was never enriched await db.execute( - "UPDATE enriched_domains SET niche=?, site_type=? WHERE domain=?", - (niche, site_type, domain), + """INSERT INTO enriched_domains (domain, niche, site_type) + VALUES (?, ?, ?) + ON CONFLICT(domain) DO UPDATE SET + niche=excluded.niche, + site_type=excluded.site_type""", + (domain, niche, site_type), ) else: # Prescreen status upsert — create row if it doesn't exist yet diff --git a/app/prescreener.py b/app/prescreener.py index 7bd038d..50250bd 100644 --- a/app/prescreener.py +++ b/app/prescreener.py @@ -175,16 +175,48 @@ def _build_classify_prompt(items: list[dict]) -> str: def _parse_classify_output(raw: str) -> list[dict]: + """Extract JSON array from DeepSeek output. + Strategy: search the full raw text first (handles cases where the JSON + sits outside or inside blocks), then try with think stripped. + DeepSeek-R1 sometimes puts its answer inside the think block; stripping + it first would lose the data entirely. + """ text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() - # Strip DeepSeek reasoning block if present - text = re.sub(r"[\s\S]*?", "", text).strip() - m = re.search(r"\[[\s\S]+\]", text) - if m: + + def _try_parse(s: str): + m = re.search(r"\[[\s\S]+\]", s) + if not m: + return None try: - return json.loads(m.group(0)) + result = json.loads(m.group(0)) + if isinstance(result, list) and result: + return result except json.JSONDecodeError: pass - logger.warning("DeepSeek classification parse failed: %s", raw[:300]) + return None + + # 1. Try the full output as-is (handles JSON after ) + parsed = _try_parse(text) + if parsed: + logger.info("DeepSeek: parsed %d items from full output", len(parsed)) + return parsed + + # 2. Try ONLY the content inside (handles JSON inside the block) + think_m = re.search(r"([\s\S]*?)", text) + if think_m: + parsed = _try_parse(think_m.group(1)) + if parsed: + logger.info("DeepSeek: parsed %d items from block", len(parsed)) + return parsed + + # 3. Try with think block stripped (standard path) + stripped = re.sub(r"[\s\S]*?", "", text).strip() + parsed = _try_parse(stripped) + if parsed: + logger.info("DeepSeek: parsed %d items after stripping ", len(parsed)) + return parsed + + logger.warning("DeepSeek classification parse failed, raw snippet: %.400s", raw) return [] @@ -217,7 +249,10 @@ async def classify_with_deepseek(live_items: list[dict]) -> list[dict]: if isinstance(output, list): output = "".join(output) - return _parse_classify_output(output) + logger.info("DeepSeek raw output (first 500 chars): %.500s", output) + result = _parse_classify_output(output) + logger.info("DeepSeek classified %d / %d domains", len(result), len(live_items)) + return result except Exception as e: logger.error("DeepSeek classification error: %s", e)