fix: DeepSeek niche/type not saving to DB
Two bugs: 1. _parse_classify_output stripped <think> block before searching for JSON. DeepSeek-R1 often puts the JSON array inside the think block (especially when it "decides" mid-reasoning), so stripping it first destroyed the data. Fix: search full output first, then inside <think>, then stripped — three fallback strategies with info logging at each step. 2. Phase 2 save used bare UPDATE WHERE domain=? which silently does nothing if the domain row doesn't exist yet in enriched_domains. Fix: replace with INSERT ... ON CONFLICT DO UPDATE (true upsert). Also adds logger.info lines so container logs show raw DeepSeek output and parse result count for easy debugging. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
10
app/db.py
10
app/db.py
@@ -436,10 +436,14 @@ async def save_prescreen_results(results: list[dict]):
|
|||||||
niche = r.get("niche")
|
niche = r.get("niche")
|
||||||
site_type = r.get("type") # DeepSeek returns "type" key
|
site_type = r.get("type") # DeepSeek returns "type" key
|
||||||
if niche or site_type:
|
if niche or site_type:
|
||||||
# Classification-only update (domain row must already exist)
|
# Upsert niche/type — works even if the row was never enriched
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"UPDATE enriched_domains SET niche=?, site_type=? WHERE domain=?",
|
"""INSERT INTO enriched_domains (domain, niche, site_type)
|
||||||
(niche, site_type, domain),
|
VALUES (?, ?, ?)
|
||||||
|
ON CONFLICT(domain) DO UPDATE SET
|
||||||
|
niche=excluded.niche,
|
||||||
|
site_type=excluded.site_type""",
|
||||||
|
(domain, niche, site_type),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Prescreen status upsert — create row if it doesn't exist yet
|
# Prescreen status upsert — create row if it doesn't exist yet
|
||||||
|
|||||||
@@ -175,16 +175,48 @@ def _build_classify_prompt(items: list[dict]) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _parse_classify_output(raw: str) -> list[dict]:
|
def _parse_classify_output(raw: str) -> list[dict]:
|
||||||
|
"""Extract JSON array from DeepSeek output.
|
||||||
|
Strategy: search the full raw text first (handles cases where the JSON
|
||||||
|
sits outside or inside <think> blocks), then try with think stripped.
|
||||||
|
DeepSeek-R1 sometimes puts its answer inside the think block; stripping
|
||||||
|
it first would lose the data entirely.
|
||||||
|
"""
|
||||||
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
||||||
# Strip DeepSeek <think>…</think> reasoning block if present
|
|
||||||
text = re.sub(r"<think>[\s\S]*?</think>", "", text).strip()
|
def _try_parse(s: str):
|
||||||
m = re.search(r"\[[\s\S]+\]", text)
|
m = re.search(r"\[[\s\S]+\]", s)
|
||||||
if m:
|
if not m:
|
||||||
|
return None
|
||||||
try:
|
try:
|
||||||
return json.loads(m.group(0))
|
result = json.loads(m.group(0))
|
||||||
|
if isinstance(result, list) and result:
|
||||||
|
return result
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
logger.warning("DeepSeek classification parse failed: %s", raw[:300])
|
return None
|
||||||
|
|
||||||
|
# 1. Try the full output as-is (handles JSON after </think>)
|
||||||
|
parsed = _try_parse(text)
|
||||||
|
if parsed:
|
||||||
|
logger.info("DeepSeek: parsed %d items from full output", len(parsed))
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
# 2. Try ONLY the content inside <think> (handles JSON inside the block)
|
||||||
|
think_m = re.search(r"<think>([\s\S]*?)</think>", text)
|
||||||
|
if think_m:
|
||||||
|
parsed = _try_parse(think_m.group(1))
|
||||||
|
if parsed:
|
||||||
|
logger.info("DeepSeek: parsed %d items from <think> block", len(parsed))
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
# 3. Try with think block stripped (standard path)
|
||||||
|
stripped = re.sub(r"<think>[\s\S]*?</think>", "", text).strip()
|
||||||
|
parsed = _try_parse(stripped)
|
||||||
|
if parsed:
|
||||||
|
logger.info("DeepSeek: parsed %d items after stripping <think>", len(parsed))
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
logger.warning("DeepSeek classification parse failed, raw snippet: %.400s", raw)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@@ -217,7 +249,10 @@ async def classify_with_deepseek(live_items: list[dict]) -> list[dict]:
|
|||||||
if isinstance(output, list):
|
if isinstance(output, list):
|
||||||
output = "".join(output)
|
output = "".join(output)
|
||||||
|
|
||||||
return _parse_classify_output(output)
|
logger.info("DeepSeek raw output (first 500 chars): %.500s", output)
|
||||||
|
result = _parse_classify_output(output)
|
||||||
|
logger.info("DeepSeek classified %d / %d domains", len(result), len(live_items))
|
||||||
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("DeepSeek classification error: %s", e)
|
logger.error("DeepSeek classification error: %s", e)
|
||||||
|
|||||||
Reference in New Issue
Block a user