mirror of
https://github.com/SuperClaude-Org/SuperClaude_Framework.git
synced 2025-12-29 16:16:08 +00:00
feat: PM Agent plugin architecture with confidence check test suite
## Plugin Architecture (Token Efficiency) - Plugin-based PM Agent (97% token reduction vs slash commands) - Lazy loading: 50 tokens at install, 1,632 tokens on /pm invocation - Skills framework: confidence_check skill for hallucination prevention ## Confidence Check Test Suite - 8 test cases (4 categories × 2 cases each) - Real data from agiletec commit history - Precision/Recall evaluation (target: ≥0.9/≥0.85) - Token overhead measurement (target: <150 tokens) ## Research & Analysis - PM Agent ROI analysis: Claude 4.5 baseline vs self-improving agents - Evidence-based decision framework - Performance benchmarking methodology ## Files Changed ### Plugin Implementation - .claude-plugin/plugin.json: Plugin manifest - .claude-plugin/commands/pm.md: PM Agent command - .claude-plugin/skills/confidence_check.py: Confidence assessment - .claude-plugin/marketplace.json: Local marketplace config ### Test Suite - .claude-plugin/tests/confidence_test_cases.json: 8 test cases - .claude-plugin/tests/run_confidence_tests.py: Evaluation script - .claude-plugin/tests/EXECUTION_PLAN.md: Next session guide - .claude-plugin/tests/README.md: Test suite documentation ### Documentation - TEST_PLUGIN.md: Token efficiency comparison (slash vs plugin) - docs/research/pm_agent_roi_analysis_2025-10-21.md: ROI analysis ### Code Changes - src/superclaude/pm_agent/confidence.py: Updated confidence checks - src/superclaude/pm_agent/token_budget.py: Deleted (replaced by /context) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Pre-execution Confidence Check
|
||||
Pre-implementation Confidence Check
|
||||
|
||||
Prevents wrong-direction execution by assessing confidence BEFORE starting.
|
||||
|
||||
@@ -7,9 +7,16 @@ Token Budget: 100-200 tokens
|
||||
ROI: 25-250x token savings when stopping wrong direction
|
||||
|
||||
Confidence Levels:
|
||||
- High (90-100%): Official docs verified, patterns identified, path clear
|
||||
- High (≥90%): Root cause identified, solution verified, no duplication, architecture-compliant
|
||||
- Medium (70-89%): Multiple approaches possible, trade-offs require consideration
|
||||
- Low (<70%): Requirements unclear, no patterns, domain knowledge insufficient
|
||||
- Low (<70%): Investigation incomplete, unclear root cause, missing official docs
|
||||
|
||||
Required Checks:
|
||||
1. No duplicate implementations (check existing code first)
|
||||
2. Architecture compliance (use existing tech stack, e.g., Supabase not custom API)
|
||||
3. Official documentation verified
|
||||
4. Working OSS implementations referenced
|
||||
5. Root cause identified with high certainty
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
@@ -36,40 +43,56 @@ class ConfidenceChecker:
|
||||
"""
|
||||
Assess confidence level (0.0 - 1.0)
|
||||
|
||||
Checks:
|
||||
1. Official documentation verified? (40%)
|
||||
2. Existing patterns identified? (30%)
|
||||
3. Implementation path clear? (30%)
|
||||
Investigation Phase Checks:
|
||||
1. No duplicate implementations? (25%)
|
||||
2. Architecture compliance? (25%)
|
||||
3. Official documentation verified? (20%)
|
||||
4. Working OSS implementations referenced? (15%)
|
||||
5. Root cause identified? (15%)
|
||||
|
||||
Args:
|
||||
context: Context dict with test/implementation details
|
||||
context: Context dict with task details
|
||||
|
||||
Returns:
|
||||
float: Confidence score (0.0 = no confidence, 1.0 = absolute)
|
||||
float: Confidence score (0.0 = no confidence, 1.0 = absolute certainty)
|
||||
"""
|
||||
score = 0.0
|
||||
checks = []
|
||||
|
||||
# Check 1: Documentation verified (40%)
|
||||
# Check 1: No duplicate implementations (25%)
|
||||
if self._no_duplicates(context):
|
||||
score += 0.25
|
||||
checks.append("✅ No duplicate implementations found")
|
||||
else:
|
||||
checks.append("❌ Check for existing implementations first")
|
||||
|
||||
# Check 2: Architecture compliance (25%)
|
||||
if self._architecture_compliant(context):
|
||||
score += 0.25
|
||||
checks.append("✅ Uses existing tech stack (e.g., Supabase)")
|
||||
else:
|
||||
checks.append("❌ Verify architecture compliance (avoid reinventing)")
|
||||
|
||||
# Check 3: Official documentation verified (20%)
|
||||
if self._has_official_docs(context):
|
||||
score += 0.4
|
||||
checks.append("✅ Official documentation")
|
||||
score += 0.2
|
||||
checks.append("✅ Official documentation verified")
|
||||
else:
|
||||
checks.append("❌ Missing documentation")
|
||||
checks.append("❌ Read official docs first")
|
||||
|
||||
# Check 2: Existing patterns (30%)
|
||||
if self._has_existing_patterns(context):
|
||||
score += 0.3
|
||||
checks.append("✅ Existing patterns found")
|
||||
# Check 4: Working OSS implementations referenced (15%)
|
||||
if self._has_oss_reference(context):
|
||||
score += 0.15
|
||||
checks.append("✅ Working OSS implementation found")
|
||||
else:
|
||||
checks.append("❌ No existing patterns")
|
||||
checks.append("❌ Search for OSS implementations")
|
||||
|
||||
# Check 3: Clear implementation path (30%)
|
||||
if self._has_clear_path(context):
|
||||
score += 0.3
|
||||
checks.append("✅ Implementation path clear")
|
||||
# Check 5: Root cause identified (15%)
|
||||
if self._root_cause_identified(context):
|
||||
score += 0.15
|
||||
checks.append("✅ Root cause identified")
|
||||
else:
|
||||
checks.append("❌ Implementation unclear")
|
||||
checks.append("❌ Continue investigation to identify root cause")
|
||||
|
||||
# Store check results for reporting
|
||||
context["confidence_checks"] = checks
|
||||
@@ -103,6 +126,78 @@ class ConfidenceChecker:
|
||||
|
||||
return False
|
||||
|
||||
def _no_duplicates(self, context: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check for duplicate implementations
|
||||
|
||||
Before implementing, verify:
|
||||
- No existing similar functions/modules (Glob/Grep)
|
||||
- No helper functions that solve the same problem
|
||||
- No libraries that provide this functionality
|
||||
|
||||
Returns True if no duplicates found (investigation complete)
|
||||
"""
|
||||
# This is a placeholder - actual implementation should:
|
||||
# 1. Search codebase with Glob/Grep for similar patterns
|
||||
# 2. Check project dependencies for existing solutions
|
||||
# 3. Verify no helper modules provide this functionality
|
||||
duplicate_check = context.get("duplicate_check_complete", False)
|
||||
return duplicate_check
|
||||
|
||||
def _architecture_compliant(self, context: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check architecture compliance
|
||||
|
||||
Verify solution uses existing tech stack:
|
||||
- Supabase project → Use Supabase APIs (not custom API)
|
||||
- Next.js project → Use Next.js patterns (not custom routing)
|
||||
- Turborepo → Use workspace patterns (not manual scripts)
|
||||
|
||||
Returns True if solution aligns with project architecture
|
||||
"""
|
||||
# This is a placeholder - actual implementation should:
|
||||
# 1. Read CLAUDE.md for project tech stack
|
||||
# 2. Verify solution uses existing infrastructure
|
||||
# 3. Check not reinventing provided functionality
|
||||
architecture_check = context.get("architecture_check_complete", False)
|
||||
return architecture_check
|
||||
|
||||
def _has_oss_reference(self, context: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if working OSS implementations referenced
|
||||
|
||||
Search for:
|
||||
- Similar open-source solutions
|
||||
- Reference implementations in popular projects
|
||||
- Community best practices
|
||||
|
||||
Returns True if OSS reference found and analyzed
|
||||
"""
|
||||
# This is a placeholder - actual implementation should:
|
||||
# 1. Search GitHub for similar implementations
|
||||
# 2. Read popular OSS projects solving same problem
|
||||
# 3. Verify approach matches community patterns
|
||||
oss_check = context.get("oss_reference_complete", False)
|
||||
return oss_check
|
||||
|
||||
def _root_cause_identified(self, context: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if root cause is identified with high certainty
|
||||
|
||||
Verify:
|
||||
- Problem source pinpointed (not guessing)
|
||||
- Solution addresses root cause (not symptoms)
|
||||
- Fix verified against official docs/OSS patterns
|
||||
|
||||
Returns True if root cause clearly identified
|
||||
"""
|
||||
# This is a placeholder - actual implementation should:
|
||||
# 1. Verify problem analysis complete
|
||||
# 2. Check solution addresses root cause
|
||||
# 3. Confirm fix aligns with best practices
|
||||
root_cause_check = context.get("root_cause_identified", False)
|
||||
return root_cause_check
|
||||
|
||||
def _has_existing_patterns(self, context: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if existing patterns can be followed
|
||||
@@ -162,8 +257,8 @@ class ConfidenceChecker:
|
||||
str: Recommended action
|
||||
"""
|
||||
if confidence >= 0.9:
|
||||
return "✅ High confidence - Proceed immediately"
|
||||
return "✅ High confidence (≥90%) - Proceed with implementation"
|
||||
elif confidence >= 0.7:
|
||||
return "⚠️ Medium confidence - Present options to user"
|
||||
return "⚠️ Medium confidence (70-89%) - Continue investigation, DO NOT implement yet"
|
||||
else:
|
||||
return "❌ Low confidence - STOP and request clarification"
|
||||
return "❌ Low confidence (<70%) - STOP and continue investigation loop"
|
||||
|
||||
@@ -1,260 +0,0 @@
|
||||
"""
|
||||
Token Budget Management
|
||||
|
||||
Budget-aware operations with complexity-based allocation.
|
||||
|
||||
Budget Levels:
|
||||
- Simple (typo fix): 200 tokens
|
||||
- Medium (bug fix): 1,000 tokens
|
||||
- Complex (feature): 2,500 tokens
|
||||
|
||||
Token Efficiency Strategy:
|
||||
- Compress trial-and-error history (keep only successful path)
|
||||
- Focus on actionable learnings (not full trajectory)
|
||||
- Example: "[Summary] 3 failures (details: failures.json) | Success: proper validation"
|
||||
|
||||
Expected Reduction:
|
||||
- Simple tasks: 80-95% reduction
|
||||
- Medium tasks: 60-80% reduction
|
||||
- Complex tasks: 40-60% reduction
|
||||
"""
|
||||
|
||||
from typing import Dict, Literal, Optional
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ComplexityLevel(str, Enum):
|
||||
"""Task complexity levels"""
|
||||
SIMPLE = "simple"
|
||||
MEDIUM = "medium"
|
||||
COMPLEX = "complex"
|
||||
|
||||
|
||||
class TokenBudgetManager:
|
||||
"""
|
||||
Token budget management for complexity-aware operations
|
||||
|
||||
Usage:
|
||||
# Simple task (typo fix)
|
||||
budget = TokenBudgetManager(complexity="simple")
|
||||
assert budget.limit == 200
|
||||
|
||||
# Medium task (bug fix)
|
||||
budget = TokenBudgetManager(complexity="medium")
|
||||
assert budget.limit == 1000
|
||||
|
||||
# Complex task (feature implementation)
|
||||
budget = TokenBudgetManager(complexity="complex")
|
||||
assert budget.limit == 2500
|
||||
|
||||
# Check budget
|
||||
if budget.remaining < 100:
|
||||
print("⚠️ Low budget - compress output")
|
||||
"""
|
||||
|
||||
# Budget allocations by complexity
|
||||
BUDGETS = {
|
||||
ComplexityLevel.SIMPLE: 200, # Typo fix, comment update
|
||||
ComplexityLevel.MEDIUM: 1000, # Bug fix, refactoring
|
||||
ComplexityLevel.COMPLEX: 2500, # Feature implementation
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
complexity: Literal["simple", "medium", "complex"] = "medium",
|
||||
custom_limit: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Initialize token budget manager
|
||||
|
||||
Args:
|
||||
complexity: Task complexity level
|
||||
custom_limit: Custom token limit (overrides complexity-based)
|
||||
"""
|
||||
self.complexity = ComplexityLevel(complexity)
|
||||
|
||||
if custom_limit is not None:
|
||||
self.limit = custom_limit
|
||||
else:
|
||||
self.limit = self.BUDGETS[self.complexity]
|
||||
|
||||
self.used = 0
|
||||
self.operations = []
|
||||
|
||||
def use(self, tokens: int, operation: str = "") -> bool:
|
||||
"""
|
||||
Use tokens for an operation
|
||||
|
||||
Args:
|
||||
tokens: Number of tokens to use
|
||||
operation: Description of operation
|
||||
|
||||
Returns:
|
||||
bool: Whether tokens were successfully allocated
|
||||
"""
|
||||
if self.used + tokens > self.limit:
|
||||
return False
|
||||
|
||||
self.used += tokens
|
||||
self.operations.append({
|
||||
"tokens": tokens,
|
||||
"operation": operation,
|
||||
"total_used": self.used,
|
||||
})
|
||||
|
||||
return True
|
||||
|
||||
@property
|
||||
def remaining(self) -> int:
|
||||
"""Get remaining token budget"""
|
||||
return self.limit - self.used
|
||||
|
||||
@property
|
||||
def usage_percentage(self) -> float:
|
||||
"""Get budget usage percentage"""
|
||||
return (self.used / self.limit) * 100 if self.limit > 0 else 0.0
|
||||
|
||||
@property
|
||||
def is_low(self) -> bool:
|
||||
"""Check if budget is running low (<20% remaining)"""
|
||||
return self.remaining < (self.limit * 0.2)
|
||||
|
||||
@property
|
||||
def is_critical(self) -> bool:
|
||||
"""Check if budget is critical (<10% remaining)"""
|
||||
return self.remaining < (self.limit * 0.1)
|
||||
|
||||
def get_status(self) -> Dict[str, any]:
|
||||
"""
|
||||
Get current budget status
|
||||
|
||||
Returns:
|
||||
Dict with status information
|
||||
"""
|
||||
return {
|
||||
"complexity": self.complexity.value,
|
||||
"limit": self.limit,
|
||||
"used": self.used,
|
||||
"remaining": self.remaining,
|
||||
"usage_percentage": round(self.usage_percentage, 1),
|
||||
"is_low": self.is_low,
|
||||
"is_critical": self.is_critical,
|
||||
"operations_count": len(self.operations),
|
||||
}
|
||||
|
||||
def get_recommendation(self) -> str:
|
||||
"""
|
||||
Get recommendation based on current budget status
|
||||
|
||||
Returns:
|
||||
str: Recommendation message
|
||||
"""
|
||||
if self.is_critical:
|
||||
return "🚨 CRITICAL: <10% budget remaining - Use symbols only, compress heavily"
|
||||
elif self.is_low:
|
||||
return "⚠️ LOW: <20% budget remaining - Compress output, avoid verbose explanations"
|
||||
elif self.usage_percentage > 50:
|
||||
return "📊 MODERATE: >50% budget used - Start token-efficient communication"
|
||||
else:
|
||||
return "✅ HEALTHY: Budget sufficient for standard operations"
|
||||
|
||||
def format_usage_report(self) -> str:
|
||||
"""
|
||||
Format budget usage report
|
||||
|
||||
Returns:
|
||||
str: Formatted report
|
||||
"""
|
||||
status = self.get_status()
|
||||
|
||||
report = [
|
||||
f"🧠 Token Budget Report",
|
||||
f"━━━━━━━━━━━━━━━━━━━━━━",
|
||||
f"Complexity: {status['complexity']}",
|
||||
f"Limit: {status['limit']} tokens",
|
||||
f"Used: {status['used']} tokens ({status['usage_percentage']}%)",
|
||||
f"Remaining: {status['remaining']} tokens",
|
||||
f"",
|
||||
f"Recommendation:",
|
||||
f"{self.get_recommendation()}",
|
||||
]
|
||||
|
||||
if self.operations:
|
||||
report.append(f"")
|
||||
report.append(f"Recent Operations:")
|
||||
for op in self.operations[-5:]: # Last 5 operations
|
||||
operation_name = op['operation'] or "unnamed"
|
||||
report.append(
|
||||
f" • {operation_name}: {op['tokens']} tokens "
|
||||
f"(total: {op['total_used']})"
|
||||
)
|
||||
|
||||
return "\n".join(report)
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset budget usage (keep limit)"""
|
||||
self.used = 0
|
||||
self.operations = []
|
||||
|
||||
def set_complexity(self, complexity: Literal["simple", "medium", "complex"]) -> None:
|
||||
"""
|
||||
Update complexity level and reset budget
|
||||
|
||||
Args:
|
||||
complexity: New complexity level
|
||||
"""
|
||||
self.complexity = ComplexityLevel(complexity)
|
||||
self.limit = self.BUDGETS[self.complexity]
|
||||
self.reset()
|
||||
|
||||
@classmethod
|
||||
def estimate_complexity(cls, context: Dict[str, any]) -> ComplexityLevel:
|
||||
"""
|
||||
Estimate complexity level from context
|
||||
|
||||
Heuristics:
|
||||
- Simple: Single file, <50 lines changed, no new files
|
||||
- Medium: Multiple files, <200 lines changed, or refactoring
|
||||
- Complex: New features, >200 lines, architectural changes
|
||||
|
||||
Args:
|
||||
context: Context dict with task information
|
||||
|
||||
Returns:
|
||||
ComplexityLevel: Estimated complexity
|
||||
"""
|
||||
# Check lines changed
|
||||
lines_changed = context.get("lines_changed", 0)
|
||||
if lines_changed > 200:
|
||||
return ComplexityLevel.COMPLEX
|
||||
|
||||
# Check files modified
|
||||
files_modified = context.get("files_modified", 0)
|
||||
if files_modified > 3:
|
||||
return ComplexityLevel.COMPLEX
|
||||
elif files_modified > 1:
|
||||
return ComplexityLevel.MEDIUM
|
||||
|
||||
# Check task type
|
||||
task_type = context.get("task_type", "").lower()
|
||||
if any(keyword in task_type for keyword in ["feature", "implement", "add"]):
|
||||
return ComplexityLevel.COMPLEX
|
||||
elif any(keyword in task_type for keyword in ["fix", "bug", "refactor"]):
|
||||
return ComplexityLevel.MEDIUM
|
||||
else:
|
||||
return ComplexityLevel.SIMPLE
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""String representation"""
|
||||
return (
|
||||
f"TokenBudget({self.complexity.value}: "
|
||||
f"{self.used}/{self.limit} tokens, "
|
||||
f"{self.usage_percentage:.1f}% used)"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Developer representation"""
|
||||
return (
|
||||
f"TokenBudgetManager(complexity={self.complexity.value!r}, "
|
||||
f"limit={self.limit}, used={self.used})"
|
||||
)
|
||||
Reference in New Issue
Block a user