From cb78c61eb4c42ff21a0b54c9baa34d8d38c89b71 Mon Sep 17 00:00:00 2001 From: sungdark Date: Thu, 26 Feb 2026 19:14:30 +0000 Subject: [PATCH] feat: add weighted quality scoring engine + benchmark endpoints --- README.md | 10 +++ app.py | 22 +++++ sample_scorecards.json | 51 ++++++++++++ scoring.py | 179 +++++++++++++++++++++++++++++++++++++++++ test_scoring.py | 43 ++++++++++ 5 files changed, 305 insertions(+) create mode 100644 sample_scorecards.json create mode 100644 scoring.py create mode 100644 test_scoring.py diff --git a/README.md b/README.md index 63e71e4..5b635e5 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,16 @@ curl -X POST http://localhost:8080/api/repurpose \ curl http://localhost:8080/api/usage -H "X-API-Key: cs_your_key" ``` +### Quality Scoring (Bounty #1) +```bash +curl -X POST http://localhost:8080/api/quality/score \ + -H "Content-Type: application/json" \ + -d '{"submission":"# Proposal\n- goal: ...\n- approach: ...\n- result: ...\n- risk: ...\n- timeline: ...\n- test: ..."}' +``` + +Returns: +`{weighted_score, quality_rating, scores:{...}, feedback:[...], pass_threshold}` + ## 🎯 Platforms | Platform | Output | diff --git a/app.py b/app.py index 99235a3..ba35034 100644 --- a/app.py +++ b/app.py @@ -17,6 +17,7 @@ from pydantic import BaseModel, Field from middleware import validate_api_key, track_usage, get_usage_stats, get_or_create_key, PLANS +from scoring import QualityScorer app = FastAPI( title="ContentSplit", @@ -57,6 +58,14 @@ class RepurposeResponse(BaseModel): created_at: str +class QualityScoreRequest(BaseModel): + submission: str = Field(..., min_length=1, description="Submission content: JSON, markdown, code, or text") + + +class QualityBenchmarkRequest(BaseModel): + submissions: list[str] = Field(..., min_length=1, description="Batch submissions for benchmark") + + # ── Content Generation (using prompts, model-agnostic) ──────────────────── PLATFORM_PROMPTS = { @@ -386,6 +395,19 @@ async def list_platforms(): } +quality_scorer = QualityScorer() + + +@app.post("/api/quality/score") +async def quality_score(req: QualityScoreRequest): + return quality_scorer.score(req.submission) + + +@app.post("/api/quality/benchmark") +async def quality_benchmark(req: QualityBenchmarkRequest): + return quality_scorer.benchmark(req.submissions) + + @app.get("/", response_class=HTMLResponse) async def landing_page(): """Landing page.""" diff --git a/sample_scorecards.json b/sample_scorecards.json new file mode 100644 index 0000000..1062503 --- /dev/null +++ b/sample_scorecards.json @@ -0,0 +1,51 @@ +[ + { + "weighted_score": 0.6391, + "quality_rating": "fair", + "scores": { + "completeness": 0.1427, + "format_compliance": 1.0, + "coverage": 0.775, + "clarity": 0.75, + "validity": 0.9 + }, + "feedback": [ + "Detected format: markdown.", + "Improve completeness (current 0.14)." + ], + "pass_threshold": false + }, + { + "weighted_score": 0.7188, + "quality_rating": "fair", + "scores": { + "completeness": 0.3127, + "format_compliance": 1.0, + "coverage": 1.0, + "clarity": 0.5, + "validity": 1.0 + }, + "feedback": [ + "Detected format: json.", + "Improve completeness (current 0.31).", + "Improve clarity (current 0.50)." + ], + "pass_threshold": true + }, + { + "weighted_score": 0.5965, + "quality_rating": "poor", + "scores": { + "completeness": 0.0382, + "format_compliance": 0.85, + "coverage": 0.7, + "clarity": 1.0, + "validity": 0.9 + }, + "feedback": [ + "Detected format: text.", + "Improve completeness (current 0.04)." + ], + "pass_threshold": false + } +] \ No newline at end of file diff --git a/scoring.py b/scoring.py new file mode 100644 index 0000000..c70dfeb --- /dev/null +++ b/scoring.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +import json +import re +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple + +WEIGHTS: Dict[str, float] = { + "completeness": 0.30, + "format_compliance": 0.20, + "coverage": 0.25, + "clarity": 0.15, + "validity": 0.10, +} + + +@dataclass +class ScoreResult: + weighted_score: float + quality_rating: str + scores: Dict[str, float] + feedback: List[str] + pass_threshold: bool + + def as_dict(self) -> Dict[str, Any]: + return { + "weighted_score": round(self.weighted_score, 4), + "quality_rating": self.quality_rating, + "scores": {k: round(v, 4) for k, v in self.scores.items()}, + "feedback": self.feedback, + "pass_threshold": self.pass_threshold, + } + + +class QualityScorer: + def __init__(self, pass_threshold: float = 0.7): + self.pass_threshold = pass_threshold + + def score(self, submission: str) -> Dict[str, Any]: + submission = submission or "" + fmt = self._detect_format(submission) + sections = self._extract_sections(submission, fmt) + + scores = { + "completeness": self._score_completeness(submission, sections), + "format_compliance": self._score_format_compliance(submission, fmt), + "coverage": self._score_coverage(submission, sections), + "clarity": self._score_clarity(submission), + "validity": self._score_validity(submission, fmt), + } + + weighted = sum(scores[k] * WEIGHTS[k] for k in WEIGHTS) + feedback = self._feedback(scores, fmt) + + return ScoreResult( + weighted_score=weighted, + quality_rating=self._rating(weighted), + scores=scores, + feedback=feedback, + pass_threshold=weighted >= self.pass_threshold, + ).as_dict() + + def benchmark(self, submissions: List[str]) -> Dict[str, Any]: + start = time.perf_counter() + scorecards = [self.score(s) for s in submissions] + elapsed = time.perf_counter() - start + return { + "count": len(submissions), + "elapsed_seconds": round(elapsed, 4), + "scorecards": scorecards, + } + + def _detect_format(self, text: str) -> str: + stripped = text.strip() + if not stripped: + return "text" + if stripped.startswith("{") or stripped.startswith("["): + try: + json.loads(stripped) + return "json" + except Exception: + pass + if "```" in text or re.search(r"\b(def|class|function|const|let)\b", text): + return "code" + if re.search(r"^#{1,6}\s+", text, flags=re.M) or re.search(r"^[-*]\s+", text, flags=re.M): + return "markdown" + return "text" + + def _extract_sections(self, text: str, fmt: str) -> List[str]: + if fmt == "json": + try: + obj = json.loads(text) + if isinstance(obj, dict): + return list(obj.keys()) + return ["list"] + except Exception: + return [] + return re.findall(r"^#{1,6}\s+(.+)$", text, flags=re.M) + + def _score_completeness(self, text: str, sections: List[str]) -> float: + length = len(text.split()) + section_bonus = min(len(sections) / 5.0, 1.0) * 0.3 + length_score = min(length / 220.0, 1.0) * 0.7 + return min(length_score + section_bonus, 1.0) + + def _score_format_compliance(self, text: str, fmt: str) -> float: + if fmt == "json": + try: + obj = json.loads(text) + return 1.0 if isinstance(obj, (dict, list)) else 0.7 + except Exception: + return 0.2 + if fmt == "code": + return 1.0 if "```" in text else 0.75 + if fmt == "markdown": + has_heading = bool(re.search(r"^#{1,6}\s+", text, flags=re.M)) + has_list = bool(re.search(r"^[-*]\s+", text, flags=re.M)) + return 0.6 + 0.2 * has_heading + 0.2 * has_list + return 0.85 + + def _score_coverage(self, text: str, sections: List[str]) -> float: + key_terms = ["goal", "approach", "result", "risk", "timeline", "test"] + txt = text.lower() + hits = sum(1 for k in key_terms if k in txt) + section_factor = min(len(sections) / 4.0, 1.0) + return min((hits / len(key_terms)) * 0.7 + section_factor * 0.3, 1.0) + + def _score_clarity(self, text: str) -> float: + sentences = re.split(r"[.!?]+", text) + sentences = [s.strip() for s in sentences if s.strip()] + if not sentences: + return 0.0 + avg_len = sum(len(s.split()) for s in sentences) / len(sentences) + # best around 12-24 words/sentence + if avg_len < 6: + base = 0.5 + elif avg_len <= 24: + base = 1.0 + elif avg_len <= 36: + base = 0.75 + else: + base = 0.5 + punctuation = 1.0 if re.search(r"[,;:]", text) else 0.85 + return min(base * punctuation, 1.0) + + def _score_validity(self, text: str, fmt: str) -> float: + if fmt == "json": + try: + json.loads(text) + return 1.0 + except Exception: + return 0.1 + if fmt == "code": + opens = text.count("{") + text.count("(") + text.count("[") + closes = text.count("}") + text.count(")") + text.count("]") + return 1.0 if opens == closes else 0.6 + # basic sanity check for plain text/markdown + bad_patterns = ["lorem ipsum", "TODO", "TBD"] + penalty = sum(1 for p in bad_patterns if p.lower() in text.lower()) * 0.1 + return max(0.9 - penalty, 0.4) + + def _rating(self, weighted: float) -> str: + if weighted >= 0.9: + return "excellent" + if weighted >= 0.75: + return "good" + if weighted >= 0.6: + return "fair" + return "poor" + + def _feedback(self, scores: Dict[str, float], fmt: str) -> List[str]: + fb = [f"Detected format: {fmt}."] + for dim, val in scores.items(): + if val < 0.6: + fb.append(f"Improve {dim.replace('_', ' ')} (current {val:.2f}).") + if len(fb) == 1: + fb.append("Strong submission across all dimensions.") + return fb diff --git a/test_scoring.py b/test_scoring.py new file mode 100644 index 0000000..f4affbd --- /dev/null +++ b/test_scoring.py @@ -0,0 +1,43 @@ +import json +import unittest + +from scoring import QualityScorer + + +class TestQualityScorer(unittest.TestCase): + def setUp(self): + self.scorer = QualityScorer(pass_threshold=0.7) + + def test_json_submission(self): + submission = json.dumps( + { + "goal": "Build scoring", + "approach": "Weighted rubric", + "result": "0-1 output", + "risk": "edge formats", + "timeline": "2 days", + "test": "20-case benchmark", + } + ) + out = self.scorer.score(submission) + self.assertIn("weighted_score", out) + self.assertIn("scores", out) + self.assertGreaterEqual(out["scores"]["format_compliance"], 0.95) + + def test_markdown_submission(self): + submission = """# Proposal\n- goal: build engine\n- approach: rubric scoring\n- result: reliable scores\n- risk: malformed input\n- timeline: 2d\n- test: benchmark suite\n""" + out = self.scorer.score(submission) + self.assertEqual(out["quality_rating"] in {"good", "excellent", "fair", "poor"}, True) + self.assertTrue(isinstance(out["feedback"], list)) + + def test_benchmark_100_under_10s(self): + samples = [ + f"goal approach result risk timeline test sample {i}" for i in range(100) + ] + b = self.scorer.benchmark(samples) + self.assertEqual(b["count"], 100) + self.assertLess(b["elapsed_seconds"], 10.0) + + +if __name__ == "__main__": + unittest.main()