Mint-Claw · lokrim · Feb 28, 2026
diff --git a/scoring.py b/scoring.py
@@ -0,0 +1,262 @@
+import json
+import re
+from typing import Dict, Any
+
+# Dimension weights defined by the quality scoring specification
+WEIGHTS = {
+    'completeness': 0.30,
+    'format_compliance': 0.20,
+    'coverage': 0.25,
+    'clarity': 0.15,
+    'validity': 0.10
+}
+
+def detect_format(content: str) -> str:
+    """
+    Infers the format of the provided content string.
+    Supported formats: json, markdown, code, text.
+    """
+    content = content.strip()
+    if not content:
+        return "text"
+
+    try:
+        json.loads(content)
+        return "json"
+    except json.JSONDecodeError:
+        pass
+
+    md_score = 0
+    code_score = 0
+
+    # Evaluate markdown characteristics
+    if re.search(r'^#+ .+', content, re.MULTILINE): md_score += 3
+    if re.search(r'\[.+\]\(.+\)', content): md_score += 2
+    if re.search(r'\*\*.+\*\*', content): md_score += 1
+    if re.search(r'^\s*[-*] .+', content, re.MULTILINE): md_score += 1
+    if '```' in content: md_score += 2
+
+    # Evaluate code patterns
+    if re.search(r'\b(def|class|function|import|export|const|let|var|return)\b', content): code_score += 2
+    if re.search(r'\{[^}]+\}', content): code_score += 1
+    if re.search(r'//|/\*|# noqa', content): code_score += 1
+    if re.search(r'\(.*\)[:\s]*\{?', content): code_score += 1
+    if re.search(r';\s*$', content, re.MULTILINE): code_score += 2
+
+    if md_score == 0 and code_score == 0:
+        return "text"
+
+    if md_score > code_score:
+        return "markdown"
+    elif code_score > md_score:
+        return "code"
+
+    # Tie-breaker: dense punctuation usually indicates code
+    if len(re.findall(r'[{};()\[\]=]', content)) > 10:
+        return "code"
+
+    return "text"
+
+def score_completeness(content: str, format_type: str) -> float:
+    """
+    Evaluates the submission's structural depth and length.
+    Returns a normalized float between 0.0 and 1.0.
+    """
+    length = len(content)
+    if length == 0:
+        return 0.0
+
+    base_score = min(length / 500.0, 0.7)
+    structure_bonus = 0.0
+
+    if format_type == "json":
+        try:
+            parsed = json.loads(content)
+            if isinstance(parsed, dict):
+                structure_bonus = min(len(parsed.keys()) / 10.0, 0.3)
+            elif isinstance(parsed, list):
+                structure_bonus = min(len(parsed) / 20.0, 0.3)
+        except json.JSONDecodeError:
+            pass
+    elif format_type == "markdown":
+        headers = len(re.findall(r'^#+ ', content, re.MULTILINE))
+        structure_bonus = min(headers / 5.0, 0.3)
+    elif format_type == "code":
+        lines = len(content.split('\n'))
+        structure_bonus = min(lines / 50.0, 0.3)
+    else:
+        words = len(content.split())
+        structure_bonus = min(words / 100.0, 0.3)
+
+    return min(base_score + structure_bonus, 1.0)
+
+def score_format_compliance(content: str, format_type: str) -> float:
+    """
+    Validates adherence to the detected format's standard conventions.
+    """
+    if format_type == "json":
+        try:
+            json.loads(content)
+            return 1.0
+        except json.JSONDecodeError:
+            return 0.2
+
+    elif format_type == "markdown":
+        score = 0.5
+        if re.search(r'\n#+ ', content): score += 0.2
+        if re.search(r'^\s*[-*] .+', content, re.MULTILINE): score += 0.1
+        if re.search(r'\[.+\]\(.+\)|\*\*.+\*\*', content): score += 0.2
+        return min(score, 1.0)
+
+    elif format_type == "code":
+        score = 0.5
+        if re.search(r'^ {4}', content, re.MULTILINE) or re.search(r'^\t', content, re.MULTILINE):
+            score += 0.3
+        if re.search(r'(def|class|function)', content): score += 0.2
+        return min(score, 1.0)
+
+    score = 0.5
+    if re.search(r'[.!?](?:\s+|$)', content): score += 0.3
+    if '\n\n' in content: score += 0.2
+    return min(score, 1.0)
+
+def score_coverage(content: str, format_type: str) -> float:
+    """
+    Estimates topic coverage by measuring unique vocabulary density.
+    """
+    words = re.findall(r'\b\w+\b', content.lower())
+    if not words:
+        return 0.0
+
+    unique_words = len(set(words))
+    ratio = unique_words / len(words)
+
+    if ratio > 0.8: return 0.9
+    if ratio > 0.5: return 1.0
+    if ratio > 0.3: return 0.8
+    if ratio > 0.1: return 0.5
+    return 0.3
+
+def score_clarity(content: str, format_type: str) -> float:
+    """
+    Scores readability based on line lengths, pacing, and whitespace.
+    """
+    lines = content.split('\n')
+    if not lines:
+        return 0.0
+
+    empty_lines = sum(1 for line in lines if not line.strip())
+    empty_ratio = empty_lines / len(lines)
+
+    score = 0.6
+    if 0.1 <= empty_ratio <= 0.3:
+        score += 0.4
+    elif empty_ratio > 0.4:
+        score -= 0.2
+
+    max_len = max(len(line) for line in lines) if lines else 0
+    if max_len < 120:
+        score += 0.3
+    elif max_len > 300 and format_type != 'json':
+        score -= 0.3
+
+    return max(0.0, min(score, 1.0))
+
+def score_validity(content: str, format_type: str) -> float:
+    """
+    Detects syntax anomalies, unbalanced closures, and trailing spaces.
+    """
+    score = 1.0
+
+    # Check for unclosed brackets or quotes
+    if content.count('(') != content.count(')'): score -= 0.3
+    if content.count('[') != content.count(']'): score -= 0.3
+    if content.count('{') != content.count('}'): score -= 0.3
+    if content.count('"') % 2 != 0: score -= 0.2
+
+    trailing_spaces = sum(1 for line in content.split('\n') if len(line) > 0 and line.endswith((' ', '\t')))
+    if trailing_spaces > 0:
+        score -= min(0.3, trailing_spaces * 0.05)
+
+    return max(0.0, min(score, 1.0))
+
+def generate_nlp_feedback(dim: str, score: float, format_type: str) -> str:
+    """
+    Returns actionable natural language feedback for a given dimension.
+    """
+    if dim == 'completeness':
+        if score >= 0.8: return "Submission is comprehensive and well-structured."
+        if score >= 0.5: return "Meets basic length requirements; consider adding more detail."
+        return "Submission is too brief or lacks expected structural elements."
+
+    elif dim == 'format_compliance':
+        if score >= 0.8: return f"High adherence to {format_type} conventions."
+        if score >= 0.5: return f"Moderate compliance with {format_type} standards; minor formatting issues detected."
+        return f"Poor {format_type} formatting. Review standard syntax guidelines."
+
+    elif dim == 'coverage':
+        if score >= 0.8: return "Excellent vocabulary range denoting good topic coverage."
+        if score >= 0.5: return "Adequate concept spread, but somewhat repetitive."
+        return "Highly repetitive content with limited vocabulary."
+
+    elif dim == 'clarity':
+        if score >= 0.8: return "Clear, readable structure with appropriate spacing."
+        if score >= 0.5: return "Generally readable; pacing or line lengths could be improved."
+        return "Difficult to parse. Break up long lines and use consistent whitespace."
+
+    elif dim == 'validity':
+        if score >= 0.8: return "Logically sound with balanced syntax."
+        if score >= 0.5: return "Mostly valid; minor anomalies like trailing spaces found."
+        return "Significant validity issues detected (e.g., unbalanced closures)."
+
+    return ""
+
+def score_submission(content: str) -> Dict[str, Any]:
+    """
+    Evaluates a submission across all defined dimensions.
+    Returns a dictionary formatted to the specification rubric.
+    """
+    format_type = detect_format(content)
+
+    dims = {
+        'completeness': score_completeness(content, format_type),
+        'format_compliance': score_format_compliance(content, format_type),
+        'coverage': score_coverage(content, format_type),
+        'clarity': score_clarity(content, format_type),
+        'validity': score_validity(content, format_type),
+    }
+
+    # Calculate the final weighted score
+    weighted_score = sum(dims[k] * WEIGHTS[k] for k in dims)
+    weighted_score = round(weighted_score, 4)
+
+    # Assign a letter grade based on ranges
+    quality_rating = 'F'
+    if weighted_score >= 0.9: quality_rating = 'S'
+    elif weighted_score >= 0.8: quality_rating = 'A'
+    elif weighted_score >= 0.7: quality_rating = 'B'
+    elif weighted_score >= 0.6: quality_rating = 'C'
+
+    pass_threshold = weighted_score >= 0.70
+
+    feedback = [f"Detected format: {format_type.upper()}"]
+
+    # Sort dimensions ascending to highlight lowest-scoring areas first
+    sorted_dims = sorted(dims.items(), key=lambda x: x[1])
+    for dim_name, dim_val in sorted_dims[:3]:
+        fb_text = generate_nlp_feedback(dim_name, dim_val, format_type)
+        feedback.append(f"{dim_name.title()}: {fb_text}")
+
+    if pass_threshold:
+        feedback.append("Submission meets the required quality baseline.")
+    else:
+        feedback.append("Submission failed to meet the quality baseline (>= 0.70).")
+
+    return {
+        "weighted_score": weighted_score,
+        "quality_rating": quality_rating,
+        "scores": {k: round(v, 4) for k, v in dims.items()},
+        "feedback": feedback,
+        "pass_threshold": pass_threshold,
+        "format_detected": format_type
+    }
diff --git a/test_scoring.py b/test_scoring.py
@@ -0,0 +1,120 @@
+import time
+import json
+import unittest
+from scoring import score_submission, detect_format
+
+# --- BENCHMARK DATA ---
+sample_json = json.dumps({"key1": "value1", "key2": ["a", "b", "c"], "nested": {"a": 1, "b": 2}})
+sample_markdown = "# Title\\n\\nHere is some **bold text**.\\n\\n- item 1\\n- item 2"
+sample_code = "def add(a, b):\\n    return a + b"
+sample_text = "This is a plain text submission with a few standard sentences. It should be parsed as plain text and evaluated accordingly."
+benchmark_submissions = [sample_json, sample_markdown, sample_code, sample_text] * 26  # 104 submissions
+
+class TestQualityScoring(unittest.TestCase):
+    def setUp(self):
+        # 20 diverse test cases
+        self.samples = [
+            # JSON Cases (1-5)
+            '{"user_id": 123, "name": "Alice", "active": true, "roles": ["admin", "editor"]}',
+            '{"status": "error", "message": "unauthorized"}',  # short JSON
+            json.dumps({"data": [{"id": i} for i in range(50)]}), # large JSON
+            '{"broken": "json", missing_quotes}', # Invalid JSON string
+            json.dumps({"key": "value" * 50}), # Repetitive JSON
+
+            # Markdown Cases (6-10)
+            '# Great Post\\n\\nThis is a **bold** statement and a [link](http://example.com).',
+            '## Section 1\\n- item A\\n- item B\\n\\n## Section 2\\n- item C',
+            '# Short',
+            '# Guide\\n\\nHere is some code:\\n```python\\nprint("hello")\\n```',
+            '# Repetitive\\n\\nRepetitive text repetitive text repetitive text repetitive text repetitive text.',
+
+            # Code Cases (11-15)
+            'def calculate_score(data):\\n    return sum(data.values())',
+            'class User:\\n    def __init__(self, name):\\n        self.name = name',
+            'import os\\n\\nprint(os.environ)\\n',
+            'function greet(name) {\\n    console.log("Hello, " + name);\\n}',
+            '// simple comment code\\nlet x = 10;\\nif (x > 5) {\\n    return true;\\n}',
+
+            # Text Cases (16-20)
+            'The quick brown fox jumps over the lazy dog. This is a very standard sentence that contains unique words.',
+            'A very short sentence.',
+            'This text is extremely repetitive. ' * 20,
+            'This paragraph explains the intricate details of quality scoring. It examines metrics like completeness, format compliance, clarity, coverage, and validity.',
+            'Line 1\\n\\nLine 2\\n\\nLine 3\\n\\nLine 4\\n\\nLine 5\\n\\nLine 6'
+        ]
+
+    def test_format_detection(self):
+        self.assertEqual(detect_format('{"a": 1}'), "json")
+        self.assertEqual(detect_format('# Hello\\nworld'), "markdown")
+        self.assertEqual(detect_format('def foo():\\n  pass\\n'), "code")
+        self.assertEqual(detect_format('Hello world'), "text")
+
+    def test_all_samples(self):
+        for i, sample in enumerate(self.samples):
+            res = score_submission(sample)
+
+            # Check structure
+            self.assertIn("weighted_score", res)
+            self.assertIn("quality_rating", res)
+            self.assertIn("scores", res)
+            self.assertIn("feedback", res)
+            self.assertIn("pass_threshold", res)
+
+            # Check score bounds
+            for dim, score in res['scores'].items():
+                self.assertGreaterEqual(score, 0.0)
+                self.assertLessEqual(score, 1.0)
+
+            self.assertGreaterEqual(res['weighted_score'], 0.0)
+            self.assertLessEqual(res['weighted_score'], 1.0)
+
+            # NLP feedback should have correctly formatted lines
+            self.assertTrue(any(":" in fb for fb in res['feedback']))
+
+    def test_weights_sum_to_one(self):
+        from scoring import WEIGHTS
+        self.assertAlmostEqual(sum(WEIGHTS.values()), 1.0, places=4)
+
+    def test_pass_threshold_logic(self):
+        res_good = score_submission(self.samples[2]) # large JSON should pass
+        res_bad = score_submission(self.samples[7]) # "# Short" should fail
+
+        # Exact value depends on heuristics, but logic should align with weight >= 0.70
+        self.assertEqual(res_bad['pass_threshold'], res_bad['weighted_score'] >= 0.70)
+        self.assertEqual(res_good['pass_threshold'], res_good['weighted_score'] >= 0.70)
+
+
+def run_performance_benchmark():
+    """
+    Runs the 100+ submissions < 10s benchmark required by the bounty.
+    """
+    print("\\n" + "="*50)
+    print(f"Running performance benchmark on {len(benchmark_submissions)} submissions...")
+    print("="*50)
+
+    start_time = time.time()
+
+    results = []
+    for sub in benchmark_submissions:
+        res = score_submission(sub)
+        results.append(res)
+
+    duration = time.time() - start_time
+    print(f"\\nProcessed {len(results)} submissions in {duration:.4f} seconds.")
+
+    if duration < 10.0:
+        print("✅ Performance requirement met (<10s).")
+    else:
+        print("❌ Performance requirement failed.")
+
+    print("\\nExample Output format (First JSON submission):")
+    print(json.dumps(results[0], indent=2))
+    print("\\nDone.\\n")
+
+if __name__ == "__main__":
+    # 1. Run Benchmark
+    run_performance_benchmark()
+
+    # 2. Run Unittests
+    print("Running Unittests...")
+    unittest.main()