diff --git a/scoring.py b/scoring.py new file mode 100644 index 0000000..2e896ec --- /dev/null +++ b/scoring.py @@ -0,0 +1,262 @@ +import json +import re +from typing import Dict, Any + +# Dimension weights defined by the quality scoring specification +WEIGHTS = { + 'completeness': 0.30, + 'format_compliance': 0.20, + 'coverage': 0.25, + 'clarity': 0.15, + 'validity': 0.10 +} + +def detect_format(content: str) -> str: + """ + Infers the format of the provided content string. + Supported formats: json, markdown, code, text. + """ + content = content.strip() + if not content: + return "text" + + try: + json.loads(content) + return "json" + except json.JSONDecodeError: + pass + + md_score = 0 + code_score = 0 + + # Evaluate markdown characteristics + if re.search(r'^#+ .+', content, re.MULTILINE): md_score += 3 + if re.search(r'\[.+\]\(.+\)', content): md_score += 2 + if re.search(r'\*\*.+\*\*', content): md_score += 1 + if re.search(r'^\s*[-*] .+', content, re.MULTILINE): md_score += 1 + if '```' in content: md_score += 2 + + # Evaluate code patterns + if re.search(r'\b(def|class|function|import|export|const|let|var|return)\b', content): code_score += 2 + if re.search(r'\{[^}]+\}', content): code_score += 1 + if re.search(r'//|/\*|# noqa', content): code_score += 1 + if re.search(r'\(.*\)[:\s]*\{?', content): code_score += 1 + if re.search(r';\s*$', content, re.MULTILINE): code_score += 2 + + if md_score == 0 and code_score == 0: + return "text" + + if md_score > code_score: + return "markdown" + elif code_score > md_score: + return "code" + + # Tie-breaker: dense punctuation usually indicates code + if len(re.findall(r'[{};()\[\]=]', content)) > 10: + return "code" + + return "text" + +def score_completeness(content: str, format_type: str) -> float: + """ + Evaluates the submission's structural depth and length. + Returns a normalized float between 0.0 and 1.0. + """ + length = len(content) + if length == 0: + return 0.0 + + base_score = min(length / 500.0, 0.7) + structure_bonus = 0.0 + + if format_type == "json": + try: + parsed = json.loads(content) + if isinstance(parsed, dict): + structure_bonus = min(len(parsed.keys()) / 10.0, 0.3) + elif isinstance(parsed, list): + structure_bonus = min(len(parsed) / 20.0, 0.3) + except json.JSONDecodeError: + pass + elif format_type == "markdown": + headers = len(re.findall(r'^#+ ', content, re.MULTILINE)) + structure_bonus = min(headers / 5.0, 0.3) + elif format_type == "code": + lines = len(content.split('\n')) + structure_bonus = min(lines / 50.0, 0.3) + else: + words = len(content.split()) + structure_bonus = min(words / 100.0, 0.3) + + return min(base_score + structure_bonus, 1.0) + +def score_format_compliance(content: str, format_type: str) -> float: + """ + Validates adherence to the detected format's standard conventions. + """ + if format_type == "json": + try: + json.loads(content) + return 1.0 + except json.JSONDecodeError: + return 0.2 + + elif format_type == "markdown": + score = 0.5 + if re.search(r'\n#+ ', content): score += 0.2 + if re.search(r'^\s*[-*] .+', content, re.MULTILINE): score += 0.1 + if re.search(r'\[.+\]\(.+\)|\*\*.+\*\*', content): score += 0.2 + return min(score, 1.0) + + elif format_type == "code": + score = 0.5 + if re.search(r'^ {4}', content, re.MULTILINE) or re.search(r'^\t', content, re.MULTILINE): + score += 0.3 + if re.search(r'(def|class|function)', content): score += 0.2 + return min(score, 1.0) + + score = 0.5 + if re.search(r'[.!?](?:\s+|$)', content): score += 0.3 + if '\n\n' in content: score += 0.2 + return min(score, 1.0) + +def score_coverage(content: str, format_type: str) -> float: + """ + Estimates topic coverage by measuring unique vocabulary density. + """ + words = re.findall(r'\b\w+\b', content.lower()) + if not words: + return 0.0 + + unique_words = len(set(words)) + ratio = unique_words / len(words) + + if ratio > 0.8: return 0.9 + if ratio > 0.5: return 1.0 + if ratio > 0.3: return 0.8 + if ratio > 0.1: return 0.5 + return 0.3 + +def score_clarity(content: str, format_type: str) -> float: + """ + Scores readability based on line lengths, pacing, and whitespace. + """ + lines = content.split('\n') + if not lines: + return 0.0 + + empty_lines = sum(1 for line in lines if not line.strip()) + empty_ratio = empty_lines / len(lines) + + score = 0.6 + if 0.1 <= empty_ratio <= 0.3: + score += 0.4 + elif empty_ratio > 0.4: + score -= 0.2 + + max_len = max(len(line) for line in lines) if lines else 0 + if max_len < 120: + score += 0.3 + elif max_len > 300 and format_type != 'json': + score -= 0.3 + + return max(0.0, min(score, 1.0)) + +def score_validity(content: str, format_type: str) -> float: + """ + Detects syntax anomalies, unbalanced closures, and trailing spaces. + """ + score = 1.0 + + # Check for unclosed brackets or quotes + if content.count('(') != content.count(')'): score -= 0.3 + if content.count('[') != content.count(']'): score -= 0.3 + if content.count('{') != content.count('}'): score -= 0.3 + if content.count('"') % 2 != 0: score -= 0.2 + + trailing_spaces = sum(1 for line in content.split('\n') if len(line) > 0 and line.endswith((' ', '\t'))) + if trailing_spaces > 0: + score -= min(0.3, trailing_spaces * 0.05) + + return max(0.0, min(score, 1.0)) + +def generate_nlp_feedback(dim: str, score: float, format_type: str) -> str: + """ + Returns actionable natural language feedback for a given dimension. + """ + if dim == 'completeness': + if score >= 0.8: return "Submission is comprehensive and well-structured." + if score >= 0.5: return "Meets basic length requirements; consider adding more detail." + return "Submission is too brief or lacks expected structural elements." + + elif dim == 'format_compliance': + if score >= 0.8: return f"High adherence to {format_type} conventions." + if score >= 0.5: return f"Moderate compliance with {format_type} standards; minor formatting issues detected." + return f"Poor {format_type} formatting. Review standard syntax guidelines." + + elif dim == 'coverage': + if score >= 0.8: return "Excellent vocabulary range denoting good topic coverage." + if score >= 0.5: return "Adequate concept spread, but somewhat repetitive." + return "Highly repetitive content with limited vocabulary." + + elif dim == 'clarity': + if score >= 0.8: return "Clear, readable structure with appropriate spacing." + if score >= 0.5: return "Generally readable; pacing or line lengths could be improved." + return "Difficult to parse. Break up long lines and use consistent whitespace." + + elif dim == 'validity': + if score >= 0.8: return "Logically sound with balanced syntax." + if score >= 0.5: return "Mostly valid; minor anomalies like trailing spaces found." + return "Significant validity issues detected (e.g., unbalanced closures)." + + return "" + +def score_submission(content: str) -> Dict[str, Any]: + """ + Evaluates a submission across all defined dimensions. + Returns a dictionary formatted to the specification rubric. + """ + format_type = detect_format(content) + + dims = { + 'completeness': score_completeness(content, format_type), + 'format_compliance': score_format_compliance(content, format_type), + 'coverage': score_coverage(content, format_type), + 'clarity': score_clarity(content, format_type), + 'validity': score_validity(content, format_type), + } + + # Calculate the final weighted score + weighted_score = sum(dims[k] * WEIGHTS[k] for k in dims) + weighted_score = round(weighted_score, 4) + + # Assign a letter grade based on ranges + quality_rating = 'F' + if weighted_score >= 0.9: quality_rating = 'S' + elif weighted_score >= 0.8: quality_rating = 'A' + elif weighted_score >= 0.7: quality_rating = 'B' + elif weighted_score >= 0.6: quality_rating = 'C' + + pass_threshold = weighted_score >= 0.70 + + feedback = [f"Detected format: {format_type.upper()}"] + + # Sort dimensions ascending to highlight lowest-scoring areas first + sorted_dims = sorted(dims.items(), key=lambda x: x[1]) + for dim_name, dim_val in sorted_dims[:3]: + fb_text = generate_nlp_feedback(dim_name, dim_val, format_type) + feedback.append(f"{dim_name.title()}: {fb_text}") + + if pass_threshold: + feedback.append("Submission meets the required quality baseline.") + else: + feedback.append("Submission failed to meet the quality baseline (>= 0.70).") + + return { + "weighted_score": weighted_score, + "quality_rating": quality_rating, + "scores": {k: round(v, 4) for k, v in dims.items()}, + "feedback": feedback, + "pass_threshold": pass_threshold, + "format_detected": format_type + } diff --git a/test_scoring.py b/test_scoring.py new file mode 100644 index 0000000..cfb4242 --- /dev/null +++ b/test_scoring.py @@ -0,0 +1,120 @@ +import time +import json +import unittest +from scoring import score_submission, detect_format + +# --- BENCHMARK DATA --- +sample_json = json.dumps({"key1": "value1", "key2": ["a", "b", "c"], "nested": {"a": 1, "b": 2}}) +sample_markdown = "# Title\\n\\nHere is some **bold text**.\\n\\n- item 1\\n- item 2" +sample_code = "def add(a, b):\\n return a + b" +sample_text = "This is a plain text submission with a few standard sentences. It should be parsed as plain text and evaluated accordingly." +benchmark_submissions = [sample_json, sample_markdown, sample_code, sample_text] * 26 # 104 submissions + +class TestQualityScoring(unittest.TestCase): + def setUp(self): + # 20 diverse test cases + self.samples = [ + # JSON Cases (1-5) + '{"user_id": 123, "name": "Alice", "active": true, "roles": ["admin", "editor"]}', + '{"status": "error", "message": "unauthorized"}', # short JSON + json.dumps({"data": [{"id": i} for i in range(50)]}), # large JSON + '{"broken": "json", missing_quotes}', # Invalid JSON string + json.dumps({"key": "value" * 50}), # Repetitive JSON + + # Markdown Cases (6-10) + '# Great Post\\n\\nThis is a **bold** statement and a [link](http://example.com).', + '## Section 1\\n- item A\\n- item B\\n\\n## Section 2\\n- item C', + '# Short', + '# Guide\\n\\nHere is some code:\\n```python\\nprint("hello")\\n```', + '# Repetitive\\n\\nRepetitive text repetitive text repetitive text repetitive text repetitive text.', + + # Code Cases (11-15) + 'def calculate_score(data):\\n return sum(data.values())', + 'class User:\\n def __init__(self, name):\\n self.name = name', + 'import os\\n\\nprint(os.environ)\\n', + 'function greet(name) {\\n console.log("Hello, " + name);\\n}', + '// simple comment code\\nlet x = 10;\\nif (x > 5) {\\n return true;\\n}', + + # Text Cases (16-20) + 'The quick brown fox jumps over the lazy dog. This is a very standard sentence that contains unique words.', + 'A very short sentence.', + 'This text is extremely repetitive. ' * 20, + 'This paragraph explains the intricate details of quality scoring. It examines metrics like completeness, format compliance, clarity, coverage, and validity.', + 'Line 1\\n\\nLine 2\\n\\nLine 3\\n\\nLine 4\\n\\nLine 5\\n\\nLine 6' + ] + + def test_format_detection(self): + self.assertEqual(detect_format('{"a": 1}'), "json") + self.assertEqual(detect_format('# Hello\\nworld'), "markdown") + self.assertEqual(detect_format('def foo():\\n pass\\n'), "code") + self.assertEqual(detect_format('Hello world'), "text") + + def test_all_samples(self): + for i, sample in enumerate(self.samples): + res = score_submission(sample) + + # Check structure + self.assertIn("weighted_score", res) + self.assertIn("quality_rating", res) + self.assertIn("scores", res) + self.assertIn("feedback", res) + self.assertIn("pass_threshold", res) + + # Check score bounds + for dim, score in res['scores'].items(): + self.assertGreaterEqual(score, 0.0) + self.assertLessEqual(score, 1.0) + + self.assertGreaterEqual(res['weighted_score'], 0.0) + self.assertLessEqual(res['weighted_score'], 1.0) + + # NLP feedback should have correctly formatted lines + self.assertTrue(any(":" in fb for fb in res['feedback'])) + + def test_weights_sum_to_one(self): + from scoring import WEIGHTS + self.assertAlmostEqual(sum(WEIGHTS.values()), 1.0, places=4) + + def test_pass_threshold_logic(self): + res_good = score_submission(self.samples[2]) # large JSON should pass + res_bad = score_submission(self.samples[7]) # "# Short" should fail + + # Exact value depends on heuristics, but logic should align with weight >= 0.70 + self.assertEqual(res_bad['pass_threshold'], res_bad['weighted_score'] >= 0.70) + self.assertEqual(res_good['pass_threshold'], res_good['weighted_score'] >= 0.70) + + +def run_performance_benchmark(): + """ + Runs the 100+ submissions < 10s benchmark required by the bounty. + """ + print("\\n" + "="*50) + print(f"Running performance benchmark on {len(benchmark_submissions)} submissions...") + print("="*50) + + start_time = time.time() + + results = [] + for sub in benchmark_submissions: + res = score_submission(sub) + results.append(res) + + duration = time.time() - start_time + print(f"\\nProcessed {len(results)} submissions in {duration:.4f} seconds.") + + if duration < 10.0: + print("✅ Performance requirement met (<10s).") + else: + print("❌ Performance requirement failed.") + + print("\\nExample Output format (First JSON submission):") + print(json.dumps(results[0], indent=2)) + print("\\nDone.\\n") + +if __name__ == "__main__": + # 1. Run Benchmark + run_performance_benchmark() + + # 2. Run Unittests + print("Running Unittests...") + unittest.main()