Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 262 additions & 0 deletions scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
import json
import re
from typing import Dict, Any

# Dimension weights defined by the quality scoring specification
WEIGHTS = {
'completeness': 0.30,
'format_compliance': 0.20,
'coverage': 0.25,
'clarity': 0.15,
'validity': 0.10
}

def detect_format(content: str) -> str:
"""
Infers the format of the provided content string.
Supported formats: json, markdown, code, text.
"""
content = content.strip()
if not content:
return "text"

try:
json.loads(content)
return "json"
except json.JSONDecodeError:
pass

md_score = 0
code_score = 0

# Evaluate markdown characteristics
if re.search(r'^#+ .+', content, re.MULTILINE): md_score += 3
if re.search(r'\[.+\]\(.+\)', content): md_score += 2
if re.search(r'\*\*.+\*\*', content): md_score += 1
if re.search(r'^\s*[-*] .+', content, re.MULTILINE): md_score += 1
if '```' in content: md_score += 2

# Evaluate code patterns
if re.search(r'\b(def|class|function|import|export|const|let|var|return)\b', content): code_score += 2
if re.search(r'\{[^}]+\}', content): code_score += 1
if re.search(r'//|/\*|# noqa', content): code_score += 1
if re.search(r'\(.*\)[:\s]*\{?', content): code_score += 1
if re.search(r';\s*$', content, re.MULTILINE): code_score += 2

if md_score == 0 and code_score == 0:
return "text"

if md_score > code_score:
return "markdown"
elif code_score > md_score:
return "code"

# Tie-breaker: dense punctuation usually indicates code
if len(re.findall(r'[{};()\[\]=]', content)) > 10:
return "code"

return "text"

def score_completeness(content: str, format_type: str) -> float:
"""
Evaluates the submission's structural depth and length.
Returns a normalized float between 0.0 and 1.0.
"""
length = len(content)
if length == 0:
return 0.0

base_score = min(length / 500.0, 0.7)
structure_bonus = 0.0

if format_type == "json":
try:
parsed = json.loads(content)
if isinstance(parsed, dict):
structure_bonus = min(len(parsed.keys()) / 10.0, 0.3)
elif isinstance(parsed, list):
structure_bonus = min(len(parsed) / 20.0, 0.3)
except json.JSONDecodeError:
pass
elif format_type == "markdown":
headers = len(re.findall(r'^#+ ', content, re.MULTILINE))
structure_bonus = min(headers / 5.0, 0.3)
elif format_type == "code":
lines = len(content.split('\n'))
structure_bonus = min(lines / 50.0, 0.3)
else:
words = len(content.split())
structure_bonus = min(words / 100.0, 0.3)

return min(base_score + structure_bonus, 1.0)

def score_format_compliance(content: str, format_type: str) -> float:
"""
Validates adherence to the detected format's standard conventions.
"""
if format_type == "json":
try:
json.loads(content)
return 1.0
except json.JSONDecodeError:
return 0.2

elif format_type == "markdown":
score = 0.5
if re.search(r'\n#+ ', content): score += 0.2
if re.search(r'^\s*[-*] .+', content, re.MULTILINE): score += 0.1
if re.search(r'\[.+\]\(.+\)|\*\*.+\*\*', content): score += 0.2
return min(score, 1.0)

elif format_type == "code":
score = 0.5
if re.search(r'^ {4}', content, re.MULTILINE) or re.search(r'^\t', content, re.MULTILINE):
score += 0.3
if re.search(r'(def|class|function)', content): score += 0.2
return min(score, 1.0)

score = 0.5
if re.search(r'[.!?](?:\s+|$)', content): score += 0.3
if '\n\n' in content: score += 0.2
return min(score, 1.0)

def score_coverage(content: str, format_type: str) -> float:
"""
Estimates topic coverage by measuring unique vocabulary density.
"""
words = re.findall(r'\b\w+\b', content.lower())
if not words:
return 0.0

unique_words = len(set(words))
ratio = unique_words / len(words)

if ratio > 0.8: return 0.9
if ratio > 0.5: return 1.0
if ratio > 0.3: return 0.8
if ratio > 0.1: return 0.5
return 0.3

def score_clarity(content: str, format_type: str) -> float:
"""
Scores readability based on line lengths, pacing, and whitespace.
"""
lines = content.split('\n')
if not lines:
return 0.0

empty_lines = sum(1 for line in lines if not line.strip())
empty_ratio = empty_lines / len(lines)

score = 0.6
if 0.1 <= empty_ratio <= 0.3:
score += 0.4
elif empty_ratio > 0.4:
score -= 0.2

max_len = max(len(line) for line in lines) if lines else 0
if max_len < 120:
score += 0.3
elif max_len > 300 and format_type != 'json':
score -= 0.3

return max(0.0, min(score, 1.0))

def score_validity(content: str, format_type: str) -> float:
"""
Detects syntax anomalies, unbalanced closures, and trailing spaces.
"""
score = 1.0

# Check for unclosed brackets or quotes
if content.count('(') != content.count(')'): score -= 0.3
if content.count('[') != content.count(']'): score -= 0.3
if content.count('{') != content.count('}'): score -= 0.3
if content.count('"') % 2 != 0: score -= 0.2

trailing_spaces = sum(1 for line in content.split('\n') if len(line) > 0 and line.endswith((' ', '\t')))
if trailing_spaces > 0:
score -= min(0.3, trailing_spaces * 0.05)

return max(0.0, min(score, 1.0))

def generate_nlp_feedback(dim: str, score: float, format_type: str) -> str:
"""
Returns actionable natural language feedback for a given dimension.
"""
if dim == 'completeness':
if score >= 0.8: return "Submission is comprehensive and well-structured."
if score >= 0.5: return "Meets basic length requirements; consider adding more detail."
return "Submission is too brief or lacks expected structural elements."

elif dim == 'format_compliance':
if score >= 0.8: return f"High adherence to {format_type} conventions."
if score >= 0.5: return f"Moderate compliance with {format_type} standards; minor formatting issues detected."
return f"Poor {format_type} formatting. Review standard syntax guidelines."

elif dim == 'coverage':
if score >= 0.8: return "Excellent vocabulary range denoting good topic coverage."
if score >= 0.5: return "Adequate concept spread, but somewhat repetitive."
return "Highly repetitive content with limited vocabulary."

elif dim == 'clarity':
if score >= 0.8: return "Clear, readable structure with appropriate spacing."
if score >= 0.5: return "Generally readable; pacing or line lengths could be improved."
return "Difficult to parse. Break up long lines and use consistent whitespace."

elif dim == 'validity':
if score >= 0.8: return "Logically sound with balanced syntax."
if score >= 0.5: return "Mostly valid; minor anomalies like trailing spaces found."
return "Significant validity issues detected (e.g., unbalanced closures)."

return ""

def score_submission(content: str) -> Dict[str, Any]:
"""
Evaluates a submission across all defined dimensions.
Returns a dictionary formatted to the specification rubric.
"""
format_type = detect_format(content)

dims = {
'completeness': score_completeness(content, format_type),
'format_compliance': score_format_compliance(content, format_type),
'coverage': score_coverage(content, format_type),
'clarity': score_clarity(content, format_type),
'validity': score_validity(content, format_type),
}

# Calculate the final weighted score
weighted_score = sum(dims[k] * WEIGHTS[k] for k in dims)
weighted_score = round(weighted_score, 4)

# Assign a letter grade based on ranges
quality_rating = 'F'
if weighted_score >= 0.9: quality_rating = 'S'
elif weighted_score >= 0.8: quality_rating = 'A'
elif weighted_score >= 0.7: quality_rating = 'B'
elif weighted_score >= 0.6: quality_rating = 'C'

pass_threshold = weighted_score >= 0.70

feedback = [f"Detected format: {format_type.upper()}"]

# Sort dimensions ascending to highlight lowest-scoring areas first
sorted_dims = sorted(dims.items(), key=lambda x: x[1])
for dim_name, dim_val in sorted_dims[:3]:
fb_text = generate_nlp_feedback(dim_name, dim_val, format_type)
feedback.append(f"{dim_name.title()}: {fb_text}")

if pass_threshold:
feedback.append("Submission meets the required quality baseline.")
else:
feedback.append("Submission failed to meet the quality baseline (>= 0.70).")

return {
"weighted_score": weighted_score,
"quality_rating": quality_rating,
"scores": {k: round(v, 4) for k, v in dims.items()},
"feedback": feedback,
"pass_threshold": pass_threshold,
"format_detected": format_type
}
120 changes: 120 additions & 0 deletions test_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import time
import json
import unittest
from scoring import score_submission, detect_format

# --- BENCHMARK DATA ---
sample_json = json.dumps({"key1": "value1", "key2": ["a", "b", "c"], "nested": {"a": 1, "b": 2}})
sample_markdown = "# Title\\n\\nHere is some **bold text**.\\n\\n- item 1\\n- item 2"
sample_code = "def add(a, b):\\n return a + b"
sample_text = "This is a plain text submission with a few standard sentences. It should be parsed as plain text and evaluated accordingly."
benchmark_submissions = [sample_json, sample_markdown, sample_code, sample_text] * 26 # 104 submissions

class TestQualityScoring(unittest.TestCase):
def setUp(self):
# 20 diverse test cases
self.samples = [
# JSON Cases (1-5)
'{"user_id": 123, "name": "Alice", "active": true, "roles": ["admin", "editor"]}',
'{"status": "error", "message": "unauthorized"}', # short JSON
json.dumps({"data": [{"id": i} for i in range(50)]}), # large JSON
'{"broken": "json", missing_quotes}', # Invalid JSON string
json.dumps({"key": "value" * 50}), # Repetitive JSON

# Markdown Cases (6-10)
'# Great Post\\n\\nThis is a **bold** statement and a [link](http://example.com).',
'## Section 1\\n- item A\\n- item B\\n\\n## Section 2\\n- item C',
'# Short',
'# Guide\\n\\nHere is some code:\\n```python\\nprint("hello")\\n```',
'# Repetitive\\n\\nRepetitive text repetitive text repetitive text repetitive text repetitive text.',

# Code Cases (11-15)
'def calculate_score(data):\\n return sum(data.values())',
'class User:\\n def __init__(self, name):\\n self.name = name',
'import os\\n\\nprint(os.environ)\\n',
'function greet(name) {\\n console.log("Hello, " + name);\\n}',
'// simple comment code\\nlet x = 10;\\nif (x > 5) {\\n return true;\\n}',

# Text Cases (16-20)
'The quick brown fox jumps over the lazy dog. This is a very standard sentence that contains unique words.',
'A very short sentence.',
'This text is extremely repetitive. ' * 20,
'This paragraph explains the intricate details of quality scoring. It examines metrics like completeness, format compliance, clarity, coverage, and validity.',
'Line 1\\n\\nLine 2\\n\\nLine 3\\n\\nLine 4\\n\\nLine 5\\n\\nLine 6'
]

def test_format_detection(self):
self.assertEqual(detect_format('{"a": 1}'), "json")
self.assertEqual(detect_format('# Hello\\nworld'), "markdown")
self.assertEqual(detect_format('def foo():\\n pass\\n'), "code")
self.assertEqual(detect_format('Hello world'), "text")

def test_all_samples(self):
for i, sample in enumerate(self.samples):
res = score_submission(sample)

# Check structure
self.assertIn("weighted_score", res)
self.assertIn("quality_rating", res)
self.assertIn("scores", res)
self.assertIn("feedback", res)
self.assertIn("pass_threshold", res)

# Check score bounds
for dim, score in res['scores'].items():
self.assertGreaterEqual(score, 0.0)
self.assertLessEqual(score, 1.0)

self.assertGreaterEqual(res['weighted_score'], 0.0)
self.assertLessEqual(res['weighted_score'], 1.0)

# NLP feedback should have correctly formatted lines
self.assertTrue(any(":" in fb for fb in res['feedback']))

def test_weights_sum_to_one(self):
from scoring import WEIGHTS
self.assertAlmostEqual(sum(WEIGHTS.values()), 1.0, places=4)

def test_pass_threshold_logic(self):
res_good = score_submission(self.samples[2]) # large JSON should pass
res_bad = score_submission(self.samples[7]) # "# Short" should fail

# Exact value depends on heuristics, but logic should align with weight >= 0.70
self.assertEqual(res_bad['pass_threshold'], res_bad['weighted_score'] >= 0.70)
self.assertEqual(res_good['pass_threshold'], res_good['weighted_score'] >= 0.70)


def run_performance_benchmark():
"""
Runs the 100+ submissions < 10s benchmark required by the bounty.
"""
print("\\n" + "="*50)
print(f"Running performance benchmark on {len(benchmark_submissions)} submissions...")
print("="*50)

start_time = time.time()

results = []
for sub in benchmark_submissions:
res = score_submission(sub)
results.append(res)

duration = time.time() - start_time
print(f"\\nProcessed {len(results)} submissions in {duration:.4f} seconds.")

if duration < 10.0:
print("✅ Performance requirement met (<10s).")
else:
print("❌ Performance requirement failed.")

print("\\nExample Output format (First JSON submission):")
print(json.dumps(results[0], indent=2))
print("\\nDone.\\n")

if __name__ == "__main__":
# 1. Run Benchmark
run_performance_benchmark()

# 2. Run Unittests
print("Running Unittests...")
unittest.main()