entrius · langverse2023 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/gittensor/classes.py b/gittensor/classes.py
@@ -1,5 +1,6 @@
 import re
 from dataclasses import dataclass, field
+
 from datetime import datetime
 from typing import DefaultDict, Dict, List, Optional, Set
 from math import prod
@@ -192,7 +193,7 @@ def calculate_score_from_file_changes(self, programming_languages: Dict[str, flo
             if file.file_extension in MITIGATED_EXTENSIONS:
                 total_changes_to_score = min(file.changes, MAX_LINES_SCORED_FOR_MITIGATED_EXT)
 
-            non_scoreable_lines = count_non_scoreable_lines(file.patch, total_changes_to_score, file.file_extension)
+            non_scoreable_lines = count_non_scoreable_lines(file.patch, total_changes_to_score, file.filename)
             scored_changes = max(0, total_changes_to_score - non_scoreable_lines)
 
             self.total_lines_scored += scored_changes

diff --git a/gittensor/constants.py b/gittensor/constants.py
@@ -64,35 +64,6 @@
 EXCESSIVE_PR_PENALTY_SLOPE = 0.50
 EXCESSIVE_PR_MIN_MULTIPLIER = 0.00
 
-COMMENT_PATTERNS = [
-    r'^\s*#',  # Python, Ruby, Shell, etc.
-    r'^\s*//',  # C, C++, Java, JavaScript, Go, Rust, etc.
-    r'^\s*/\*',  # C-style multi-line start
-    r'^\s*\*',  # C-style multi-line continuation
-    r'^\s*\*/',  # C-style multi-line end
-    r'^\s*--',  # SQL, Lua, Haskell
-    r'^\s*<!--',  # HTML, XML
-    r'^\s*%',  # LaTeX, MATLAB
-    r'^\s*;',  # Lisp, Assembly
-    r'^\s*"""',  # Python docstring
-    r"^\s*'''",  # Python docstring
-]
-
-PREPROCESSOR_LANGUAGES = {
-    'c',
-    'h',
-    'cpp',
-    'cxx',
-    'cc',
-    'hpp',
-    'hxx',
-    'hh',
-    'h++',
-    'cs',
-    'rs',
-    'swift',
-}
-
 # =============================================================================
 # Rewards & Emissions
 # =============================================================================

diff --git a/gittensor/validator/evaluation/scoring.py b/gittensor/validator/evaluation/scoring.py
@@ -282,4 +282,4 @@ def _is_valid_issue(issue: Issue, pr: PullRequest) -> bool:
             bt.logging.warning(f"Skipping issue #{issue.number} - closed {days_diff:.1f}d from PR merge (max: {MAX_ISSUE_CLOSE_WINDOW_DAYS})")
             return False
 
-    return True
+    return True
diff --git a/gittensor/validator/utils/spam_detection.py b/gittensor/validator/utils/spam_detection.py
@@ -1,11 +1,13 @@
 import re
-from typing import List, Optional
+from typing import List, Optional, Set
 from Levenshtein import distance, ratio
+from pygments import lex
+from pygments.lexers import get_lexer_for_filename, TextLexer
+from pygments.token import Comment, String
+from pygments.util import ClassNotFound
 from gittensor.constants import (
     TYPO_MAX_DIST,
     TYPO_MIN_SIM,
-    COMMENT_PATTERNS,
-    PREPROCESSOR_LANGUAGES,
 )
 
 def tokenize(text: str) -> List[str]:
@@ -27,22 +29,93 @@ def is_token_typo(old: str, new: str, max_dist=TYPO_MAX_DIST, min_sim=TYPO_MIN_S
     return all(token_pair_typo(o, n, max_dist, min_sim)
             for o, n in zip(old_tokens, new_tokens))
 
-def is_comment_line(content: str, file_extension: Optional[str] = None) -> bool:
-    """Check if line content matches a comment pattern. Skips '#' pattern for preprocessor languages (C, C++, Rust, etc.) to avoid false positives."""
-    patterns_to_check = COMMENT_PATTERNS
-    if file_extension and file_extension in PREPROCESSOR_LANGUAGES:
-        # Skip the '#' pattern (index 0) for languages where # is preprocessor directive
-        patterns_to_check = [p for p in COMMENT_PATTERNS if not p.startswith(r'^\s*#')]
+def is_single_diff_line(line: str) -> bool:
+    """True for +foo or -bar but False for ++foo, --bar, etc."""
+    if not line:
+        return False
+    char = line[0]
+    return char in "+-" and (len(line) == 1 or line[1] != char)
+
+def get_comment_line_indices(lines: List[str], file_name: Optional[str] = None) -> Set[int]:
+    """
+    Analyzes all lines together to detect comments (including multi-line comments).
+
+    Returns:
+        A set of indices corresponding to lines that contain ONLY comments or docstrings.
+    """
+    if not lines:
+        return set()
+
+    # 1. Reconstruct the source code content by stripping diff markers (+, -, space)
+    clean_content_parts = []
+    for line in lines:
+        if len(line) > 0 and line[0] in "+- ":
+            clean_content_parts.append(line[1:])
+        else:
+            # Empty lines or diff headers are treated as empty strings to preserve index alignment
+            clean_content_parts.append("")
 
-    return any(re.match(pattern, content) for pattern in patterns_to_check)
+    full_text = "\n".join(clean_content_parts)
+
+    # 2. Determine the appropriate lexer
+    try:
+        lexer = get_lexer_for_filename(file_name) if file_name else TextLexer()
+    except ClassNotFound:
+        lexer = TextLexer()
+
+    # 3. Tokenize the entire text to preserve context (e.g., inside /* ... */)
+    try:
+        tokens = lex(full_text, lexer)
+    except Exception:
+        return set()
+
+    # 4. Map tokens back to line indices
+    n_lines = len(lines)
+    line_has_code = [False] * n_lines
+    line_has_comment = [False] * n_lines
+
+    current_line_idx = 0
+
+    for token_type, value in tokens:
+        # A token might span multiple lines (e.g., block comments or multi-line strings)
+        sub_lines = value.split('\n')
+
+        for i, sub_line in enumerate(sub_lines):
+            target_line_idx = current_line_idx + i
+
+            if target_line_idx >= n_lines:
+                break
+
+            # If this part of the token has actual content (not just whitespace)
+            if sub_line.strip():
+                if token_type in Comment or token_type in String.Doc:
+                    line_has_comment[target_line_idx] = True
+                else:
+                    # Any other token (Keyword, Name, Operator) implies code
+                    line_has_code[target_line_idx] = True
+
+        # Advance the line index based on the number of newlines in the current token
+        current_line_idx += len(sub_lines) - 1
+
+    # 5. Identify lines that are pure comments (has comment AND no code)
+    comment_indices = set()
+    for i in range(n_lines):
+        if line_has_comment[i] and not line_has_code[i]:
+            comment_indices.add(i)
+
+    return comment_indices
 
-def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = None, file_extension: Optional[str] = None) -> int:
+def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = None, file_name: Optional[str] = None) -> int:
     """Count lines that shouldn't contribute to the score (blank, comment, etc)."""
     if not patch:
         return 0
 
     non_scoreable = 0
     lines = patch.split("\n")
+
+    # Pre-calculate comment lines using context-aware lexing
+    comment_line_indices = get_comment_line_indices(lines, file_name)
+
     scoreable_count = 0
     skip_next = False  # Track if next line should be skipped
 
@@ -56,8 +129,13 @@ def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = N
 
         content = line[1:]
 
-        # Blank lines and comments
-        if content.strip() == "" or is_comment_line(content, file_extension):
+        # Blank lines
+        if content.strip() == "":
+            non_scoreable += 1
+            continue
+
+        # Check if the current line index was identified as a comment
+        if i in comment_line_indices:
             non_scoreable += 1
             continue
 
@@ -77,9 +155,3 @@ def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = N
 
     return non_scoreable
 
-def is_single_diff_line(line: str) -> bool:
-    """True for +foo or -bar but False for ++foo, --bar, etc."""
-    if not line:
-        return False
-    char = line[0]
-    return char in "+-" and (len(line) == 1 or line[1] != char)
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ debugpy==1.8.11
 # For validator database storage (not required for validators to run)
 pytz==2025.2
 psycopg2-binary==2.9.10
+Pygments==2.19.2