Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
d66209e
Fix typo detection double-counting in spam filter (#51)
NikolaPantel Dec 2, 2025
43599ca
contributing guidelines
anderdc Dec 2, 2025
d87937a
Merge branch 'test' into contributing
anderdc Dec 2, 2025
8a6030d
no table
anderdc Dec 2, 2025
4817145
Merge pull request #55 from entrius/contributing
anderdc Dec 2, 2025
445676d
Fix issue multiplier to use validated issues list (#50)
NikolaPantel Dec 3, 2025
e93c686
fix: prevent gamming issue multiplier (#53)
codomposer Dec 3, 2025
ce9d6c5
fix: update subnet repsitories (#57)
codomposer Dec 3, 2025
be50add
Added new branch for BitMind (#59)
James-4u Dec 3, 2025
c091111
feat: adding sn23 repo in the master repo list (#60)
James-4u Dec 3, 2025
94b0695
Fix confusing recycle allocation logic in dynamic emissions (#54)
NikolaPantel Dec 4, 2025
749a9db
Fix: Skip '#' Comment Pattern for Preprocessor Languages (#56)
James-4u Dec 4, 2025
6be085a
PR scoring changes (#61)
LandynDev Dec 4, 2025
bd69d5d
Removed gittensor repository from being eligible for tagline boosts i…
LandynDev Dec 4, 2025
e3a128a
Require issues to be closed within 1 day of PR merged date (#63)
LandynDev Dec 4, 2025
3404664
update
Dec 4, 2025
e5ec391
update
Dec 5, 2025
991d5f6
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 5, 2025
ca34545
Update scoring.py
langverse2023 Dec 5, 2025
e5bd670
Update scoring.py
langverse2023 Dec 5, 2025
d430875
update
Dec 5, 2025
d9ab6e6
update
namkhanh20xx Dec 5, 2025
256c85a
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 8, 2025
69cd561
update
namkhanh20xx Dec 15, 2025
215d21c
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 15, 2025
163ddc5
update
namkhanh20xx Dec 15, 2025
0065321
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 16, 2025
891f3e1
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 17, 2025
50960ae
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 19, 2025
f0b7e57
Merge branch 'test' into kln/20251204182044
langverse2023 Dec 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion gittensor/classes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from dataclasses import dataclass, field

from datetime import datetime
from typing import DefaultDict, Dict, List, Optional, Set
from math import prod
Expand Down Expand Up @@ -192,7 +193,7 @@ def calculate_score_from_file_changes(self, programming_languages: Dict[str, flo
if file.file_extension in MITIGATED_EXTENSIONS:
total_changes_to_score = min(file.changes, MAX_LINES_SCORED_FOR_MITIGATED_EXT)

non_scoreable_lines = count_non_scoreable_lines(file.patch, total_changes_to_score, file.file_extension)
non_scoreable_lines = count_non_scoreable_lines(file.patch, total_changes_to_score, file.filename)
scored_changes = max(0, total_changes_to_score - non_scoreable_lines)

self.total_lines_scored += scored_changes
Expand Down
29 changes: 0 additions & 29 deletions gittensor/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,35 +64,6 @@
EXCESSIVE_PR_PENALTY_SLOPE = 0.50
EXCESSIVE_PR_MIN_MULTIPLIER = 0.00

COMMENT_PATTERNS = [
r'^\s*#', # Python, Ruby, Shell, etc.
r'^\s*//', # C, C++, Java, JavaScript, Go, Rust, etc.
r'^\s*/\*', # C-style multi-line start
r'^\s*\*', # C-style multi-line continuation
r'^\s*\*/', # C-style multi-line end
r'^\s*--', # SQL, Lua, Haskell
r'^\s*<!--', # HTML, XML
r'^\s*%', # LaTeX, MATLAB
r'^\s*;', # Lisp, Assembly
r'^\s*"""', # Python docstring
r"^\s*'''", # Python docstring
]

PREPROCESSOR_LANGUAGES = {
'c',
'h',
'cpp',
'cxx',
'cc',
'hpp',
'hxx',
'hh',
'h++',
'cs',
'rs',
'swift',
}

# =============================================================================
# Rewards & Emissions
# =============================================================================
Expand Down
2 changes: 1 addition & 1 deletion gittensor/validator/evaluation/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,4 +282,4 @@ def _is_valid_issue(issue: Issue, pr: PullRequest) -> bool:
bt.logging.warning(f"Skipping issue #{issue.number} - closed {days_diff:.1f}d from PR merge (max: {MAX_ISSUE_CLOSE_WINDOW_DAYS})")
return False

return True
return True
110 changes: 91 additions & 19 deletions gittensor/validator/utils/spam_detection.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import re
from typing import List, Optional
from typing import List, Optional, Set
from Levenshtein import distance, ratio
from pygments import lex
from pygments.lexers import get_lexer_for_filename, TextLexer
from pygments.token import Comment, String
from pygments.util import ClassNotFound
Comment on lines +4 to +7
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to include pygments as a dependency

from gittensor.constants import (
TYPO_MAX_DIST,
TYPO_MIN_SIM,
COMMENT_PATTERNS,
PREPROCESSOR_LANGUAGES,
)

def tokenize(text: str) -> List[str]:
Expand All @@ -27,22 +29,93 @@ def is_token_typo(old: str, new: str, max_dist=TYPO_MAX_DIST, min_sim=TYPO_MIN_S
return all(token_pair_typo(o, n, max_dist, min_sim)
for o, n in zip(old_tokens, new_tokens))

def is_comment_line(content: str, file_extension: Optional[str] = None) -> bool:
"""Check if line content matches a comment pattern. Skips '#' pattern for preprocessor languages (C, C++, Rust, etc.) to avoid false positives."""
patterns_to_check = COMMENT_PATTERNS
if file_extension and file_extension in PREPROCESSOR_LANGUAGES:
# Skip the '#' pattern (index 0) for languages where # is preprocessor directive
patterns_to_check = [p for p in COMMENT_PATTERNS if not p.startswith(r'^\s*#')]
def is_single_diff_line(line: str) -> bool:
"""True for +foo or -bar but False for ++foo, --bar, etc."""
if not line:
return False
char = line[0]
return char in "+-" and (len(line) == 1 or line[1] != char)

def get_comment_line_indices(lines: List[str], file_name: Optional[str] = None) -> Set[int]:
"""
Analyzes all lines together to detect comments (including multi-line comments).

Returns:
A set of indices corresponding to lines that contain ONLY comments or docstrings.
"""
if not lines:
return set()

# 1. Reconstruct the source code content by stripping diff markers (+, -, space)
clean_content_parts = []
for line in lines:
if len(line) > 0 and line[0] in "+- ":
clean_content_parts.append(line[1:])
else:
# Empty lines or diff headers are treated as empty strings to preserve index alignment
clean_content_parts.append("")

return any(re.match(pattern, content) for pattern in patterns_to_check)
full_text = "\n".join(clean_content_parts)

# 2. Determine the appropriate lexer
try:
lexer = get_lexer_for_filename(file_name) if file_name else TextLexer()
except ClassNotFound:
lexer = TextLexer()

# 3. Tokenize the entire text to preserve context (e.g., inside /* ... */)
try:
tokens = lex(full_text, lexer)
except Exception:
return set()

# 4. Map tokens back to line indices
n_lines = len(lines)
line_has_code = [False] * n_lines
line_has_comment = [False] * n_lines

current_line_idx = 0

for token_type, value in tokens:
# A token might span multiple lines (e.g., block comments or multi-line strings)
sub_lines = value.split('\n')

for i, sub_line in enumerate(sub_lines):
target_line_idx = current_line_idx + i

if target_line_idx >= n_lines:
break

# If this part of the token has actual content (not just whitespace)
if sub_line.strip():
if token_type in Comment or token_type in String.Doc:
line_has_comment[target_line_idx] = True
else:
# Any other token (Keyword, Name, Operator) implies code
line_has_code[target_line_idx] = True

# Advance the line index based on the number of newlines in the current token
current_line_idx += len(sub_lines) - 1

# 5. Identify lines that are pure comments (has comment AND no code)
comment_indices = set()
for i in range(n_lines):
if line_has_comment[i] and not line_has_code[i]:
comment_indices.add(i)

return comment_indices

def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = None, file_extension: Optional[str] = None) -> int:
def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = None, file_name: Optional[str] = None) -> int:
"""Count lines that shouldn't contribute to the score (blank, comment, etc)."""
if not patch:
return 0

non_scoreable = 0
lines = patch.split("\n")

# Pre-calculate comment lines using context-aware lexing
comment_line_indices = get_comment_line_indices(lines, file_name)

scoreable_count = 0
skip_next = False # Track if next line should be skipped

Expand All @@ -56,8 +129,13 @@ def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = N

content = line[1:]

# Blank lines and comments
if content.strip() == "" or is_comment_line(content, file_extension):
# Blank lines
if content.strip() == "":
non_scoreable += 1
continue

# Check if the current line index was identified as a comment
if i in comment_line_indices:
non_scoreable += 1
continue

Expand All @@ -77,9 +155,3 @@ def count_non_scoreable_lines(patch: str, max_scoreable_lines: Optional[int] = N

return non_scoreable

def is_single_diff_line(line: str) -> bool:
"""True for +foo or -bar but False for ++foo, --bar, etc."""
if not line:
return False
char = line[0]
return char in "+-" and (len(line) == 1 or line[1] != char)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ debugpy==1.8.11
# For validator database storage (not required for validators to run)
pytz==2025.2
psycopg2-binary==2.9.10
Pygments==2.19.2
Comment thread
langverse2023 marked this conversation as resolved.