Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Virtual environments
.venv/
venv/
env/

# Python cache
__pycache__/
*.py[cod]

# Pytest
.pytest_cache/

# Coverage
.coverage
htmlcov/

# Jupyter
.ipynb_checkpoints/

# OS files
.DS_Store
Thumbs.db

# IDE/editor
.vscode/
.idea/

# Build artifacts
build/
dist/
*.egg-info/

# Local test outputs
test_results.txt

# Temporary files
*.tmp
*.log
Empty file added tests/__init__.py
Empty file.
235 changes: 235 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
"""
fixtures.py
-----------
Realistic test fixtures for multilingual and trajectory tests.
"""

from __future__ import annotations

from typing import List

from training_setup_logs.trajectory.models import (
MessageRole,
ToolCall,
ToolCallStatus,
Trajectory,
TurnMessage,
)


# ---------------------------------------------------------------------------
# Multilingual query fixtures
# ---------------------------------------------------------------------------


WEATHER_QUERIES_MULTILINGUAL: List[str] = [
# Devanagari
"कल मौसम कैसा रहेगा",
# Transliterated variants
"kal mausam kaisa rahega",
"kal mosam kaisa rahega",
"kal mousam kaisa hoga",
# Code-switched
"kal weather kaisa hai",
"मौसम tomorrow kaisa hoga",
# English
"what will the weather be like tomorrow",
# Unrelated (should NOT cluster with weather queries)
"मुझे एक अच्छा रेस्टोरेंट बताओ",
"recommend a restaurant near me",
"find me a good book to read",
]

HINDI_QUERIES: List[str] = [
"आज का तापमान क्या है",
"कल बारिश होगी क्या",
"मुझे दिल्ली का मौसम बताओ",
]

TRANSLITERATED_QUERIES: List[str] = [
"aaj ka tapmaan kya hai",
"kal barish hogi kya",
"mujhe delhi ka mausam batao",
]

CODE_SWITCHED_QUERIES: List[str] = [
"aaj temperature kitna hai",
"kal rain hogi kya Delhi mein",
"मुझे weather update do",
]

ENGLISH_QUERIES: List[str] = [
"what is the temperature today",
"will it rain tomorrow",
"give me the weather update for Delhi",
]


# ---------------------------------------------------------------------------
# Trajectory fixtures
# ---------------------------------------------------------------------------


def make_clean_trajectory(tid: str = "traj_clean_001") -> Trajectory:
"""A clean, efficient, successful trajectory."""
return Trajectory(
trajectory_id=tid,
turns=[
TurnMessage(role=MessageRole.USER, content="What is the weather in Delhi tomorrow?", language="en"),
TurnMessage(role=MessageRole.ASSISTANT, content="Let me check the weather for you.", language="en"),
TurnMessage(role=MessageRole.ASSISTANT, content="Tomorrow in Delhi: 32°C, partly cloudy.", language="en"),
],
tool_calls=[
ToolCall(
tool_name="weather_api",
arguments={"city": "Delhi", "date": "tomorrow"},
return_value={"temp": 32, "condition": "partly cloudy"},
status=ToolCallStatus.SUCCESS,
latency_ms=230.0,
)
],
)


def make_retry_trajectory(tid: str = "traj_retry_001") -> Trajectory:
"""Trajectory with a retry that eventually succeeds."""
return Trajectory(
trajectory_id=tid,
turns=[
TurnMessage(role=MessageRole.USER, content="kal mausam kaisa rahega", language="hi-latn"),
TurnMessage(role=MessageRole.ASSISTANT, content="Let me check. One moment.", language="en"),
TurnMessage(role=MessageRole.ASSISTANT, content="kal Delhi mein 30°C hoga.", language="hi-latn"),
],
tool_calls=[
ToolCall(
tool_name="weather_api",
arguments={"city": "Delhi", "date": "tomorrow"},
return_value=None,
status=ToolCallStatus.FAILURE,
latency_ms=40.0,
),
ToolCall(
tool_name="weather_api",
arguments={"city": "Delhi", "date": "tomorrow"},
return_value={"temp": 30, "condition": "sunny"},
status=ToolCallStatus.SUCCESS,
latency_ms=280.0,
retry_of=0,
),
],
)


def make_redundant_trajectory(tid: str = "traj_redundant_001") -> Trajectory:
"""Trajectory with redundant tool calls."""
return Trajectory(
trajectory_id=tid,
turns=[
TurnMessage(role=MessageRole.USER, content="Weather in Mumbai?", language="en"),
TurnMessage(role=MessageRole.ASSISTANT, content="Mumbai: 28°C, humid.", language="en"),
],
tool_calls=[
ToolCall(
tool_name="weather_api",
arguments={"city": "Mumbai"},
return_value={"temp": 28},
status=ToolCallStatus.SUCCESS,
latency_ms=200.0,
),
# Identical call — redundant
ToolCall(
tool_name="weather_api",
arguments={"city": "Mumbai"},
return_value={"temp": 28},
status=ToolCallStatus.SUCCESS,
latency_ms=190.0,
),
],
)


def make_incomplete_trajectory(tid: str = "traj_incomplete_001") -> Trajectory:
"""Trajectory ending in a user turn (no assistant response)."""
return Trajectory(
trajectory_id=tid,
turns=[
TurnMessage(role=MessageRole.USER, content="कल बारिश होगी क्या?", language="hi"),
TurnMessage(role=MessageRole.ASSISTANT, content="Let me check...", language="en"),
TurnMessage(role=MessageRole.USER, content="जल्दी बताओ", language="hi"), # last = user
],
tool_calls=[
ToolCall(
tool_name="weather_api",
arguments={"city": "unknown"},
return_value=None,
status=ToolCallStatus.MISSING_RETURN,
latency_ms=5000.0,
)
],
)


def make_multilingual_recovery_trajectory(tid: str = "traj_ml_recovery_001") -> Trajectory:
"""
Hard multilingual trajectory with:
- code-switched user query
- tool failure
- fallback tool success
- clarification loop
"""
return Trajectory(
trajectory_id=tid,
turns=[
TurnMessage(role=MessageRole.USER, content="kal rain hogi kya Delhi mein?", language="hi-en-mixed"),
TurnMessage(role=MessageRole.ASSISTANT, content="Which area of Delhi?", language="en"),
TurnMessage(role=MessageRole.USER, content="South Delhi", language="en"),
TurnMessage(role=MessageRole.ASSISTANT, content="South Delhi: moderate rain expected.", language="en"),
],
tool_calls=[
ToolCall(
tool_name="rainfall_api",
arguments={"city": "Delhi"},
return_value=None,
status=ToolCallStatus.FAILURE,
latency_ms=45.0,
),
ToolCall(
tool_name="weather_api",
arguments={"city": "South Delhi", "date": "tomorrow"},
return_value={"rain_prob": 0.75},
status=ToolCallStatus.SUCCESS,
latency_ms=310.0,
is_fallback=True,
),
],
)


def make_hallucinated_args_trajectory(tid: str = "traj_halluc_001") -> Trajectory:
"""Trajectory where tool args contain placeholder/hallucinated values."""
return Trajectory(
trajectory_id=tid,
turns=[
TurnMessage(role=MessageRole.USER, content="Book me a flight to Goa", language="en"),
TurnMessage(role=MessageRole.ASSISTANT, content="I've booked a flight for you.", language="en"),
],
tool_calls=[
ToolCall(
tool_name="flight_booking",
arguments={"destination": "Goa", "departure": "<FILL_DATE>", "passenger": "TODO"},
return_value=None,
status=ToolCallStatus.HALLUCINATED,
latency_ms=120.0,
)
],
)


ALL_TRAJECTORIES = [
make_clean_trajectory(),
make_retry_trajectory(),
make_redundant_trajectory(),
make_incomplete_trajectory(),
make_multilingual_recovery_trajectory(),
make_hallucinated_args_trajectory(),
]
77 changes: 77 additions & 0 deletions tests/test_leakage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
test_leakage.py
---------------
Tests for train/eval split leakage detection.
"""

import pytest

from training_setup_logs.multilingual.leakage_detector import (
LeakageReport,
detect_leakage,
)


class TestLeakageDetector:
def test_no_leakage_on_distinct_splits(self):
train = [
"how do I apply for a passport",
"best restaurants in Kolkata",
"Python list comprehension tutorial",
]
eval_ = [
"how to file income tax return",
"train schedule from Delhi to Mumbai",
"machine learning overfitting explained",
]
report = detect_leakage(train, eval_, skip_semantic=True)
assert report.total_leaks == 0
assert report.leak_rate == 0.0

def test_exact_leak_detected(self):
train = ["कल मौसम कैसा रहेगा", "best hotel in Goa"]
eval_ = ["कल मौसम कैसा रहेगा", "completely different query"]
report = detect_leakage(train, eval_, skip_semantic=True)
assert len(report.exact_leaks) >= 1

def test_transliteration_leak_detected(self):
train = ["kal mausam kaisa rahega"]
eval_ = ["kal mosam kaisa rahega"] # spelling variant
report = detect_leakage(train, eval_, skip_semantic=True)
# After canonicalization these should match
assert report.total_leaks >= 1

def test_empty_splits(self):
report = detect_leakage([], [])
assert report.total_leaks == 0

def test_empty_train(self):
report = detect_leakage([], ["some query"])
assert report.total_leaks == 0

def test_report_structure(self):
train = ["hello world"]
eval_ = ["hello world"]
report = detect_leakage(train, eval_, skip_semantic=True)
d = report.to_dict()
assert "train_size" in d
assert "eval_size" in d
assert "leak_rate" in d
assert "cross_split_leaks" in d
assert isinstance(d["cross_split_leaks"], list)

def test_leak_rate_bounded(self):
train = ["q1", "q2", "q3"]
eval_ = ["q1", "q2", "q4"]
report = detect_leakage(train, eval_, skip_semantic=True)
assert 0.0 <= report.leak_rate <= 1.0

def test_full_leak_detected_semantic(self):
"""
Test the semantic (embedding-based) detection path.
Uses near-identical text that TF-IDF char-ngram similarity will also catch.
"""
train = ["delhi weather tomorrow forecast rain"]
eval_ = ["delhi weather tomorrow forecast rain sunny"] # superset
report = detect_leakage(train, eval_, semantic_threshold=0.50, skip_semantic=False)
assert report.total_leaks >= 1
Loading