Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.9","3.10", "3.11" ]
python-version: ["3.12", "3.13" ]

steps:
- name: Free up disk space at start
Expand Down Expand Up @@ -53,7 +53,7 @@ jobs:
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: |
poetry cache clear pypi --all -n > /dev/null
poetry install --with dev --all-extras --no-cache --quiet --no-interaction
poetry install --with dev --all-extras --no-cache --no-interaction
source ./.venv/bin/activate && pip uninstall -y pyspark && rm -rf ./.venv/lib/python${{ matrix.python-version }}/site-packages/pyspark*/
pip install pyspark==3.5.6

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/llm_tests_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.8", "3.9", "3.10" ]
python-version: [ "3.12", "3.13" ]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.9]
python-version: [3.12]
poetry-version: [2.1.3]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
Expand Down
2 changes: 1 addition & 1 deletion langtest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def load_raw_data(self, standardize_columns: bool = False) -> List[Dict]:
parsed CSV file into list of dicts
"""

if type(self._file_path) == dict:
if isinstance(self._file_path, dict):
df = pd.read_csv(self._file_path["data_source"])

if self.task == "text-classification":
Expand Down
2 changes: 1 addition & 1 deletion langtest/langtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def configure(self, config: Union[HarnessConfig, dict, str]) -> HarnessConfig:
Returns:
dict: Loaded configuration.
"""
if type(config) == dict:
if isinstance(config, dict):
self._config = config
else:
with open(config, "r", encoding="utf-8") as yml:
Expand Down
35 changes: 16 additions & 19 deletions langtest/metrics/llm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,25 +66,22 @@ def build_prompt(
f"""\n\nScore the student answer based on the following criteria:\n{eval_criteria}"""
)

prompt += dedent(
f"""
Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: {grade_list} here

{
("Grade the student answers based ONLY on their factual accuracy. Ignore differences"
" in punctuation and phrasing between the student answer and true answer. It is OK "
"if the student answer contains more or relevant information than the true answer, as"
" long as it does not contain any conflicting statements. Begin!")
}

QUESTION: {{query}}
STUDENT ANSWER: {{result}}
TRUE ANSWER: {{answer}}
GRADE:"""
prompt += (
"Example Format:\n"
"QUESTION: question here\n"
"STUDENT ANSWER: student's answer here\n"
"TRUE ANSWER: true answer here\n"
f"GRADE: {grade_list} here"
"\n\n"
"Grade the student answers based ONLY on their factual accuracy. Ignore differences"
" in punctuation and phrasing between the student answer and true answer. It is OK "
"if the student answer contains more or relevant information than the true answer, as"
" long as it does not contain any conflicting statements. Begin!"
"\n\n"
"QUESTION: {{query}}\n"
"STUDENT ANSWER: {{result}}\n"
"TRUE ANSWER: {{answer}}\n"
"GRADE:\n"
)
return prompt

Expand Down
4 changes: 2 additions & 2 deletions langtest/modelhandler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@


if "langchain" in INSTALLED_HUBS:
import langchain
import langchain_classic

LANGCHAIN_HUBS = {
(
RENAME_HUBS.get(hub.lower(), hub.lower())
if hub.lower() in RENAME_HUBS
else hub.lower()
): hub
for hub in langchain.llms.__all__
for hub in langchain_classic.llms.__all__
}
LANGCHAIN_HUBS["openrouter"] = "openrouter"

Expand Down
4 changes: 2 additions & 2 deletions langtest/modelhandler/llm_modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import os
from typing import Any, List, Type, Union, TypeVar
import langchain.llms as lc
import langchain_classic.llms as lc
import langchain.chat_models as chat_models
from langchain.chains.llm import LLMChain
from langchain_classic.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.exceptions import OutputParserException
Expand Down
6 changes: 3 additions & 3 deletions langtest/modelhandler/modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
}

if try_import_lib("langchain"):
import langchain
import langchain.llms
import langchain_classic
import langchain_classic.llms

LANGCHAIN_HUBS = {
(
RENAME_HUBS.get(hub.lower(), hub.lower())
if hub.lower() in RENAME_HUBS
else hub.lower()
): hub
for hub in langchain.llms.__all__
for hub in langchain_classic.llms.__all__
}
LANGCHAIN_HUBS["openrouter"] = "openrouter"
else:
Expand Down
15 changes: 11 additions & 4 deletions langtest/transform/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ class BaseAccuracy(ABC):

TestConfig = TypedDict(
"TestConfig",
min_score=Union[Dict[str, float], float],
# min_score=Union[Dict[str, float], float],
{"min_score": Union[Dict[str, float], float]},
)

@classmethod
Expand Down Expand Up @@ -1029,9 +1030,15 @@ class LLMEval(BaseAccuracy):

TestConfig = TypedDict(
"TestConfig",
model=str,
hub=str,
min_score=float,
# model=str,
# hub=str,
# min_score=float,
{
"model": str,
"hub": str,
"model_parameters": dict,
"min_score": float,
},
)

@classmethod
Expand Down
4 changes: 2 additions & 2 deletions langtest/transform/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ async def async_run(
category_output = all_categories[each].run(
values, model_handler, progress_bar=tests, **kwargs
)
if type(category_output) == list:
if isinstance(category_output, list):
all_results.extend(category_output)
else:
all_results.append(category_output)
Expand Down Expand Up @@ -264,7 +264,7 @@ def run(
if len(test_name.split("-")) > 1:
test_name = "multiple_perturbations"
test_output = supported_tests[test_name].async_run(samples, model, **kwargs)
if type(test_output) == list:
if isinstance(test_output, list):
tasks.extend(test_output)
else:
tasks.append(test_output)
Expand Down
2 changes: 1 addition & 1 deletion langtest/transform/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ class BaseBias(ABC):
]

# Config Hint for the bias tests
TestConfig = TypedDict("TestConfig", min_pass_rate=float)
TestConfig = TypedDict("TestConfig", {"min_pass_rate": float})

@abstractmethod
def transform(self, sample_list: List[Sample], *args, **kwargs) -> List[Sample]:
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ class BaseClinical(ABC):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

@staticmethod
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/disinformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ class DisinformationTestFactory(ITests):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/factuality.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class FactualityTestFactory(ITests):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:
Expand Down
80 changes: 30 additions & 50 deletions langtest/transform/fairness.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
from langtest.transform.base import ITests


class GenderedConfig(TypedDict):
male: float
female: float
unknown: float


class FairnessTestFactory(ITests):
"""
A class for performing fairness tests on a given dataset.
Expand Down Expand Up @@ -292,8 +298,12 @@ class BaseFairness(ABC):

TestConfig = TypedDict(
"TestConfig",
min_score=Union[float, Dict[str, float]],
max_score=Union[float, Dict[str, float]],
# min_score=Union[float, Dict[str, float]],
# max_score=Union[float, Dict[str, float]],
{
"min_score": Union[float, Dict[str, float]],
"max_score": Union[float, Dict[str, float]],
},
)

@staticmethod
Expand Down Expand Up @@ -361,16 +371,12 @@ class MinGenderF1Score(BaseFairness):

alias_name = ["min_gender_f1_score"]

min_score = TypedDict(
"min_score",
male=float,
female=float,
unknown=float,
)

TestConfig = TypedDict(
"TestConfig",
min_score=Union[min_score, float],
# min_score=Union[min_score, float],
{
"min_score": Union[GenderedConfig, float],
},
)

@classmethod
Expand Down Expand Up @@ -476,16 +482,12 @@ class MaxGenderF1Score(BaseFairness):

alias_name = ["max_gender_f1_score"]

max_score = TypedDict(
"max_score",
male=float,
female=float,
unknown=float,
)

TestConfig = TypedDict(
"TestConfig",
max_score=Union[max_score, float],
# max_score=Union[max_score, float],
{
"max_score": Union[GenderedConfig, float],
},
)

@classmethod
Expand Down Expand Up @@ -599,16 +601,12 @@ class MinGenderRougeScore(BaseFairness):
]
supported_tasks = ["question-answering", "summarization"]

min_score = TypedDict(
"min_score",
male=float,
female=float,
unknown=float,
)

TestConfig = TypedDict(
"TestConfig",
min_score=Union[min_score, float],
# min_score=Union[min_score, float],
{
"min_score": Union[GenderedConfig, float],
},
)

@classmethod
Expand Down Expand Up @@ -720,16 +718,12 @@ class MaxGenderRougeScore(BaseFairness):
]
supported_tasks = ["question-answering", "summarization"]

max_score = TypedDict(
"max_score",
male=float,
female=float,
unknown=float,
)

TestConfig = TypedDict(
"TestConfig",
max_score=Union[max_score, float],
# max_score=Union[max_score, float],
{
"max_score": Union[GenderedConfig, float],
},
)

@classmethod
Expand Down Expand Up @@ -835,16 +829,9 @@ class MinGenderLLMEval(BaseFairness):
supported_tasks = ["question-answering"]
eval_model = None

min_score = TypedDict(
"min_score",
male=float,
female=float,
unknown=float,
)

TestConfig = TypedDict(
"TestConfig",
min_score=Union[min_score, float],
{"min_score": Union[GenderedConfig, float]},
)

@classmethod
Expand Down Expand Up @@ -983,16 +970,9 @@ class MaxGenderLLMEval(BaseFairness):
supported_tasks = ["question-answering"]
eval_model = None

max_score = TypedDict(
"max_score",
male=float,
female=float,
unknown=float,
)

TestConfig = TypedDict(
"TestConfig",
max_score=Union[max_score, float],
{"max_score": Union[GenderedConfig, float]},
)

@classmethod
Expand Down
5 changes: 4 additions & 1 deletion langtest/transform/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ class BaseGrammar(ABC):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{
"min_pass_rate": float,
},
)

@staticmethod
Expand Down
Loading
Loading