PacificAI · chakravarthik27 · Mar 23, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.9","3.10", "3.11" ]
+        python-version: ["3.12", "3.13" ]
 
     steps:
       - name: Free up disk space at start
@@ -53,7 +53,7 @@ jobs:
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
         run: |
           poetry cache clear pypi --all -n > /dev/null
-          poetry install --with dev --all-extras --no-cache --quiet --no-interaction
+          poetry install --with dev --all-extras --no-cache --no-interaction
           source ./.venv/bin/activate && pip uninstall -y pyspark && rm -rf ./.venv/lib/python${{ matrix.python-version }}/site-packages/pyspark*/
           pip install pyspark==3.5.6
 

diff --git a/.github/workflows/llm_tests_build.yml b/.github/workflows/llm_tests_build.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.12", "3.13" ]
 
     steps:
       - uses: actions/checkout@v3

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: [3.12]
         poetry-version: [2.1.3]
         os: [ubuntu-latest]
     runs-on: ${{ matrix.os }}

diff --git a/langtest/datahandler/datasource.py b/langtest/datahandler/datasource.py
@@ -808,7 +808,7 @@ def load_raw_data(self, standardize_columns: bool = False) -> List[Dict]:
                 parsed CSV file into list of dicts
         """
 
-        if type(self._file_path) == dict:
+        if isinstance(self._file_path, dict):
             df = pd.read_csv(self._file_path["data_source"])
 
             if self.task == "text-classification":

diff --git a/langtest/langtest.py b/langtest/langtest.py
@@ -286,7 +286,7 @@ def configure(self, config: Union[HarnessConfig, dict, str]) -> HarnessConfig:
         Returns:
             dict: Loaded configuration.
         """
-        if type(config) == dict:
+        if isinstance(config, dict):
             self._config = config
         else:
             with open(config, "r", encoding="utf-8") as yml:

diff --git a/langtest/metrics/llm_eval.py b/langtest/metrics/llm_eval.py
@@ -66,25 +66,22 @@ def build_prompt(
                 f"""\n\nScore the student answer based on the following criteria:\n{eval_criteria}"""
             )
 
-        prompt += dedent(
-            f"""
-        Example Format:
-        QUESTION: question here
-        STUDENT ANSWER: student's answer here
-        TRUE ANSWER: true answer here
-        GRADE: {grade_list} here
-
-        {
-            ("Grade the student answers based ONLY on their factual accuracy. Ignore differences"
-             " in punctuation and phrasing between the student answer and true answer. It is OK "
-             "if the student answer contains more or relevant information than the true answer, as"
-             " long as it does not contain any conflicting statements. Begin!")
-        }
-
-        QUESTION: {{query}}
-        STUDENT ANSWER: {{result}}
-        TRUE ANSWER: {{answer}}
-        GRADE:"""
+        prompt += (
+            "Example Format:\n"
+            "QUESTION: question here\n"
+            "STUDENT ANSWER: student's answer here\n"
+            "TRUE ANSWER: true answer here\n"
+            f"GRADE: {grade_list} here"
+            "\n\n"
+            "Grade the student answers based ONLY on their factual accuracy. Ignore differences"
+            " in punctuation and phrasing between the student answer and true answer. It is OK "
+            "if the student answer contains more or relevant information than the true answer, as"
+            " long as it does not contain any conflicting statements. Begin!"
+            "\n\n"
+            "QUESTION: {{query}}\n"
+            "STUDENT ANSWER: {{result}}\n"
+            "TRUE ANSWER: {{answer}}\n"
+            "GRADE:\n"
         )
         return prompt
 

diff --git a/langtest/modelhandler/__init__.py b/langtest/modelhandler/__init__.py
@@ -33,15 +33,15 @@
 
 
 if "langchain" in INSTALLED_HUBS:
-    import langchain
+    import langchain_classic
 
     LANGCHAIN_HUBS = {
         (
             RENAME_HUBS.get(hub.lower(), hub.lower())
             if hub.lower() in RENAME_HUBS
             else hub.lower()
         ): hub
-        for hub in langchain.llms.__all__
+        for hub in langchain_classic.llms.__all__
     }
     LANGCHAIN_HUBS["openrouter"] = "openrouter"
 

diff --git a/langtest/modelhandler/llm_modelhandler.py b/langtest/modelhandler/llm_modelhandler.py
@@ -3,9 +3,9 @@
 
 import os
 from typing import Any, List, Type, Union, TypeVar
-import langchain.llms as lc
+import langchain_classic.llms as lc
 import langchain.chat_models as chat_models
-from langchain.chains.llm import LLMChain
+from langchain_classic.chains.llm import LLMChain
 from langchain_core.prompts import PromptTemplate
 from langchain_core.language_models.base import BaseLanguageModel
 from langchain_core.exceptions import OutputParserException

diff --git a/langtest/modelhandler/modelhandler.py b/langtest/modelhandler/modelhandler.py
@@ -14,16 +14,16 @@
 }
 
 if try_import_lib("langchain"):
-    import langchain
-    import langchain.llms
+    import langchain_classic
+    import langchain_classic.llms
 
     LANGCHAIN_HUBS = {
         (
             RENAME_HUBS.get(hub.lower(), hub.lower())
             if hub.lower() in RENAME_HUBS
             else hub.lower()
         ): hub
-        for hub in langchain.llms.__all__
+        for hub in langchain_classic.llms.__all__
     }
     LANGCHAIN_HUBS["openrouter"] = "openrouter"
 else:

diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py
@@ -276,7 +276,8 @@ class BaseAccuracy(ABC):
 
     TestConfig = TypedDict(
         "TestConfig",
-        min_score=Union[Dict[str, float], float],
+        # min_score=Union[Dict[str, float], float],
+        {"min_score": Union[Dict[str, float], float]},
     )
 
     @classmethod
@@ -1029,9 +1030,15 @@ class LLMEval(BaseAccuracy):
 
     TestConfig = TypedDict(
         "TestConfig",
-        model=str,
-        hub=str,
-        min_score=float,
+        # model=str,
+        # hub=str,
+        # min_score=float,
+        {
+            "model": str,
+            "hub": str,
+            "model_parameters": dict,
+            "min_score": float,
+        },
     )
 
     @classmethod

diff --git a/langtest/transform/base.py b/langtest/transform/base.py
@@ -201,7 +201,7 @@ async def async_run(
             category_output = all_categories[each].run(
                 values, model_handler, progress_bar=tests, **kwargs
             )
-            if type(category_output) == list:
+            if isinstance(category_output, list):
                 all_results.extend(category_output)
             else:
                 all_results.append(category_output)
@@ -264,7 +264,7 @@ def run(
             if len(test_name.split("-")) > 1:
                 test_name = "multiple_perturbations"
             test_output = supported_tests[test_name].async_run(samples, model, **kwargs)
-            if type(test_output) == list:
+            if isinstance(test_output, list):
                 tasks.extend(test_output)
             else:
                 tasks.append(test_output)

diff --git a/langtest/transform/bias.py b/langtest/transform/bias.py
@@ -267,7 +267,7 @@ class BaseBias(ABC):
     ]
 
     # Config Hint for the bias tests
-    TestConfig = TypedDict("TestConfig", min_pass_rate=float)
+    TestConfig = TypedDict("TestConfig", {"min_pass_rate": float})
 
     @abstractmethod
     def transform(self, sample_list: List[Sample], *args, **kwargs) -> List[Sample]:

diff --git a/langtest/transform/clinical.py b/langtest/transform/clinical.py
@@ -119,7 +119,8 @@ class BaseClinical(ABC):
     # TestConfig
     TestConfig = TypedDict(
         "TestConfig",
-        min_pass_rate=float,
+        # min_pass_rate=float,
+        {"min_pass_rate": float},
     )
 
     @staticmethod

diff --git a/langtest/transform/disinformation.py b/langtest/transform/disinformation.py
@@ -17,7 +17,8 @@ class DisinformationTestFactory(ITests):
     # TestConfig
     TestConfig = TypedDict(
         "TestConfig",
-        min_pass_rate=float,
+        # min_pass_rate=float,
+        {"min_pass_rate": float},
     )
 
     def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:

diff --git a/langtest/transform/factuality.py b/langtest/transform/factuality.py
@@ -14,7 +14,8 @@ class FactualityTestFactory(ITests):
     # TestConfig
     TestConfig = TypedDict(
         "TestConfig",
-        min_pass_rate=float,
+        # min_pass_rate=float,
+        {"min_pass_rate": float},
     )
 
     def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:

diff --git a/langtest/transform/fairness.py b/langtest/transform/fairness.py
@@ -21,6 +21,12 @@
 from langtest.transform.base import ITests
 
 
+class GenderedConfig(TypedDict):
+    male: float
+    female: float
+    unknown: float
+
+
 class FairnessTestFactory(ITests):
     """
     A class for performing fairness tests on a given dataset.
@@ -292,8 +298,12 @@ class BaseFairness(ABC):
 
     TestConfig = TypedDict(
         "TestConfig",
-        min_score=Union[float, Dict[str, float]],
-        max_score=Union[float, Dict[str, float]],
+        # min_score=Union[float, Dict[str, float]],
+        # max_score=Union[float, Dict[str, float]],
+        {
+            "min_score": Union[float, Dict[str, float]],
+            "max_score": Union[float, Dict[str, float]],
+        },
     )
 
     @staticmethod
@@ -361,16 +371,12 @@ class MinGenderF1Score(BaseFairness):
 
     alias_name = ["min_gender_f1_score"]
 
-    min_score = TypedDict(
-        "min_score",
-        male=float,
-        female=float,
-        unknown=float,
-    )
-
     TestConfig = TypedDict(
         "TestConfig",
-        min_score=Union[min_score, float],
+        # min_score=Union[min_score, float],
+        {
+            "min_score": Union[GenderedConfig, float],
+        },
     )
 
     @classmethod
@@ -476,16 +482,12 @@ class MaxGenderF1Score(BaseFairness):
 
     alias_name = ["max_gender_f1_score"]
 
-    max_score = TypedDict(
-        "max_score",
-        male=float,
-        female=float,
-        unknown=float,
-    )
-
     TestConfig = TypedDict(
         "TestConfig",
-        max_score=Union[max_score, float],
+        # max_score=Union[max_score, float],
+        {
+            "max_score": Union[GenderedConfig, float],
+        },
     )
 
     @classmethod
@@ -599,16 +601,12 @@ class MinGenderRougeScore(BaseFairness):
     ]
     supported_tasks = ["question-answering", "summarization"]
 
-    min_score = TypedDict(
-        "min_score",
-        male=float,
-        female=float,
-        unknown=float,
-    )
-
     TestConfig = TypedDict(
         "TestConfig",
-        min_score=Union[min_score, float],
+        # min_score=Union[min_score, float],
+        {
+            "min_score": Union[GenderedConfig, float],
+        },
     )
 
     @classmethod
@@ -720,16 +718,12 @@ class MaxGenderRougeScore(BaseFairness):
     ]
     supported_tasks = ["question-answering", "summarization"]
 
-    max_score = TypedDict(
-        "max_score",
-        male=float,
-        female=float,
-        unknown=float,
-    )
-
     TestConfig = TypedDict(
         "TestConfig",
-        max_score=Union[max_score, float],
+        # max_score=Union[max_score, float],
+        {
+            "max_score": Union[GenderedConfig, float],
+        },
     )
 
     @classmethod
@@ -835,16 +829,9 @@ class MinGenderLLMEval(BaseFairness):
     supported_tasks = ["question-answering"]
     eval_model = None
 
-    min_score = TypedDict(
-        "min_score",
-        male=float,
-        female=float,
-        unknown=float,
-    )
-
     TestConfig = TypedDict(
         "TestConfig",
-        min_score=Union[min_score, float],
+        {"min_score": Union[GenderedConfig, float]},
     )
 
     @classmethod
@@ -983,16 +970,9 @@ class MaxGenderLLMEval(BaseFairness):
     supported_tasks = ["question-answering"]
     eval_model = None
 
-    max_score = TypedDict(
-        "max_score",
-        male=float,
-        female=float,
-        unknown=float,
-    )
-
     TestConfig = TypedDict(
         "TestConfig",
-        max_score=Union[max_score, float],
+        {"max_score": Union[GenderedConfig, float]},
     )
 
     @classmethod

diff --git a/langtest/transform/grammar.py b/langtest/transform/grammar.py
@@ -130,7 +130,10 @@ class BaseGrammar(ABC):
     # TestConfig
     TestConfig = TypedDict(
         "TestConfig",
-        min_pass_rate=float,
+        # min_pass_rate=float,
+        {
+            "min_pass_rate": float,
+        },
     )
 
     @staticmethod