Azure · jsong468 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 12, 2025
diff --git a/doc/api.rst b/doc/api.rst
@@ -545,6 +545,8 @@ API Reference
     ScorerEvaluator
     ScorerIdentifier
     ScorerMetrics
+    ScorerMetricsEntry
+    ScorerMetricsRegistry
     ScorerPromptValidator
     SelfAskCategoryScorer
     SelfAskGeneralFloatScaleScorer

diff --git a/doc/code/scenarios/1_composite_scenario.ipynb b/doc/code/scenarios/1_composite_scenario.ipynb
@@ -156,7 +156,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "19e072daec284cd097fd22f28449e859",
+       "model_id": "25ba83e56f4144b78a1eab03e08fdb01",
        "version_major": 2,
        "version_minor": 0
       },
@@ -192,18 +192,25 @@
       "\n",
       "\u001b[1m  🎯 Target Information\u001b[0m\n",
       "\u001b[36m    • Target Type: OpenAIChatTarget\u001b[0m\n",
-      "\u001b[36m    • Target Model: gpt-40\u001b[0m\n",
-      "\u001b[36m    • Target Endpoint: https://pyrit-github-pipeline.openai.azure.com/openai/v1\u001b[0m\n",
+      "\u001b[36m    • Target Model: gpt-4o-japan-nilfilter\u001b[0m\n",
+      "\u001b[36m    • Target Endpoint: https://pyrit-japan-test.openai.azure.com/openai/v1\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[36m    • Scorer Type: TrueFalseCompositeScorer\u001b[0m\n",
       "\u001b[36m      └─ Composite of 2 scorer(s):\u001b[0m\n",
       "\u001b[36m          • Scorer Type: FloatScaleThresholdScorer\u001b[0m\n",
-      "\u001b[36m            └─ Wraps:\u001b[0m\n",
-      "\u001b[36m              • Scorer Type: AzureContentFilterScorer\u001b[0m\n",
+      "\u001b[36m            └─ Composite of 1 scorer(s):\u001b[0m\n",
+      "\u001b[36m                • Scorer Type: AzureContentFilterScorer\u001b[0m\n",
       "\u001b[36m          • Scorer Type: TrueFalseInverterScorer\u001b[0m\n",
-      "\u001b[36m            └─ Wraps:\u001b[0m\n",
-      "\u001b[36m              • Scorer Type: SelfAskRefusalScorer\u001b[0m\n",
+      "\u001b[36m            └─ Composite of 1 scorer(s):\u001b[0m\n",
+      "\u001b[36m                • Scorer Type: SelfAskRefusalScorer\u001b[0m\n",
+      "\n",
+      "\u001b[1m  📉 Scorer Performance Metrics\u001b[0m\n",
+      "\u001b[32m    • Accuracy: 58.72%\u001b[0m\n",
+      "\u001b[36m    • Accuracy Std Error: ±0.0285\u001b[0m\n",
+      "\u001b[36m    • F1 Score: 0.1399\u001b[0m\n",
+      "\u001b[36m    • Precision: 0.6667\u001b[0m\n",
+      "\u001b[36m    • Recall: 0.0781\u001b[0m\n",
       "\n",
       "\u001b[1m\u001b[36m▼ Overall Statistics\u001b[0m\n",
       "\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n",

diff --git a/doc/code/scenarios/1_composite_scenario.py b/doc/code/scenarios/1_composite_scenario.py
@@ -5,15 +5,11 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.17.3
-#   kernelspec:
-#     display_name: pyrit-dev
-#     language: python
-#     name: python3
+#       jupytext_version: 1.17.2
 # ---
 
 # %% [markdown]
-# # Foundry Scenario Example
+# # 1. Foundry Scenario Example
 #
 # This notebook demonstrates how to use a composite strategies — the `FoundryStrategy` — to test a target with multiple
 # attack strategies.

diff --git a/doc/code/scoring/scorer_evals.ipynb b/doc/code/scoring/scorer_evals.ipynb
@@ -371,6 +371,150 @@
     "    print(\"Evaluation for harm category:\", harm_category)\n",
     "    print(asdict(metrics))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17",
+   "metadata": {},
+   "source": [
+    "## Scorer Metrics Registry\n",
+    "\n",
+    "### Adding to registry\n",
+    "The `ScorerMetricsRegistry` stores official scoring evaluation metrics in JSONL format (one file for harm scoring, another for objective scoring) in the public repository. These are results from running scorer evaluations on official consolidated datasets curated by the PyRIT team. Each entry in the registry contains all the attributes that identify a scorer (see the `ScorerIdentifier` class for more information on what goes into scoring identification) as well as the metrics resulting from the evaluation. To run an evaluation that you'd like to save to the registry, set the `add_to_registry` flag to True. (Note: This should primarily be done by the PyRIT team)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from pyrit.common.path import DATASETS_PATH, SCORER_EVALS_TRUE_FALSE_PATH\n",
+    "\n",
+    "chat_target = OpenAIChatTarget(\n",
+    "                endpoint=os.environ.get(\"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT\"),\n",
+    "                api_key=os.environ.get(\"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY\"),\n",
+    "                model_name=os.environ.get(\"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL\"),\n",
+    "                temperature=0.9,\n",
+    "            )\n",
+    "true_false_classifier = SelfAskTrueFalseScorer(\n",
+    "    true_false_question_path=DATASETS_PATH/\"score\"/\"true_false_question\"/\"task_achieved.yaml\", chat_target=chat_target\n",
+    ")\n",
+    "\n",
+    "evaluator: ObjectiveScorerEvaluator = ScorerEvaluator.from_scorer(scorer=true_false_classifier)  # type: ignore\n",
+    "official_dataset_path = SCORER_EVALS_TRUE_FALSE_PATH / \"CONSOLIDATED_true_false_objective_dataset.csv\"\n",
+    "metrics = await evaluator.run_evaluation_from_csv_async(  # type:ignore\n",
+    "    csv_path=official_dataset_path,\n",
+    "    assistant_response_col_name=\"assistant_response\",\n",
+    "    human_label_col_names=[\"normalized_score\"],\n",
+    "    objective_or_harm_col_name=\"objective\",\n",
+    "    assistant_response_data_type_col_name=\"data_type\",\n",
+    "    num_scorer_trials=1,\n",
+    "    add_to_registry=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19",
+   "metadata": {},
+   "source": [
+    "### Retrieving metrics from registry\n",
+    "`ScorerMetricsRegistry` has a method (`get_metrics_registry_entries`) to filter entries on a variety of attributes, including hash, `Scorer` type, model information, as well as accuracy/mean standard error thresholds. You can use `print_summary` to display a brief summary of the scoring configuration and metrics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================================================\n",
+      "Scorer Configuration:\n",
+      "  Type: TrueFalseCompositeScorer\n",
+      "  Version: 1\n",
+      "  Dataset Version: 1.0\n",
+      "  Sub-scorers: FloatScaleThresholdScorer, TrueFalseInverterScorer\n",
+      "\n",
+      "Metrics:\n",
+      "  accuracy: 0.5872\n",
+      "  accuracy_standard_error: 0.0285\n",
+      "  f1_score: 0.1399\n",
+      "  precision: 0.6667\n",
+      "  recall: 0.0781\n",
+      "============================================================\n",
+      "============================================================\n",
+      "Scorer Configuration:\n",
+      "  Type: SelfAskTrueFalseScorer\n",
+      "  Version: 1\n",
+      "  Model: gpt-4o-unsafe (OpenAIChatTarget)\n",
+      "  Dataset Version: 1.0\n",
+      "  System Prompt: sha256:894c040cb71ebe86\n",
+      "\n",
+      "Metrics:\n",
+      "  accuracy: 0.8658\n",
+      "  accuracy_standard_error: 0.0197\n",
+      "  f1_score: 0.8305\n",
+      "  precision: 0.9074\n",
+      "  recall: 0.7656\n",
+      "============================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyrit.score.scorer_evaluation.scorer_metrics_registry import ScorerMetricsRegistry\n",
+    "\n",
+    "registry = ScorerMetricsRegistry()\n",
+    "# get first 5 entries\n",
+    "entries = registry.get_metrics_registry_entries()[:5]\n",
+    "\n",
+    "for entry in entries:\n",
+    "    entry.print_summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21",
+   "metadata": {},
+   "source": [
+    "You can view identifier information and retrieve metrics (if they exist) for a specific `Scorer` configuration by calling methods on the `Scorer` object itself. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'__type__': 'SelfAskTrueFalseScorer', 'version': 1, 'system_prompt_template': 'sha256:894c040cb71ebe86', 'user_prompt_template': None, 'sub_identifier': None, 'model_info': {'__type__': 'OpenAIChatTarget', 'model_name': 'gpt-4o-unsafe', 'temperature': 0.9}, 'score_aggregator': 'OR_', 'scorer_specific_params': None, 'pyrit_version': '0.10.1.dev0', 'hash': 'fdab4c3571d3f46a40051d8b87e0b96e8b929918c7a084f555db70093416a993'}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "ObjectiveScorerMetrics(accuracy=0.8657718120805369, accuracy_standard_error=0.01974765141819736, f1_score=0.8305084745762712, precision=0.9074074074074074, recall=0.765625)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(true_false_classifier.get_identifier())\n",
+    "true_false_classifier.get_scorer_metrics_from_registry() #get metrics from registry via scorer object"
+   ]
   }
  ],
  "metadata": {
@@ -384,7 +528,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

diff --git a/doc/code/scoring/scorer_evals.py b/doc/code/scoring/scorer_evals.py
@@ -5,11 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.18.1
-#   kernelspec:
-#     display_name: pyrit2
-#     language: python
-#     name: python3
+#       jupytext_version: 1.17.2
 # ---
 
 # %% [markdown]
@@ -229,3 +225,58 @@
 
     print("Evaluation for harm category:", harm_category)
     print(asdict(metrics))
+
+# %% [markdown]
+# ## Scorer Metrics Registry
+#
+# ### Adding to registry
+# The `ScorerMetricsRegistry` stores official scoring evaluation metrics in JSONL format (one file for harm scoring, another for objective scoring) in the public repository. These are results from running scorer evaluations on official consolidated datasets curated by the PyRIT team. Each entry in the registry contains all the attributes that identify a scorer (see the `ScorerIdentifier` class for more information on what goes into scoring identification) as well as the metrics resulting from the evaluation. To run an evaluation that you'd like to save to the registry, set the `add_to_registry` flag to True. (Note: This should primarily be done by the PyRIT team)
+
+# %%
+import os
+
+from pyrit.common.path import DATASETS_PATH, SCORER_EVALS_TRUE_FALSE_PATH
+
+chat_target = OpenAIChatTarget(
+    endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
+    api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
+    model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
+    temperature=0.9,
+)
+true_false_classifier = SelfAskTrueFalseScorer(
+    true_false_question_path=DATASETS_PATH / "score" / "true_false_question" / "task_achieved.yaml",
+    chat_target=chat_target,
+)
+
+evaluator: ObjectiveScorerEvaluator = ScorerEvaluator.from_scorer(scorer=true_false_classifier)  # type: ignore
+official_dataset_path = SCORER_EVALS_TRUE_FALSE_PATH / "CONSOLIDATED_true_false_objective_dataset.csv"
+metrics = await evaluator.run_evaluation_from_csv_async(  # type:ignore
+    csv_path=official_dataset_path,
+    assistant_response_col_name="assistant_response",
+    human_label_col_names=["normalized_score"],
+    objective_or_harm_col_name="objective",
+    assistant_response_data_type_col_name="data_type",
+    num_scorer_trials=1,
+    add_to_registry=True,
+)
+
+# %% [markdown]
+# ### Retrieving metrics from registry
+# `ScorerMetricsRegistry` has a method (`get_metrics_registry_entries`) to filter entries on a variety of attributes, including hash, `Scorer` type, model information, as well as accuracy/mean standard error thresholds. You can use `print_summary` to display a brief summary of the scoring configuration and metrics.
+
+# %%
+from pyrit.score.scorer_evaluation.scorer_metrics_registry import ScorerMetricsRegistry
+
+registry = ScorerMetricsRegistry()
+# get first 5 entries
+entries = registry.get_metrics_registry_entries()[:5]
+
+for entry in entries:
+    entry.print_summary()
+
+# %% [markdown]
+# You can view identifier information and retrieve metrics (if they exist) for a specific `Scorer` configuration by calling methods on the `Scorer` object itself.
+
+# %%
+print(true_false_classifier.get_identifier())
+true_false_classifier.get_scorer_metrics_from_registry()  # get metrics from registry via scorer object
diff --git a/pyrit/models/scenario_result.py b/pyrit/models/scenario_result.py
@@ -4,11 +4,15 @@
 import logging
 import uuid
 from datetime import datetime, timezone
-from typing import List, Literal, Optional
+from typing import TYPE_CHECKING, List, Literal, Optional
 
 import pyrit
 from pyrit.models import AttackOutcome, AttackResult
 
+if TYPE_CHECKING:
+    from pyrit.score.scorer_evaluation.scorer_evaluator import ScorerMetrics
+    from pyrit.score.scorer_evaluation.scorer_metrics_registry import RegistryType
+
 logger = logging.getLogger(__name__)
 
 
@@ -66,7 +70,7 @@ def __init__(
         self.id = id if id is not None else uuid.uuid4()
         self.scenario_identifier = scenario_identifier
         self.objective_target_identifier = objective_target_identifier
-        self.objective_scorer_identifier = objective_scorer_identifier
+        self.objective_scorer_identifier = objective_scorer_identifier or {}
         self.scenario_run_state = scenario_run_state
         self.attack_results = attack_results
         self.labels = labels if labels is not None else {}
@@ -136,3 +140,28 @@ def objective_achieved_rate(self, *, atomic_attack_name: Optional[str] = None) -
 
         successful_results = sum(1 for result in all_results if result.outcome == AttackOutcome.SUCCESS)
         return int((successful_results / total_results) * 100)
+
+    def get_scorer_evaluation_metrics(
+        self, registry_type: Optional["RegistryType"] = None
+    ) -> Optional["ScorerMetrics"]:
+        """
+        Get the evaluation metrics for the scenario's scorer from the scorer evaluation registry.
+
+        Returns:
+            ScorerMetrics: The evaluation metrics object, or None if not found.
+        """
+        # import here to avoid circular imports
+        from pyrit.score.scorer_evaluation.scorer_metrics_registry import (
+            ScorerMetricsRegistry,
+        )
+
+        # Use the stored hash directly for lookup (avoids needing to reconstruct ScorerIdentifier)
+        scorer_hash = self.objective_scorer_identifier.get("hash")
+        if not scorer_hash:
+            return None
+
+        registry = ScorerMetricsRegistry()
+        entries = registry.get_metrics_registry_entries(registry_type=registry_type, hash=scorer_hash)
+        if entries:
+            return entries[0].metrics
+        return None
diff --git a/pyrit/prompt_target/azure_blob_storage_target.py b/pyrit/prompt_target/azure_blob_storage_target.py
@@ -76,7 +76,10 @@ def __init__(
         self._sas_token = sas_token
         self._client_async: AsyncContainerClient = None
 
-        super().__init__(endpoint=self._container_url, max_requests_per_minute=max_requests_per_minute)
+        super().__init__(
+            endpoint=self._container_url,
+            max_requests_per_minute=max_requests_per_minute,
+        )
 
     async def _create_container_client_async(self) -> None:
         """

diff --git a/pyrit/prompt_target/azure_ml_chat_target.py b/pyrit/prompt_target/azure_ml_chat_target.py
@@ -86,7 +86,11 @@ def __init__(
         endpoint_value = default_values.get_required_value(
             env_var_name=self.endpoint_uri_environment_variable, passed_value=endpoint
         )
-        PromptChatTarget.__init__(self, max_requests_per_minute=max_requests_per_minute, endpoint=endpoint_value)
+        PromptChatTarget.__init__(
+            self,
+            max_requests_per_minute=max_requests_per_minute,
+            endpoint=endpoint_value,
+        )
 
         self._initialize_vars(endpoint=endpoint, api_key=api_key)
 

diff --git a/pyrit/prompt_target/crucible_target.py b/pyrit/prompt_target/crucible_target.py
@@ -41,7 +41,10 @@ def __init__(
                 minute before hitting a rate limit. The number of requests sent to the target
                 will be capped at the value provided.
         """
-        super().__init__(max_requests_per_minute=max_requests_per_minute, endpoint=endpoint)
+        super().__init__(
+            max_requests_per_minute=max_requests_per_minute,
+            endpoint=endpoint,
+        )
 
         self._api_key: str = default_values.get_required_value(
             env_var_name=self.API_KEY_ENVIRONMENT_VARIABLE, passed_value=api_key