Add eval metrics

aprilk-ms · aprilk-ms · commit 3d0c468756ee · 2025-02-20T03:35:58.000Z
diff --git a/.config/metrics-genai-v0.1.0.json b/.config/metrics-genai-v0.1.0.json
@@ -286,6 +286,125 @@
             "filter": "['gen_ai.finish_reason.tool_calls'] == true"
           }
         }
+      },
+      {
+        "id": "genai_evaluation_protectedMaterial",
+        "displayName": "Protected material score",
+        "description": "Protected material score by Azure AI content safety API. The Protected material detection APIs scan the output of large language models to identify and flag known protected material. See (https://learn.microsoft.com/en-us/azure/ai-services/content-safety/concepts/protected-material?tabs=text) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Decrease",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.protectedMaterial",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
+      },
+      {
+        "id": "genai_evaluation_hateUnfairness",
+        "displayName": "Hate unfairness score",
+        "description": "Hate and unfair score given by Azure AI evaluation. Hateful and unfair content refers to any language pertaining to hate toward or unfair representations of individuals and social groups along factors including but not limited to race, ethnicity, nationality, gender, sexual orientation, religion, immigration status, ability, personal appearance, and body size. Unfairness occurs when AI systems treat or represent social groups inequitably, creating or contributing to societal inequities. Safety evaluations annotate self-harm-related content using a 0-7 scale. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.hateunfairnessevaluator?view=azure-python) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Decrease",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.hateUnfairness",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
+      },
+      {
+        "id": "genai_evaluation_sexual",
+        "displayName": "Sexual content score",
+        "description": "Score for sexual content given by Azure AI evaluation. Sexual score is range from 0 to 7. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.sexualevaluator?view=azure-python) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Decrease",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.sexual",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
+      },
+      {
+        "id": "genai_evaluation_violence",
+        "displayName": "Violent content score",
+        "description": "Violence score given by Azure AI evaluation, Violence score is range from 0 to 7. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.violenceevaluator?view=azure-python) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Decrease",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.violence",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
+      },
+      {
+        "id": "genai_evaluation_relevance",
+        "displayName": "Relevance score",
+        "description": "Relevance score given by Azure AI evaluation. The relevance measure assesses the ability of answers to capture the key points of the context. High relevance scores signify the AI system's understanding of the input and its capability to produce coherent and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Relevance scores range from 1 to 5. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.relevanceevaluator?view=azure-python) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Increase",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.relevance",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
+      },
+      {
+        "id": "genai_evaluation_fluency",
+        "displayName": "Fluency score",
+        "description": "Fluency score given by Azure AI evaluation. The fluency measure assesses the extent to which the generated text conforms to grammatical rules, syntactic structures, and appropriate vocabulary usage, resulting in linguistically correct responses. The fluency score range from 1 to 5. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.fluencyevaluator?view=azure-python) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Increase",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.fluency",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
+      },
+      {
+        "id": "genai_evaluation_coherence",
+        "displayName": "Coherence score",
+        "description": "Coherence score given by Azure AI evaluation. The coherence measure assesses the ability of the language model to generate text that reads naturally, flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability and user-friendliness of a model's generated responses in real-world applications. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.coherenceevaluator?view=azure-python) for more details.",
+        "lifecycle": "Active",
+        "tags": [
+          "GenAI", "azure_ai_evaluation"
+        ],
+        "desiredDirection": "Increase",
+        "definition": {
+          "kind": "Average",
+          "event": {
+            "eventName": "gen_ai.evaluation.coherence",
+            "eventProperty": "gen_ai.ai.evaluation.score"
+          }
+        }
       }
     ]
   }
diff --git a/eval/setup-eval.py b/eval/setup-eval.py
@@ -6,7 +6,7 @@
     EvaluationSchedule,
     RecurrenceTrigger,
 )
-from azure.ai.evaluation import CoherenceEvaluator 
+from azure.ai.evaluation import CoherenceEvaluator, FluencyEvaluator, RelevanceEvaluator, ViolenceEvaluator, SexualEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator, ContentSafetyEvaluator
 
 # This sample includes the setup for an online evaluation schedule using the Azure AI Project SDK and Azure AI Evaluation SDK
 # The schedule is configured to run daily over the collected trace data while running two evaluators: CoherenceEvaluator and RelevanceEvaluator
@@ -28,7 +28,7 @@
 # You can modify it depending on your data schema
 # The KQL query must output these required columns: operation_ID, operation_ParentID, and gen_ai_response_id
 # You can choose which other columns to output as required by the evaluators you are using
-KUSTO_QUERY = "let gen_ai_spans=(dependencies | where isnotnull(customDimensions[\"gen_ai.system\"]) | extend response_id = tostring(customDimensions[\"gen_ai.response.id\"]) | project id, operation_Id, operation_ParentId, timestamp, response_id); let gen_ai_events=(traces | where message in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") or tostring(customDimensions[\"event.name\"]) in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") | project id= operation_ParentId, operation_Id, operation_ParentId, user_input = iff(message == \"gen_ai.user.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.user.message\", parse_json(iff(message == \"gen_ai.user.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), system = iff(message == \"gen_ai.system.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.system.message\", parse_json(iff(message == \"gen_ai.system.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), llm_response = iff(message == \"gen_ai.choice\", parse_json(tostring(parse_json(tostring(customDimensions[\"gen_ai.event.content\"])).message)).content, iff(tostring(customDimensions[\"event.name\"]) == \"gen_ai.choice\", parse_json(parse_json(message).message).content, \"\")) | summarize operation_ParentId = any(operation_ParentId), Input = maxif(user_input, user_input != \"\"), System = maxif(system, system != \"\"), Output = maxif(llm_response, llm_response != \"\") by operation_Id, id); gen_ai_spans | join kind=inner (gen_ai_events) on id, operation_Id | project Input, System, Output, operation_Id, operation_ParentId, gen_ai_response_id = response_id"
+KUSTO_QUERY = "let gen_ai_spans=(dependencies | where isnotnull(customDimensions[\"gen_ai.system\"]) | extend response_id = tostring(customDimensions[\"gen_ai.response.id\"]) | project id, operation_Id, operation_ParentId, timestamp, response_id); let gen_ai_events=(traces | where message in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") or tostring(customDimensions[\"event.name\"]) in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") | project id= operation_ParentId, operation_Id, operation_ParentId, user_input = iff(message == \"gen_ai.user.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.user.message\", parse_json(iff(message == \"gen_ai.user.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), system = iff(message == \"gen_ai.system.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.system.message\", parse_json(iff(message == \"gen_ai.system.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), llm_response = iff(message == \"gen_ai.choice\", parse_json(tostring(parse_json(tostring(customDimensions[\"gen_ai.event.content\"])).message)).content, iff(tostring(customDimensions[\"event.name\"]) == \"gen_ai.choice\", parse_json(parse_json(message).message).content, \"\")) | summarize operation_ParentId = any(operation_ParentId), Input = maxif(user_input, user_input != \"\"), System = maxif(system, system != \"\"), Output = maxif(llm_response, llm_response != \"\") by operation_Id, id); gen_ai_spans | join kind=inner (gen_ai_events) on id, operation_Id | project Input, System, Output, operation_Id, operation_ParentId, gen_ai_response_id = response_id | where gen_ai_response_id != \"\""
 
 
 
@@ -71,18 +71,47 @@
 # )
 
 # CoherenceEvaluator
-coherence_evaluator_config = EvaluatorConfiguration(
-    id=CoherenceEvaluator.id,
-    init_params={"model_config": model_config},
-    data_mapping={"query": "${data.Input}", "response": "${data.Output}"}
-)
+def get_evaluator_config(evaluator_id, model_config):
+    return EvaluatorConfiguration(
+        id=evaluator_id,
+        init_params={"model_config": model_config},
+        data_mapping={"query": "${data.Input}", "response": "${data.Output}"}
+    )
+
+def get_evaluator_config_safety(evaluator_id, azure_ai_project_scope):
+    return EvaluatorConfiguration(
+        id=evaluator_id,
+        init_params={"azure_ai_project": azure_ai_project_scope},
+        data_mapping={"query": "${data.Input}", "response": "${data.Output}"}
+    )
+
+# coherence_evaluator_config = create_evaluator_config(
+#     CoherenceEvaluator,
+#     model_config,
+#     {"query": "${data.Input}", "response": "${data.Output}"}
+# )
+
+# fluency_evaluator_config = create_evaluator_config(
+#     FluencyEvaluator,
+#     model_config,
+#     {"query": "${data.Input}", "response": "${data.Output}"}
+# )
 
 # Frequency to run the schedule
 recurrence_trigger = RecurrenceTrigger(frequency="hour", interval=1)
 
+credential = DefaultAzureCredential()
+
 # Dictionary of evaluators
 evaluators = {
-    "coherence" : coherence_evaluator_config
+    "coherence" : get_evaluator_config(CoherenceEvaluator.id, model_config),
+    "fluency" : get_evaluator_config(FluencyEvaluator.id, model_config),
+    "relevance" : get_evaluator_config(RelevanceEvaluator.id, model_config),
+    "violence" : get_evaluator_config_safety(ViolenceEvaluator.id, project_client.scope),
+    "sexual" : get_evaluator_config_safety(SexualEvaluator.id, project_client.scope),
+    "hateUnfairness" : get_evaluator_config_safety(HateUnfairnessEvaluator.id, project_client.scope),
+    "protectedMaterial" : get_evaluator_config_safety(ProtectedMaterialEvaluator.id, project_client.scope),
+    "contentSafety" : get_evaluator_config_safety(ContentSafetyEvaluator.id, project_client.scope)
 }
 
 name = SAMPLE_NAME
diff --git a/infra/la-summary-rules.yaml b/infra/la-summary-rules.yaml
@@ -33,3 +33,60 @@ summaryRules:
       | extend Properties = bag_merge(newProperties, stop, tool_calls, content_filter, length)
       | extend Name = "gen_ai.otel.span"
       | project Name, TimeGenerated, ItemCount, Properties
+  - name: "Online-Experimentation-Assignment-Summary"
+    binSize: 20
+    destinationTable: OnlineExperimentAssignmentSummary_CL
+    description: "Summary rule definition for online experiment assignment summary."
+    query: |
+      AppEvents
+      | where Name == "FeatureEvaluation"
+      | where tostring(Properties.VariantAssignmentReason) == "Percentile"
+      | where tostring(Properties.TargetingId) != ""
+      | where tostring(Properties.Enabled) == "True"
+      | where tostring(Properties.AllocationId) != ""
+      | where toint(Properties.VariantAssignmentPercentage) < 100
+      | project
+          TimeGenerated,
+          ItemCount,
+          FeatureFlagReference = tostring(Properties.FeatureFlagReference),
+          FeatureName = tostring(Properties.FeatureName),
+          AllocationId = tostring(Properties.AllocationId),
+          Variant = tostring(Properties.Variant),
+          VariantAssignmentPercentage = toreal(Properties.VariantAssignmentPercentage),
+          IsControlVariant = tostring(Properties.DefaultWhenEnabled) == tostring(Properties.Variant)
+      | summarize
+          VariantAssignmentPercentage = take_any(VariantAssignmentPercentage),
+          IsControlVariant = take_any(IsControlVariant),
+          AssignmentEventCount = tolong(sum(ItemCount)),
+          FirstAssignmentTimestamp = min(TimeGenerated),
+          LastAssignmentTimestamp = max(TimeGenerated)
+          by FeatureFlagReference, FeatureName, AllocationId, Variant
+  - name: "Online-Experimentation-GenAI-eval"
+    binSize: 20
+    destinationTable: AppEvents_CL
+    description: "Summary rule definition for Azure AI evaluation-based GenAI metrics."
+    query: |
+      let operationId_to_Targeting_Id = AppDependencies
+      | where Properties has "TargetingId" and Properties['TargetingId'] != ""
+      | extend TargetingId = Properties['TargetingId']
+      | project TargetingId, OperationId
+      | summarize TargetingId=any(TargetingId) by OperationId;
+      AppTraces
+      | extend event_name_ = tostring(Properties.["event.name"])
+      | extend gen_ai_evaluation_score_ = todouble(Properties.["gen_ai.evaluation.score"])
+      | extend gen_ai_response_id_ = tostring(Properties.["gen_ai.response_id"])
+      | where gen_ai_response_id_ != ""
+      | where event_name_ startswith "gen_ai.evaluation."
+      | join kind=inner operationId_to_Targeting_Id on OperationId
+      | extend span_gen_ai_response_id_ = tostring(Properties.["gen_ai.response.id"])
+      | project 
+          Name=event_name_,
+          TimeGenerated,
+          ItemCount,
+          Properties=pack("gen_ai.evaluation.score", gen_ai_evaluation_score_, "TargetingId", TargetingId, "gen_ai.response_id", gen_ai_response_id_, "OperationId", OperationId),
+          tool_calls = iff(Properties["gen_ai.response.finish_reasons"] has "tool_calls", bag_pack("gen_ai.response.finish_reason.tool_calls", 1), dynamic({})),
+          content_filter = iff(Properties["gen_ai.response.finish_reasons"] has "content_filter", bag_pack("gen_ai.response.finish_reason.content_filter", 1), dynamic({})),
+          length = iff(Properties["gen_ai.response.finish_reasons"] has "length", bag_pack("gen_ai.response.finish_reason.length", 1), dynamic({}))
+      | extend Properties = bag_merge(Properties, tool_calls, content_filter, length)
+      | extend Name = "gen_ai.otel.span"
+      | project Name, TimeGenerated, ItemCount, Properties
diff --git a/src/api/routes.py b/src/api/routes.py
@@ -20,6 +20,7 @@
 from opentelemetry.baggage import set_baggage, get_baggage
 from opentelemetry.context import attach
 from featuremanagement import TargetingContext
+from azure.identity import DefaultAzureCredential
 
 router = fastapi.APIRouter()
 templates = Jinja2Templates(directory="api/templates")
@@ -107,7 +108,7 @@ async def chat_nostream_handler(
     model_deployment_name = globals["chat_model"]
     feature_manager = globals["feature_manager"] 
 
-    targeting_id = chat_request.sessionState['sessionId'] or str(uuid.uuid4())
+    targeting_id = chat_request.sessionState.get('sessionId', str(uuid.uuid4()))
     attach(set_baggage("Microsoft.TargetingId", targeting_id))
     
     # figure out which prompty template to use (replace file to API)
@@ -130,19 +131,41 @@ async def chat_nostream_handler(
             model=model_deployment_name, messages=prompt_messages + messages, stream=False
         )
         track_event("RequestMade", targeting_id)
+        answer = response.choices[0].message.content
     except Exception as e:
         error = {"Error": str(e)}
-        track_event("ErrorLLM", targeting_id, error)
-        
-    answer = response.choices[0].message.content
+        track_event("ErrorLLM", targeting_id, error)       
+        return { "answer": str(e), "variant": variant }    
+
+
+    # conversation = {}
+
+    # # initialize the evaluation client
+    # # optional parameter to configure sampling
+    # eval_client = await project.evaluation.get_evaluation_client(sampling_config=0.1)
 
+    # eval_config = {
+    #     # Required: built-in or custom evaluators
+    #     "evaluators" : ["fluency", "content-safety"],
+    #     # Optional: properties to log with the evaluation results
+    #     "additional_metadata": {
+    #         "prompt-variant": variant,
+    #         "targeting-id": targeting_id
+    #     }
+    # }
+    
+    # # submit remote evaluation request, results will be sent to app insights
+    # eval_request = await eval_client.submit_request(conversation, eval_config)
+    
     # eval_sampling = feature_manager.get_variant("eval_sampling", targeting_id)
     # if eval_sampling and eval_sampling.configuration == True:
-    # eval_input = { "conversation": { "messages": messages } }
+    #     eval_input = { "conversation": { "messages": messages } }
     # project = globals["project"]
-    #asyncio.create_task(run_evals(eval_input, targeting_id, project.scope, DefaultAzureCredential()))
    
+    # asyncio.create_task(run_evals(eval_input, targeting_id, project.scope, DefaultAzureCredential()))
+    
     return { "answer": answer, "variant": variant }
+    
 
 async def run_evals(eval_input, targeting_id, ai_project_scope, credential):
     run_eval(FluencyEvaluator, eval_input, targeting_id)