Skip to content

Commit 3d0c468

Browse files
committed
Add eval metrics
1 parent 9bd1ef6 commit 3d0c468

4 files changed

Lines changed: 242 additions & 14 deletions

File tree

.config/metrics-genai-v0.1.0.json

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,125 @@
286286
"filter": "['gen_ai.finish_reason.tool_calls'] == true"
287287
}
288288
}
289+
},
290+
{
291+
"id": "genai_evaluation_protectedMaterial",
292+
"displayName": "Protected material score",
293+
"description": "Protected material score by Azure AI content safety API. The Protected material detection APIs scan the output of large language models to identify and flag known protected material. See (https://learn.microsoft.com/en-us/azure/ai-services/content-safety/concepts/protected-material?tabs=text) for more details.",
294+
"lifecycle": "Active",
295+
"tags": [
296+
"GenAI", "azure_ai_evaluation"
297+
],
298+
"desiredDirection": "Decrease",
299+
"definition": {
300+
"kind": "Average",
301+
"event": {
302+
"eventName": "gen_ai.evaluation.protectedMaterial",
303+
"eventProperty": "gen_ai.ai.evaluation.score"
304+
}
305+
}
306+
},
307+
{
308+
"id": "genai_evaluation_hateUnfairness",
309+
"displayName": "Hate unfairness score",
310+
"description": "Hate and unfair score given by Azure AI evaluation. Hateful and unfair content refers to any language pertaining to hate toward or unfair representations of individuals and social groups along factors including but not limited to race, ethnicity, nationality, gender, sexual orientation, religion, immigration status, ability, personal appearance, and body size. Unfairness occurs when AI systems treat or represent social groups inequitably, creating or contributing to societal inequities. Safety evaluations annotate self-harm-related content using a 0-7 scale. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.hateunfairnessevaluator?view=azure-python) for more details.",
311+
"lifecycle": "Active",
312+
"tags": [
313+
"GenAI", "azure_ai_evaluation"
314+
],
315+
"desiredDirection": "Decrease",
316+
"definition": {
317+
"kind": "Average",
318+
"event": {
319+
"eventName": "gen_ai.evaluation.hateUnfairness",
320+
"eventProperty": "gen_ai.ai.evaluation.score"
321+
}
322+
}
323+
},
324+
{
325+
"id": "genai_evaluation_sexual",
326+
"displayName": "Sexual content score",
327+
"description": "Score for sexual content given by Azure AI evaluation. Sexual score is range from 0 to 7. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.sexualevaluator?view=azure-python) for more details.",
328+
"lifecycle": "Active",
329+
"tags": [
330+
"GenAI", "azure_ai_evaluation"
331+
],
332+
"desiredDirection": "Decrease",
333+
"definition": {
334+
"kind": "Average",
335+
"event": {
336+
"eventName": "gen_ai.evaluation.sexual",
337+
"eventProperty": "gen_ai.ai.evaluation.score"
338+
}
339+
}
340+
},
341+
{
342+
"id": "genai_evaluation_violence",
343+
"displayName": "Violent content score",
344+
"description": "Violence score given by Azure AI evaluation, Violence score is range from 0 to 7. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.violenceevaluator?view=azure-python) for more details.",
345+
"lifecycle": "Active",
346+
"tags": [
347+
"GenAI", "azure_ai_evaluation"
348+
],
349+
"desiredDirection": "Decrease",
350+
"definition": {
351+
"kind": "Average",
352+
"event": {
353+
"eventName": "gen_ai.evaluation.violence",
354+
"eventProperty": "gen_ai.ai.evaluation.score"
355+
}
356+
}
357+
},
358+
{
359+
"id": "genai_evaluation_relevance",
360+
"displayName": "Relevance score",
361+
"description": "Relevance score given by Azure AI evaluation. The relevance measure assesses the ability of answers to capture the key points of the context. High relevance scores signify the AI system's understanding of the input and its capability to produce coherent and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Relevance scores range from 1 to 5. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.relevanceevaluator?view=azure-python) for more details.",
362+
"lifecycle": "Active",
363+
"tags": [
364+
"GenAI", "azure_ai_evaluation"
365+
],
366+
"desiredDirection": "Increase",
367+
"definition": {
368+
"kind": "Average",
369+
"event": {
370+
"eventName": "gen_ai.evaluation.relevance",
371+
"eventProperty": "gen_ai.ai.evaluation.score"
372+
}
373+
}
374+
},
375+
{
376+
"id": "genai_evaluation_fluency",
377+
"displayName": "Fluency score",
378+
"description": "Fluency score given by Azure AI evaluation. The fluency measure assesses the extent to which the generated text conforms to grammatical rules, syntactic structures, and appropriate vocabulary usage, resulting in linguistically correct responses. The fluency score range from 1 to 5. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.fluencyevaluator?view=azure-python) for more details.",
379+
"lifecycle": "Active",
380+
"tags": [
381+
"GenAI", "azure_ai_evaluation"
382+
],
383+
"desiredDirection": "Increase",
384+
"definition": {
385+
"kind": "Average",
386+
"event": {
387+
"eventName": "gen_ai.evaluation.fluency",
388+
"eventProperty": "gen_ai.ai.evaluation.score"
389+
}
390+
}
391+
},
392+
{
393+
"id": "genai_evaluation_coherence",
394+
"displayName": "Coherence score",
395+
"description": "Coherence score given by Azure AI evaluation. The coherence measure assesses the ability of the language model to generate text that reads naturally, flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability and user-friendliness of a model's generated responses in real-world applications. See (https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.coherenceevaluator?view=azure-python) for more details.",
396+
"lifecycle": "Active",
397+
"tags": [
398+
"GenAI", "azure_ai_evaluation"
399+
],
400+
"desiredDirection": "Increase",
401+
"definition": {
402+
"kind": "Average",
403+
"event": {
404+
"eventName": "gen_ai.evaluation.coherence",
405+
"eventProperty": "gen_ai.ai.evaluation.score"
406+
}
407+
}
289408
}
290409
]
291410
}

eval/setup-eval.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
EvaluationSchedule,
77
RecurrenceTrigger,
88
)
9-
from azure.ai.evaluation import CoherenceEvaluator
9+
from azure.ai.evaluation import CoherenceEvaluator, FluencyEvaluator, RelevanceEvaluator, ViolenceEvaluator, SexualEvaluator, HateUnfairnessEvaluator, ProtectedMaterialEvaluator, ContentSafetyEvaluator
1010

1111
# This sample includes the setup for an online evaluation schedule using the Azure AI Project SDK and Azure AI Evaluation SDK
1212
# The schedule is configured to run daily over the collected trace data while running two evaluators: CoherenceEvaluator and RelevanceEvaluator
@@ -28,7 +28,7 @@
2828
# You can modify it depending on your data schema
2929
# The KQL query must output these required columns: operation_ID, operation_ParentID, and gen_ai_response_id
3030
# You can choose which other columns to output as required by the evaluators you are using
31-
KUSTO_QUERY = "let gen_ai_spans=(dependencies | where isnotnull(customDimensions[\"gen_ai.system\"]) | extend response_id = tostring(customDimensions[\"gen_ai.response.id\"]) | project id, operation_Id, operation_ParentId, timestamp, response_id); let gen_ai_events=(traces | where message in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") or tostring(customDimensions[\"event.name\"]) in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") | project id= operation_ParentId, operation_Id, operation_ParentId, user_input = iff(message == \"gen_ai.user.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.user.message\", parse_json(iff(message == \"gen_ai.user.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), system = iff(message == \"gen_ai.system.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.system.message\", parse_json(iff(message == \"gen_ai.system.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), llm_response = iff(message == \"gen_ai.choice\", parse_json(tostring(parse_json(tostring(customDimensions[\"gen_ai.event.content\"])).message)).content, iff(tostring(customDimensions[\"event.name\"]) == \"gen_ai.choice\", parse_json(parse_json(message).message).content, \"\")) | summarize operation_ParentId = any(operation_ParentId), Input = maxif(user_input, user_input != \"\"), System = maxif(system, system != \"\"), Output = maxif(llm_response, llm_response != \"\") by operation_Id, id); gen_ai_spans | join kind=inner (gen_ai_events) on id, operation_Id | project Input, System, Output, operation_Id, operation_ParentId, gen_ai_response_id = response_id"
31+
KUSTO_QUERY = "let gen_ai_spans=(dependencies | where isnotnull(customDimensions[\"gen_ai.system\"]) | extend response_id = tostring(customDimensions[\"gen_ai.response.id\"]) | project id, operation_Id, operation_ParentId, timestamp, response_id); let gen_ai_events=(traces | where message in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") or tostring(customDimensions[\"event.name\"]) in (\"gen_ai.choice\", \"gen_ai.user.message\", \"gen_ai.system.message\") | project id= operation_ParentId, operation_Id, operation_ParentId, user_input = iff(message == \"gen_ai.user.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.user.message\", parse_json(iff(message == \"gen_ai.user.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), system = iff(message == \"gen_ai.system.message\" or tostring(customDimensions[\"event.name\"]) == \"gen_ai.system.message\", parse_json(iff(message == \"gen_ai.system.message\", tostring(customDimensions[\"gen_ai.event.content\"]), message)).content, \"\"), llm_response = iff(message == \"gen_ai.choice\", parse_json(tostring(parse_json(tostring(customDimensions[\"gen_ai.event.content\"])).message)).content, iff(tostring(customDimensions[\"event.name\"]) == \"gen_ai.choice\", parse_json(parse_json(message).message).content, \"\")) | summarize operation_ParentId = any(operation_ParentId), Input = maxif(user_input, user_input != \"\"), System = maxif(system, system != \"\"), Output = maxif(llm_response, llm_response != \"\") by operation_Id, id); gen_ai_spans | join kind=inner (gen_ai_events) on id, operation_Id | project Input, System, Output, operation_Id, operation_ParentId, gen_ai_response_id = response_id | where gen_ai_response_id != \"\""
3232

3333

3434

@@ -71,18 +71,47 @@
7171
# )
7272

7373
# CoherenceEvaluator
74-
coherence_evaluator_config = EvaluatorConfiguration(
75-
id=CoherenceEvaluator.id,
76-
init_params={"model_config": model_config},
77-
data_mapping={"query": "${data.Input}", "response": "${data.Output}"}
78-
)
74+
def get_evaluator_config(evaluator_id, model_config):
75+
return EvaluatorConfiguration(
76+
id=evaluator_id,
77+
init_params={"model_config": model_config},
78+
data_mapping={"query": "${data.Input}", "response": "${data.Output}"}
79+
)
80+
81+
def get_evaluator_config_safety(evaluator_id, azure_ai_project_scope):
82+
return EvaluatorConfiguration(
83+
id=evaluator_id,
84+
init_params={"azure_ai_project": azure_ai_project_scope},
85+
data_mapping={"query": "${data.Input}", "response": "${data.Output}"}
86+
)
87+
88+
# coherence_evaluator_config = create_evaluator_config(
89+
# CoherenceEvaluator,
90+
# model_config,
91+
# {"query": "${data.Input}", "response": "${data.Output}"}
92+
# )
93+
94+
# fluency_evaluator_config = create_evaluator_config(
95+
# FluencyEvaluator,
96+
# model_config,
97+
# {"query": "${data.Input}", "response": "${data.Output}"}
98+
# )
7999

80100
# Frequency to run the schedule
81101
recurrence_trigger = RecurrenceTrigger(frequency="hour", interval=1)
82102

103+
credential = DefaultAzureCredential()
104+
83105
# Dictionary of evaluators
84106
evaluators = {
85-
"coherence" : coherence_evaluator_config
107+
"coherence" : get_evaluator_config(CoherenceEvaluator.id, model_config),
108+
"fluency" : get_evaluator_config(FluencyEvaluator.id, model_config),
109+
"relevance" : get_evaluator_config(RelevanceEvaluator.id, model_config),
110+
"violence" : get_evaluator_config_safety(ViolenceEvaluator.id, project_client.scope),
111+
"sexual" : get_evaluator_config_safety(SexualEvaluator.id, project_client.scope),
112+
"hateUnfairness" : get_evaluator_config_safety(HateUnfairnessEvaluator.id, project_client.scope),
113+
"protectedMaterial" : get_evaluator_config_safety(ProtectedMaterialEvaluator.id, project_client.scope),
114+
"contentSafety" : get_evaluator_config_safety(ContentSafetyEvaluator.id, project_client.scope)
86115
}
87116

88117
name = SAMPLE_NAME

infra/la-summary-rules.yaml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,60 @@ summaryRules:
3333
| extend Properties = bag_merge(newProperties, stop, tool_calls, content_filter, length)
3434
| extend Name = "gen_ai.otel.span"
3535
| project Name, TimeGenerated, ItemCount, Properties
36+
- name: "Online-Experimentation-Assignment-Summary"
37+
binSize: 20
38+
destinationTable: OnlineExperimentAssignmentSummary_CL
39+
description: "Summary rule definition for online experiment assignment summary."
40+
query: |
41+
AppEvents
42+
| where Name == "FeatureEvaluation"
43+
| where tostring(Properties.VariantAssignmentReason) == "Percentile"
44+
| where tostring(Properties.TargetingId) != ""
45+
| where tostring(Properties.Enabled) == "True"
46+
| where tostring(Properties.AllocationId) != ""
47+
| where toint(Properties.VariantAssignmentPercentage) < 100
48+
| project
49+
TimeGenerated,
50+
ItemCount,
51+
FeatureFlagReference = tostring(Properties.FeatureFlagReference),
52+
FeatureName = tostring(Properties.FeatureName),
53+
AllocationId = tostring(Properties.AllocationId),
54+
Variant = tostring(Properties.Variant),
55+
VariantAssignmentPercentage = toreal(Properties.VariantAssignmentPercentage),
56+
IsControlVariant = tostring(Properties.DefaultWhenEnabled) == tostring(Properties.Variant)
57+
| summarize
58+
VariantAssignmentPercentage = take_any(VariantAssignmentPercentage),
59+
IsControlVariant = take_any(IsControlVariant),
60+
AssignmentEventCount = tolong(sum(ItemCount)),
61+
FirstAssignmentTimestamp = min(TimeGenerated),
62+
LastAssignmentTimestamp = max(TimeGenerated)
63+
by FeatureFlagReference, FeatureName, AllocationId, Variant
64+
- name: "Online-Experimentation-GenAI-eval"
65+
binSize: 20
66+
destinationTable: AppEvents_CL
67+
description: "Summary rule definition for Azure AI evaluation-based GenAI metrics."
68+
query: |
69+
let operationId_to_Targeting_Id = AppDependencies
70+
| where Properties has "TargetingId" and Properties['TargetingId'] != ""
71+
| extend TargetingId = Properties['TargetingId']
72+
| project TargetingId, OperationId
73+
| summarize TargetingId=any(TargetingId) by OperationId;
74+
AppTraces
75+
| extend event_name_ = tostring(Properties.["event.name"])
76+
| extend gen_ai_evaluation_score_ = todouble(Properties.["gen_ai.evaluation.score"])
77+
| extend gen_ai_response_id_ = tostring(Properties.["gen_ai.response_id"])
78+
| where gen_ai_response_id_ != ""
79+
| where event_name_ startswith "gen_ai.evaluation."
80+
| join kind=inner operationId_to_Targeting_Id on OperationId
81+
| extend span_gen_ai_response_id_ = tostring(Properties.["gen_ai.response.id"])
82+
| project
83+
Name=event_name_,
84+
TimeGenerated,
85+
ItemCount,
86+
Properties=pack("gen_ai.evaluation.score", gen_ai_evaluation_score_, "TargetingId", TargetingId, "gen_ai.response_id", gen_ai_response_id_, "OperationId", OperationId),
87+
tool_calls = iff(Properties["gen_ai.response.finish_reasons"] has "tool_calls", bag_pack("gen_ai.response.finish_reason.tool_calls", 1), dynamic({})),
88+
content_filter = iff(Properties["gen_ai.response.finish_reasons"] has "content_filter", bag_pack("gen_ai.response.finish_reason.content_filter", 1), dynamic({})),
89+
length = iff(Properties["gen_ai.response.finish_reasons"] has "length", bag_pack("gen_ai.response.finish_reason.length", 1), dynamic({}))
90+
| extend Properties = bag_merge(Properties, tool_calls, content_filter, length)
91+
| extend Name = "gen_ai.otel.span"
92+
| project Name, TimeGenerated, ItemCount, Properties

src/api/routes.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from opentelemetry.baggage import set_baggage, get_baggage
2121
from opentelemetry.context import attach
2222
from featuremanagement import TargetingContext
23+
from azure.identity import DefaultAzureCredential
2324

2425
router = fastapi.APIRouter()
2526
templates = Jinja2Templates(directory="api/templates")
@@ -107,7 +108,7 @@ async def chat_nostream_handler(
107108
model_deployment_name = globals["chat_model"]
108109
feature_manager = globals["feature_manager"]
109110

110-
targeting_id = chat_request.sessionState['sessionId'] or str(uuid.uuid4())
111+
targeting_id = chat_request.sessionState.get('sessionId', str(uuid.uuid4()))
111112
attach(set_baggage("Microsoft.TargetingId", targeting_id))
112113

113114
# figure out which prompty template to use (replace file to API)
@@ -130,19 +131,41 @@ async def chat_nostream_handler(
130131
model=model_deployment_name, messages=prompt_messages + messages, stream=False
131132
)
132133
track_event("RequestMade", targeting_id)
134+
answer = response.choices[0].message.content
133135
except Exception as e:
134136
error = {"Error": str(e)}
135-
track_event("ErrorLLM", targeting_id, error)
136-
137-
answer = response.choices[0].message.content
137+
track_event("ErrorLLM", targeting_id, error)
138+
return { "answer": str(e), "variant": variant }
139+
140+
141+
# conversation = {}
142+
143+
# # initialize the evaluation client
144+
# # optional parameter to configure sampling
145+
# eval_client = await project.evaluation.get_evaluation_client(sampling_config=0.1)
138146

147+
# eval_config = {
148+
# # Required: built-in or custom evaluators
149+
# "evaluators" : ["fluency", "content-safety"],
150+
# # Optional: properties to log with the evaluation results
151+
# "additional_metadata": {
152+
# "prompt-variant": variant,
153+
# "targeting-id": targeting_id
154+
# }
155+
# }
156+
157+
# # submit remote evaluation request, results will be sent to app insights
158+
# eval_request = await eval_client.submit_request(conversation, eval_config)
159+
139160
# eval_sampling = feature_manager.get_variant("eval_sampling", targeting_id)
140161
# if eval_sampling and eval_sampling.configuration == True:
141-
# eval_input = { "conversation": { "messages": messages } }
162+
# eval_input = { "conversation": { "messages": messages } }
142163
# project = globals["project"]
143-
#asyncio.create_task(run_evals(eval_input, targeting_id, project.scope, DefaultAzureCredential()))
144164

165+
# asyncio.create_task(run_evals(eval_input, targeting_id, project.scope, DefaultAzureCredential()))
166+
145167
return { "answer": answer, "variant": variant }
168+
146169

147170
async def run_evals(eval_input, targeting_id, ai_project_scope, credential):
148171
run_eval(FluencyEvaluator, eval_input, targeting_id)

0 commit comments

Comments
 (0)