Skip to content

Commit 39a347f

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add PromptTemplateData to support context and history columns when creating Evaluation run from dataframe
PiperOrigin-RevId: 871483777
1 parent 89d5723 commit 39a347f

4 files changed

Lines changed: 219 additions & 105 deletions

File tree

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,9 @@ def test_create_eval_run_with_inference_configs(client):
223223
assert evaluation_run.error is None
224224

225225

226-
# Test fails in replay mode because of UUID generation mismatch.
226+
# # Test fails in replay mode because of UUID generation mismatch.
227+
# import pandas as pd
228+
227229
# def test_create_eval_run_data_source_evaluation_dataset(client):
228230
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
229231
# input_df = pd.DataFrame(
@@ -319,6 +321,75 @@ def test_create_eval_run_with_inference_configs(client):
319321
# assert evaluation_run.error is None
320322

321323

324+
# def test_create_eval_run_data_source_evaluation_dataset_with_prompt_template_data(
325+
# client,
326+
# ):
327+
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset and inference_configs."""
328+
# input_df = pd.DataFrame(
329+
# {
330+
# "prompt": ["prompt1", "prompt2"],
331+
# "reference": ["reference1", "reference2"],
332+
# "response": ["response1", "response2"],
333+
# "context": ["context1", "context2"],
334+
# "conversation_history": ["history1", "history2"],
335+
# }
336+
# )
337+
# evaluation_run = client.evals.create_evaluation_run(
338+
# name="test9",
339+
# display_name="test9",
340+
# dataset=types.EvaluationDataset(
341+
# candidate_name="candidate_1",
342+
# eval_dataset_df=input_df,
343+
# ),
344+
# dest=GCS_DEST,
345+
# metrics=[GENERAL_QUALITY_METRIC],
346+
# )
347+
# assert isinstance(evaluation_run, types.EvaluationRun)
348+
# assert evaluation_run.display_name == "test9"
349+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
350+
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
351+
# # Check evaluation set
352+
# assert evaluation_run.data_source.evaluation_set
353+
# eval_set = client.evals.get_evaluation_set(
354+
# name=evaluation_run.data_source.evaluation_set
355+
# )
356+
# assert len(eval_set.evaluation_items) == 2
357+
# # Check evaluation items
358+
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
359+
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
360+
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
361+
# assert (
362+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
363+
# "prompt"
364+
# ]
365+
# == genai_types.Content(
366+
# parts=[genai_types.Part(text=input_df.iloc[i]["prompt"])],
367+
# role="user",
368+
# )
369+
# )
370+
# assert (
371+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
372+
# "context"
373+
# ]
374+
# == genai_types.Content(
375+
# parts=[genai_types.Part(text=input_df.iloc[i]["context"])],
376+
# role="user",
377+
# )
378+
# )
379+
# assert (
380+
# eval_item.evaluation_request.prompt.prompt_template_data.values[
381+
# "conversation_history"
382+
# ]
383+
# == genai_types.Content(
384+
# parts=[genai_types.Part(text=input_df.iloc[i]["conversation_history"])],
385+
# role="user",
386+
# )
387+
# )
388+
# assert (
389+
# eval_item.evaluation_request.candidate_responses[0].text
390+
# == input_df.iloc[i]["response"]
391+
# )
392+
# assert evaluation_run.error is None
322393
pytest_plugins = ("pytest_asyncio",)
323394

324395

vertexai/_genai/_evals_common.py

Lines changed: 105 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,78 @@ def _extract_contents_for_inference(
258258
return request_dict_or_raw_text
259259

260260

261+
def _resolve_dataset(
262+
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
263+
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
264+
) -> types.EvaluationRunDataSource:
265+
"""Resolves dataset for the evaluation run."""
266+
if isinstance(dataset, types.EvaluationDataset):
267+
candidate_name = _get_candidate_name(dataset, agent_info_pydantic)
268+
eval_set = _create_evaluation_set_from_dataframe(
269+
self._api_client, dest, dataset.eval_dataset_df, candidate_name
270+
)
271+
dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name)
272+
return dataset
273+
274+
275+
def _resolve_inference_configs(
276+
inference_configs: Optional[
277+
dict[str, types.EvaluationRunInferenceConfigOrDict]
278+
] = None,
279+
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
280+
) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]:
281+
"""Resolves inference configs for the evaluation run."""
282+
if agent_info_pydantic and agent_info_pydantic.name:
283+
inference_configs = {}
284+
inference_configs[agent_info_pydantic.name] = (
285+
types.EvaluationRunInferenceConfig(
286+
agent_config=types.EvaluationRunAgentConfig(
287+
developer_instruction=genai_types.Content(
288+
parts=[
289+
genai_types.Part(text=agent_info_pydantic.instruction)
290+
]
291+
),
292+
tools=agent_info_pydantic.tool_declarations,
293+
)
294+
)
295+
)
296+
return inference_configs
297+
298+
299+
def _add_evaluation_run_labels(
300+
labels: Optional[dict[str, str]] = None,
301+
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
302+
):
303+
"""Adds labels to the evaluation run."""
304+
labels = labels or {}
305+
if agent_info_pydantic and agent_info_pydantic.agent_resource_name:
306+
labels["vertex-ai-evaluation-agent-engine-id"] = (
307+
agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[
308+
-1
309+
]
310+
)
311+
return labels
312+
313+
314+
def _get_candidate_name(
315+
dataset: types.EvaluationDataset,
316+
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
317+
) -> Optional[str]:
318+
"""Internal helper to get candidate name."""
319+
if agent_info_pydantic is not None and (
320+
dataset.candidate_name
321+
and agent_info_pydantic
322+
and agent_info_pydantic.name
323+
and dataset.candidate_name != agent_info_pydantic.name
324+
):
325+
logger.warning(
326+
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
327+
)
328+
elif dataset.candidate_name is None and agent_info_pydantic:
329+
return agent_info_pydantic.name
330+
return dataset.candidate_name or None
331+
332+
261333
def _execute_inference_concurrently(
262334
api_client: BaseApiClient,
263335
prompt_dataset: pd.DataFrame,
@@ -1858,6 +1930,9 @@ def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]:
18581930
result[key] = value
18591931
elif isinstance(value, (list, tuple)):
18601932
result[key] = [_object_to_dict(item) for item in value]
1933+
# Add recursive handling for dictionaries
1934+
elif isinstance(value, dict):
1935+
result[key] = {k: _object_to_dict(v) for k, v in value.items()}
18611936
elif isinstance(value, bytes):
18621937
result[key] = base64.b64encode(value).decode("utf-8")
18631938
elif hasattr(value, "__dict__"): # Nested object
@@ -1871,9 +1946,15 @@ def _create_evaluation_set_from_dataframe(
18711946
api_client: BaseApiClient,
18721947
gcs_dest_prefix: str,
18731948
eval_df: pd.DataFrame,
1874-
candidate_name: Optional[str] = None,
18751949
) -> Union[types.EvaluationSet, Any]:
18761950
"""Converts a dataframe to an EvaluationSet."""
1951+
if dataset.eval_dataset_df is None:
1952+
raise ValueError(
1953+
"EvaluationDataset must have eval_dataset_df populated."
1954+
)
1955+
candidate_name = _evals_common._get_candidate_name(
1956+
dataset, agent_info_pydantic
1957+
)
18771958
eval_item_requests = []
18781959
for _, row in eval_df.iterrows():
18791960
intermediate_events = []
@@ -1885,13 +1966,31 @@ def _create_evaluation_set_from_dataframe(
18851966
for event in row[_evals_constant.INTERMEDIATE_EVENTS]:
18861967
if CONTENT in event:
18871968
intermediate_events.append(event[CONTENT])
1969+
if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
1970+
values = {}
1971+
if _evals_constant.CONTEXT in row:
1972+
values[_evals_constant.CONTEXT] = genai_types.Content(
1973+
parts=[genai_types.Part(text=row[_evals_constant.CONTEXT])],
1974+
role=_evals_constant.USER_AUTHOR,
1975+
)
1976+
if _evals_constant.HISTORY in row:
1977+
values[_evals_constant.HISTORY] = genai_types.Content(
1978+
parts=[genai_types.Part(text=row[_evals_constant.HISTORY])],
1979+
role=_evals_constant.USER_AUTHOR,
1980+
)
1981+
if _evals_constant.PROMPT in row:
1982+
values[_evals_constant.PROMPT] = genai_types.Content(
1983+
parts=[genai_types.Part(text=row[_evals_constant.PROMPT])],
1984+
role=_evals_constant.USER_AUTHOR,
1985+
)
1986+
prompt = types.EvaluationPrompt(
1987+
prompt_template_data=types.PromptTemplateData(values=values)
1988+
)
1989+
elif _evals_constant.PROMPT in row:
1990+
prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
18881991
eval_item_requests.append(
18891992
types.EvaluationItemRequest(
1890-
prompt=(
1891-
types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
1892-
if _evals_constant.PROMPT in row
1893-
else None
1894-
),
1993+
prompt=prompt if prompt else None,
18951994
golden_response=(
18961995
types.CandidateResponse(text=row[_evals_constant.REFERENCE])
18971996
if _evals_constant.REFERENCE in row

vertexai/_genai/_evals_constant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
CONTENT = "content"
5454
PARTS = "parts"
5555
USER_AUTHOR = "user"
56+
HISTORY = "conversation_history"
5657

5758
COMMON_DATASET_COLUMNS = frozenset(
5859
{
@@ -61,5 +62,6 @@
6162
REFERENCE,
6263
SESSION_INPUT,
6364
CONTEXT,
65+
HISTORY,
6466
}
6567
)

0 commit comments

Comments
 (0)