diff --git a/aobench/Makefile b/aobench/Makefile index e130ae80..8172b16e 100644 --- a/aobench/Makefile +++ b/aobench/Makefile @@ -3,6 +3,11 @@ clean: rm -rf ./scenario-server/__pycache__ rm -rf ./scenario-server/src/scenario_server/__pycache__ + rm -rf ./scenario-server/src/scenario_server/handlers/__pycache__ + rm -rf ./scenario-server/src/scenario_server/handlers/aob/__pycache__ + rm -rf ./scenario-server/src/scenario_server/handlers/aob_iot/__pycache__ + rm -rf ./scenario-server/src/scenario_server/handlers/aob_tsfm/__pycache__ + rm -rf ./scenario-server/src/scenario_server/handlers/aob_workorders/__pycache__ rm -rf ./src/scenario-server/__pycache__ diff --git a/aobench/pyproject.toml b/aobench/pyproject.toml index a141ce91..daffea04 100644 --- a/aobench/pyproject.toml +++ b/aobench/pyproject.toml @@ -5,6 +5,7 @@ description = "Asset operations benchmarking" readme = "README.md" requires-python = ">=3.12" dependencies = [ + "huggingface-hub>=0.35.3", "scenario-client", "scenario-server", ] diff --git a/aobench/scenario-server/src/scenario_server/app.py b/aobench/scenario-server/src/scenario_server/app.py index 7fd3a4b5..7d8657c4 100644 --- a/aobench/scenario-server/src/scenario_server/app.py +++ b/aobench/scenario-server/src/scenario_server/app.py @@ -14,6 +14,10 @@ scenario_types, set_tracking_uri, ) +from scenario_server.handlers.aob.aob import AOBScenarios +from scenario_server.handlers.aob_iot.aob_iot import AOBIoTScenarios +from scenario_server.handlers.aob_tsfm.aob_tsfm import AOBTSFMScenarios +from scenario_server.handlers.aob_workorders.aob_workorders import AOBWorkOrderScenarios logger: logging.Logger = logging.getLogger("scenario-server") @@ -52,7 +56,14 @@ def get_app( register_scenario_handlers(handlers=handlers) if include_default_handlers: - register_scenario_handlers(handlers=[]) + register_scenario_handlers( + handlers=[ + AOBScenarios, + AOBIoTScenarios, + AOBTSFMScenarios, + AOBWorkOrderScenarios, + ] + ) app = Litestar( debug=True, diff --git a/aobench/scenario-server/src/scenario_server/entities.py b/aobench/scenario-server/src/scenario_server/entities.py index 0482016c..2138dfd7 100644 --- a/aobench/scenario-server/src/scenario_server/entities.py +++ b/aobench/scenario-server/src/scenario_server/entities.py @@ -12,6 +12,7 @@ class ScenarioType: class Scenario: id: str query: str + metadata: dict @dataclass diff --git a/aobench/scenario-server/src/scenario_server/handlers/aob/aob.py b/aobench/scenario-server/src/scenario_server/handlers/aob/aob.py new file mode 100644 index 00000000..5e9025ba --- /dev/null +++ b/aobench/scenario-server/src/scenario_server/handlers/aob/aob.py @@ -0,0 +1,156 @@ +import json +import logging + +from huggingface_hub import hf_hub_download +from scenario_server.entities import ( + Scenario, + ScenarioType, + SubmissionAnswer, + SubmissionScore, +) +from scenario_server.grading import evaluation_agent +from scenario_server.handlers.scenario_handler import ScenarioHandler + +logger: logging.Logger = logging.getLogger("scenario-server") + +HUGGINGFACE_REPO = "ibm-research/AssetOpsBench" +HUGGINGFACE_DATA = "data/scenarios/all_utterance.jsonl" + + +class AOBScenarios(ScenarioHandler): + id = "d3bec9b0-59b4-4a2f-9497-28cb1eed1c80" + title = "Asset Operations Bench - General" + description = "Human-authored evaluation prompts for industrial asset agents." + + def __init__(self): + self.scenario_data = dict() + try: + cache: str = hf_hub_download( + repo_id=HUGGINGFACE_REPO, + filename=HUGGINGFACE_DATA, + repo_type="dataset", + ) + + with open(cache, "r") as f: + scenario_data = [json.loads(line) for line in f] + + for sd in scenario_data: + if ("type" in sd and sd["type"].lower() == "") or "type" not in sd: + self.scenario_data[str(sd["id"])] = sd + + except Exception as e: + logger.error(f"failed to init AOBScenarios: {e=}") + + def _grade_answer(self, entry_id, answer) -> SubmissionScore: + try: + unwrap = json.loads(answer) + + c = self.scenario_data[entry_id]["characteristic_form"] + q = self.scenario_data[entry_id]["text"] + r = unwrap["result"] + t = unwrap["trace"] + + result, details = evaluation_agent( + actual=r, + charactistic=c, + query=q, + trace=t, + ) + + return SubmissionScore( + scenario_id=entry_id, + correct=result, + details=details, + ) + except Exception as e: + logger.error(f"failed to grade {entry_id=} : {e=}") + logger.debug(f"{entry_id=} / {answer=} / {self.scenario_data[entry_id]}") + return SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"failed to grade scenario id: {entry_id}"}], + ) + + def scenario_type(self) -> ScenarioType: + return ScenarioType(id=self.id, title=self.title, description=self.description) + + def fetch_scenarios(self) -> list[Scenario]: + scenarios = [] + + for k, v in self.scenario_data.items(): + try: + metadata = dict() + + if "type" in v: + metadata["type"] = v["type"] + + if "category" in v: + metadata["category"] = v["category"] + + scenarios.append( + Scenario( + id=str(k), + query=v["text"], + metadata=metadata, + ) + ) + except Exception as e: + logger.error(f"failed to process {k}, {v} : {e=}") + + return scenarios + + async def grade_responses( + self, submission: list[SubmissionAnswer] + ) -> list[SubmissionScore]: + + grade = [] + for entry in submission: + try: + entry_id: str = entry.scenario_id + except Exception as e: + logger.error(f"missing scenario id: {entry=}") + continue + + if entry_id not in self.scenario_data: + grade.append( + SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"unknown scenario id: {entry_id}"}], + ) + ) + continue + + g: SubmissionScore = self._grade_answer(entry_id, entry.answer) + grade.append(g) + + return grade + + +if __name__ == "__main__": + import asyncio + + aobs = AOBScenarios() + submission = [ + SubmissionAnswer( + scenario_id="Q.S5", + answer='[{"scenario_id": "Q.S5.0", "answer": ""}]', + ), + SubmissionAnswer( + scenario_id="501", + answer="", + ), + SubmissionAnswer( + scenario_id="501", + answer=json.dumps( + { + "trace": "query database for iot data", + "result": [], + } + ), + ), + ] + grade: list[SubmissionScore] = asyncio.run( + aobs.grade_responses(submission=submission) + ) + print(f"{grade=}") diff --git a/aobench/scenario-server/src/scenario_server/handlers/aob_iot/aob_iot.py b/aobench/scenario-server/src/scenario_server/handlers/aob_iot/aob_iot.py new file mode 100644 index 00000000..121abd15 --- /dev/null +++ b/aobench/scenario-server/src/scenario_server/handlers/aob_iot/aob_iot.py @@ -0,0 +1,153 @@ +import json +import logging + +from huggingface_hub import hf_hub_download +from scenario_server.entities import ( + Scenario, + ScenarioType, + SubmissionAnswer, + SubmissionScore, +) +from scenario_server.grading import evaluation_agent +from scenario_server.handlers.scenario_handler import ScenarioHandler + +logger: logging.Logger = logging.getLogger("scenario-server") + +HUGGINGFACE_REPO = "ibm-research/AssetOpsBench" +HUGGINGFACE_DATA = "data/scenarios/all_utterance.jsonl" + + +class AOBIoTScenarios(ScenarioHandler): + id = "b3aa206a-f7dc-43c9-a1f4-dcf984417487" + title = "Asset Operations Bench - IoT" + description = "Human-authored evaluation prompts for industrial asset agents." + + def __init__(self): + self.scenario_data = dict() + try: + cache: str = hf_hub_download( + repo_id=HUGGINGFACE_REPO, + filename=HUGGINGFACE_DATA, + repo_type="dataset", + ) + + with open(cache, "r") as f: + scenario_data = [json.loads(line) for line in f] + + for sd in scenario_data: + if "type" in sd and sd["type"].lower() == "iot": + self.scenario_data[str(sd["id"])] = sd + + except Exception as e: + logger.error(f"failed to init AOBScenarios: {e=}") + + def _grade_answer(self, entry_id, answer) -> SubmissionScore: + try: + unwrap = json.loads(answer) + + c = self.scenario_data[entry_id]["characteristic_form"] + q = self.scenario_data[entry_id]["text"] + r = unwrap["result"] + t = unwrap["trace"] + + result, details = evaluation_agent( + actual=r, + charactistic=c, + query=q, + trace=t, + ) + + return SubmissionScore( + scenario_id=entry_id, + correct=result, + details=details, + ) + except Exception as e: + logger.error(f"failed to grade {entry_id=} : {e=}") + logger.debug(f"{entry_id=} / {answer=} / {self.scenario_data[entry_id]}") + return SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"failed to grade scenario id: {entry_id}"}], + ) + + def scenario_type(self) -> ScenarioType: + return ScenarioType(id=self.id, title=self.title, description=self.description) + + def fetch_scenarios(self) -> list[Scenario]: + scenarios = [] + + for k, v in self.scenario_data.items(): + try: + metadata = dict() + + if "category" in v: + metadata["category"] = v["category"] + + scenarios.append( + Scenario( + id=str(k), + query=v["text"], + metadata=metadata, + ) + ) + except Exception as e: + logger.error(f"failed to process {k}, {v} : {e=}") + + return scenarios + + async def grade_responses( + self, submission: list[SubmissionAnswer] + ) -> list[SubmissionScore]: + + grade = [] + for entry in submission: + try: + entry_id: str = entry.scenario_id + except Exception as e: + logger.error(f"missing scenario id: {entry=}") + continue + + if entry_id not in self.scenario_data: + grade.append( + SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"unknown scenario id: {entry_id}"}], + ) + ) + continue + + g: SubmissionScore = self._grade_answer(entry_id, entry.answer) + grade.append(g) + + return grade + + +if __name__ == "__main__": + import asyncio + + aobs = AOBIoTScenarios() + submission: list[SubmissionAnswer] = [ + SubmissionAnswer( + scenario_id="Q.S5", + answer='[{"scenario_id": "Q.S5.0", "answer": ""}]', + ), + SubmissionAnswer( + scenario_id="2", + answer="", + ), + SubmissionAnswer( + scenario_id="2", + answer=json.dumps( + { + "trace": "query database for iot sites", + "result": ["Downtown", "Uptown"], + } + ), + ), + ] + grade: list[SubmissionScore] = asyncio.run( + aobs.grade_responses(submission=submission) + ) + print(f"{grade=}") diff --git a/aobench/scenario-server/src/scenario_server/handlers/aob_tsfm/aob_tsfm.py b/aobench/scenario-server/src/scenario_server/handlers/aob_tsfm/aob_tsfm.py new file mode 100644 index 00000000..81762d07 --- /dev/null +++ b/aobench/scenario-server/src/scenario_server/handlers/aob_tsfm/aob_tsfm.py @@ -0,0 +1,153 @@ +import json +import logging + +from huggingface_hub import hf_hub_download +from scenario_server.entities import ( + Scenario, + ScenarioType, + SubmissionAnswer, + SubmissionScore, +) +from scenario_server.grading import evaluation_agent +from scenario_server.handlers.scenario_handler import ScenarioHandler + +logger: logging.Logger = logging.getLogger("scenario-server") + +HUGGINGFACE_REPO = "ibm-research/AssetOpsBench" +HUGGINGFACE_DATA = "data/scenarios/all_utterance.jsonl" + + +class AOBTSFMScenarios(ScenarioHandler): + id = "13aab653-66fe-4fe6-84d8-89f1b18eede3" + title = "Asset Operations Bench - TSFM" + description = "Human-authored evaluation prompts for industrial asset agents." + + def __init__(self): + self.scenario_data = dict() + try: + cache: str = hf_hub_download( + repo_id=HUGGINGFACE_REPO, + filename=HUGGINGFACE_DATA, + repo_type="dataset", + ) + + with open(cache, "r") as f: + scenario_data = [json.loads(line) for line in f] + + for sd in scenario_data: + if "type" in sd and sd["type"].lower() == "tsfm": + self.scenario_data[str(sd["id"])] = sd + + except Exception as e: + logger.error(f"failed to init AOBScenarios: {e=}") + + def _grade_answer(self, entry_id, answer) -> SubmissionScore: + try: + unwrap = json.loads(answer) + + c = self.scenario_data[entry_id]["characteristic_form"] + q = self.scenario_data[entry_id]["text"] + r = unwrap["result"] + t = unwrap["trace"] + + result, details = evaluation_agent( + actual=r, + charactistic=c, + query=q, + trace=t, + ) + + return SubmissionScore( + scenario_id=entry_id, + correct=result, + details=details, + ) + except Exception as e: + logger.error(f"failed to grade {entry_id=} : {e=}") + logger.debug(f"{entry_id=} / {answer=} / {self.scenario_data[entry_id]}") + return SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"failed to grade scenario id: {entry_id}"}], + ) + + def scenario_type(self) -> ScenarioType: + return ScenarioType(id=self.id, title=self.title, description=self.description) + + def fetch_scenarios(self) -> list[Scenario]: + scenarios = [] + + for k, v in self.scenario_data.items(): + try: + metadata = dict() + + if "category" in v: + metadata["category"] = v["category"] + + scenarios.append( + Scenario( + id=str(k), + query=v["text"], + metadata=metadata, + ) + ) + except Exception as e: + logger.error(f"failed to process {k}, {v} : {e=}") + + return scenarios + + async def grade_responses( + self, submission: list[SubmissionAnswer] + ) -> list[SubmissionScore]: + + grade = [] + for entry in submission: + try: + entry_id: str = entry.scenario_id + except Exception as e: + logger.error(f"missing scenario id: {entry=}") + continue + + if entry_id not in self.scenario_data: + grade.append( + SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"unknown scenario id: {entry_id}"}], + ) + ) + continue + + g: SubmissionScore = self._grade_answer(entry_id, entry.answer) + grade.append(g) + + return grade + + +if __name__ == "__main__": + import asyncio + + aobs = AOBTSFMScenarios() + submission: list[SubmissionAnswer] = [ + SubmissionAnswer( + scenario_id="Q.S5", + answer='[{"scenario_id": "Q.S5.0", "answer": ""}]', + ), + SubmissionAnswer( + scenario_id="201", + answer="", + ), + SubmissionAnswer( + scenario_id="201", + answer=json.dumps( + { + "trace": "Fetch csv data. Use 1 percent of data to fine tune model.", + "result": ["saved_model_file=/tmp/model"], + } + ), + ), + ] + grade: list[SubmissionScore] = asyncio.run( + aobs.grade_responses(submission=submission) + ) + print(f"{grade=}") diff --git a/aobench/scenario-server/src/scenario_server/handlers/aob_workorders/aob_workorders.py b/aobench/scenario-server/src/scenario_server/handlers/aob_workorders/aob_workorders.py new file mode 100644 index 00000000..e9ef3cc1 --- /dev/null +++ b/aobench/scenario-server/src/scenario_server/handlers/aob_workorders/aob_workorders.py @@ -0,0 +1,153 @@ +import json +import logging + +from huggingface_hub import hf_hub_download +from scenario_server.entities import ( + Scenario, + ScenarioType, + SubmissionAnswer, + SubmissionScore, +) +from scenario_server.grading import evaluation_agent +from scenario_server.handlers.scenario_handler import ScenarioHandler + +logger: logging.Logger = logging.getLogger("scenario-server") + +HUGGINGFACE_REPO = "ibm-research/AssetOpsBench" +HUGGINGFACE_DATA = "data/scenarios/all_utterance.jsonl" + + +class AOBWorkOrderScenarios(ScenarioHandler): + id = "4021467f-363b-41d2-8c62-f6aa738b01b7" + title = "Asset Operations Bench - Workorders" + description = "Human-authored evaluation prompts for industrial asset agents." + + def __init__(self): + self.scenario_data = dict() + try: + cache: str = hf_hub_download( + repo_id=HUGGINGFACE_REPO, + filename=HUGGINGFACE_DATA, + repo_type="dataset", + ) + + with open(cache, "r") as f: + scenario_data = [json.loads(line) for line in f] + + for sd in scenario_data: + if "type" in sd and sd["type"].lower() == "workorder": + self.scenario_data[str(sd["id"])] = sd + + except Exception as e: + logger.error(f"failed to init AOBScenarios: {e=}") + + def _grade_answer(self, entry_id, answer) -> SubmissionScore: + try: + unwrap = json.loads(answer) + + c = self.scenario_data[entry_id]["characteristic_form"] + q = self.scenario_data[entry_id]["text"] + r = unwrap["result"] + t = unwrap["trace"] + + result, details = evaluation_agent( + actual=r, + charactistic=c, + query=q, + trace=t, + ) + + return SubmissionScore( + scenario_id=entry_id, + correct=result, + details=details, + ) + except Exception as e: + logger.error(f"failed to grade {entry_id=} : {e=}") + logger.debug(f"{entry_id=} / {answer=} / {self.scenario_data[entry_id]}") + return SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"failed to grade scenario id: {entry_id}"}], + ) + + def scenario_type(self) -> ScenarioType: + return ScenarioType(id=self.id, title=self.title, description=self.description) + + def fetch_scenarios(self) -> list[Scenario]: + scenarios = [] + + for k, v in self.scenario_data.items(): + try: + metadata = dict() + + if "category" in v: + metadata["category"] = v["category"] + + scenarios.append( + Scenario( + id=str(k), + query=v["text"], + metadata=metadata, + ) + ) + except Exception as e: + logger.error(f"failed to process {k}, {v} : {e=}") + + return scenarios + + async def grade_responses( + self, submission: list[SubmissionAnswer] + ) -> list[SubmissionScore]: + + grade = [] + for entry in submission: + try: + entry_id: str = entry.scenario_id + except Exception as e: + logger.error(f"missing scenario id: {entry=}") + continue + + if entry_id not in self.scenario_data: + grade.append( + SubmissionScore( + scenario_id=entry_id, + correct=False, + details=[{"error": f"unknown scenario id: {entry_id}"}], + ) + ) + continue + + g: SubmissionScore = self._grade_answer(entry_id, entry.answer) + grade.append(g) + + return grade + + +if __name__ == "__main__": + import asyncio + + aobs = AOBWorkOrderScenarios() + submission: list[SubmissionAnswer] = [ + SubmissionAnswer( + scenario_id="Q.S5", + answer='[{"scenario_id": "Q.S5.0", "answer": ""}]', + ), + SubmissionAnswer( + scenario_id="401", + answer="", + ), + SubmissionAnswer( + scenario_id="401", + answer=json.dumps( + { + "trace": "find workorders for equipment labeled as CWC04013 in 2017", + "result": [], + } + ), + ), + ] + grade: list[SubmissionScore] = asyncio.run( + aobs.grade_responses(submission=submission) + ) + print(f"{grade=}") diff --git a/aobench/uv.lock b/aobench/uv.lock index afac833a..a2ae7970 100644 --- a/aobench/uv.lock +++ b/aobench/uv.lock @@ -128,12 +128,14 @@ name = "assetopsbench" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "huggingface-hub" }, { name = "scenario-client" }, { name = "scenario-server" }, ] [package.metadata] requires-dist = [ + { name = "huggingface-hub", specifier = ">=0.35.3" }, { name = "scenario-client", editable = "scenario-client" }, { name = "scenario-server", editable = "scenario-server" }, ]