-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathscore.py
More file actions
228 lines (190 loc) · 9.81 KB
/
score.py
File metadata and controls
228 lines (190 loc) · 9.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import os
import json
import argparse
from dataclasses import dataclass
THUNDERBIRD_MAPPING = {
"a10b69e1-6034-4a2b-93e1-571d45194f75": "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
"dd84e895-72fd-4023-a336-97689ded257c": "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
"9bc3cc16-074a-45ac-9bdc-b2a362e1daf3": "06fe7178-4491-4589-810f-2e2bc9502122",
"15c3b339-88f7-4a86-ab16-e71c58dcb01e": "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
"f201fbc3-44e6-46fc-bcaa-432f9815454c": "030eeff7-b492-4218-b312-701ec99ee0cc",
"3f49d2cc-f400-4e7d-90cc-9b18e401cc31": "480bcfea-d68f-4aaa-a0a9-2589ef319381",
"3f28fe4f-5d9d-4994-a456-efd78cfae1a3": "6766f2b8-8a72-417f-a9e5-56fcaa735837",
"9b7bc335-06b5-4cd3-9119-1a649c478509": "35253b65-1c19-4304-8aa4-6884b8218fc0",
"a1af9f1c-50d5-4bc3-a51e-4d9b425ff638": "99146c54-4f37-4ab8-9327-5f3291665e1e",
"7b1e1ff9-bb85-49be-b01d-d6424be18cd0": "12086550-11c0-466b-b367-1d9e75b3910e",
"10a730d5-d414-4b40-b479-684bed1ae522": "94760984-3ff5-41ee-8347-cf1af709fea0",
"dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397": "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
"08c73485-7c6d-4681-999d-919f5c32dcfa": "c9e7eaf2-b1a1-4efc-a982-721972fa9f02",
"d38192b0-17dc-4e1d-99c3-786d0117de77": "d088f539-cab4-4f9a-ac92-9999fc3a656e",
"5203d847-2572-4150-912a-03f062254390": "e1e75309-3ddb-4d09-92ec-de869c928143"
} # from our keys to theirs
APPLICATIONS = set(["libreoffice_calc", "libreoffice_writer", "gimp", "libreoffice_impress", "vlc", "chrome", "vs_code", "thunderbird", "os", "multi_apps"])
HUMAN_PATH = "./"
EMPTY = "EMPTY" # Used for UITARS
@dataclass(frozen=True)
class TaskMeta:
application: str
task_id: str
def __str__(self):
return f"{self.application}/{self.task_id}" if self.application is not EMPTY else f"{self.task_id[:100]}..."
@dataclass
class humanTask:
single: list[str]
grouped: list[list[str]]
@dataclass
class TaskTrajectory:
num_steps: int
task_result: float
@dataclass
class TaskScore:
wes_plus: float
wes_minus: float
wes: float
def compute_task_score(task_meta: TaskMeta, human_task: humanTask, task_trajectory: TaskTrajectory, max_steps: int, single: bool) -> TaskScore:
expected_steps = len(human_task.single) if single else len(human_task.grouped)
if task_trajectory.num_steps < expected_steps and task_trajectory.task_result > 0.9 and not single:
print(f"WARN: Task '{task_meta}' successfully completed in {task_trajectory.num_steps} instead of expected {expected_steps}")
wes_plus = task_trajectory.task_result * expected_steps / task_trajectory.num_steps
wes_minus = - ((1 - task_trajectory.task_result) * task_trajectory.num_steps / max_steps)
wes = wes_plus + wes_minus
return TaskScore(wes_plus, wes_minus, wes)
def get_human_tasks(human_path: str, use_query: bool = False) -> dict[TaskMeta, humanTask]:
"""
Load human tasks from the specified path.
"""
human_tasks = {}
num_task_files = 0
num_task_files_opened = 0
for application in os.listdir(human_path):
if application not in APPLICATIONS:
continue # skip for now
app_path = os.path.join(os.path.join(human_path, application))
if os.path.isdir(app_path):
for task_file in os.listdir(app_path):
if task_file.endswith(".json"):
num_task_files += 1
with open(os.path.join(app_path, task_file), 'r', encoding='utf-8') as f:
num_task_files_opened += 1
task_data = json.load(f)
try:
task_id = task_data["id"] if not use_query else task_data["instruction"] # for UITARS
single = task_data["human-ground-truth"]["single-action"]
grouped = task_data["human-ground-truth"]["grouped-action"]
except Exception as e:
print(f"Skipping task {task_file} due to exception: {e}")
continue
if use_query: # for UITARS
human_tasks[TaskMeta(EMPTY, task_id)] = humanTask(single, grouped)
else:
human_tasks[TaskMeta(application, task_id)] = humanTask(single, grouped)
return human_tasks
def get_task_trajectories(result_path: str) -> tuple[dict[TaskMeta, TaskTrajectory], int]:
"""
Load task trajectories from the specified result path.
"""
max_steps_empirical = 0
task_trajectories = {}
for application in os.listdir(result_path):
if application not in APPLICATIONS:
continue
app_path = os.path.join(result_path, application)
if os.path.isdir(app_path):
for task_folder in os.listdir(app_path):
full_task_folder_path = os.path.join(app_path, task_folder)
if os.path.isdir(full_task_folder_path):
for file in os.listdir(full_task_folder_path):
if file.endswith(".jsonl"):
file_path = os.path.join(full_task_folder_path, file)
with open(file_path, 'r') as f:
full_trajectory = []
for line in f:
step_data = json.loads(line)
full_trajectory.append(step_data)
task_id = task_folder
num_steps = len(full_trajectory)
max_steps_empirical = max(max_steps_empirical, num_steps)
task_result_file_path = os.path.join(full_task_folder_path, 'result.txt')
if not os.path.exists(task_result_file_path):
print(f"Result file not found for {task_id} in {full_task_folder_path}")
continue
with open(task_result_file_path, 'r') as f:
top_line = f.readlines()[0].strip()
try:
task_result = float(top_line)
except ValueError as e:
if top_line == "True":
task_result = 1.0
elif top_line == "False":
task_result = 0.0
else:
raise e
task_trajectories[TaskMeta(application, task_id)] = TaskTrajectory(num_steps, task_result)
return task_trajectories, max_steps_empirical
def get_task_trajectories_uitars(result_path: str) -> tuple[dict[TaskMeta, TaskTrajectory], int]:
max_steps_empirical = 0
task_trajectories = {}
for task_folder in os.listdir(result_path):
full_task_folder_path = os.path.join(result_path, task_folder)
if os.path.isdir(full_task_folder_path):
for file in os.listdir(full_task_folder_path):
if file == "data.json":
file_path = os.path.join(full_task_folder_path, file)
with open(file_path, 'r') as f:
task_data = json.load(f)
num_steps = len(task_data)
max_steps_empirical = max(max_steps_empirical, num_steps)
task_result = task_data[-1]["inner_step"][0]["score"]
elif file == "meta.json":
file_path = os.path.join(full_task_folder_path, file)
with open(file_path, 'r') as f:
meta_data = json.load(f)
query = meta_data["query"]
task_trajectories[TaskMeta(EMPTY, query)] = TaskTrajectory(num_steps, task_result)
return task_trajectories, max_steps_empirical
def get_and_display_aggregated_scores(task_scores: list[TaskScore], num_tasks: int):
avg_wes_plus = sum(score.wes_plus for score in task_scores) / num_tasks
avg_wes_minus = sum(score.wes_minus for score in task_scores) / num_tasks
return avg_wes_plus, avg_wes_minus
def main(result_path, max_steps_scoring, is_uitars):
human_tasks: dict[TaskMeta, humanTask] = get_human_tasks(HUMAN_PATH, is_uitars)
num_human_tasks = len(human_tasks)
print(f"Found {num_human_tasks} human tasks")
print("Getting task trajectory")
task_trajectories, max_steps_empirical = get_task_trajectories(result_path) if not is_uitars else get_task_trajectories_uitars(result_path)
print(f"Found {len(task_trajectories)} trajectories.")
max_steps = max(max_steps_scoring, max_steps_empirical)
single_scores: list[TaskScore] = []
grouped_scores: list[TaskScore] = []
osworld_result = 0
skipped_tasks = 0
for task_meta, human_task in human_tasks.items():
if task_meta not in task_trajectories:
if task_meta.application == "thunderbird" and task_meta.task_id in THUNDERBIRD_MAPPING:
task_meta = TaskMeta("thunderbird", THUNDERBIRD_MAPPING[task_meta.task_id])
if task_meta not in task_trajectories:
print(f"Task '{task_meta}' not found in trajectories, skipping with score of 0.")
skipped_tasks += 1
continue
else:
print(f"Task '{task_meta}' not found in trajectories, skipping with score of 0.")
skipped_tasks += 1
continue
single_scores.append(compute_task_score(task_meta, human_task, task_trajectories[task_meta], max_steps, single=True))
grouped_scores.append(compute_task_score(task_meta, human_task, task_trajectories[task_meta], max_steps, single=False))
osworld_result += task_trajectories[task_meta].task_result
print("\n=== RESULTS ===")
print(f"Skipped {skipped_tasks} tasks that were not found in trajectories.")
print(f"Original OSWorld Result: {osworld_result/num_human_tasks*100:.2f}% (on {num_human_tasks} human tasks)")
single_action_wes_plus, wes_minus = get_and_display_aggregated_scores(single_scores, num_human_tasks)
grouped_action_wes_plus, _ = get_and_display_aggregated_scores(grouped_scores, num_human_tasks)
print(f"Single Action WES+: {single_action_wes_plus*100:.2f}")
print(f"Grouped Action WES+: {grouped_action_wes_plus*100:.2f}")
print(f"WES-: {wes_minus:.2f}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Score trajecories on OSWorld-Human")
parser.add_argument("--result-path", type=str, help="Path to the full result directory.")
parser.add_argument("--max-steps-scoring", type=int, default=15)
parser.add_argument("--uitars", action='store_true', help="UI-TARS trajectories have a different structure.")
args = parser.parse_args()
main(args.result_path, args.max_steps_scoring, args.uitars)