-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_virtual.py
More file actions
465 lines (402 loc) · 21.7 KB
/
run_virtual.py
File metadata and controls
465 lines (402 loc) · 21.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
"""
Mobile-Agent-v3 runner script for virtual GUI environment
Uses Gemini3-generated virtual Android interfaces instead of real devices
"""
from collections.abc import Sequence
import os
from dataclasses import dataclass
from typing import Any
from absl import app
from absl import flags
from absl import logging
from agent_env import checkpointer as checkpointer_lib
from agent_env import registry
from agent_env import suite_utils
from agent_env.agents import base_agent
from agent_env.agents import infer_ma3
from agent_env.agents import mobile_agent_v3
from agent_env.agents import gui_owl
from agent_env.env import interface
# Import virtual environment adapter
from virtual_env_adapter import VirtualEnvAdapter
# ==================== Simplified Task Class ====================
@dataclass
class SimpleTask:
"""Simplified task class to bypass agent_env task registry"""
name: str
goal: str
complexity: float = 1.0 # Complexity coefficient (numeric)
app_names: tuple = ()
start_on_home_screen: bool = True # Whether to start from home screen
params: dict = None # Task parameters (including seed, etc.)
def __post_init__(self):
"""Post-initialization processing, set default params"""
if self.params is None:
self.params = {
'seed': 0, # Default seed
}
@property
def template(self):
return self.goal
def initialize_task(self, env: Any) -> None:
"""Initialize task (empty implementation)"""
pass
def is_successful(self, env: Any) -> float:
"""Check if task is successful (directly return 1.0 for success in virtual environment)"""
return 1.0
def tear_down(self, env: Any) -> None:
"""Clean up task (empty implementation)"""
pass
logging.set_verbosity(logging.WARNING)
os.environ['GRPC_VERBOSITY'] = 'ERROR'
os.environ['GRPC_TRACE'] = 'none'
# ==================== Gemini3 Virtual Environment Configuration ====================
_USE_VIRTUAL_ENV = flags.DEFINE_boolean(
'use_virtual_env',
True,
'Whether to use virtual Gemini3 environment instead of real Android device.'
)
_GEMINI_API_KEY = flags.DEFINE_string(
'gemini_api_key',
'',
'API key for Gemini3 image generation (via OpenRouter).'
)
_GEMINI_BASE_URL = flags.DEFINE_string(
'gemini_base_url',
'https://openrouter.ai/api/v1',
'Base URL for Gemini3 API.'
)
_GEMINI_MODEL = flags.DEFINE_string(
'gemini_model',
'google/gemini-2.0-flash-exp:free',
'Gemini model name for image generation.'
)
_INITIAL_IMAGE_PATH = flags.DEFINE_string(
'initial_image_path',
'start_Android.png',
'Path to initial Android screenshot image.'
)
_RESOLUTION = flags.DEFINE_string(
'resolution',
'1080x2400',
'Virtual screen resolution (WIDTHxHEIGHT).'
)
# ==================== Agent LLM Configuration ====================
_MODEL = flags.DEFINE_string(
'model',
'',
'Your model name for the agent.'
)
_API_KEY = flags.DEFINE_string(
'api_key',
'',
'Your API key for the agent LLM.'
)
_BASE_URL = flags.DEFINE_string(
'base_url',
'',
'Your base URL for the agent LLM.'
)
# ==================== Task Configuration ====================
_SUITE_FAMILY = flags.DEFINE_enum(
'suite_family',
registry.TaskRegistry.ANDROID_WORLD_FAMILY,
[
registry.TaskRegistry.ANDROID_WORLD_FAMILY,
registry.TaskRegistry.MINIWOB_FAMILY_SUBSET,
registry.TaskRegistry.MINIWOB_FAMILY,
registry.TaskRegistry.ANDROID_FAMILY,
registry.TaskRegistry.INFORMATION_RETRIEVAL_FAMILY,
],
'Task family.'
)
_TASKS = flags.DEFINE_list(
'tasks',
[],
'Specific tasks to run.'
)
_N_TASK_COMBINATIONS = flags.DEFINE_integer(
'n_task_combinations',
1,
'Number of task parameter combinations to sample.'
)
_TASK_RANDOM_SEED = flags.DEFINE_integer(
'task_random_seed',
None,
'Random seed for task parameter sampling.'
)
_FIXED_TASK_SEED = flags.DEFINE_boolean(
'fixed_task_seed',
False,
'Whether to use the same task seed across combinations.'
)
# ==================== Agent Configuration ====================
_AGENT_NAME = flags.DEFINE_string(
'agent_name',
'mobile_agent_v3',
'Agent name.'
)
# ==================== Output Configuration ====================
_OUTPUT_PATH = flags.DEFINE_string(
'output_path',
'results',
'Output path for checkpoints.'
)
_CHECKPOINT_DIR = flags.DEFINE_string(
'checkpoint_dir',
'',
'Checkpoint directory.'
)
_TRAJ_OUTPUT_PATH = flags.DEFINE_string(
'traj_output_path',
'traj_output',
'Trajectory output path.'
)
# MiniWoB Configuration
_MINIWOB_TRANSITION_PAUSE = 0.2
_MINIWOB_ADDITIONAL_GUIDELINES = [
(
'This task is running in a mock app, you must stay in this app and'
' DO NOT use the `navigate_home` action.'
),
]
def _parse_resolution(resolution_str: str) -> tuple:
"""Parse resolution string"""
parts = resolution_str.split('x')
if len(parts) != 2:
raise ValueError(f'Invalid resolution format: {resolution_str}. Expected WIDTHxHEIGHT')
return (int(parts[0]), int(parts[1]))
def _create_virtual_env() -> VirtualEnvAdapter:
"""Create virtual environment"""
print('\n' + '='*60)
print('Creating Gemini3 Virtual Android Environment')
print('='*60)
resolution = _parse_resolution(_RESOLUTION.value)
env = VirtualEnvAdapter(
gemini_api_key=_GEMINI_API_KEY.value,
gemini_base_url=_GEMINI_BASE_URL.value,
gemini_model=_GEMINI_MODEL.value,
initial_task="", # Task description will be provided at reset
initial_image_path=_INITIAL_IMAGE_PATH.value,
resolution=resolution
)
print(f'[OK] Virtual environment created successfully')
print(f' - Gemini Model: {_GEMINI_MODEL.value}')
print(f' - Resolution: {resolution}')
print(f' - Initial Image: {_INITIAL_IMAGE_PATH.value}')
print('='*60 + '\n')
return env
def _get_agent(
env: interface.AsyncEnv,
family: str | None = None,
) -> base_agent.EnvironmentInteractingAgent:
"""Create Agent"""
print('\n' + '='*60)
print('Initializing Agent')
print('='*60)
agent = None
if _AGENT_NAME.value == 'gui_owl':
agent = gui_owl.GUIOwl(
env,
infer_ma3.GUIOwlWrapper(_API_KEY.value, _BASE_URL.value, _MODEL.value),
"abs_resized",
api_key=None,
url=None,
output_path=_TRAJ_OUTPUT_PATH.value
)
elif _AGENT_NAME.value == 'mobile_agent_v3':
agent = mobile_agent_v3.MobileAgentV3_M3A(
env,
infer_ma3.GUIOwlWrapper(_API_KEY.value, _BASE_URL.value, _MODEL.value),
output_path=_TRAJ_OUTPUT_PATH.value
)
if not agent:
raise ValueError(f'Unknown agent: {_AGENT_NAME.value}')
# Set special guidelines for MiniWoB tasks
if (
agent.name in ['M3A', 'T3A', 'SeeAct']
and family
and family.startswith('miniwob')
and hasattr(agent, 'set_task_guidelines')
):
agent.set_task_guidelines(_MINIWOB_ADDITIONAL_GUIDELINES)
agent.name = _AGENT_NAME.value
print(f'[OK] Agent created successfully')
print(f' - Agent Name: {_AGENT_NAME.value}')
print(f' - LLM Model: {_MODEL.value}')
print('='*60 + '\n')
return agent
def _main(argv: Sequence[str]) -> None:
"""Main function"""
del argv # Unused
print('\n' + '='*70)
print(' '*15 + 'Mobile-Agent-v3 Virtual GUI Environment')
print('='*70)
print(f'Agent: {_AGENT_NAME.value}')
print(f'Agent LLM: {_MODEL.value}')
print(f'Virtual Environment: Gemini3 ({_GEMINI_MODEL.value})')
print(f'Task Family: {_SUITE_FAMILY.value}')
print(f'Output Path: {_TRAJ_OUTPUT_PATH.value}')
print('='*70 + '\n')
# Create virtual environment
if _USE_VIRTUAL_ENV.value:
env = _create_virtual_env()
else:
raise ValueError('This script requires use_virtual_env=True')
# Create task suite
print('Creating task suite...')
n_task_combinations = _N_TASK_COMBINATIONS.value
task_registry = registry.TaskRegistry()
# Social task goal description mapping (English descriptions with specific app names)
SOCIAL_TASK_GOALS = {
# Message Category
"ForwardMultipleMessagesToGroup": 'Open Telegram, forward the 3 most recent messages about project progress from chat with "Sarah" to group "Dev Team"',
"QuoteReplyWithMention": 'Open WhatsApp, quote "Mike"\'s message in the group chat and reply "@Mike I have some additional suggestions about the technical solution you mentioned"',
"ScheduledMessageFollowup": 'Open Telegram, send message to "Emma" asking "Is the meeting material ready?", if no reply within 5 minutes, send a reminder',
"MultiStepConversation": 'Open WhatsApp, confirm meeting time with "David": first ask "Are you free tomorrow afternoon?", then send specific time suggestion based on reply',
"CrossChatReference": 'Open Telegram, in group "Product Discussion" mention "Confirmed requirements with Lisa in private chat, everyone can start design now"',
# Contact Category
"BatchEditContactTags": 'Open WhatsApp, add tags "Important" and "Priority Reply" to all contacts with note name containing "Client"',
"MergeAndCleanDuplicates": 'Open Contacts app, find contacts with similar names (e.g. "John Smith" and "John Smith-Company"), merge them after confirmation',
"SmartContactRecommendation": 'Open WhatsApp, check system recommendations "People you may know", select and add 2 people met through mutual group chats',
"ContextBasedBlocking": 'Open Telegram, block all non-friend contacts from group "Marketing Promotion"',
"CreateDynamicContactList": 'Open Contacts app, create smart list "Project Members" including all members from groups "Project A" and "Project B"',
# Group Category
"ConditionalMemberAddition": 'Open Telegram, check if "Jack" is in group "Tech Sharing", if not add him and mention "Welcome to tech discussion group"',
"BatchMemberManagement": 'Open WhatsApp, find members who haven\'t spoken for over 6 months in group "Alumni", move them to new group "Inactive Members"',
"GroupReorganization": 'Open Telegram, split group "Temp Discussion (50 people)" by department into 3 small groups: "Dev Team", "Design Team", "Operations Team", and notify original group members',
"ThemeGroupCustomization": 'Open WhatsApp, set theme avatar for group "Anniversary Planning", group announcement "30 days until event", and pin important notification messages',
"SmartGroupCleanup": 'Open Telegram, exit all groups with no messages for over 3 months and less than 5 members, save chat history before exiting',
# Media Category
"CuratedPhotoSharing": 'Open WhatsApp, find activity photos "Rachel" sent last week from chat history, download and add text "Wonderful moments" then share to group "Colleague Gathering"',
"SequentialMediaStory": 'Open Telegram, send 5 photos in sequence to group "Travel Log", each with different captions, telling a complete travel story',
"VideoWithTimestamp": 'Open WhatsApp, record 30-second video message "Hi everyone, this is weekly work summary", send to group "Team Weekly Report" and pin it',
"VoiceMessageSeries": 'Open Telegram, send 3 consecutive voice messages to "Boss" explaining project progress: 1) Current status 2) Problems encountered 3) Next week plan',
"LocationBasedMeetup": 'Open WhatsApp, send location to group "Lunch Squad" with message "I am at this restaurant, 3 seats available" and set reminder to auto-delete after 30 minutes',
"ContactCardSharing": 'Open Telegram, check "Kevin"\'s profile, determine which project group he suits, then send his contact card to that group with recommendation reason',
# Call Category
"ScheduledCallSetup": 'Open WhatsApp, send message to "Linda" scheduling "Phone call to discuss proposal at 3pm today", initiate voice call at scheduled time, send reminder 5 minutes before',
"VideoCallWithScreenShare": 'Open Telegram, initiate video call with "Mike", share design diagrams recently discussed in group chat during call',
"ProgressiveGroupCall": 'Open WhatsApp, in group "Emergency Meeting" first mention 3 core members asking if convenient, after confirmation initiate group voice call and invite other members',
"MultiTaskVideoMeeting": 'Open Telegram, during group video call, simultaneously share meeting link and documents in group text window, and record key decisions',
"CallWithFollowup": 'Open WhatsApp, make voice call to "Nancy" to discuss contract details, immediately send summary in chat after call "3 points confirmed in phone call..."',
"SmartCallRouting": 'Open Telegram, check "Peter"\'s online status, if showing "busy" send text message, if "online" directly initiate voice call',
# Notification Category
"ContextualNotificationRule": 'Open WhatsApp settings, set smart notifications for group "Work Group": all notifications during work hours (9am-6pm), only mentions during non-work hours',
"BatchMuteManagement": 'Open Telegram settings, set all groups containing keywords "advertising" or "promotion" to permanently mute, but do not exit',
"DynamicPinning": 'Open WhatsApp, check 3 conversations with most messages in last 24 hours, pin them, and unpin other conversations',
"SmartArchiving": 'Open Telegram, auto-archive all conversations with no new messages for over 1 month and not pinned, keep conversations with important contacts',
"GranularPrivacyControl": 'Open WhatsApp settings, set privacy rules: show online status to "Family" group, show last seen to "Colleagues" group, hide from others',
"SecureConversationMode": 'Open Telegram, enable end-to-end encryption with "Legal", set messages to self-destruct (24 hours), and disable screenshots and forwarding',
# Emoji Category
"ContextualEmojiReaction": 'Open WhatsApp, find "Rachel"\'s celebration message in group chat, add 🎉 reaction to that message, and reply "Congratulations!"',
"BatchReactionToThread": 'Open Telegram, add 👍 reaction to all 5 consecutive project update messages from "Sam" in group, indicating read and acknowledged',
"AnimatedResponseSequence": 'Open WhatsApp, send "Tina" a set of 3 consecutive GIF animations expressing "thinking → eureka moment → excitement" process',
"SmartReactionSuggestion": 'Open Telegram, check recent announcement message in group "Team", if over 50% of people added reactions, also add the most commonly used reaction',
"CustomEmojiCreation": 'Open WhatsApp, select one photo from recent chat images, crop and create as custom emoji, then use it for the first time in group',
"EmojiSentimentAnalysis": 'Open Telegram, check last 20 messages with "Victor", select appropriate emoji to reply current message based on conversation atmosphere',
# File Category
"InChatDocumentEdit": 'Open WhatsApp, find document "Wendy" sent last week in chat, directly open and edit within chat, add comments and send back',
"FileCollectionAndOrganization": 'Open Telegram, collect all files sent by members in group "Project Documents" in the past week, create folder to categorize and save to chat favorites',
"CollaborativeFileReview": 'Open WhatsApp, download proposal document from "Yara", reply with modification comments for each chapter (at least 3) within chat',
"BatchFileForwarding": 'Open Telegram, select all PDF files from private chat with "Xavier", batch forward to group "Study Group" with note "Weekly learning materials"',
"CloudFileSharing": 'Open WhatsApp, create online collaborative document in group "Design Team", mention all members to edit quarterly plan together',
"SmartFileSuggestion": 'Open Telegram, check "Zack"\'s inquiry "Where is Q3 report?", find related file from message history and send again',
# Status Category
"ScheduledStatusPost": 'Open WhatsApp Status, create status "Project successfully launched! Thanks team", mention 5 team members, set visible only to "Colleagues" group',
"StatusEngagementChain": 'Open WeChat Moments, check "Amy"\'s latest status, if work-related comment professional advice, if life sharing like and repost to own status',
"InteractiveStatusPoll": 'Open WhatsApp Status, post status with poll "Where to go for team building next week? Options: Hiking/Escape Room/Dinner", mention all members of group "Team"',
"CommentThreadEngagement": 'Open WeChat Moments, under "Ben"\'s popular status find "Cathy"\'s comment, reply to her comment to start discussion',
"StatusAnalyticsAndCleanup": 'Open WhatsApp Status, check all statuses posted in last 30 days, delete those with less than 10 views, keep those with more interactions',
"DynamicAudienceStatus": 'Open WeChat Moments, when posting status auto-set visible range based on content: work-related → colleagues visible, life sharing → all visible, private → only me',
# History Category
"ContextSearchAndReply": 'Open WhatsApp, search keyword "meeting" in chat with "Diana", find most recent meeting time, then reply "According to our chat history, meeting is scheduled at..."',
"CrossChatInfoRetrieval": 'Open Telegram, global search "contract" to find 3 related conversations, integrate key information and send to "Legal" to inquire about progress',
"TimelineReconstruction": 'Open WhatsApp, review past 3 months of chat with "Ethan", summarize 3 key project milestones and create memo',
"SelectiveHistoryExport": 'Open Telegram, filter all messages containing "decision" from chat with "Fiona", export as document and share to group "Management"',
"SmartHistoryCleanup": 'Open WhatsApp, delete all stickers and GIFs in conversation with "George" (keep text and important images) to slim down chat',
"ConversationBookmarking": 'Open Telegram, add bookmark tags to important messages (at least 5) in group "Product Requirements", create index for easy future reference',
}
# Check if using social tasks
task_names = _TASKS.value if _TASKS.value else []
if isinstance(task_names, str):
task_names = [t.strip() for t in task_names.split(',')]
use_social_tasks = any(task_name in SOCIAL_TASK_GOALS for task_name in task_names)
if use_social_tasks:
# Create simplified social tasks (using dict format, compatible with agent.get_task_name)
print('Social app tasks detected, using simplified task mode...')
suite = {}
for task_name in task_names:
if task_name in SOCIAL_TASK_GOALS:
task_instances = []
for i in range(n_task_combinations):
task = SimpleTask(
name=task_name, # Use original task name
goal=SOCIAL_TASK_GOALS[task_name],
complexity=1.5, # Medium complexity (numeric format)
app_names=("WeChat", "QQ", "Telegram")
)
task_instances.append(task)
suite[task_name] = task_instances
else:
print(f"Warning: Goal description for task {task_name} not found, skipping")
total_tasks = sum(len(instances) for instances in suite.values())
print(f'[OK] Social task suite created successfully, total {total_tasks} tasks\n')
else:
# Use standard agent_env tasks
suite = suite_utils.create_suite(
task_registry.get_registry(family=_SUITE_FAMILY.value),
n_task_combinations=n_task_combinations,
seed=_TASK_RANDOM_SEED.value,
tasks=_TASKS.value,
use_identical_params=_FIXED_TASK_SEED.value,
)
suite.suite_family = _SUITE_FAMILY.value
print(f'[OK] Task suite created successfully, total {len(suite)} tasks\n')
# Create Agent
agent = _get_agent(env, _SUITE_FAMILY.value)
agent.get_task_name(suite)
# Set special parameters for MiniWoB
if _SUITE_FAMILY.value.startswith('miniwob'):
agent.transition_pause = _MINIWOB_TRANSITION_PAUSE
else:
agent.transition_pause = None
# Set checkpoint directory
if _CHECKPOINT_DIR.value:
checkpoint_dir = _CHECKPOINT_DIR.value
else:
checkpoint_dir = checkpointer_lib.create_run_directory(_OUTPUT_PATH.value)
print(f'Checkpoint directory: {checkpoint_dir}\n')
# Custom process function to disable evaluation statistics output
def process_episodes_silent(episodes, print_summary=False):
"""Process episodes without printing evaluation statistics"""
return suite_utils.process_episodes(episodes, print_summary=False)
# Run evaluation
print('='*70)
print(' '*25 + 'Starting Task Execution')
print('='*70 + '\n')
try:
suite_utils.run(
suite,
agent,
checkpointer=checkpointer_lib.IncrementalCheckpointer(checkpoint_dir),
demo_mode=False,
process_episodes_fn=process_episodes_silent,
)
print('\n' + '='*70)
print(' '*25 + 'Execution Complete')
print('='*70)
print(f'Results saved to: {checkpoint_dir}')
print(f'Trajectories saved to: {_TRAJ_OUTPUT_PATH.value}')
print('='*70 + '\n')
except KeyboardInterrupt:
print('\n\nUser interrupted, cleaning up...')
except Exception as e:
print(f'\n\nError: {e}')
import traceback
traceback.print_exc()
finally:
env.close()
print('Environment closed')
if __name__ == '__main__':
app.run(_main)