diff --git a/rcg/README.md b/rcg/README.md index f49d29bd..d0869369 100644 --- a/rcg/README.md +++ b/rcg/README.md @@ -6,7 +6,7 @@ LLM-based robot control interface for RCareWorld simulation environment. ```bash # Install dependencies -pip install gradio pillow openai +pip install gradio==5.* pillow openai python-dotenv # Set API key (if using OpenAI) export OPENAI_API_KEY="your-key" diff --git a/rcg/gradio_ui.py b/rcg/gradio_ui.py index 65fdde15..8f17ee8e 100644 --- a/rcg/gradio_ui.py +++ b/rcg/gradio_ui.py @@ -81,7 +81,7 @@ def camera_loop(): _latest_frame = np.array(image) # ~10 FPS = ~0.1 second delay - time.sleep(0.1) + # time.sleep(0.01) except Exception as e: print(f"[Camera Feed Error] {e}") @@ -112,7 +112,7 @@ def get_latest_frame(): # ============================================================================ -# Control Button Functions +# Movement Control Button Functions # ============================================================================ def move_up() -> str: @@ -158,7 +158,7 @@ def move_left() -> str: try: with _unity_lock: # Thread safety _global_robot.IKTargetDoMove( - position=[-MOVEMENT_DISTANCE, 0, 0], + position=[MOVEMENT_DISTANCE, 0, 0], duration=1.0, speed_based=False, relative=True @@ -177,7 +177,7 @@ def move_right() -> str: try: with _unity_lock: # Thread safety _global_robot.IKTargetDoMove( - position=[MOVEMENT_DISTANCE, 0, 0], + position=[-MOVEMENT_DISTANCE, 0, 0], duration=1.0, speed_based=False, relative=True @@ -216,6 +216,110 @@ def release_action() -> str: traceback.print_exc() return f"✗ Error: {str(e)}" +# ============================================================================ +# Camera Control Button Functions +# ============================================================================ + +def move_camera_up() -> str: + """Move camera up by configured distance.""" + try: + with _unity_lock: # Thread safety + _global_camera.DoMove( + position=[0, MOVEMENT_DISTANCE, 0], + duration=1.0, + speed_based=False, + relative=True + ) + return f"✓ Moved camera up {int(MOVEMENT_DISTANCE*100)}cm" + except Exception as e: + import traceback + traceback.print_exc() + return f"✗ Error: {str(e)}" + + +def move_camera_down() -> str: + """Move camera down by configured distance.""" + try: + with _unity_lock: # Thread safety + _global_camera.DoMove( + position=[0, -MOVEMENT_DISTANCE, 0], + duration=1.0, + speed_based=False, + relative=True + ) + return f"✓ Moved camera down {int(MOVEMENT_DISTANCE*100)}cm" + except Exception as e: + import traceback + traceback.print_exc() + return f"✗ Error: {str(e)}" + + +def move_camera_left() -> str: + """Move camera left by configured distance.""" + try: + with _unity_lock: # Thread safety + _global_camera.DoMove( + position=[MOVEMENT_DISTANCE, 0, 0], + duration=1.0, + speed_based=False, + relative=True + ) + return f"✓ Moved camera left {int(MOVEMENT_DISTANCE*100)}cm" + except Exception as e: + import traceback + traceback.print_exc() + return f"✗ Error: {str(e)}" + + +def move_camera_right() -> str: + """Move camera right by configured distance.""" + try: + with _unity_lock: # Thread safety + _global_camera.DoMove( + position=[-MOVEMENT_DISTANCE, 0, 0], + duration=1.0, + speed_based=False, + relative=True + ) + return f"✓ Moved camera right {int(MOVEMENT_DISTANCE*100)}cm" + except Exception as e: + import traceback + traceback.print_exc() + return f"✗ Error: {str(e)}" + +def move_camera_forward() -> str: + """Move camera forward by configured distance.""" + try: + with _unity_lock: # Thread safety + _global_camera.DoMove( + position=[0, 0, -MOVEMENT_DISTANCE], + duration=1.0, + speed_based=False, + relative=True + ) + return f"✓ Moved camera forward {int(MOVEMENT_DISTANCE*100)}cm" + except Exception as e: + import traceback + traceback.print_exc() + return f"✗ Error: {str(e)}" + + +def move_camera_back() -> str: + """Move camera back by configured distance.""" + try: + with _unity_lock: # Thread safety + _global_camera.DoMove( + position=[0, 0, MOVEMENT_DISTANCE], + duration=1.0, + speed_based=False, + relative=True + ) + return f"✓ Moved camera back {int(MOVEMENT_DISTANCE*100)}cm" + except Exception as e: + import traceback + traceback.print_exc() + return f"✗ Error: {str(e)}" + # ============================================================================ # Chat Interface Functions @@ -245,13 +349,13 @@ def process_chat_message(message: str, history: List[dict]) -> Tuple[List[dict], # Add function call info if available if result.get("function_called"): - func_result = result.get("function_result", {}) - func_name = result["function_called"] + for i, func_name in enumerate(result.get("function_called")): + func_result = result.get("function_result")[i] - if func_result.get("success"): - response_parts.append(f"**[Function: {func_name}]** ✓ {func_result.get('message', 'Done')}") - else: - response_parts.append(f"**[Function: {func_name}]** ✗ {func_result.get('message', 'Failed')}") + if func_result.get("success"): + response_parts.append(f"**[Function: {func_name}]** ✓ {func_result.get('message', 'Done')}") + else: + response_parts.append(f"**[Function: {func_name}]** ✗ {func_result.get('message', 'Failed')}") # Add LLM response response_parts.append(result["llm_response"]) @@ -323,12 +427,42 @@ def create_interface() -> gr.Blocks: # Title gr.Markdown("# RCareGen") - # Top: Camera Feed + # Top: Camera Feed and controls with gr.Row(): - camera_feed = gr.Image( - label="Robot Camera Feed (~10 FPS)", - type="numpy" - ) + with gr.Column(scale=1): + with gr.Group(): + gr.Markdown(f"**Camera Controls** ({int(MOVEMENT_DISTANCE*100)}cm increments)") + + # 3x3 Grid for directional controls with equal-width columns + # Row 1: Empty, Up, Empty + with gr.Row(): + gr.HTML("
") # Empty spacer + camera_up = gr.Button("⬆️ Up", variant="primary", elem_classes=["direction-btn"]) + gr.HTML("
") # Empty spacer + + # Row 2: Left, Empty, Right + with gr.Row(): + camera_left = gr.Button("⬅️ Left", variant="primary", elem_classes=["direction-btn"]) + gr.HTML("") # Empty spacer + camera_right = gr.Button("➡️ Right", variant="primary", elem_classes=["direction-btn"]) + + # Row 3: Empty, Down, Empty + with gr.Row(): + gr.HTML("") # Empty spacer + camera_down = gr.Button("⬇️ Down", variant="primary", elem_classes=["direction-btn"]) + gr.HTML("") # Empty spacer + + # Row 4: Forward, Back + with gr.Row(): + camera_forward = gr.Button("Forward", variant="primary", elem_classes=["direction-btn"]) + camera_back = gr.Button("Back", variant="primary", elem_classes=["direction-btn"]) + + # Right Panel: Camera Feed + with gr.Column(scale=2): + camera_feed = gr.Image( + label="Robot Camera Feed (~10 FPS)", + type="numpy" + ) # Timer for periodic camera updates (Gradio 4.0 style) timer = gr.Timer(value=0.1, active=True) @@ -346,29 +480,20 @@ def create_interface() -> gr.Blocks: # 3x3 Grid for directional controls with equal-width columns # Row 1: Empty, Up, Empty with gr.Row(): - with gr.Column(scale=1): gr.HTML("") # Empty spacer - with gr.Column(scale=1): btn_up = gr.Button("⬆️ Up", variant="primary", elem_classes=["direction-btn"]) - with gr.Column(scale=1): gr.HTML("") # Empty spacer # Row 2: Left, Empty, Right with gr.Row(): - with gr.Column(scale=1): btn_left = gr.Button("⬅️ Left", variant="primary", elem_classes=["direction-btn"]) - with gr.Column(scale=1): gr.HTML("") # Empty spacer - with gr.Column(scale=1): btn_right = gr.Button("➡️ Right", variant="primary", elem_classes=["direction-btn"]) # Row 3: Empty, Down, Empty with gr.Row(): - with gr.Column(scale=1): gr.HTML("") # Empty spacer - with gr.Column(scale=1): btn_down = gr.Button("⬇️ Down", variant="primary", elem_classes=["direction-btn"]) - with gr.Column(scale=1): gr.HTML("") # Empty spacer with gr.Group(): @@ -435,6 +560,14 @@ def create_interface() -> gr.Blocks: outputs=camera_feed ) + # Camera movement button handlers + camera_up.click(fn=move_camera_up, inputs=None, outputs=btn_status) + camera_down.click(fn=move_camera_down, inputs=None, outputs=btn_status) + camera_left.click(fn=move_camera_left, inputs=None, outputs=btn_status) + camera_right.click(fn=move_camera_right, inputs=None, outputs=btn_status) + camera_forward.click(fn=move_camera_forward, inputs=None, outputs=btn_status) + camera_back.click(fn=move_camera_back, inputs=None, outputs=btn_status) + # Movement button handlers btn_up.click(fn=move_up, inputs=None, outputs=btn_status) btn_down.click(fn=move_down, inputs=None, outputs=btn_status) diff --git a/rcg/llm.py b/rcg/llm.py index 806f0cb8..e0e6f8ac 100644 --- a/rcg/llm.py +++ b/rcg/llm.py @@ -8,7 +8,7 @@ # Import prompts from prompt.py from rcg.prompt import ( SYSTEM_PROMPT, - FUNCTION_SCHEMAS, + TOOL_SCHEMAS, get_error_message, get_success_message ) @@ -22,6 +22,9 @@ OpenAI = None print("[Warning] OpenAI package not installed. Install with: pip install openai") +# Load environment variables +from dotenv import load_dotenv +load_dotenv() # ============================================================================ # Configuration @@ -29,19 +32,18 @@ class LLMConfig: """Configuration for LLM API.""" - # OpenAI API settings - # WARNING: Hardcoded API key for debugging only. Should use environment variables in production. - API_KEY = "sk-" + # OpenAI API settings (using custom Qwen3 API endpoint) + API_KEY = os.getenv("OPENAI_API_KEY", "") # NOTE: BASE_URL typically ends with /v1 - BASE_URL = " - + BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") + # Use models supportting function calling - MODEL = "sqz-qwen3-235b" + MODEL = os.getenv("OPENAI_MODEL", "gpt-5.1") # Temperature and other params TEMPERATURE = 0.7 - MAX_TOKENS = None # None = no limit + MAX_TOKENS = 2048 # None = no limit # Verbose logging VERBOSE = True @@ -764,59 +766,76 @@ def process_command(self, user_input: str) -> Dict[str, Any]: response = client.chat.completions.create( model=LLMConfig.MODEL, messages=self.conversation_history, - functions=FUNCTION_SCHEMAS, - function_call="auto", - temperature=LLMConfig.TEMPERATURE + tools=TOOL_SCHEMAS, + tool_choice="auto", + parallel_tool_calls=True, + temperature=LLMConfig.TEMPERATURE, + max_completion_tokens=LLMConfig.MAX_TOKENS ) message = response.choices[0].message # Check for function call - if hasattr(message, 'function_call') and message.function_call: - function_name = message.function_call.name - function_args = json.loads(message.function_call.arguments) - - # Log function call - if self.enable_logging: - self._write_log(f"FUNCTION CALL: {function_name}") - self._write_log(f"Arguments: {json.dumps(function_args, indent=2, ensure_ascii=False)}") - self._write_log("") + if hasattr(message, 'tool_calls') and message.tool_calls: + function_names = [] + function_args_list = [] + function_results = [] + + for call in message.tool_calls: + function_name = call.function.name + function_names.append(function_name) + function_args = json.loads(call.function.arguments) + function_args_list.append(function_args) + + # Log function call + if self.enable_logging: + self._write_log(f"FUNCTION CALL: {function_name}") + self._write_log(f"Arguments: {json.dumps(function_args, indent=2, ensure_ascii=False)}") + self._write_log("") - if LLMConfig.SHOW_FUNCTION_CALLS: - print(f"\n[LLM] Calling function: {function_name}") - print(f"[LLM] Arguments: {json.dumps(function_args, indent=2)}") + if LLMConfig.SHOW_FUNCTION_CALLS: + print(f"\n[LLM] Calling function: {function_name}") + print(f"[LLM] Arguments: {json.dumps(function_args, indent=2)}") - # Execute function - function_result = execute_function(function_name, function_args) + # Execute function + function_result = execute_function(function_name, function_args) + function_results.append(function_result if function_result else {}) - # Log function result - if self.enable_logging: - self._write_log(f"FUNCTION RESULT:") - self._write_log(f" Success: {function_result.get('success', False)}") - self._write_log(f" Message: {function_result.get('message', 'N/A')}") - if function_result.get('data'): - data_str = json.dumps(function_result['data'], indent=2, ensure_ascii=False) - self._write_log(f" Data: {data_str}") - self._write_log("") - - # Add to history - self.conversation_history.append({ - "role": "assistant", - "content": None, - "function_call": {"name": function_name, "arguments": json.dumps(function_args)} - }) - - self.conversation_history.append({ - "role": "function", - "name": function_name, - "content": json.dumps(function_result) - }) + # Log function result + if self.enable_logging: + self._write_log(f"FUNCTION RESULT:") + self._write_log(f" Success: {function_result.get('success', False)}") + self._write_log(f" Message: {function_result.get('message', 'N/A')}") + if function_result.get('data'): + data_str = json.dumps(function_result['data'], indent=2, ensure_ascii=False) + self._write_log(f" Data: {data_str}") + self._write_log("") + + # Add to history + self.conversation_history.append({ + "role": "assistant", + "content": None, + "tool_calls": [{ + "function": {"name": function_name, "arguments": json.dumps(function_args)}, + "type": call.type, + "id": call.id + }] + }) + + self.conversation_history.append({ + "tool_call_id": call.id, + "role": "tool", + "type": "function_tool_output", + "name": function_name, + "content": json.dumps(function_result) + }) # Get final response final_response = client.chat.completions.create( model=LLMConfig.MODEL, messages=self.conversation_history, - temperature=LLMConfig.TEMPERATURE + temperature=LLMConfig.TEMPERATURE, + max_completion_tokens=LLMConfig.MAX_TOKENS ) final_message = final_response.choices[0].message.content @@ -830,95 +849,26 @@ def process_command(self, user_input: str) -> Dict[str, Any]: return { "success": True, - "function_called": function_name, - "function_args": function_args, - "function_result": function_result, + "function_called": function_names, + "function_args": function_args_list, + "function_result": function_results, "llm_response": final_message } else: - # No standard function call - try manual parsing - assistant_message = message.content - - # Try to parse manual function call from text - parsed = self._parse_manual_function_call(assistant_message) - - if parsed: - # Found manual function call! - function_name, function_args = parsed - - # Log manual function call - if self.enable_logging: - self._write_log(f"MANUAL FUNCTION CALL DETECTED: {function_name}") - self._write_log(f"Arguments: {json.dumps(function_args, indent=2, ensure_ascii=False)}") - self._write_log("") - - if LLMConfig.SHOW_FUNCTION_CALLS: - print(f"\n[LLM] Manual function call: {function_name}") - print(f"[LLM] Arguments: {json.dumps(function_args, indent=2)}") + # No function call at all + self.conversation_history.append({"role": "assistant", "content": message.content}) - # Execute function - function_result = execute_function(function_name, function_args) - - # Log function result - if self.enable_logging: - self._write_log(f"FUNCTION RESULT:") - self._write_log(f" Success: {function_result.get('success', False)}") - self._write_log(f" Message: {function_result.get('message', 'N/A')}") - if function_result.get('data'): - data_str = json.dumps(function_result['data'], indent=2, ensure_ascii=False) - self._write_log(f" Data: {data_str}") - self._write_log("") - - # Add to history - self.conversation_history.append({"role": "assistant", "content": assistant_message}) - - # Create user message with function result - result_message = f"Function {function_name} returned: {json.dumps(function_result)}" - self.conversation_history.append({"role": "user", "content": result_message}) - - # Get final response - final_response = client.chat.completions.create( - model=LLMConfig.MODEL, - messages=self.conversation_history, - temperature=LLMConfig.TEMPERATURE - ) - - final_message = final_response.choices[0].message.content - - # Remove tags from final message - import re - final_message = re.sub(r'.*?', '', final_message, flags=re.DOTALL).strip() - - self.conversation_history.append({"role": "assistant", "content": final_message}) - - # Log LLM response - if self.enable_logging: - self._write_log(f"LLM RESPONSE:") - self._write_log(final_message) - self._write_log("") - - return { - "success": True, - "function_called": function_name, - "function_args": function_args, - "function_result": function_result, - "llm_response": final_message - } - else: - # No function call at all - self.conversation_history.append({"role": "assistant", "content": assistant_message}) - - # Log LLM response - if self.enable_logging: - self._write_log(f"LLM RESPONSE (no function call):") - self._write_log(assistant_message) - self._write_log("") + # Log LLM response + if self.enable_logging: + self._write_log(f"LLM RESPONSE (no function call):") + self._write_log(message.content) + self._write_log("") - return { - "success": True, - "function_called": None, - "llm_response": assistant_message - } + return { + "success": True, + "function_called": None, + "llm_response": message.content + } except Exception as e: return { diff --git a/rcg/main.py b/rcg/main.py index ffc8be9c..4faec524 100644 --- a/rcg/main.py +++ b/rcg/main.py @@ -10,6 +10,7 @@ import sys import argparse from pathlib import Path +from dotenv import load_dotenv # Add project root to path _PROJECT_ROOT = Path(__file__).parent.parent @@ -157,6 +158,9 @@ def parse_args(): def main(): """Main entry point.""" + # Load environment variables + load_dotenv() + # Parse arguments args = parse_args() use_gradio = not args.no_gradio diff --git a/rcg/prompt.py b/rcg/prompt.py index 2fe39aad..6f29897e 100644 --- a/rcg/prompt.py +++ b/rcg/prompt.py @@ -15,7 +15,7 @@ # System Prompt # ============================================================================ -SYSTEM_PROMPT = """You control a Kinova Gen3 robotic arm in Unity. Be concise and direct. +SYSTEM_PROMPT_FULL = """You control a Kinova Gen3 robotic arm in Unity. Be concise and direct. ## ⚠️ CRITICAL: Coordinate System Unity uses: **X = left/right, Y = UP/DOWN (vertical), Z = forward/back** @@ -125,6 +125,20 @@ 6. **NEVER use Z-axis for up/down movement! Always use Y-axis!** """ +SYSTEM_PROMPT = """You control a Kinova Gen3 robotic arm in Unity. Be concise and direct. +Think step by step about what functions you need to call to fully complete the user's request. +Ensure that you are calling all functions necessary to achieve the desired outcome. + +## ⚠️ CRITICAL: Coordinate System +Unity uses: **X = left/right, Y = UP/DOWN (vertical), Z = forward/back** +- Move UP → increase Y (y > 0) +- Move DOWN → decrease Y (y < 0) +- Move LEFT → decrease X (x < 0) +- Move RIGHT → increase X (x > 0) +- Move FORWARD → increase Z (z > 0) +- Move BACKWARD → decrease Z (z < 0) +""" + # ============================================================================ # Function Schemas for OpenAI Function Calling @@ -243,6 +257,8 @@ } ] +TOOL_SCHEMAS = [{"type": "function", "function": f} for f in FUNCTION_SCHEMAS] + # ============================================================================ # User Prompt Templates