diff --git a/README.md b/README.md index c38824759..41d731dfc 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,12 @@ pip install -e . # Or using uv (faster) uv pip install -e . +# For development with testing dependencies (pytest, etc.) +pip install -e ".[dev]" + +# Or using uv +uv pip install -e ".[dev]" + # Run server locally without Docker uv run server --host 0.0.0.0 --port 8000 ``` diff --git a/examples/local_coding_env.py b/examples/local_coding_env.py index e88dcb352..6509d7caf 100644 --- a/examples/local_coding_env.py +++ b/examples/local_coding_env.py @@ -67,6 +67,7 @@ def main(): print(f" {i}. Code: {code.replace(chr(10), '\\n')[:50]}...") print(f" → stdout: {result.observation.stdout.strip()}") print(f" → exit_code: {result.observation.exit_code}") + print(f" → reward: {result.reward}") if result.observation.stderr: print(f" → stderr: {result.observation.stderr}") @@ -84,6 +85,8 @@ def main(): print(f" {i}. {description}") print(f" Code: {code.replace(chr(10), '\\n')[:40]}...") print(f" → exit_code: {result.observation.exit_code}") + print(f" → reward: {result.reward}") + if result.observation.stderr: # Truncate long error messages error_msg = result.observation.stderr[:100] @@ -116,6 +119,7 @@ def main(): except Exception as e: print(f"\n❌ Test failed: {e}") import traceback + traceback.print_exc() return False diff --git a/pyproject.toml b/pyproject.toml index 37d7400a2..dd943be25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "openenv" -version = "0.1.1" +version = "0.1.2" description = "A unified framework for reinforcement learning environments" readme = "README.md" requires-python = ">=3.10" @@ -26,6 +26,11 @@ dependencies = [ "tomli-w>=1.2.0" ] +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", +] + [project.scripts] openenv = "openenv_cli.__main__:main" diff --git a/src/core/containers/runtime/providers.py b/src/core/containers/runtime/providers.py index a8022ddca..529710899 100644 --- a/src/core/containers/runtime/providers.py +++ b/src/core/containers/runtime/providers.py @@ -192,23 +192,42 @@ def stop_container(self) -> None: import subprocess try: - # Stop container + # Try graceful stop first (Docker waits 5 seconds before SIGKILL) + # Subprocess timeout is 15 seconds to allow Docker's grace period subprocess.run( - ["docker", "stop", self._container_id], + ["docker", "stop", "--time=5", self._container_id], capture_output=True, check=True, - timeout=10, + timeout=15, ) + except subprocess.TimeoutExpired: + # Graceful stop timed out, force kill the container + print(f"Warning: Container {self._container_id} did not stop gracefully, forcing kill...") + try: + subprocess.run( + ["docker", "kill", self._container_id], + capture_output=True, + check=True, + timeout=5, + ) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + # Container might already be stopped + pass + except subprocess.CalledProcessError: + # Container might already be stopped + pass - # Remove container + # Always try to remove the container + try: subprocess.run( - ["docker", "rm", self._container_id], + ["docker", "rm", "-f", self._container_id], capture_output=True, check=True, timeout=10, ) - except subprocess.CalledProcessError: - # Container might already be stopped/removed + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + # Container might already be removed or removal failed + # Use -f flag to force removal even if still running pass finally: self._container_id = None diff --git a/src/envs/coding_env/README.md b/src/envs/coding_env/README.md index b99921b8e..aec93982c 100644 --- a/src/envs/coding_env/README.md +++ b/src/envs/coding_env/README.md @@ -15,13 +15,112 @@ tags: A Python code execution environment that runs arbitrary Python code and returns results. Perfect for testing code execution infrastructure and demonstrating environment usage patterns. -## Quick Start +## Installation & Usage -The simplest way to use the Coding environment is through the `CodingEnv` class: +The Coding Environment supports two usage modes: +### Mode 1: In-Repository Development (Recommended for Contributors) + +Use this mode when developing or contributing to OpenEnv. + +**Setup:** +```bash +# 1. Clone the repository +git clone https://github.com/facebookresearch/OpenEnv.git +cd OpenEnv + +# 2. Install in development mode +pip install -e . + +# 3. Build the Docker image (from repo root) +docker build -t coding-env:latest -f src/envs/coding_env/server/Dockerfile . + +# 4. Run the example +python ./examples/local_coding_env.py +``` + +**Code example:** ```python +# Use in-repo import paths from envs.coding_env import CodeAction, CodingEnv +try: + # Create environment from Docker image + coding_env = CodingEnv.from_docker_image("coding-env:latest") + + # Execute Python code + result = coding_env.step(CodeAction(code="print('Hello, World!')")) + print(f"stdout: {result.observation.stdout.strip()}") + print(f"exit_code: {result.observation.exit_code}") +finally: + coding_env.close() +``` + +### Mode 2: Standalone Package (For End Users) + +Use this mode when using coding_env as a standalone package. + +**Setup:** +```bash +# 1. Install openenv-core (once available on PyPI) +pip install openenv-core + +# 2. Install coding_env package +pip install openenv-coding_env + +# 3. Use the same Docker image as in-repo mode +# The client-server communicate over HTTP, so the Docker build mode doesn't matter for testing +# You can use the in-repo built image: coding-env:latest +``` + +**Code example:** +```python +# Use standalone import paths +from coding_env import CodeAction, CodingEnv + +try: + # Connect to the same Docker image built in in-repo mode + coding_env = CodingEnv.from_docker_image("coding-env:latest") + result = coding_env.step(CodeAction(code="print('Hello, World!')")) + print(f"stdout: {result.observation.stdout.strip()}") +finally: + coding_env.close() +``` + +## Quick Start Example + +**In-repo mode:** +```bash +# From OpenEnv repo root, after pip install -e . +python ./examples/local_coding_env.py +``` + +**Standalone mode:** + +For standalone testing, use a separate test script (the repo example uses in-repo imports only): + +```python +# save as test_standalone.py +from coding_env import CodeAction, CodingEnv + +try: + # Uses the same Docker image as in-repo mode + client = CodingEnv.from_docker_image("coding-env:latest") + result = client.step(CodeAction(code="print('Hello from standalone!')")) + print(f"stdout: {result.observation.stdout.strip()}") +finally: + client.close() +``` + +**Note:** The client (your Python code) and server (Docker container) are independent. The standalone client can connect to the in-repo Docker image because they communicate over HTTP. + +### Manual Usage Example + +Once set up (either mode), the usage is identical: + +```python +from coding_env import CodeAction, CodingEnv # or: from envs.coding_env import ... + try: # Create environment from Docker image coding_env = CodingEnv.from_docker_image("coding-env:latest") @@ -48,21 +147,12 @@ finally: coding_env.close() ``` -That's it! The `CodingEnv.from_docker_image()` method handles: +The `CodingEnv.from_docker_image()` method handles: - Starting the Docker container - Waiting for the server to be ready - Connecting to the environment - Container cleanup when you call `close()` -## Building the Docker Image - -Before using the environment, you need to build the Docker image: - -```bash -# From project root -docker build -t coding-env:latest -f src/envs/coding_env/server/Dockerfile . -``` - ## Environment Details ### Action @@ -88,10 +178,14 @@ docker build -t coding-env:latest -f src/envs/coding_env/server/Dockerfile . If you already have a Coding environment server running, you can connect directly: ```python +# In-repo mode from envs.coding_env import CodingEnv +# OR standalone mode +from coding_env import CodingEnv + # Connect to existing server -coding_env = CodingEnv(base_url="") +coding_env = CodingEnv(base_url="http://localhost:8000") # Use as normal result = coding_env.reset() @@ -100,34 +194,60 @@ result = coding_env.step(CodeAction(code="print('Hello!')")) Note: When connecting to an existing server, `coding_env.close()` will NOT stop the server. -## Development & Testing +## Docker Build Options + +The Dockerfile supports two build modes: + +```bash +# In-repo build (default) - from OpenEnv repo root +docker build -t coding-env:latest -f src/envs/coding_env/server/Dockerfile . + +# Standalone build - from coding_env package directory (for distribution only) +docker build -t coding-env:standalone -f server/Dockerfile --build-arg BUILD_MODE=standalone . +``` + +**When to use each mode:** + +- **In-repo mode (default)**: For development and testing (works with both client modes) +- **Standalone mode**: Only needed when distributing the Docker image without the full OpenEnv repo -### Running the Full Example +**Important:** For local testing, the in-repo Docker image works with both in-repo and standalone clients. The client and server communicate over HTTP, so they're independent. The BUILD_MODE distinction is primarily for distribution/packaging purposes. + +## Development & Testing -Run the complete example that demonstrates the full workflow: +### Running Tests ```bash -python3 src/envs/coding_env/client/example_usage.py +# From repo root +pytest tests/envs/test_python_codeact_reset.py ``` -This example shows: -- Creating an environment from a Docker image -- Resetting and executing code through the environment -- Automatic cleanup with `close()` +### Building Packages Locally + +```bash +# Build openenv-core +cd src +python -m build -w + +# Build coding_env +cd envs/coding_env +python -m build -w +``` ## Project Structure ``` coding_env/ -├── README.md # This file -├── models.py # Action, Observation, and State models -├── client/ -│ ├── coding_env_client.py # CodingEnv client implementation -│ └── example_usage.py # Usage examples +├── README.md # This file +├── pyproject.toml # Package configuration +├── __init__.py # Package exports +├── models.py # Action, Observation, and State models +├── client.py # CodingEnv client implementation └── server/ ├── python_codeact_env.py # Core environment logic + ├── python_executor.py # Code execution wrapper ├── app.py # FastAPI application ├── transforms.py # Observation transforms - ├── Dockerfile # Container image definition + ├── Dockerfile # Container image (dual-mode) └── README.md # Server-specific documentation ``` diff --git a/src/envs/coding_env/client.py b/src/envs/coding_env/client.py index d65c5152e..4b7e40e28 100644 --- a/src/envs/coding_env/client.py +++ b/src/envs/coding_env/client.py @@ -13,11 +13,18 @@ from __future__ import annotations -from openenv_core.client_types import StepResult - -from openenv_core.http_env_client import HTTPEnvClient - -from coding_env.models import CodeAction, CodeObservation, CodeState +# Support both standalone and in-repo imports +try: + # Standalone imports (when installed from pip) + from openenv_core.client_types import StepResult + from openenv_core.http_env_client import HTTPEnvClient +except ImportError: + # In-repo imports (when running from OpenEnv repository) + from core.client_types import StepResult + from core.http_env_client import HTTPEnvClient + +# Use relative imports for sibling modules - works in both modes +from .models import CodeAction, CodeObservation, CodeState class CodingEnv(HTTPEnvClient[CodeAction, CodeObservation]): diff --git a/src/envs/coding_env/models.py b/src/envs/coding_env/models.py index a92c2560e..6f6330212 100644 --- a/src/envs/coding_env/models.py +++ b/src/envs/coding_env/models.py @@ -8,7 +8,13 @@ from dataclasses import dataclass -from openenv_core.env_server.interfaces import Action, Observation, State +# Support both standalone and in-repo imports +try: + # Standalone imports (when installed from pip) + from openenv_core.env_server.types import Action, Observation, State +except ImportError: + # In-repo imports (when running from OpenEnv repository) + from core.env_server.types import Action, Observation, State @dataclass diff --git a/src/envs/coding_env/pyproject.toml b/src/envs/coding_env/pyproject.toml index 06b70f2ba..c35f6f3a5 100644 --- a/src/envs/coding_env/pyproject.toml +++ b/src/envs/coding_env/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "openenv-coding_env" -version = "0.1.0" +version = "0.1.1" description = "Coding Environment for OpenEnv" requires-python = ">=3.10" dependencies = [ diff --git a/src/envs/coding_env/server/Dockerfile b/src/envs/coding_env/server/Dockerfile index cef367db9..efb5cadf2 100644 --- a/src/envs/coding_env/server/Dockerfile +++ b/src/envs/coding_env/server/Dockerfile @@ -1,26 +1,73 @@ -# Base image +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Dockerfile for Coding Environment +# Supports both in-repo and standalone builds +# +# In-repo build (from repo root): +# docker build -t coding-env:latest -f src/envs/coding_env/server/Dockerfile . +# +# Standalone build (from coding_env directory with openenv-core on PyPI): +# docker build -t coding-env:latest -f server/Dockerfile --build-arg BUILD_MODE=standalone . + FROM python:3.11-slim +# Build argument to control mode +ARG BUILD_MODE=in-repo + # Set working directory -WORKDIR /app/env +WORKDIR /app # Install system dependencies RUN apt-get update && apt-get install -y \ git \ + curl \ && rm -rf /var/lib/apt/lists/* -# Copy environment files -COPY . . +# Copy files based on build mode +# For in-repo: copy entire src/ directory from repo root +# For standalone: copy current directory (coding_env package) +COPY ${BUILD_MODE:+src/} ./ -# Install Python dependencies -RUN pip install --no-cache-dir -e . +# Install dependencies +RUN if [ "$BUILD_MODE" = "in-repo" ]; then \ + # In-repo: install core dependencies directly \ + pip install --no-cache-dir \ + 'fastapi>=0.104.0' \ + 'pydantic>=2.0.0' \ + 'uvicorn[standard]>=0.24.0' \ + 'requests>=2.25.0' \ + 'smolagents>=1.22.0,<2'; \ + else \ + # Standalone: install from pyproject.toml (includes openenv-core from PyPI) \ + pip install --no-cache-dir -e .; \ + fi + +# Convert ARG to ENV so it's available at runtime +ENV BUILD_MODE=${BUILD_MODE} +ENV PYTHONUNBUFFERED=1 +ENV ENABLE_WEB_INTERFACE=true + +# Set PYTHONPATH based on build mode (evaluated at build time) +RUN if [ "$BUILD_MODE" = "in-repo" ]; then \ + echo "export PYTHONPATH=/app/src" >> /etc/environment; \ + fi # Expose port EXPOSE 8000 -# Set environment variables -ENV PYTHONUNBUFFERED=1 -ENV ENABLE_WEB_INTERFACE=true +# Health check +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 -# Run the server -CMD ["python", "-m", "uvicorn", "coding_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"] +# Run the server with correct module path based on build mode +# In-repo: envs.coding_env.server.app:app (with PYTHONPATH=/app/src) +# Standalone: coding_env.server.app:app (with package installed) +CMD if [ "$BUILD_MODE" = "in-repo" ]; then \ + export PYTHONPATH=/app/src && uvicorn envs.coding_env.server.app:app --host 0.0.0.0 --port 8000; \ + else \ + uvicorn coding_env.server.app:app --host 0.0.0.0 --port 8000; \ + fi diff --git a/src/envs/coding_env/server/app.py b/src/envs/coding_env/server/app.py index 1a5edf7cb..6bfbf962a 100644 --- a/src/envs/coding_env/server/app.py +++ b/src/envs/coding_env/server/app.py @@ -21,10 +21,23 @@ python -m envs.coding_env.server.app """ -from openenv_core.env_server import create_app - -from coding_env.models import CodeAction, CodeObservation -from coding_env.server.python_codeact_env import PythonCodeActEnv +# Support both standalone and in-repo imports +try: + # Standalone imports (when installed from pip) + from openenv_core.env_server import create_app +except ImportError: + # In-repo imports (when running from OpenEnv repository) + from core.env_server import create_app + +# Use relative/absolute imports that work in both modes +try: + # Standalone mode + from coding_env.models import CodeAction, CodeObservation + from coding_env.server.python_codeact_env import PythonCodeActEnv +except ImportError: + # In-repo mode + from envs.coding_env.models import CodeAction, CodeObservation + from envs.coding_env.server.python_codeact_env import PythonCodeActEnv # Create the environment instance env = PythonCodeActEnv() diff --git a/src/envs/coding_env/server/python_codeact_env.py b/src/envs/coding_env/server/python_codeact_env.py index ecc93d9fe..3e744a5ee 100644 --- a/src/envs/coding_env/server/python_codeact_env.py +++ b/src/envs/coding_env/server/python_codeact_env.py @@ -13,11 +13,27 @@ import uuid -from openenv_core.env_server.interfaces import Action, Environment, Observation -from coding_env.server.python_executor import PyExecutor +# Support both standalone and in-repo imports +try: + # Standalone imports (when installed from pip) + from openenv_core.env_server.interfaces import Action, Environment, Observation +except ImportError: + # In-repo imports (when running from OpenEnv repository) + from core.env_server.interfaces import Action, Environment, Observation -from coding_env.models import CodeAction, CodeObservation, CodeState -from .transforms import create_safe_coding_transform +# Use relative/absolute imports that work in both modes +try: + from coding_env.models import CodeAction, CodeObservation, CodeState + + # Standalone mode + from coding_env.server.python_executor import PyExecutor + from coding_env.server.transforms import create_safe_coding_transform +except ImportError: + from envs.coding_env.models import CodeAction, CodeObservation, CodeState + + # In-repo mode + from envs.coding_env.server.python_executor import PyExecutor + from envs.coding_env.server.transforms import create_safe_coding_transform class PythonCodeActEnv(Environment): @@ -105,6 +121,7 @@ def step(self, action: Action) -> Observation: stdout=result.stdout, stderr=result.stderr, exit_code=result.exit_code, + metadata={"last_code": action.code}, # Add code to metadata for transforms ) return self._apply_transform(observation) diff --git a/src/envs/coding_env/server/python_executor.py b/src/envs/coding_env/server/python_executor.py index 17b6ecc13..19024f0da 100644 --- a/src/envs/coding_env/server/python_executor.py +++ b/src/envs/coding_env/server/python_executor.py @@ -27,7 +27,13 @@ from smolagents import LocalPythonExecutor -from openenv_core.env_server.types import CodeExecResult +# Support both standalone and in-repo imports +try: + # Standalone imports (when installed from pip) + from openenv_core.env_server.types import CodeExecResult +except ImportError: + # In-repo imports (when running from OpenEnv repository) + from core.env_server.types import CodeExecResult logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) diff --git a/src/envs/coding_env/server/transforms.py b/src/envs/coding_env/server/transforms.py index ee5a1c4b0..39d2e5a80 100644 --- a/src/envs/coding_env/server/transforms.py +++ b/src/envs/coding_env/server/transforms.py @@ -9,11 +9,25 @@ import ast import re -from openenv_core.env_server.base_transforms import CompositeTransform -from openenv_core.env_server.interfaces import Transform -from openenv_core.env_server.types import Observation - -from coding_env.models import CodeObservation +# Support both standalone and in-repo imports +try: + # Standalone imports (when installed from pip) + from openenv_core.env_server.base_transforms import CompositeTransform + from openenv_core.env_server.interfaces import Transform + from openenv_core.env_server.types import Observation +except ImportError: + # In-repo imports (when running from OpenEnv repository) + from core.env_server.base_transforms import CompositeTransform + from core.env_server.interfaces import Transform + from core.env_server.types import Observation + +# Use relative/absolute imports that work in both modes +try: + # Standalone mode + from coding_env.models import CodeObservation +except ImportError: + # In-repo mode + from envs.coding_env.models import CodeObservation class CodeSafetyTransform(Transform): diff --git a/tests/envs/test_python_codeact_rewards.py b/tests/envs/test_python_codeact_rewards.py new file mode 100644 index 000000000..0a5bd811e --- /dev/null +++ b/tests/envs/test_python_codeact_rewards.py @@ -0,0 +1,270 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Test that PythonCodeActEnv properly computes rewards via transform pipeline.""" + +import sys +from pathlib import Path + +import pytest + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + +from envs.coding_env.models import CodeAction +from envs.coding_env.server.python_codeact_env import PythonCodeActEnv + + +# ============================================================================ +# Fixtures +# ============================================================================ + + +@pytest.fixture +def env(): + """Provides a fresh PythonCodeActEnv for each test.""" + environment = PythonCodeActEnv() + environment.reset() + return environment + + +@pytest.fixture +def env_with_variable(env): + """Environment with a variable already defined.""" + env.step(CodeAction(code="test_var = 42")) + return env + + +# ============================================================================ +# Parametrized Tests - Reward Computation +# ============================================================================ + + +@pytest.mark.parametrize( + "code,expected_reward,expected_exit_code,description", + [ + # Safe + concise code + ("x = 5", 0.1, 0, "safe + concise"), + ("print('Hello')", 0.1, 0, "safe + concise print"), + ("y = 10 + 5", 0.1, 0, "safe + concise calculation"), + # Safe + verbose code (>100 chars, no concise bonus) + ("x = " + " + ".join(str(i) for i in range(50)), 0.0, 0, "safe + verbose"), + # Dangerous + concise (-1.0 safety + 0.1 concise = -0.9) + # NOTE: These actually fail at execution, so exit_code=1 + ("import os", -0.9, 1, "dangerous + concise"), + ("eval('1+1')", -0.9, 1, "dangerous eval"), + ("exec('x=1')", -0.9, 1, "dangerous exec"), + ("with open('f.txt') as f: pass", -0.9, 1, "dangerous open"), + # Dangerous + verbose (-1.0 safety, no concise bonus) + ("import os\n" + "x = 1\n" * 50, -1.0, 1, "dangerous + verbose"), + # Syntax error + concise (0.0 safe - 0.2 syntax + 0.1 concise = -0.1) + ("print('unclosed", -0.1, 1, "syntax error + concise"), + # Syntax error + verbose (0.0 safe - 0.2 syntax = -0.2) + ( + "x = " + " + ".join(str(i) for i in range(50)) + "\nprint('unclosed", + -0.2, + 1, + "syntax error + verbose", + ), + ], + ids=lambda x: ( + x if isinstance(x, str) and len(x) < 20 else None + ), # Use description for test IDs +) +def test_reward_computation( + env, code, expected_reward, expected_exit_code, description +): + """Test reward computation for various code patterns. + + Parametrized test covering: + - Safe code (concise and verbose) + - Dangerous patterns (import os, eval, exec, open) + - Syntax errors + - Combinations of safety and quality transforms + + Uses pytest.approx() for all float comparisons since rewards are computed + via floating point addition in the transform pipeline (transforms.py line 101). + """ + action = CodeAction(code=code) + obs = env.step(action) + + assert obs.reward == pytest.approx( + expected_reward, rel=1e-9 + ), f"{description}: expected reward {expected_reward}, got {obs.reward}" + assert ( + obs.exit_code == expected_exit_code + ), f"{description}: expected exit_code {expected_exit_code}, got {obs.exit_code}" + + +# ============================================================================ +# Metadata Tests +# ============================================================================ + + +def test_metadata_contains_last_code(env): + """Test that step() includes executed code in observation metadata. + + This is CRITICAL for the transform pipeline to evaluate code and assign rewards. + Without metadata["last_code"], transforms cannot access the code and rewards + will always be None. + """ + code = "print('Hello, World!')" + action = CodeAction(code=code) + obs = env.step(action) + + assert ( + "last_code" in obs.metadata + ), "metadata must contain 'last_code' for transform pipeline to evaluate code" + assert ( + obs.metadata["last_code"] == code + ), f"metadata['last_code'] should be '{code}', got '{obs.metadata.get('last_code')}'" + + +@pytest.mark.parametrize( + "code,should_have_violation", + [ + ("import os", True), + ("eval('1+1')", True), + ("open('file.txt')", True), + ("print('safe')", False), + ("x = 1 + 2", False), + ], +) +def test_metadata_safety_violations(env, code, should_have_violation): + """Test that metadata correctly tracks safety violations.""" + action = CodeAction(code=code) + obs = env.step(action) + + assert "last_code" in obs.metadata + assert obs.metadata["last_code"] == code + + if should_have_violation: + assert ( + "safety_violation" in obs.metadata + ), f"Code '{code}' should have safety_violation in metadata" + else: + assert ( + "safety_violation" not in obs.metadata + ), f"Code '{code}' should NOT have safety_violation in metadata" + + +# ============================================================================ +# Consistency and State Tests +# ============================================================================ + + +def test_reward_not_none_for_safe_code(env): + """Test that safe code always receives a non-None reward.""" + action = CodeAction(code="print('Hello')") + obs = env.step(action) + + assert obs.reward is not None, "Safe code should receive a reward (not None)" + assert obs.exit_code == 0, "Safe code should execute successfully" + + +def test_reward_consistency_across_steps(env): + """Test that rewards are computed consistently across multiple steps.""" + for i in range(5): + action = CodeAction(code=f"x = {i}") + obs = env.step(action) + + assert obs.reward is not None, f"Step {i}: Reward should not be None" + assert obs.reward == pytest.approx( + 0.1, rel=1e-9 + ), f"Step {i}: Should get consistent 0.1 reward, got {obs.reward}" + + +def test_reset_preserves_transform_functionality(env): + """Test that reset() doesn't break reward computation.""" + # First episode + action1 = CodeAction(code="x = 1") + obs1 = env.step(action1) + assert obs1.reward == pytest.approx(0.1, rel=1e-9) + + # Reset and start new episode + env.reset() + action2 = CodeAction(code="y = 2") + obs2 = env.step(action2) + assert obs2.reward == pytest.approx( + 0.1, rel=1e-9 + ), "Reward computation should work after reset" + + +# ============================================================================ +# Fixture Composition Tests +# ============================================================================ + + +def test_using_composed_fixture(env_with_variable): + """Test using an environment that builds on base fixture.""" + action = CodeAction(code="print(test_var)") + obs = env_with_variable.step(action) + + assert obs.exit_code == 0 + assert "42" in obs.stdout + assert obs.reward == pytest.approx(0.1, rel=1e-9) + + +@pytest.mark.parametrize( + "code,expected_output", + [ + ("print(test_var)", "42"), + ("print(test_var * 2)", "84"), + ("print(test_var + 8)", "50"), + ], +) +def test_fixture_with_parametrization(env_with_variable, code, expected_output): + """Test combining fixtures with parametrization.""" + action = CodeAction(code=code) + obs = env_with_variable.step(action) + + assert obs.exit_code == 0 + assert expected_output in obs.stdout + assert obs.reward == pytest.approx(0.1, rel=1e-9) + + +# ============================================================================ +# Edge Cases and Special Patterns +# ============================================================================ + + +@pytest.mark.parametrize( + "dangerous_pattern", + [ + "import os", + "import subprocess", + "eval('x')", + "exec('x=1')", + "__import__('os')", + "open('file.txt')", + ], +) +def test_all_dangerous_patterns_detected(env, dangerous_pattern): + """Test that all dangerous patterns are correctly detected and penalized.""" + action = CodeAction(code=dangerous_pattern) + obs = env.step(action) + + # Concise dangerous code gets -0.9 (-1.0 safety + 0.1 concise) + assert obs.reward == pytest.approx( + -0.9, rel=1e-9 + ), f"Pattern '{dangerous_pattern}' should get -0.9 reward, got {obs.reward}" + assert "safety_violation" in obs.metadata + + +def test_multiline_code_with_mixed_patterns(env): + """Test code with both safe and dangerous patterns (dangerous wins).""" + code = """ +x = 5 +y = 10 +import os +z = x + y +""" + action = CodeAction(code=code) + obs = env.step(action) + + # Should be flagged as dangerous even with safe code mixed in + assert obs.reward < 0, "Code with dangerous import should have negative reward" + assert "safety_violation" in obs.metadata