Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ jobs:
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

- name: Install OpenEnv for integration tests
run: |
# Install OpenEnv core and echo environment
uv pip install "openenv-core"
uv pip install "openenv-echo-env @ git+https://github.com/meta-pytorch/OpenEnv.git#subdirectory=src/envs/echo_env"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm i would prefer to not install more dependencies in ci.yml like this, as the tests are getting quite slow.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we create a test that happens in parallel that's specifically for the test you want to test?

- name: Run Core Tests with pytest-xdist
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down Expand Up @@ -109,6 +115,9 @@ jobs:
--ignore=tests/remote_server/test_remote_fireworks.py \
--ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \
--ignore=tests/logging/test_elasticsearch_direct_http_handler.py \
--ignore=tests/pytest/test_openenv_browsergym_basic.py \
--ignore=tests/pytest/test_openenv_browsergym_eval.py \
--ignore=tests/pytest/test_openenv_textarena_docker.py \
--ignore=eval_protocol/benchmarks/ \
--ignore=eval_protocol/quickstart/ \
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
Expand Down
8 changes: 6 additions & 2 deletions tests/pytest/test_openenv_browsergym_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")

# Skip if OpenEnv not installed
try:
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction # type: ignore
except ImportError:
pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True)


@pytest.mark.integration
def test_openenv_browsergym_basic():
Expand Down Expand Up @@ -43,8 +49,6 @@ def test_openenv_browsergym_basic():

# Construct the processor with a trivial action_parser; the model output will still be generated
# but we parse to a safe noop action to minimize flakiness for the environment step.
from envs.browsergym_env import BrowserGymAction, BrowserGymEnv # type: ignore

processor = OpenEnvRolloutProcessor(
env_factory=None,
prompt_builder=lambda obs, step, history: "Do nothing",
Expand Down
6 changes: 6 additions & 0 deletions tests/pytest/test_openenv_browsergym_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")

# Skip if OpenEnv not installed
try:
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction # type: ignore
except ImportError:
pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True)


def openenv_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Expand Down
44 changes: 12 additions & 32 deletions tests/pytest/test_openenv_echo_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,8 @@
from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor
import pytest


# Preferred import when using the monolithic `openenv` package
from envs.echo_env import EchoEnv # type: ignore


# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
# Import OpenEnv Echo environment
from envs.echo_env import EchoEnv, EchoAction # type: ignore
Comment thread
shreymodi1 marked this conversation as resolved.
Comment thread
shreymodi1 marked this conversation as resolved.


def echo_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
Expand All @@ -39,23 +34,10 @@ def action_parser(response_text: str):
"""
Convert raw model response to EchoAction.
"""
try:
from envs.echo_env import EchoAction # type: ignore
except Exception:
pytest.skip("OpenEnv (openenv.envs.echo_env) is not installed; skipping Echo hub test.")
raise
text = response_text.strip() if isinstance(response_text, str) else ""
return EchoAction(message=text or "hello")


# try:
# from envs.echo_env import EchoEnv # type: ignore

# _HAS_ECHO = True
# except Exception:
# _HAS_ECHO = False


# Inline test data
ECHO_INLINE_DATA: List[Dict[str, Any]] = [
{"id": "echo-1", "prompt": "hello"},
Expand All @@ -76,18 +58,16 @@ def action_parser(response_text: str):
num_runs=1,
max_concurrent_rollouts=2,
mode="pointwise",
rollout_processor=(
OpenEnvRolloutProcessor(
# Use HF Hub to launch the environment container automatically
env_client_cls=EchoEnv, # type: ignore
hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"),
# Simple prompt+parser above
prompt_builder=prompt_builder,
action_parser=action_parser,
# Keep defaults for timeouts/viewport/etc. (not relevant for echo)
timeout_ms=5000,
num_generations=1,
)
rollout_processor=OpenEnvRolloutProcessor(
# Use HF Hub to launch the environment container automatically
env_client_cls=EchoEnv,
hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"),
# Simple prompt+parser above
prompt_builder=prompt_builder,
action_parser=action_parser,
# Keep defaults for timeouts/viewport/etc. (not relevant for echo)
timeout_ms=5000,
num_generations=1,
),
)
def test_openenv_echo_hub(row: EvaluationRow) -> EvaluationRow:
Expand Down
Loading