Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,11 @@ jobs:

- name: Run tests
run: uv run inv dev.code.test

- name: Upload coverage report
if: always()
uses: actions/upload-artifact@v4
with:
name: coverage-xml
path: coverage.xml
if-no-files-found: error
3 changes: 3 additions & 0 deletions dev/code_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ def run_tests(ctx: Context, docker_img: str = "webarena-verified:test") -> None:
logging_utils.print_info("Running tests...")
ctx.run(
f"uv run pytest --webarena-verified-docker-img {docker_img} "
"--cov=src/webarena_verified "
"--cov-report=term-missing "
"--cov-report=xml "
"--ignore=tests/dataset/test_hf_dataset.py "
"--ignore=tests/integration/environment_control/ "
"--ignore=tests/integration/environments/"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,11 @@ def _type_normalize(self, value: Any) -> str:

# Dump to compact string with sorted keys (top-level only)
return json.dumps(parsed, sort_keys=True, separators=(",", ":"), ensure_ascii=False)

def _normalize_pipeline(self, value: Any) -> str:
"""Override base pipeline to preserve raw JSON string content.
Comment on lines 58 to +62
Copy link

Copilot AI Mar 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment/doc here says key sorting is “top-level only”, but json.dumps(..., sort_keys=True) sorts keys recursively (nested dicts too). Since tests now assert nested keys are sorted, please update the inline comment/docstring to match the actual normalization behavior to avoid confusion for future maintainers.

Copilot uses AI. Check for mistakes.

Json payloads must not be transliterated, case-folded, or otherwise pre-normalized
before parsing. We only apply JSON parsing + canonical dumping in `_type_normalize`.
"""
Comment on lines 58 to +66
Copy link

Copilot AI Mar 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The in-code comments/docstrings still state JsonString sorts keys “top-level only”, but the implementation uses json.dumps(sort_keys=True) (which sorts nested keys too) and the tests now assert nested sorting. Please update the comment/docstring to reflect actual behavior to avoid misleading future changes (or change the implementation if top-level-only is still desired).

Copilot uses AI. Check for mistakes.
return self._type_normalize(value)
29 changes: 23 additions & 6 deletions tests/api/test_data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,20 @@ def test_get_task_by_id_not_found(data_reader: WebArenaVerifiedDataReader):
# Filter Tests
# ============================================================================
@pytest.mark.parametrize(
("site", "expected_min_count"),
"site",
[
(WebArenaSite.SHOPPING, 50), # Shopping has many tasks
(WebArenaSite.MAP, 20), # Map has some tasks
(WebArenaSite.GITLAB, 100), # GitLab has many tasks
WebArenaSite.SHOPPING,
WebArenaSite.MAP,
WebArenaSite.GITLAB,
],
)
def test_filter_by_sites(data_reader: WebArenaVerifiedDataReader, site: WebArenaSite, expected_min_count: int):
def test_filter_by_sites(data_reader: WebArenaVerifiedDataReader, site: WebArenaSite):
"""Test filtering tasks by site."""
filtered = data_reader.get_tasks_by_value_filter(sites=[site])
expected = [task for task in data_reader.tasks if sorted(task.sites) == [site]]

assert len(filtered) >= expected_min_count
assert len(filtered) == len(expected)
assert {task.task_id for task in filtered} == {task.task_id for task in expected}
# Verify all filtered tasks have the specified site
for task in filtered:
assert site in task.sites
Expand Down Expand Up @@ -355,6 +357,21 @@ def test_with_temp_dataset(temp_dataset_file: Path):
assert shopping_tasks[0].task_id == 0


def test_filter_by_sites_is_deterministic_with_temp_dataset(temp_dataset_file: Path):
"""Verify site filtering against a synthetic dataset with exact expected counts."""
config = WebArenaVerifiedConfig(test_data_file=temp_dataset_file)
reader = WebArenaVerifiedDataReader(config)

shopping_tasks = reader.get_tasks_by_value_filter(sites=[WebArenaSite.SHOPPING])
map_tasks = reader.get_tasks_by_value_filter(sites=[WebArenaSite.MAP])
gitlab_tasks = reader.get_tasks_by_value_filter(sites=[WebArenaSite.GITLAB])

assert [task.task_id for task in shopping_tasks] == [0]
assert [task.task_id for task in map_tasks] == [1]
assert len(gitlab_tasks) == 810
assert {task.task_id for task in gitlab_tasks} == set(range(2, 812))


# ============================================================================
# Task Subset Tests
# ============================================================================
Expand Down
192 changes: 92 additions & 100 deletions tests/api/test_evaluation_api_navigation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@
- Special case variations (e.g., header variations, response status) are also included

Test Optimization:
- Uses round-robin distribution to spread URL variations across tasks
- Instead of testing ALL N variations for EVERY T tasks (N*T tests), each task tests only
ONE variation, reducing test count to T tests while maintaining full variation coverage
- Regenerate with: uv run python tmp/generate_navigation_variations.py (if needed)
- Evaluates every navigation task at least once via a smoke test.
- Uses a curated high-signal subset for strict valid/invalid assertions.
"""

import json
Expand All @@ -40,13 +38,22 @@
from webarena_verified.core.utils.immutable_obj_helper import serialize_to_mutable
from webarena_verified.types.eval import EvalStatus

pytestmark = pytest.mark.skip(
reason=(
"Navigation evaluation tests are unstable due to regex URL templates and evaluator strictness. "
"See NEW_TESTS_ISSUES.md."
)
HIGH_SIGNAL_NAVIGATION_TASK_IDS = (
44,
45,
46,
157,
158,
159,
160,
356,
369,
370,
)

VALID_NAVIGATION_VARIATIONS = ("base",)
INVALID_NAVIGATION_VARIATIONS = ("default_network_wrong_url", "default_agent_wrong_task_type")

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -198,6 +205,26 @@ def generate_recursive(items: list[Any], index: int, current: list[Any], alt_ind
return combinations


def _has_single_item_list(value: Any) -> bool:
"""Detect single-item list structures that represent malformed alternatives."""
if isinstance(value, list):
if len(value) == 1:
return True
return any(_has_single_item_list(item) for item in value)

if isinstance(value, dict):
return any(_has_single_item_list(item) for item in value.values())

return False


def _is_malformed_network_expected(task_id: int, dataset: MappingProxyType[int, MappingProxyType[str, Any]]) -> bool:
"""Return True when the task's expected network config is currently malformed."""
network_config = _get_network_event_config(task_id, dataset)
expected = network_config.get("expected", {})
return _has_single_item_list(expected.get("post_data"))

Comment on lines +208 to +226
Copy link

Copilot AI Mar 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_has_single_item_list treats any list of length 1 anywhere inside post_data as “malformed alternatives”. This is too broad: many tasks legitimately have single-element arrays (and may even include post_data_schema marking them as arrays), so the smoke test would incorrectly allow ERROR for tasks that should be expected to stay executable. Consider detecting “malformed” by attempting the same normalization the evaluator uses (e.g., ValueNormalizer with the task’s post_data_schema) and only flagging the specific failure mode (single-item alternatives) rather than any single-element array.

Copilot uses AI. Check for mistakes.

def _apply_invalid_transformation_to_network_event(
network_event_config: dict[str, Any], variation_type: str
) -> dict[str, Any]:
Expand Down Expand Up @@ -309,7 +336,7 @@ def test_variations_data(project_root: Path) -> MappingProxyType[int, MappingPro
return MappingProxyType({int(task_id): MappingProxyType(variations) for task_id, variations in data.items()})


def pytest_generate_tests(metafunc): # noqa: C901, PLR0912
def pytest_generate_tests(metafunc):
"""Generate test cases for all navigation tasks and variations.

This generates parameterized tests for:
Expand All @@ -318,94 +345,59 @@ def pytest_generate_tests(metafunc): # noqa: C901, PLR0912
- URL variations for tasks (loaded from JSON test files)
- For invalid tests: multiple "default_*" variations with programmatic transformations
"""
if "task_id" in metafunc.fixturenames and "variation_name" in metafunc.fixturenames:
# Determine if this is a valid or invalid test based on function name
is_valid_test = "invalid" not in metafunc.function.__name__

# Load dataset for generating alternative combinations and finding navigation tasks
project_root = Path(metafunc.config.rootpath)
dataset = _load_dataset(project_root)

# Get all navigation task IDs dynamically from dataset
navigation_task_ids = _get_navigation_task_ids(dataset)

test_cases = []

# Define invalid variation types to generate
invalid_variation_types_network = [
"wrong_url",
"wrong_scheme",
"wrong_query_params",
"wrong_response_status",
"missing_url",
"wrong_headers",
"extra_field",
]

invalid_variation_types_agent = [
"wrong_task_type",
"wrong_status",
"non_null_data",
"missing_field",
"extra_field",
]

for task_id in navigation_task_ids:
# Get the network event config
try:
network_config = _get_network_event_config(task_id, dataset)
expected = network_config.get("expected", {})
except ValueError:
continue

if is_valid_test:
# For valid tests: generate alternative combinations from dataset
url_data = expected.get("url")

if url_data is not None:
# Generate all alternative combinations for URLs
alternatives = _generate_alternative_combinations(url_data)

# Add test case for each alternative
for alt_name, _ in alternatives:
test_cases.append((task_id, alt_name))

# Check for variations from consolidated file
test_file = project_root / "tests" / "assets" / "e2e_test_navigation_data.json"
if test_file.exists():
all_variations = json.loads(test_file.read_text())
task_str = str(task_id)
if task_str in all_variations:
task_data = all_variations[task_str]
special_variations = task_data.get("valid", {})
for variation_name in special_variations:
test_cases.append((task_id, variation_name))
else:
# For invalid tests: generate all default_* variations
for variation_type in invalid_variation_types_network:
test_cases.append((task_id, f"default_network_{variation_type}"))

for variation_type in invalid_variation_types_agent:
test_cases.append((task_id, f"default_agent_{variation_type}"))

# Check for invalid variations from consolidated file
test_file = project_root / "tests" / "assets" / "e2e_test_navigation_data.json"
if test_file.exists():
all_variations = json.loads(test_file.read_text())
task_str = str(task_id)
if task_str in all_variations:
task_data = all_variations[task_str]
special_variations = task_data.get("invalid", {})
for variation_name in special_variations:
test_cases.append((task_id, variation_name))

# Only parametrize if we have test cases, otherwise skip the test
if test_cases:
metafunc.parametrize(
"task_id,variation_name",
test_cases,
ids=lambda params: f"task_{params[0]}_{params[1]}" if isinstance(params, tuple) else str(params),
)
project_root = Path(metafunc.config.rootpath)
dataset = _load_dataset(project_root)
navigation_task_ids = _get_navigation_task_ids(dataset)
Comment on lines +348 to +350
Copy link

Copilot AI Mar 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytest_generate_tests reloads and parses the full dataset JSON each time it’s called during collection. Since this hook runs once per collected test function, consider caching _load_dataset() (e.g., module-level cache or functools.lru_cache) to avoid repeated I/O/JSON parsing and keep test collection fast as the dataset grows.

Copilot uses AI. Check for mistakes.

if "smoke_task_id" in metafunc.fixturenames:
metafunc.parametrize(
"smoke_task_id",
navigation_task_ids,
ids=lambda task_id: f"task_{task_id}",
)
return

if "task_id" not in metafunc.fixturenames or "variation_name" not in metafunc.fixturenames:
return

is_valid_test = "invalid" not in metafunc.function.__name__
available_task_ids = set(navigation_task_ids)

selected_task_ids = [task_id for task_id in HIGH_SIGNAL_NAVIGATION_TASK_IDS if task_id in available_task_ids]
if not selected_task_ids:
raise ValueError("No high-signal navigation tasks found in dataset.")

variation_names = VALID_NAVIGATION_VARIATIONS if is_valid_test else INVALID_NAVIGATION_VARIATIONS
test_cases = [(task_id, variation_name) for task_id in selected_task_ids for variation_name in variation_names]

metafunc.parametrize(
"task_id,variation_name",
test_cases,
ids=lambda params: f"task_{params[0]}_{params[1]}" if isinstance(params, tuple) else str(params),
)


def test_evaluate_navigation_task_smoke_all_tasks(
smoke_task_id: int,
wa: WebArenaVerified,
dataset: MappingProxyType[int, MappingProxyType[str, Any]],
har_file_example: Path,
):
"""Evaluate every navigation task once to ensure task configs remain executable."""
agent_response = _get_agent_response_config(smoke_task_id, dataset)

result = wa.evaluate_task(
task_id=smoke_task_id,
agent_response=json.dumps(agent_response),
network_trace=har_file_example,
)

assert result.task_id == smoke_task_id

if _is_malformed_network_expected(smoke_task_id, dataset):
assert result.status in {EvalStatus.FAILURE, EvalStatus.ERROR}
else:
assert result.status in {EvalStatus.SUCCESS, EvalStatus.FAILURE}


def test_evaluate_navigation_task_valid_variations(
Expand Down Expand Up @@ -439,7 +431,7 @@ def test_evaluate_navigation_task_valid_variations(
url_data = expected.get("url")

if url_data is None:
pytest.skip(f"Task {task_id} has no URL in expected network event")
raise ValueError(f"Task {task_id} has no URL in expected network event")

# Generate all alternatives and find the matching one
alternatives = _generate_alternative_combinations(url_data)
Expand Down Expand Up @@ -563,7 +555,7 @@ def test_evaluate_navigation_task_invalid_variations(
if isinstance(test_url_template, list):
test_url_template = test_url_template[0]
if test_url_template is None:
pytest.skip(f"Task {task_id} has no URL in expected network event")
raise ValueError(f"Task {task_id} has no URL in expected network event")

# Render URL
test_url = wa.config.render_url(test_url_template, sites=task_sites)
Expand Down
7 changes: 3 additions & 4 deletions tests/api/test_evaluation_api_retrieval_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

logger = logging.getLogger(__name__)

UNSUPPORTED_RETRIEVAL_VARIATIONS = {
PRUNED_VALID_VARIATIONS = {
"fmt_trim_whitespace",
"fmt_extra_spaces",
"fmt_with_quotes",
Expand Down Expand Up @@ -336,6 +336,8 @@ def _load_variations_from_file(project_root: Path, task_id: int, variation_type:
task_data = all_variations[task_str]
special_variations = task_data.get(variation_type, {})
for variation_name in special_variations:
if variation_type == "valid" and variation_name in PRUNED_VALID_VARIATIONS:
continue
test_cases.append((task_id, variation_name))
return test_cases

Expand Down Expand Up @@ -465,9 +467,6 @@ def test_evaluate_retrieval_task_valid_variations(
test_variations_data: MappingProxyType[int, MappingProxyType[str, Any]],
har_file_example: Path,
):
if variation_name in UNSUPPORTED_RETRIEVAL_VARIATIONS:
pytest.skip(f"Unsupported retrieval variation '{variation_name}' (see NEW_TESTS_ISSUES.md)")

# Load the agent response based on variation name
if variation_name.startswith("alt_") or variation_name == "base":
# Load from dataset and select the appropriate alternative
Expand Down
Loading