From 7a4b4fa0ea2538457679e1c4a2dd5c5f67ca00b5 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 13:27:46 -0500 Subject: [PATCH 01/14] Agent initial impl Signed-off-by: Mike Knepper --- .../src/data_designer_nemo/context.py | 12 +++- .../fileset_filesystem_provider.py | 63 +++++++++++++++++++ .../src/data_designer_nemo/seed.py | 55 +++++++++++++++- .../unsupported_features.py | 5 +- .../unit/test_fileset_filesystem_provider.py | 46 ++++++++++++++ .../unit/test_remote_filesystem_seeds.py | 53 ++++++++++++++++ 6 files changed, 230 insertions(+), 4 deletions(-) create mode 100644 packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py create mode 100644 packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py create mode 100644 packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py diff --git a/packages/data_designer_nemo/src/data_designer_nemo/context.py b/packages/data_designer_nemo/src/data_designer_nemo/context.py index 0d8eb699ab..b751864dea 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/context.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/context.py @@ -23,6 +23,7 @@ ) from data_designer_nemo.errors import NDDError from data_designer_nemo.fileset_file_seed_reader import FilesetFileSeedReader +from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider from data_designer_nemo.model_provider import ( make_local_first_model_provider_registry, make_model_provider_registry, @@ -113,6 +114,7 @@ class RemoteDataDesignerContext: def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str): self._sdk = sdk self._workspace = workspace + self._validated_filesystem_roots: set[str] = set() def get_secret_resolver(self) -> SecretResolver: return NMPSecretResolver(self._sdk, self._workspace) @@ -130,7 +132,8 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: except NDDError as e: errors.append(e) try: - await validate_seed(config, self._workspace, sdk) + if validated_root := await validate_seed(config, self._workspace, sdk): + self._validated_filesystem_roots.add(validated_root) except NDDError as e: errors.append(e) try: @@ -141,9 +144,16 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: return errors def get_seed_readers(self) -> list[SeedReader]: + provider = FilesetFileSystemProvider( + self._sdk, + workspace=self._workspace, + validated_roots=self._validated_filesystem_roots, + ) return [ HuggingFaceSeedReader(), FilesetFileSeedReader(self._sdk), + DirectorySeedReader(fs_provider=provider), + FileContentsSeedReader(fs_provider=provider), ] def get_person_reader(self) -> PersonReader: diff --git a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py new file mode 100644 index 0000000000..e9a4b9e43e --- /dev/null +++ b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import PurePosixPath + +from data_designer.engine.resources.seed_reader import ( + SeedReaderConfigError, + SeedReaderError, + SeedReaderFileSystemContext, +) +from data_designer_nemo.sdk_translation import async_to_sync_sdk +from fsspec.implementations.dirfs import DirFileSystem +from nemo_platform import AsyncNeMoPlatform, NeMoPlatform +from nemo_platform.filesets import FilesetFileSystem, FilesetPathError, build_fileset_ref, parse_fileset_ref + + +class FilesetFileSystemProvider: + """Filesystem provider that roots directory-style seed readers in a fileset.""" + + def __init__( + self, + sdk: NeMoPlatform | AsyncNeMoPlatform, + *, + workspace: str, + validated_roots: set[str] | None = None, + ) -> None: + if isinstance(sdk, AsyncNeMoPlatform): + sdk = async_to_sync_sdk(sdk) + self._sdk = sdk + self._workspace = workspace + self._validated_roots = set() if validated_roots is None else validated_roots + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: + root = self._canonical_root(runtime_path) + rooted_fs = DirFileSystem(path=root, fs=FilesetFileSystem(self._sdk)) + return SeedReaderFileSystemContext(fs=rooted_fs, root_path=PurePosixPath(root)) + + def ensure_root_exists(self, *, runtime_path: str) -> None: + workspace, fileset, fragment = self._parse(runtime_path) + root = build_fileset_ref(fragment, workspace=workspace, fileset=fileset) + if root in self._validated_roots: + return + + fs = FilesetFileSystem(self._sdk) + if fs.exists(root): + self._validated_roots.add(root) + return + + fileset_root = build_fileset_ref("", workspace=workspace, fileset=fileset) + fully_qualified_fileset_name = f"{workspace}/{fileset}" + if not fs.exists(fileset_root): + raise SeedReaderConfigError(f"🛑 Fileset {fully_qualified_fileset_name!r} not found.") + raise SeedReaderConfigError(f"🛑 Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}.") + + def _canonical_root(self, runtime_path: str) -> str: + workspace, fileset, fragment = self._parse(runtime_path) + return build_fileset_ref(fragment, workspace=workspace, fileset=fileset) + + def _parse(self, runtime_path: str) -> tuple[str, str, str]: + try: + return parse_fileset_ref(runtime_path, workspace_fallback=self._workspace) + except FilesetPathError as error: + raise SeedReaderError(f"🛑 Invalid fileset seed source path {runtime_path!r}: {error}") from error diff --git a/packages/data_designer_nemo/src/data_designer_nemo/seed.py b/packages/data_designer_nemo/src/data_designer_nemo/seed.py index 91baf47b70..ebfb5e3bbc 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/seed.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/seed.py @@ -9,6 +9,7 @@ from data_designer_nemo.fileset_file_seed_source import FilesetFileSeedSource from data_designer_nemo.secret_resolver import validate_secret from nemo_platform import AsyncNeMoPlatform, NotFoundError, PermissionDeniedError +from nemo_platform.filesets import FilesetPathError, build_fileset_ref, parse_fileset_ref logger = logging.getLogger(__name__) @@ -17,7 +18,7 @@ def get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None: return dd_config.seed_config.source if dd_config.seed_config else None -async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: AsyncNeMoPlatform) -> None: +async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: AsyncNeMoPlatform) -> str | None: if (seed_source := get_seed_source(dd_config)) is None: return None @@ -38,6 +39,58 @@ async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: A raise NDDInternalError( f"An unexpected error occurred while retrieving fileset {fileset_name!r} in workspace {workspace!r}: {e}" ) from e + return None + + if isinstance(seed_source, dd.DirectorySeedSource | dd.FileContentsSeedSource): + return await _validate_filesystem_seed_source(seed_source.path, workspace, sdk) + + return None + + +async def _validate_filesystem_seed_source(path: str, request_workspace: str, sdk: AsyncNeMoPlatform) -> str: + try: + workspace, fileset_name, fragment = parse_fileset_ref(path, workspace_fallback=request_workspace) + except FilesetPathError as e: + raise NDDInvalidConfigError( + f"The fileset reference in seed source path {path!r} is formatted incorrectly" + ) from e + + try: + await sdk.files.filesets.retrieve(name=fileset_name, workspace=workspace) + except NotFoundError as e: + raise NDDInvalidConfigError(f"Could not find fileset {fileset_name!r} in workspace {workspace!r}") from e + except PermissionDeniedError as e: + raise NDDInvalidConfigError(f"Access denied to workspace {workspace!r}") from e + except Exception as e: + logger.exception("Error retrieving fileset", extra={"fileset_name": fileset_name, "workspace": workspace}) + raise NDDInternalError( + f"An unexpected error occurred while retrieving fileset {fileset_name!r} in workspace {workspace!r}: {e}" + ) from e + + canonical_root = build_fileset_ref(fragment, workspace=workspace, fileset=fileset_name) + if not fragment: + return canonical_root + + fully_qualified_fileset_name = f"{workspace}/{fileset_name}" + try: + response = await sdk.files.list(remote_path=fragment, fileset=fileset_name, workspace=workspace) + except NotFoundError as e: + raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}") from e + except PermissionDeniedError as e: + raise NDDInvalidConfigError(f"Access denied to workspace {workspace!r}") from e + except Exception as e: + logger.exception( + "Error listing fileset path", + extra={"fileset_name": fileset_name, "workspace": workspace, "fragment": fragment}, + ) + raise NDDInternalError( + f"An unexpected error occurred while listing path {fragment!r} in fileset {fully_qualified_fileset_name!r}: {e}" + ) from e + + if not response.data: + raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}") + + return canonical_root def _parse_seed_source_path(path: str, request_workspace: str) -> tuple[str, str]: diff --git a/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py b/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py index 013b399f5d..b16af8849e 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py @@ -6,10 +6,11 @@ import data_designer.config as dd from data_designer_nemo.errors import NDDInvalidConfigError -_SUPPORTED_SEED_TYPES = {"hf", "nmp"} +_SUPPORTED_SEED_TYPES = {"directory", "file_contents", "hf", "nmp"} _UNSUPPORTED_SEED_TYPES_MESSAGE = ( "The NeMo Platform Data Designer service only supports seed data from HuggingFace " - "(seed_type=hf) or the Files service (seed_type=nmp)." + "or the NeMo Platform Files service (FilesetFile, Directory, or FileContents seed sources " + "referencing fileset paths). Upload your data to the Files service, adjust your config, and try again." ) _DATAFRAME_SEED_TYPE = "df" _DATAFRAME_SEED_TYPE_MESSAGE = ( diff --git a/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py b/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py new file mode 100644 index 0000000000..6e64bcecca --- /dev/null +++ b/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock, patch + +import pytest +from data_designer.engine.resources.seed_reader import SeedReaderConfigError +from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider + + +def test_create_context_roots_reader_in_canonical_fileset_ref() -> None: + sdk = Mock() + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.async_impl = True + fs_class.return_value.asynchronous = False + context = FilesetFileSystemProvider(sdk, workspace="default").create_context(runtime_path="docs#corpus") + + fs_class.assert_called_once_with(sdk) + assert str(context.root_path) == "default/docs#corpus" + + +def test_ensure_root_exists_skips_validated_roots() -> None: + sdk = Mock() + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + FilesetFileSystemProvider( + sdk, + workspace="default", + validated_roots={"default/docs#corpus"}, + ).ensure_root_exists(runtime_path="docs#corpus") + + fs_class.assert_not_called() + + +def test_ensure_root_exists_reports_missing_fileset_path() -> None: + sdk = Mock() + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.exists.side_effect = [False, True] + provider = FilesetFileSystemProvider(sdk, workspace="default") + + with pytest.raises(SeedReaderConfigError, match="Path 'corpus' not found in fileset 'default/docs'"): + provider.ensure_root_exists(runtime_path="docs#corpus") + + assert fs_class.return_value.exists.call_count == 2 diff --git a/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py b/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py new file mode 100644 index 0000000000..aebd19dc8d --- /dev/null +++ b/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import Any +from unittest.mock import AsyncMock, Mock + +import data_designer.config as dd +import pytest +from data_designer.engine.resources.seed_reader import DirectorySeedReader, FileContentsSeedReader +from data_designer_nemo.context import RemoteDataDesignerContext +from data_designer_nemo.seed import validate_seed +from data_designer_nemo.unsupported_features import validate_seed_config_for_execution_context +from nemo_platform import AsyncNeMoPlatform + + +def _make_config(source: Any) -> dd.DataDesignerConfig: + builder = dd.DataDesignerConfigBuilder() + builder.with_seed_dataset(source) + return builder.build() + + +def test_remote_seed_type_validation_allows_filesystem_seed_sources() -> None: + validate_seed_config_for_execution_context( + _make_config(dd.DirectorySeedSource(path="workspace/docs#corpus")), + is_local=False, + ) + validate_seed_config_for_execution_context( + _make_config(dd.FileContentsSeedSource(path="workspace/docs#corpus")), + is_local=False, + ) + + +def test_remote_context_includes_filesystem_seed_readers() -> None: + readers = RemoteDataDesignerContext(Mock(), "default").get_seed_readers() + + assert any(isinstance(reader, DirectorySeedReader) for reader in readers) + assert any(isinstance(reader, FileContentsSeedReader) for reader in readers) + + +@pytest.mark.asyncio +async def test_validate_seed_returns_canonical_validated_filesystem_root() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock() + sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) + config = _make_config(dd.FileContentsSeedSource(path="docs#corpus", file_pattern="*.md")) + + validated_root = await validate_seed(config, "default", sdk) + + assert validated_root == "default/docs#corpus" + sdk.files.filesets.retrieve.assert_awaited_once_with(name="docs", workspace="default") + sdk.files.list.assert_awaited_once_with(remote_path="corpus", fileset="docs", workspace="default") From 6bcd72d88c0111d4f3da354df5f28b46b84c506a Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 15:11:56 -0500 Subject: [PATCH 02/14] Specify provider in test ModelConfigs Signed-off-by: Mike Knepper --- packages/data_designer_nemo/tests/unit/test_model_configs.py | 1 + plugins/nemo-data-designer/tests/unit/test_sdk_resources.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/data_designer_nemo/tests/unit/test_model_configs.py b/packages/data_designer_nemo/tests/unit/test_model_configs.py index 4a00ecb02a..1b0cf502f7 100644 --- a/packages/data_designer_nemo/tests/unit/test_model_configs.py +++ b/packages/data_designer_nemo/tests/unit/test_model_configs.py @@ -11,6 +11,7 @@ def _make_model_config(alias: str) -> dd.ModelConfig: return dd.ModelConfig( alias=alias, model="nvidia/nemotron-3", + provider="default/nvidia", ) diff --git a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py index c07ea9b5ba..0afcad4027 100644 --- a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py +++ b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py @@ -71,7 +71,9 @@ def async_resource(async_platform: AsyncNeMoPlatform) -> AsyncDataDesignerResour @pytest.fixture def config_builder() -> dd.DataDesignerConfigBuilder: - builder = dd.DataDesignerConfigBuilder(model_configs=[dd.ModelConfig(alias="text", model="model")]) + builder = dd.DataDesignerConfigBuilder( + model_configs=[dd.ModelConfig(alias="text", model="model", provider="default/nvidia")] + ) builder.add_column( column_config=dd.SamplerColumnConfig( name="foo", From d9e8fcf11c309e9e98a6af819c31ddbf1e64f1b8 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 15:12:43 -0500 Subject: [PATCH 03/14] Drop stale unit test cases since more seed source types are now supported Signed-off-by: Mike Knepper --- .../nemo-data-designer/tests/unit/test_sdk_resources.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py index 0afcad4027..4ff0bf4dd7 100644 --- a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py +++ b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py @@ -211,7 +211,7 @@ def test_preview_collector_propagates_error_frame_message( # --------------------------------------------------------------------------- -@pytest.mark.parametrize("seed_kind", ["df", "local", "directory", "file_contents"]) +@pytest.mark.parametrize("seed_kind", ["df", "local"]) def test_preview_rejects_local_only_seed_sources_before_sending_request( resource: DataDesignerResource, config_builder: dd.DataDesignerConfigBuilder, @@ -219,10 +219,9 @@ def test_preview_rejects_local_only_seed_sources_before_sending_request( tmp_path, ) -> None: """The validation gate inside ``_get_config_for_api_call`` rejects seed sources that - only make sense locally (DataFrame, LocalFile, Directory, FileContents), so the SDK - fails fast with a typed error instead of letting the server emit a 422 round-trip - later. Patching ``_preview`` to raise an AssertionError catches any regression where - the request is sent anyway. + only make sense locally (DataFrame, LocalFile), so the SDK fails fast with a typed + error instead of letting the server emit a 422 round-trip later. Patching ``_preview`` + to raise an AssertionError catches any regression where the request is sent anyway. """ if seed_kind == "df": seed_source = dd.DataFrameSeedSource(df=pd.DataFrame(data={"foo": [1, 2, 3]})) From 72167ebcf1ab829133015d3cc415e1352a2a3677 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 15:13:51 -0500 Subject: [PATCH 04/14] Drop test rejecting provider-less ModelConfig Signed-off-by: Mike Knepper --- .../tests/integration/test_remote_validation_errors.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py b/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py index 1b6cd1a059..6bf6a80f45 100644 --- a/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py +++ b/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py @@ -52,16 +52,6 @@ def test_unknown_provider_in_request() -> None: _assert_error(dd_client, builder, ["Cannot access provider", unknown_provider]) -def test_model_config_without_explicit_provider_is_rejected() -> None: - alias = "no-provider-specified" - bad_model_config = dd.ModelConfig(alias=alias, model="some-model") - builder = _builder_with_llm_column(bad_model_config) - - with u.make_mock_client_context() as client_context: - dd_client = u.make_dd_client(client_context) - _assert_error(dd_client, builder, ["does not have an explicit provider defined", alias]) - - def test_malformed_provider_reference_is_rejected() -> None: alias = "too-many-slashes" malformed_provider_name = "foo/bar/baz" From a39f5849d1a80ca1fdeb2a9ebb86210759bae03b Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 15:15:26 -0500 Subject: [PATCH 05/14] Drop guard against provider=None, no longer possible in library Signed-off-by: Mike Knepper --- .../src/data_designer_nemo/model_provider.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py index 380bd045c6..1fd76f52db 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py @@ -74,12 +74,6 @@ async def make_local_first_model_provider_registry( if len(model_configs) == 0: return None - missing_providers = [model_config for model_config in model_configs if model_config.provider is None] - if len(missing_providers) > 0: - raise NDDInvalidConfigError( - f"Error: following model configs do not have an explicit provider defined: {missing_providers}" - ) - logger.info("Building model provider registry. First checking locally-defined providers.") local_registry = _make_local_model_provider_registry() From 454e271fcbee1ce553767c8803448685677e48e1 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 15:16:53 -0500 Subject: [PATCH 06/14] Drop another none-provider test Signed-off-by: Mike Knepper --- .../tests/unit/test_model_provider.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/plugins/nemo-data-designer/tests/unit/test_model_provider.py b/plugins/nemo-data-designer/tests/unit/test_model_provider.py index 4a0b8449eb..928fd8d063 100644 --- a/plugins/nemo-data-designer/tests/unit/test_model_provider.py +++ b/plugins/nemo-data-designer/tests/unit/test_model_provider.py @@ -1,41 +1,14 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - -import data_designer.config as dd import nemo_data_designer_plugin.testing.utils as u import pytest -from data_designer_nemo.errors import NDDInvalidConfigError from data_designer_nemo.model_provider import ( - make_local_first_model_provider_registry, make_model_provider_registry, make_null_registry, ) -@pytest.mark.asyncio -async def test_local_first_provider_cannot_be_none() -> None: - """When a local-first registry build sees a missing provider, it must fail fast - *before* hitting the local-provider lookup helper. - """ - bad_model_configs = [dd.ModelConfig(alias="no-provider-specified", model="some-model")] - - with u.make_mock_client_context() as client_context: - with ( - patch("data_designer_nemo.model_provider.get_default_providers") as default_lookup, - pytest.raises(NDDInvalidConfigError) as exc_info, - ): - await make_local_first_model_provider_registry( - bad_model_configs, - sdk=client_context.async_sdk, - default_workspace=u.WORKSPACE_NAME, - ) - - default_lookup.assert_not_called() - assert "explicit provider defined" in str(exc_info.value) - - @pytest.mark.asyncio async def test_no_model_configs_returns_none() -> None: """``make_model_provider_registry`` returns None for an empty model-config list, From efea238fd8ce91cb0cc12c2437323b52a0d4083f Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 16:33:00 -0500 Subject: [PATCH 07/14] No more default field on ModelProviderRegistry Signed-off-by: Mike Knepper --- .../data_designer_nemo/src/data_designer_nemo/model_provider.py | 1 - plugins/nemo-data-designer/tests/unit/test_model_provider.py | 1 - 2 files changed, 2 deletions(-) diff --git a/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py index 1fd76f52db..3671c6b57a 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py @@ -54,7 +54,6 @@ def make_null_registry() -> ModelProviderRegistry: # is semantically valid. The library requires a non-empty ModelProviderRegistry, so in this scenario # we can provide this dummy null registry. return ModelProviderRegistry( - default=_NO_OP, providers=[make_noop_provider()], ) diff --git a/plugins/nemo-data-designer/tests/unit/test_model_provider.py b/plugins/nemo-data-designer/tests/unit/test_model_provider.py index 928fd8d063..e27c3fb479 100644 --- a/plugins/nemo-data-designer/tests/unit/test_model_provider.py +++ b/plugins/nemo-data-designer/tests/unit/test_model_provider.py @@ -28,4 +28,3 @@ def test_null_registry() -> None: registry = make_null_registry() assert len(registry.providers) == 1 - assert registry.default == "no-op" From 5f30bae7a5bcd3a55977faf9b396e5cf8444812b Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 16:34:13 -0500 Subject: [PATCH 08/14] Update test assertion to match modified error message content Signed-off-by: Mike Knepper --- .../nemo-data-designer/tests/integration/test_validate_sdk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py b/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py index 2735bc12db..e23a4a4bb7 100644 --- a/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py +++ b/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py @@ -171,7 +171,7 @@ async def test_sdk_validate_method_aggregates_df_seed_with_other_remote_errors() assert any("Tool configs" in m for m in messages) # Remote rejects everything outside the {hf, nmp} whitelist; the message # mentions both supported types rather than calling out "df" specifically. - assert any("seed_type=hf" in m and "seed_type=nmp" in m for m in messages) + assert any("seed sources" in m and "Files service" in m for m in messages) async def test_sdk_validate_method_rejects_df_seed_for_local() -> None: From fce257a391d536ade2cdb8092f5955f2c26a1632 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 16:34:27 -0500 Subject: [PATCH 09/14] One more missing provider on a ModelConfig Signed-off-by: Mike Knepper --- .../nemo-data-designer/tests/unit/test_preview_function.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/plugins/nemo-data-designer/tests/unit/test_preview_function.py b/plugins/nemo-data-designer/tests/unit/test_preview_function.py index ae3d5f0a56..ed4f59c6cd 100644 --- a/plugins/nemo-data-designer/tests/unit/test_preview_function.py +++ b/plugins/nemo-data-designer/tests/unit/test_preview_function.py @@ -24,7 +24,9 @@ def _config() -> dd.DataDesignerConfig: - builder = dd.DataDesignerConfigBuilder(model_configs=[dd.ModelConfig(alias="text", model="model")]) + builder = dd.DataDesignerConfigBuilder( + model_configs=[dd.ModelConfig(alias="text", model="model", provider="default/nvidia")] + ) builder.add_column( column_config=dd.SamplerColumnConfig( name="foo", From 5093808e1b6b3e3a73cfc2a1ad4995f1cd7da607 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Tue, 23 Jun 2026 09:28:06 -0500 Subject: [PATCH 10/14] Impl _FilesetDirFileSystem and some integration tests for the new seed sources Signed-off-by: Mike Knepper --- .../fileset_filesystem_provider.py | 30 ++++++- .../testing/utils.py | 11 ++- .../integration/test_preview_remote_sdk.py | 80 +++++++++++++++++++ 3 files changed, 118 insertions(+), 3 deletions(-) diff --git a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py index e9a4b9e43e..1ad088335a 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py @@ -14,6 +14,34 @@ from nemo_platform.filesets import FilesetFileSystem, FilesetPathError, build_fileset_ref, parse_fileset_ref +class _FilesetDirFileSystem(DirFileSystem): + """DirFileSystem that handles FilesetFileSystem's '#' path separator. + + FilesetFileSystem returns paths using '#' to separate the fileset name from + the file path (e.g. "ws/fs#data.parquet"). Standard DirFileSystem._relpath + builds its strip-prefix with '/' (e.g. "ws/fs/"), so the startswith check + fails for fileset-root paths. For subdirectory roots (e.g. "ws/fs#subdir"), + files use '/' after '#' and the standard logic already works; the '#' branch + below is a no-op in that case. + + All methods besides _relpath are inherited from DirFileSystem unchanged, so + this remains a complete AbstractFileSystem implementation. + """ + + def _relpath(self, path: str | list) -> str | list: # type: ignore[override] + if isinstance(path, list): + return [self._relpath(p) for p in path] + if not self.path: + return path + if path == self.path: + return "" + for sep in ("#", "/"): + prefix = self.path + sep + if path.startswith(prefix): + return path[len(prefix) :] + raise AssertionError(f"Path {path!r} does not start with root {self.path!r}") + + class FilesetFileSystemProvider: """Filesystem provider that roots directory-style seed readers in a fileset.""" @@ -32,7 +60,7 @@ def __init__( def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: root = self._canonical_root(runtime_path) - rooted_fs = DirFileSystem(path=root, fs=FilesetFileSystem(self._sdk)) + rooted_fs = _FilesetDirFileSystem(path=root, fs=FilesetFileSystem(self._sdk)) return SeedReaderFileSystemContext(fs=rooted_fs, root_path=PurePosixPath(root)) def ensure_root_exists(self, *, runtime_path: str) -> None: diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py index 7315380a95..95b48b4e36 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py @@ -144,6 +144,10 @@ def _dd_service_factory() -> NemoServiceAdapter: "data_designer_nemo.person_reader.async_to_sync_sdk", return_value=client_context.sdk, ), + patch( + "data_designer_nemo.fileset_filesystem_provider.async_to_sync_sdk", + return_value=client_context.sdk, + ), ): yield client_context @@ -175,7 +179,10 @@ def setup_mock_secret(client_context: ClientContext) -> Generator[None]: @contextmanager -def setup_mock_file(client_context: ClientContext) -> Generator[None]: +def setup_mock_file( + client_context: ClientContext, + remote_path: str | None = None, +) -> Generator[None]: client_context.sdk.files.filesets.create( name=FILESET_NAME, workspace=client_context.sdk.workspace or WORKSPACE_NAME, @@ -186,7 +193,7 @@ def setup_mock_file(client_context: ClientContext) -> Generator[None]: fileset=FILESET_NAME, workspace=client_context.sdk.workspace or WORKSPACE_NAME, local_path=tmpfile.name, - remote_path=FILE_PATH, + remote_path=remote_path or FILE_PATH, ) yield diff --git a/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py b/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py index 72a2c4ebec..8921a9419c 100644 --- a/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py +++ b/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py @@ -2,8 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import logging +import tempfile from collections.abc import Iterator from contextlib import contextmanager +from pathlib import Path import data_designer.config as dd import nemo_data_designer_plugin.testing.utils as u @@ -102,6 +104,84 @@ def test_fileset_file_seed_dataset_plugin() -> None: assert set(preview_results.dataset["full_name"].values) == u.FULL_NAMES +def test_directory_seed_dataset_fileset_root() -> None: + builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) + builder.with_seed_dataset(dd.DirectorySeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}")) + builder.add_column( + column_config=dd.ExpressionColumnConfig(name="expr", expr="{{ source_kind }} :: {{ relative_path }}") + ) + + with ( + u.make_mock_client_context() as client_context, + u.setup_mock_file(client_context), + ): + dd_client = u.make_dd_client(client_context) + preview_results = dd_client.preview(builder, num_records=3) + + assert preview_results.dataset is not None + assert set(preview_results.dataset["expr"].values) == {f"directory_file :: {u.FILE_PATH}"} + + +def test_directory_seed_dataset_fileset_subdir() -> None: + subdir = "some/subdir" + + builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) + builder.with_seed_dataset(dd.DirectorySeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}#{subdir}")) + builder.add_column( + column_config=dd.ExpressionColumnConfig( + name="expr", + expr="{{ source_kind }} :: {{ relative_path }}", + ) + ) + + with ( + u.make_mock_client_context() as client_context, + u.setup_mock_file(client_context, remote_path=f"{subdir}/{u.FILE_PATH}"), + ): + dd_client = u.make_dd_client(client_context) + preview_results = dd_client.preview(builder, num_records=3) + + assert preview_results.dataset is not None + assert set(preview_results.dataset["expr"].values) == {f"directory_file :: {u.FILE_PATH}"} + + +def test_file_contents_seed_dataset() -> None: + subdir = "some/subdir" + + builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) + builder.with_seed_dataset(dd.FileContentsSeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}#{subdir}")) + builder.add_column( + column_config=dd.ExpressionColumnConfig( + name="expr", + expr="{{ file_name }} :: {{ content }}", + ) + ) + + with ( + u.make_mock_client_context() as client_context, + tempfile.TemporaryDirectory() as tmpdir, + ): + client_context.sdk.files.filesets.create(name=u.FILESET_NAME, workspace=u.WORKSPACE_NAME) + for filename in ["abc.txt", "xyz.txt"]: + filepath = Path(tmpdir) / filename + filepath.write_text(f"This is {filename}") + client_context.sdk.files.upload( + fileset=u.FILESET_NAME, + workspace=u.WORKSPACE_NAME, + local_path=tmpdir, + remote_path=subdir, + ) + + dd_client = u.make_dd_client(client_context) + preview_results = dd_client.preview(builder, num_records=3) + + assert preview_results.dataset is not None + assert set(preview_results.dataset["expr"].values) == { + "abc.txt :: This is abc.txt", + "xyz.txt :: This is xyz.txt", + } + + def test_nemotron_personas_dataset() -> None: builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) builder.add_column( From 5d6d4953cdb33271e9ab0620126c2bed1f373caf Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Tue, 23 Jun 2026 11:59:11 -0500 Subject: [PATCH 11/14] Dedupe some seed validation logic Signed-off-by: Mike Knepper --- .../src/data_designer_nemo/seed.py | 44 ++++--------------- 1 file changed, 9 insertions(+), 35 deletions(-) diff --git a/packages/data_designer_nemo/src/data_designer_nemo/seed.py b/packages/data_designer_nemo/src/data_designer_nemo/seed.py index ebfb5e3bbc..d8d5a4861f 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/seed.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/seed.py @@ -26,33 +26,20 @@ async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: A await validate_secret(sdk, token, workspace) return None - if isinstance(seed_source, FilesetFileSeedSource): - workspace, fileset_name = _parse_seed_source_path(seed_source.path, workspace) - try: - await sdk.files.filesets.retrieve(name=fileset_name, workspace=workspace) - except NotFoundError as e: - raise NDDInvalidConfigError(f"Could not find fileset {fileset_name!r} in workspace {workspace!r}") from e - except PermissionDeniedError as e: - raise NDDInvalidConfigError(f"Access denied to workspace {workspace!r}") from e - except Exception as e: - logger.exception("Error retrieving fileset", extra={"fileset_name": fileset_name, "workspace": workspace}) - raise NDDInternalError( - f"An unexpected error occurred while retrieving fileset {fileset_name!r} in workspace {workspace!r}: {e}" - ) from e - return None - - if isinstance(seed_source, dd.DirectorySeedSource | dd.FileContentsSeedSource): - return await _validate_filesystem_seed_source(seed_source.path, workspace, sdk) - - return None + if isinstance(seed_source, FilesetFileSeedSource | dd.DirectorySeedSource | dd.FileContentsSeedSource): + return await _validate_seed_from_files_service(seed_source, workspace, sdk) -async def _validate_filesystem_seed_source(path: str, request_workspace: str, sdk: AsyncNeMoPlatform) -> str: +async def _validate_seed_from_files_service( + seed_source: FilesetFileSeedSource | dd.DirectorySeedSource | dd.FileContentsSeedSource, + workspace: str, + sdk: AsyncNeMoPlatform, +) -> str | None: try: - workspace, fileset_name, fragment = parse_fileset_ref(path, workspace_fallback=request_workspace) + workspace, fileset_name, fragment = parse_fileset_ref(seed_source.path, workspace_fallback=workspace) except FilesetPathError as e: raise NDDInvalidConfigError( - f"The fileset reference in seed source path {path!r} is formatted incorrectly" + f"The fileset reference in seed source path {seed_source.path!r} is formatted incorrectly" ) from e try: @@ -91,16 +78,3 @@ async def _validate_filesystem_seed_source(path: str, request_workspace: str, sd raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}") return canonical_root - - -def _parse_seed_source_path(path: str, request_workspace: str) -> tuple[str, str]: - provided_fileset = path.split("#")[0] - match provided_fileset.split("/"): - case [name]: - return request_workspace, name - case [workspace, name]: - return workspace, name - case _: - raise NDDInvalidConfigError( - f"The fileset reference {provided_fileset!r} in seed source path is formatted incorrectly" - ) From d1dca012b875ceb4889c9052cf6df28f65e3bfe3 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Tue, 23 Jun 2026 12:04:24 -0500 Subject: [PATCH 12/14] Style Signed-off-by: Mike Knepper --- .../tests/integration/test_preview_remote_sdk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py b/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py index 8921a9419c..05e5762869 100644 --- a/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py +++ b/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py @@ -108,7 +108,10 @@ def test_directory_seed_dataset_fileset_root() -> None: builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) builder.with_seed_dataset(dd.DirectorySeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}")) builder.add_column( - column_config=dd.ExpressionColumnConfig(name="expr", expr="{{ source_kind }} :: {{ relative_path }}") + column_config=dd.ExpressionColumnConfig( + name="expr", + expr="{{ source_kind }} :: {{ relative_path }}", + ) ) with ( From fa1a5f81c0f55aa76971f733db5ca2e74d3d2ad0 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Tue, 23 Jun 2026 14:42:12 -0500 Subject: [PATCH 13/14] Add hybrid fs provider Signed-off-by: Mike Knepper --- .../src/data_designer_nemo/context.py | 11 +++- .../fileset_filesystem_provider.py | 43 +++++++++++++- .../unit/test_fileset_filesystem_provider.py | 56 ++++++++++++++++++- 3 files changed, 104 insertions(+), 6 deletions(-) diff --git a/packages/data_designer_nemo/src/data_designer_nemo/context.py b/packages/data_designer_nemo/src/data_designer_nemo/context.py index b751864dea..64a0ad6fea 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/context.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/context.py @@ -23,7 +23,7 @@ ) from data_designer_nemo.errors import NDDError from data_designer_nemo.fileset_file_seed_reader import FilesetFileSeedReader -from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider +from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider from data_designer_nemo.model_provider import ( make_local_first_model_provider_registry, make_model_provider_registry, @@ -84,12 +84,17 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: return errors def get_seed_readers(self) -> list[SeedReader]: + # Directory- and FileContents-style seeds may reference either a local + # directory or a NeMo Platform fileset in local mode. The engine only + # accepts one provider per reader, so we inject a hybrid provider that + # resolves each seed path against local disk first, then a fileset. + fs_provider = HybridFileSystemProvider(self._sdk, workspace=self._workspace) return [ HuggingFaceSeedReader(), LocalFileSeedReader(), DataFrameSeedReader(), - DirectorySeedReader(), - FileContentsSeedReader(), + DirectorySeedReader(fs_provider=fs_provider), + FileContentsSeedReader(fs_provider=fs_provider), AgentRolloutSeedReader(), FilesetFileSeedReader(self._sdk), ] diff --git a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py index 1ad088335a..3edc622e13 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py @@ -1,9 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from pathlib import PurePosixPath +from pathlib import Path, PurePosixPath from data_designer.engine.resources.seed_reader import ( + FileSystemProvider, + LocalFileSystemProvider, SeedReaderConfigError, SeedReaderError, SeedReaderFileSystemContext, @@ -89,3 +91,42 @@ def _parse(self, runtime_path: str) -> tuple[str, str, str]: return parse_fileset_ref(runtime_path, workspace_fallback=self._workspace) except FilesetPathError as error: raise SeedReaderError(f"🛑 Invalid fileset seed source path {runtime_path!r}: {error}") from error + + +class HybridFileSystemProvider: + """Filesystem provider that resolves a seed path against local disk first, then a fileset. + + In local mode a directory-style seed source may point at either a directory on + the local filesystem or a NeMo Platform fileset, and the engine only lets us + inject a single provider per seed reader. We route per path: if the path + resolves to an existing local directory we serve it from disk, otherwise we + treat it as a fileset reference. This mirrors the local-first model-provider + resolution strategy (locally-defined providers first, Inference Gateway as the + fallback). + """ + + def __init__( + self, + sdk: NeMoPlatform | AsyncNeMoPlatform, + *, + workspace: str, + validated_roots: set[str] | None = None, + ) -> None: + self._local = LocalFileSystemProvider() + self._fileset = FilesetFileSystemProvider(sdk, workspace=workspace, validated_roots=validated_roots) + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: + return self._route(runtime_path).create_context(runtime_path=runtime_path) + + def ensure_root_exists(self, *, runtime_path: str) -> None: + self._route(runtime_path).ensure_root_exists(runtime_path=runtime_path) + + def _route(self, runtime_path: str) -> FileSystemProvider: + return self._local if _is_local_directory(runtime_path) else self._fileset + + +def _is_local_directory(runtime_path: str) -> bool: + try: + return Path(runtime_path).expanduser().is_dir() + except (OSError, ValueError, RuntimeError): + return False diff --git a/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py b/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py index 6e64bcecca..cb05e35d21 100644 --- a/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py +++ b/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py @@ -1,11 +1,17 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from pathlib import Path from unittest.mock import Mock, patch import pytest -from data_designer.engine.resources.seed_reader import SeedReaderConfigError -from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider +from data_designer.engine.resources.seed_reader import ( + DirectorySeedReader, + FileContentsSeedReader, + SeedReaderConfigError, +) +from data_designer_nemo.context import LocalDataDesignerContext +from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider def test_create_context_roots_reader_in_canonical_fileset_ref() -> None: @@ -44,3 +50,49 @@ def test_ensure_root_exists_reports_missing_fileset_path() -> None: provider.ensure_root_exists(runtime_path="docs#corpus") assert fs_class.return_value.exists.call_count == 2 + + +def test_hybrid_routes_existing_local_directory_to_disk(tmp_path: Path) -> None: + sdk = Mock() + provider = HybridFileSystemProvider(sdk, workspace="default") + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + context = provider.create_context(runtime_path=str(tmp_path)) + provider.ensure_root_exists(runtime_path=str(tmp_path)) + + assert context.root_path == tmp_path.resolve() + fs_class.assert_not_called() + + +def test_hybrid_routes_non_local_path_to_fileset() -> None: + sdk = Mock() + provider = HybridFileSystemProvider(sdk, workspace="default") + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.async_impl = True + fs_class.return_value.asynchronous = False + context = provider.create_context(runtime_path="docs#corpus") + + fs_class.assert_called_once_with(sdk) + assert str(context.root_path) == "default/docs#corpus" + + +def test_hybrid_ensure_root_exists_validates_fileset_for_non_local_path() -> None: + sdk = Mock() + provider = HybridFileSystemProvider(sdk, workspace="default") + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.exists.side_effect = [False, True] + + with pytest.raises(SeedReaderConfigError, match="Path 'corpus' not found in fileset 'default/docs'"): + provider.ensure_root_exists(runtime_path="docs#corpus") + + assert fs_class.return_value.exists.call_count == 2 + + +def test_local_context_wires_hybrid_provider_into_filesystem_readers() -> None: + readers = LocalDataDesignerContext(Mock(), "default").get_seed_readers() + + fs_readers = [r for r in readers if isinstance(r, DirectorySeedReader | FileContentsSeedReader)] + assert len(fs_readers) == 2 + assert all(isinstance(r._fs_provider, HybridFileSystemProvider) for r in fs_readers) From 35e125381307a1f0606ea37b32410c8255dcb8a3 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Wed, 24 Jun 2026 10:18:10 -0500 Subject: [PATCH 14/14] Local validation plus some refactoring Signed-off-by: Mike Knepper --- .../src/data_designer_nemo/context.py | 34 +++--- .../fileset_filesystem_provider.py | 12 ++- .../src/data_designer_nemo/seed.py | 102 ++++++++++++++++-- .../src/data_designer_nemo/tool_configs.py | 10 ++ .../unsupported_features.py | 99 ----------------- .../tests/unit/test_local_filesystem_seeds.py | 88 +++++++++++++++ .../unit/test_remote_filesystem_seeds.py | 26 +---- .../functions/_types.py | 4 +- .../nemo_data_designer_plugin/jobs/spec.py | 2 +- .../sdk/resources.py | 30 +----- .../tests/unit/test_context.py | 11 +- .../tests/unit/test_sdk_resources.py | 39 ------- 12 files changed, 238 insertions(+), 219 deletions(-) create mode 100644 packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py delete mode 100644 packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py create mode 100644 packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py diff --git a/packages/data_designer_nemo/src/data_designer_nemo/context.py b/packages/data_designer_nemo/src/data_designer_nemo/context.py index 64a0ad6fea..fee11c0245 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/context.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/context.py @@ -23,7 +23,10 @@ ) from data_designer_nemo.errors import NDDError from data_designer_nemo.fileset_file_seed_reader import FilesetFileSeedReader -from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider +from data_designer_nemo.fileset_filesystem_provider import ( + FilesetFileSystemProvider, + HybridFileSystemProvider, +) from data_designer_nemo.model_provider import ( make_local_first_model_provider_registry, make_model_provider_registry, @@ -34,10 +37,7 @@ from data_designer_nemo.sdk_translation import sync_to_async_sdk from data_designer_nemo.secret_resolver import NMPSecretResolver from data_designer_nemo.seed import validate_seed -from data_designer_nemo.unsupported_features import ( - validate_no_tool_configs, - validate_seed_config_for_execution_context, -) +from data_designer_nemo.tool_configs import validate_no_tool_configs from nemo_platform import AsyncNeMoPlatform, NeMoPlatform @@ -65,6 +65,7 @@ class LocalDataDesignerContext: def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str): self._sdk = sdk self._workspace = workspace + self._validated_filesystem_roots: set[str] = set() def get_secret_resolver(self) -> SecretResolver: return CompositeResolver( @@ -76,11 +77,15 @@ def get_secret_resolver(self) -> SecretResolver: ) async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: + sdk = self._async_sdk() errors: list[NDDError] = [] + try: - validate_seed_config_for_execution_context(config, is_local=True) + if validated_root := await validate_seed(config, self._workspace, sdk, is_local=True): + self._validated_filesystem_roots.add(validated_root) except NDDError as e: errors.append(e) + return errors def get_seed_readers(self) -> list[SeedReader]: @@ -88,7 +93,9 @@ def get_seed_readers(self) -> list[SeedReader]: # directory or a NeMo Platform fileset in local mode. The engine only # accepts one provider per reader, so we inject a hybrid provider that # resolves each seed path against local disk first, then a fileset. - fs_provider = HybridFileSystemProvider(self._sdk, workspace=self._workspace) + fs_provider = HybridFileSystemProvider( + self._sdk, workspace=self._workspace, validated_roots=self._validated_filesystem_roots + ) return [ HuggingFaceSeedReader(), LocalFileSeedReader(), @@ -114,6 +121,11 @@ async def get_model_providers(self, model_configs: list[dd.ModelConfig]) -> list return [make_noop_provider()] + def _async_sdk(self) -> AsyncNeMoPlatform: + if isinstance(self._sdk, NeMoPlatform): + return sync_to_async_sdk(self._sdk) + return self._sdk + class RemoteDataDesignerContext: def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str): @@ -132,15 +144,13 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: validate_no_tool_configs(config) except NDDError as e: errors.append(e) + try: - validate_seed_config_for_execution_context(config, is_local=False) - except NDDError as e: - errors.append(e) - try: - if validated_root := await validate_seed(config, self._workspace, sdk): + if validated_root := await validate_seed(config, self._workspace, sdk, is_local=False): self._validated_filesystem_roots.add(validated_root) except NDDError as e: errors.append(e) + try: await ensure_nemotron_personas_filesets(config, sdk) except NDDError as e: diff --git a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py index 3edc622e13..a171df85d8 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py @@ -30,7 +30,7 @@ class _FilesetDirFileSystem(DirFileSystem): this remains a complete AbstractFileSystem implementation. """ - def _relpath(self, path: str | list) -> str | list: # type: ignore[override] + def _relpath(self, path: str | list) -> str | list: if isinstance(path, list): return [self._relpath(p) for p in path] if not self.path: @@ -122,10 +122,16 @@ def ensure_root_exists(self, *, runtime_path: str) -> None: self._route(runtime_path).ensure_root_exists(runtime_path=runtime_path) def _route(self, runtime_path: str) -> FileSystemProvider: - return self._local if _is_local_directory(runtime_path) else self._fileset + return self._local if is_local_directory(runtime_path) else self._fileset -def _is_local_directory(runtime_path: str) -> bool: +def is_local_directory(runtime_path: str) -> bool: + """Whether a seed path resolves to an existing directory on the local filesystem. + + Shared by ``HybridFileSystemProvider`` routing and local-mode seed validation so + that eager validation and read-time routing always agree on which backend serves + a given path. + """ try: return Path(runtime_path).expanduser().is_dir() except (OSError, ValueError, RuntimeError): diff --git a/packages/data_designer_nemo/src/data_designer_nemo/seed.py b/packages/data_designer_nemo/src/data_designer_nemo/seed.py index d8d5a4861f..14ebdae242 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/seed.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/seed.py @@ -2,30 +2,58 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any import data_designer.config as dd from data_designer.config.seed_source import SeedSource from data_designer_nemo.errors import NDDInternalError, NDDInvalidConfigError from data_designer_nemo.fileset_file_seed_source import FilesetFileSeedSource +from data_designer_nemo.fileset_filesystem_provider import is_local_directory from data_designer_nemo.secret_resolver import validate_secret from nemo_platform import AsyncNeMoPlatform, NotFoundError, PermissionDeniedError from nemo_platform.filesets import FilesetPathError, build_fileset_ref, parse_fileset_ref logger = logging.getLogger(__name__) - -def get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None: - return dd_config.seed_config.source if dd_config.seed_config else None +_SUPPORTED_SEED_TYPES = {"directory", "file_contents", "hf", "nmp"} +_UNSUPPORTED_SEED_TYPES_MESSAGE = ( + "The NeMo Platform Data Designer service only supports seed data from HuggingFace " + "or the NeMo Platform Files service (FilesetFile, Directory, or FileContents seed sources " + "referencing fileset paths). Upload your data to the Files service, adjust your config, and try again." +) +_DATAFRAME_SEED_TYPE = "df" +_DATAFRAME_SEED_TYPE_MESSAGE = ( + "Dataframe seed sources (seed_type=df) are not supported on the NeMo Platform. TODO: more detail here!" +) -async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: AsyncNeMoPlatform) -> str | None: - if (seed_source := get_seed_source(dd_config)) is None: +async def validate_seed( + dd_config: dd.DataDesignerConfig, + workspace: str, + sdk: AsyncNeMoPlatform, + is_local: bool, +) -> str | None: + if (seed_source := _get_seed_source(dd_config)) is None: return None - if isinstance(seed_source, dd.HuggingFaceSeedSource) and (token := seed_source.token) is not None: - await validate_secret(sdk, token, workspace) + _validate_seed_type_for_execution_context( + seed_source.seed_type, + is_local=is_local, + ) + + if isinstance(seed_source, dd.HuggingFaceSeedSource): + # In local execution context, a HF seed source token will always "resolve" + # because the composite secret resolver includes a plaintext resolver. + # In remote execution context, a HF seed source token must be a reference + # to a Nemo Platform secret (if provided). + if not is_local and (token := seed_source.token) is not None: + await validate_secret(sdk, token, workspace) return None + if is_local and isinstance(seed_source, dd.DirectorySeedSource | dd.FileContentsSeedSource): + if is_local_directory(seed_source.path): + return None + if isinstance(seed_source, FilesetFileSeedSource | dd.DirectorySeedSource | dd.FileContentsSeedSource): return await _validate_seed_from_files_service(seed_source, workspace, sdk) @@ -78,3 +106,63 @@ async def _validate_seed_from_files_service( raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}") return canonical_root + + +def validate_seed_source_for_execution_context(data: Any, *, is_local: bool) -> None: + """Raises if a raw request seed source is unsupported for the execution context. + + This function is used in Pydantic validators defined on the preview and job request models, + both of which carry a `config: dd.DataDesignerConfig` field. + + This function is used in "before"-style Pydantic validators, where the data argument is typed + as Any. We run in the before context to preempt less-useful error messages from the DD library: + - missing dataframe field (we don't serialize dataframes over the wire) + - file does not exist (the client's local fs != the service's local fs) + + The validators using this function only care about preventing unsupported seed types. All the + other standard Pydantic validation will get applied by FastAPI parsing the request; this does + not bypass that. So, we can safely ignore all Exceptions (most commonly KeyError, on requests + that don't include a seed_config at all) and index our way straight to the deeply nested field + we care about for this particular validation. + + Per the Pydantic v2 contract, "before"-mode validators may raise ``ValueError``, + ``AssertionError``, or ``PydanticCustomError`` — anything else (including our + ``NDDInvalidConfigError``) propagates raw out of ``model_validate`` and is not wrapped in + ``pydantic.ValidationError``. That breaks ``except ValidationError`` clauses in CLI / framework + code that turn validation problems into clean user-facing messages. To keep those code paths + working *and* keep ``NDDInvalidConfigError`` as the canonical error class for non-Pydantic + callers, we translate at this boundary: catch the plugin's error class and re-raise as a + ``ValueError`` carrying the same message. + """ + seed_type = _get_raw_seed_type(data) + if seed_type is None: + return + + try: + _validate_seed_type_for_execution_context(seed_type, is_local=is_local) + except NDDInvalidConfigError as exc: + raise ValueError(str(exc)) from exc + + +def _validate_seed_type_for_execution_context(seed_type: str, *, is_local: bool) -> None: + """Raises if a seed source type is unsupported in this execution context.""" + if is_local: + if seed_type == _DATAFRAME_SEED_TYPE: + raise NDDInvalidConfigError(_DATAFRAME_SEED_TYPE_MESSAGE) + return + + if seed_type not in _SUPPORTED_SEED_TYPES: + raise NDDInvalidConfigError(_UNSUPPORTED_SEED_TYPES_MESSAGE) + + +def _get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None: + return dd_config.seed_config.source if dd_config.seed_config else None + + +def _get_raw_seed_type(data: Any) -> str | None: + try: + seed_type = data["config"]["seed_config"]["source"]["seed_type"] + except Exception: + return None + + return seed_type if isinstance(seed_type, str) else None diff --git a/packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py b/packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py new file mode 100644 index 0000000000..a2edd7d43a --- /dev/null +++ b/packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import data_designer.config as dd +from data_designer_nemo.errors import NDDInvalidConfigError + + +def validate_no_tool_configs(config: dd.DataDesignerConfig) -> None: + if config.tool_configs and len(config.tool_configs) > 0: + raise NDDInvalidConfigError("Tool configs are not supported in the NeMo Platform Data Designer service.") diff --git a/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py b/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py deleted file mode 100644 index b16af8849e..0000000000 --- a/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Any - -import data_designer.config as dd -from data_designer_nemo.errors import NDDInvalidConfigError - -_SUPPORTED_SEED_TYPES = {"directory", "file_contents", "hf", "nmp"} -_UNSUPPORTED_SEED_TYPES_MESSAGE = ( - "The NeMo Platform Data Designer service only supports seed data from HuggingFace " - "or the NeMo Platform Files service (FilesetFile, Directory, or FileContents seed sources " - "referencing fileset paths). Upload your data to the Files service, adjust your config, and try again." -) -_DATAFRAME_SEED_TYPE = "df" -_DATAFRAME_SEED_TYPE_MESSAGE = ( - "Dataframe seed sources (seed_type=df) are not supported on the NeMo Platform. " - "Use a serializable seed source such as a local file, directory, HuggingFace, or the Files service." -) - - -def validate_no_tool_configs(config: dd.DataDesignerConfig) -> None: - if config.tool_configs and len(config.tool_configs) > 0: - raise NDDInvalidConfigError("Tool configs are not supported in the NeMo Platform Data Designer service.") - - -def validate_remote_seed_type(seed_type: str) -> None: - """Raises if a seed source type is unsupported for remote execution.""" - _validate_seed_type_for_execution_context(seed_type, is_local=False) - - -def validate_seed_config_for_execution_context(config: dd.DataDesignerConfig, *, is_local: bool) -> None: - """Raises if a parsed config uses a seed source unsupported in this execution context.""" - seed_type = _get_config_seed_type(config) - if seed_type is not None: - _validate_seed_type_for_execution_context(seed_type, is_local=is_local) - - -def validate_seed_source_for_execution_context(data: Any, *, is_local: bool) -> None: - """Raises if a raw request seed source is unsupported for the execution context. - - This function is used in Pydantic validators defined on the preview and job request models, - both of which carry a `config: dd.DataDesignerConfig` field. - - This function is used in "before"-style Pydantic validators, where the data argument is typed - as Any. We run in the before context to preempt less-useful error messages from the DD library: - - missing dataframe field (we don't serialize dataframes over the wire) - - file does not exist (the client's local fs != the service's local fs) - - The validators using this function only care about preventing unsupported seed types. All the - other standard Pydantic validation will get applied by FastAPI parsing the request; this does - not bypass that. So, we can safely ignore all Exceptions (most commonly KeyError, on requests - that don't include a seed_config at all) and index our way straight to the deeply nested field - we care about for this particular validation. - - Per the Pydantic v2 contract, "before"-mode validators may raise ``ValueError``, - ``AssertionError``, or ``PydanticCustomError`` — anything else (including our - ``NDDInvalidConfigError``) propagates raw out of ``model_validate`` and is not wrapped in - ``pydantic.ValidationError``. That breaks ``except ValidationError`` clauses in CLI / framework - code that turn validation problems into clean user-facing messages. To keep those code paths - working *and* keep ``NDDInvalidConfigError`` as the canonical error class for non-Pydantic - callers, we translate at this boundary: catch the plugin's error class and re-raise as a - ``ValueError`` carrying the same message. - """ - seed_type = _get_raw_seed_type(data) - if seed_type is None: - return - - try: - _validate_seed_type_for_execution_context(seed_type, is_local=is_local) - except NDDInvalidConfigError as exc: - raise ValueError(str(exc)) from exc - - -def _validate_seed_type_for_execution_context(seed_type: str, *, is_local: bool) -> None: - """Raises if a seed source type is unsupported in this execution context.""" - if is_local: - if seed_type == _DATAFRAME_SEED_TYPE: - raise NDDInvalidConfigError(_DATAFRAME_SEED_TYPE_MESSAGE) - return - - if seed_type not in _SUPPORTED_SEED_TYPES: - raise NDDInvalidConfigError(_UNSUPPORTED_SEED_TYPES_MESSAGE) - - -def _get_config_seed_type(config: dd.DataDesignerConfig) -> str | None: - if config.seed_config is None: - return None - - return config.seed_config.source.seed_type - - -def _get_raw_seed_type(data: Any) -> str | None: - try: - seed_type = data["config"]["seed_config"]["source"]["seed_type"] - except Exception: - return None - - return seed_type if isinstance(seed_type, str) else None diff --git a/packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py b/packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py new file mode 100644 index 0000000000..6a9fb270c5 --- /dev/null +++ b/packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path +from typing import Any +from unittest.mock import AsyncMock, Mock + +import data_designer.config as dd +import pytest +from data_designer_nemo.context import LocalDataDesignerContext +from data_designer_nemo.errors import NDDInvalidConfigError +from data_designer_nemo.seed import validate_seed +from nemo_platform import AsyncNeMoPlatform, NotFoundError + + +def _make_config(source: Any) -> dd.DataDesignerConfig: + builder = dd.DataDesignerConfigBuilder() + builder.with_seed_dataset(source) + return builder.build() + + +@pytest.mark.asyncio +async def test_local_validate_seed_passes_existing_local_directory_without_sdk(tmp_path: Path) -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + + validated_root = await validate_seed( + _make_config(dd.DirectorySeedSource(path=str(tmp_path))), "default", sdk, is_local=True + ) + + assert validated_root is None + sdk.files.filesets.retrieve.assert_not_called() + sdk.files.list.assert_not_called() + + +@pytest.mark.asyncio +async def test_local_validate_seed_validates_fileset_for_non_local_path() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock() + sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) + + validated_root = await validate_seed( + _make_config(dd.DirectorySeedSource(path="docs#corpus")), "default", sdk, is_local=True + ) + + assert validated_root == "default/docs#corpus" + sdk.files.filesets.retrieve.assert_awaited_once_with(name="docs", workspace="default") + + +@pytest.mark.asyncio +async def test_local_validate_seed_reports_missing_fileset() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock(side_effect=NotFoundError("missing", response=Mock(), body=None)) + + with pytest.raises(NDDInvalidConfigError, match="Could not find fileset"): + await validate_seed( + _make_config(dd.DirectorySeedSource(path="does-not-exist#corpus")), "default", sdk, is_local=True + ) + + +@pytest.mark.asyncio +async def test_local_validate_seed_skips_huggingface_secret_resolution() -> None: + # Remote mode resolves the HF token against the Files/secret service; local mode must not, + # since the token may be a plaintext value or an environment variable. + sdk = AsyncMock(spec=AsyncNeMoPlatform) + + validated_root = await validate_seed( + _make_config(dd.HuggingFaceSeedSource(path="org/dataset", token="hf_local_token")), + "default", + sdk, + is_local=True, + ) + + assert validated_root is None + + +@pytest.mark.asyncio +async def test_local_context_validate_caches_fileset_root() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock() + sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) + ctx = LocalDataDesignerContext(sdk, "default") + + errors = await ctx.validate(_make_config(dd.DirectorySeedSource(path="docs#corpus"))) + + assert errors == [] + assert "default/docs#corpus" in ctx._validated_filesystem_roots diff --git a/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py b/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py index aebd19dc8d..5c07493f07 100644 --- a/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py +++ b/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py @@ -3,7 +3,6 @@ from __future__ import annotations -from typing import Any from unittest.mock import AsyncMock, Mock import data_designer.config as dd @@ -11,27 +10,9 @@ from data_designer.engine.resources.seed_reader import DirectorySeedReader, FileContentsSeedReader from data_designer_nemo.context import RemoteDataDesignerContext from data_designer_nemo.seed import validate_seed -from data_designer_nemo.unsupported_features import validate_seed_config_for_execution_context from nemo_platform import AsyncNeMoPlatform -def _make_config(source: Any) -> dd.DataDesignerConfig: - builder = dd.DataDesignerConfigBuilder() - builder.with_seed_dataset(source) - return builder.build() - - -def test_remote_seed_type_validation_allows_filesystem_seed_sources() -> None: - validate_seed_config_for_execution_context( - _make_config(dd.DirectorySeedSource(path="workspace/docs#corpus")), - is_local=False, - ) - validate_seed_config_for_execution_context( - _make_config(dd.FileContentsSeedSource(path="workspace/docs#corpus")), - is_local=False, - ) - - def test_remote_context_includes_filesystem_seed_readers() -> None: readers = RemoteDataDesignerContext(Mock(), "default").get_seed_readers() @@ -44,9 +25,12 @@ async def test_validate_seed_returns_canonical_validated_filesystem_root() -> No sdk = AsyncMock(spec=AsyncNeMoPlatform) sdk.files.filesets.retrieve = AsyncMock() sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) - config = _make_config(dd.FileContentsSeedSource(path="docs#corpus", file_pattern="*.md")) - validated_root = await validate_seed(config, "default", sdk) + builder = dd.DataDesignerConfigBuilder() + builder.with_seed_dataset(dd.FileContentsSeedSource(path="docs#corpus", file_pattern="*.md")) + config = builder.build() + + validated_root = await validate_seed(config, "default", sdk, is_local=False) assert validated_root == "default/docs#corpus" sdk.files.filesets.retrieve.assert_awaited_once_with(name="docs", workspace="default") diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py index 0fa799b0b3..6233a9561c 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py @@ -8,9 +8,7 @@ import data_designer.config as dd from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.dataset_metadata import DatasetMetadata -from data_designer_nemo.unsupported_features import ( - validate_seed_source_for_execution_context, -) +from data_designer_nemo.seed import validate_seed_source_for_execution_context from nemo_platform_plugin.functions.frames import Done, Error, Heartbeat from pydantic import BaseModel, Field, ValidationInfo, model_validator diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py index 0dc77ecdaa..a33e05ffe1 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py @@ -4,7 +4,7 @@ from typing import Any import data_designer.config as dd -from data_designer_nemo.unsupported_features import validate_seed_source_for_execution_context +from data_designer_nemo.seed import validate_seed_source_for_execution_context from pydantic import BaseModel, ValidationInfo, model_validator diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py index c217eca6ba..4d4a61ed19 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py @@ -16,8 +16,6 @@ from data_designer.config.preview_results import PreviewResults from data_designer.config.utils.info import InterfaceInfo from data_designer.logging import RandomEmoji -from data_designer_nemo.errors import NDDInvalidConfigError -from data_designer_nemo.unsupported_features import validate_remote_seed_type from nemo_data_designer_plugin.functions._types import ( AnalysisFrame, DatasetFrame, @@ -247,7 +245,7 @@ def preview( Returns: An object containing the preview dataset and tools for inspecting the results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = PreviewSpec(config=config, num_records=num_records) with _PreviewFrameCollector() as message_collector: @@ -305,7 +303,7 @@ def create( Returns: An object with methods for querying the job's status and results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = DataDesignerJobConfig(config=config, num_records=num_records) try: resp = self._client().post( @@ -384,13 +382,6 @@ def validate( A :class:`ValidationReport` whose ``ok`` property is true iff every requested context validated cleanly. """ - # Don't apply the eager ``_get_config_for_api_call`` rejection that - # ``preview`` / ``create`` use — the validate pass is *meant* to - # surface unsupported-seed errors as part of its report, alongside - # any other problems. Short-circuiting on the first eager check would - # break aggregation and would also reject ``df``-seed configs that - # are surfaced cleanly with a helpful message by the validate pass - # itself (see ``_validate_seed_type_for_execution_context``). resolved_workspace = workspace or self._platform.workspace or "default" return validate_config_sync( config_builder, @@ -426,7 +417,7 @@ async def preview( Returns: An object containing the preview dataset and tools for inspecting the results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = PreviewSpec(config=config, num_records=num_records) with _PreviewFrameCollector() as message_collector: @@ -484,7 +475,7 @@ async def create( Returns: An object with methods for querying the job's status and results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = DataDesignerJobConfig(config=config, num_records=num_records) try: resp = await self._client().post( @@ -547,8 +538,6 @@ async def validate( workspace: str | None = None, ) -> ValidationReport: """Async equivalent of :meth:`DataDesignerResource.validate`.""" - # See the sync ``DataDesignerResource.validate`` docstring for why we - # bypass ``_get_config_for_api_call`` here. resolved_workspace = workspace or self._platform.workspace or "default" return await validate_config( config_builder, @@ -558,17 +547,6 @@ async def validate( ) -def _get_config_for_api_call(config_builder: dd.DataDesignerConfigBuilder) -> dd.DataDesignerConfig: - """Build the config and reject unsupported local-only seed source types.""" - - if (seed_config := config_builder.get_seed_config()) is not None: - try: - validate_remote_seed_type(seed_config.source.seed_type) - except NDDInvalidConfigError as exc: - raise DataDesignerConfigValidationError(str(exc)) from exc - return config_builder.build() - - def _get_error(e: BaseException) -> DataDesignerClientError: if isinstance(e, httpx.HTTPStatusError): status_code, detail = extract_http_error_info(e) diff --git a/plugins/nemo-data-designer/tests/unit/test_context.py b/plugins/nemo-data-designer/tests/unit/test_context.py index 418519e985..f618cfe569 100644 --- a/plugins/nemo-data-designer/tests/unit/test_context.py +++ b/plugins/nemo-data-designer/tests/unit/test_context.py @@ -82,17 +82,13 @@ def validate_tools(validated_config: dd.DataDesignerConfig) -> None: assert validated_config is config calls.append("tools") - def validate_seed_type(validated_config: dd.DataDesignerConfig, *, is_local: bool) -> None: - assert validated_config is config - assert is_local is False - calls.append("seed-type") - async def validate_seed( - validated_config: dd.DataDesignerConfig, workspace: str, async_sdk: AsyncNeMoPlatform + validated_config: dd.DataDesignerConfig, workspace: str, async_sdk: AsyncNeMoPlatform, is_local: bool ) -> None: assert validated_config is config assert workspace == u.WORKSPACE_NAME assert async_sdk is sdk + assert not is_local calls.append("seed") async def validate_personas(validated_config: dd.DataDesignerConfig, async_sdk: AsyncNeMoPlatform) -> None: @@ -101,14 +97,13 @@ async def validate_personas(validated_config: dd.DataDesignerConfig, async_sdk: calls.append("personas") monkeypatch.setattr("data_designer_nemo.context.validate_no_tool_configs", validate_tools) - monkeypatch.setattr("data_designer_nemo.context.validate_seed_config_for_execution_context", validate_seed_type) monkeypatch.setattr("data_designer_nemo.context.validate_seed", validate_seed) monkeypatch.setattr("data_designer_nemo.context.ensure_nemotron_personas_filesets", validate_personas) errors = await RemoteDataDesignerContext(sdk, u.WORKSPACE_NAME).validate(config) assert errors == [] - assert calls == ["tools", "seed-type", "seed", "personas"] + assert calls == ["tools", "seed", "personas"] async def test_remote_validate_rejects_unsupported_seed_config() -> None: diff --git a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py index 4ff0bf4dd7..160da9cbe0 100644 --- a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py +++ b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py @@ -206,45 +206,6 @@ def test_preview_collector_propagates_error_frame_message( resource.preview(config_builder) -# --------------------------------------------------------------------------- -# Client-side seed-source validation gate (_get_config_for_api_call) -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize("seed_kind", ["df", "local"]) -def test_preview_rejects_local_only_seed_sources_before_sending_request( - resource: DataDesignerResource, - config_builder: dd.DataDesignerConfigBuilder, - seed_kind: str, - tmp_path, -) -> None: - """The validation gate inside ``_get_config_for_api_call`` rejects seed sources that - only make sense locally (DataFrame, LocalFile), so the SDK fails fast with a typed - error instead of letting the server emit a 422 round-trip later. Patching ``_preview`` - to raise an AssertionError catches any regression where the request is sent anyway. - """ - if seed_kind == "df": - seed_source = dd.DataFrameSeedSource(df=pd.DataFrame(data={"foo": [1, 2, 3]})) - elif seed_kind == "local": - seed_file = tmp_path / "seed.parquet" - _make_basic_dataset().to_parquet(seed_file) - seed_source = dd.LocalFileSeedSource(path=str(seed_file)) - elif seed_kind == "directory": - seed_source = dd.DirectorySeedSource(path=str(tmp_path)) - else: - seed_source = dd.FileContentsSeedSource(path=str(tmp_path)) - - config_builder.with_seed_dataset(seed_source) - - with ( - patch.object(resource, "_preview", side_effect=AssertionError("preview request should not be sent")), - pytest.raises(DataDesignerConfigValidationError) as exc_info, - ): - resource.preview(config_builder) - - assert "only supports seed data" in str(exc_info.value) - - # --------------------------------------------------------------------------- # Default model surfaces # ---------------------------------------------------------------------------