diff --git a/packages/data_designer_nemo/src/data_designer_nemo/context.py b/packages/data_designer_nemo/src/data_designer_nemo/context.py index 0d8eb699ab..fee11c0245 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/context.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/context.py @@ -23,6 +23,10 @@ ) from data_designer_nemo.errors import NDDError from data_designer_nemo.fileset_file_seed_reader import FilesetFileSeedReader +from data_designer_nemo.fileset_filesystem_provider import ( + FilesetFileSystemProvider, + HybridFileSystemProvider, +) from data_designer_nemo.model_provider import ( make_local_first_model_provider_registry, make_model_provider_registry, @@ -33,10 +37,7 @@ from data_designer_nemo.sdk_translation import sync_to_async_sdk from data_designer_nemo.secret_resolver import NMPSecretResolver from data_designer_nemo.seed import validate_seed -from data_designer_nemo.unsupported_features import ( - validate_no_tool_configs, - validate_seed_config_for_execution_context, -) +from data_designer_nemo.tool_configs import validate_no_tool_configs from nemo_platform import AsyncNeMoPlatform, NeMoPlatform @@ -64,6 +65,7 @@ class LocalDataDesignerContext: def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str): self._sdk = sdk self._workspace = workspace + self._validated_filesystem_roots: set[str] = set() def get_secret_resolver(self) -> SecretResolver: return CompositeResolver( @@ -75,20 +77,31 @@ def get_secret_resolver(self) -> SecretResolver: ) async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: + sdk = self._async_sdk() errors: list[NDDError] = [] + try: - validate_seed_config_for_execution_context(config, is_local=True) + if validated_root := await validate_seed(config, self._workspace, sdk, is_local=True): + self._validated_filesystem_roots.add(validated_root) except NDDError as e: errors.append(e) + return errors def get_seed_readers(self) -> list[SeedReader]: + # Directory- and FileContents-style seeds may reference either a local + # directory or a NeMo Platform fileset in local mode. The engine only + # accepts one provider per reader, so we inject a hybrid provider that + # resolves each seed path against local disk first, then a fileset. + fs_provider = HybridFileSystemProvider( + self._sdk, workspace=self._workspace, validated_roots=self._validated_filesystem_roots + ) return [ HuggingFaceSeedReader(), LocalFileSeedReader(), DataFrameSeedReader(), - DirectorySeedReader(), - FileContentsSeedReader(), + DirectorySeedReader(fs_provider=fs_provider), + FileContentsSeedReader(fs_provider=fs_provider), AgentRolloutSeedReader(), FilesetFileSeedReader(self._sdk), ] @@ -108,11 +121,17 @@ async def get_model_providers(self, model_configs: list[dd.ModelConfig]) -> list return [make_noop_provider()] + def _async_sdk(self) -> AsyncNeMoPlatform: + if isinstance(self._sdk, NeMoPlatform): + return sync_to_async_sdk(self._sdk) + return self._sdk + class RemoteDataDesignerContext: def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str): self._sdk = sdk self._workspace = workspace + self._validated_filesystem_roots: set[str] = set() def get_secret_resolver(self) -> SecretResolver: return NMPSecretResolver(self._sdk, self._workspace) @@ -125,14 +144,13 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: validate_no_tool_configs(config) except NDDError as e: errors.append(e) + try: - validate_seed_config_for_execution_context(config, is_local=False) - except NDDError as e: - errors.append(e) - try: - await validate_seed(config, self._workspace, sdk) + if validated_root := await validate_seed(config, self._workspace, sdk, is_local=False): + self._validated_filesystem_roots.add(validated_root) except NDDError as e: errors.append(e) + try: await ensure_nemotron_personas_filesets(config, sdk) except NDDError as e: @@ -141,9 +159,16 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]: return errors def get_seed_readers(self) -> list[SeedReader]: + provider = FilesetFileSystemProvider( + self._sdk, + workspace=self._workspace, + validated_roots=self._validated_filesystem_roots, + ) return [ HuggingFaceSeedReader(), FilesetFileSeedReader(self._sdk), + DirectorySeedReader(fs_provider=provider), + FileContentsSeedReader(fs_provider=provider), ] def get_person_reader(self) -> PersonReader: diff --git a/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py new file mode 100644 index 0000000000..a171df85d8 --- /dev/null +++ b/packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path, PurePosixPath + +from data_designer.engine.resources.seed_reader import ( + FileSystemProvider, + LocalFileSystemProvider, + SeedReaderConfigError, + SeedReaderError, + SeedReaderFileSystemContext, +) +from data_designer_nemo.sdk_translation import async_to_sync_sdk +from fsspec.implementations.dirfs import DirFileSystem +from nemo_platform import AsyncNeMoPlatform, NeMoPlatform +from nemo_platform.filesets import FilesetFileSystem, FilesetPathError, build_fileset_ref, parse_fileset_ref + + +class _FilesetDirFileSystem(DirFileSystem): + """DirFileSystem that handles FilesetFileSystem's '#' path separator. + + FilesetFileSystem returns paths using '#' to separate the fileset name from + the file path (e.g. "ws/fs#data.parquet"). Standard DirFileSystem._relpath + builds its strip-prefix with '/' (e.g. "ws/fs/"), so the startswith check + fails for fileset-root paths. For subdirectory roots (e.g. "ws/fs#subdir"), + files use '/' after '#' and the standard logic already works; the '#' branch + below is a no-op in that case. + + All methods besides _relpath are inherited from DirFileSystem unchanged, so + this remains a complete AbstractFileSystem implementation. + """ + + def _relpath(self, path: str | list) -> str | list: + if isinstance(path, list): + return [self._relpath(p) for p in path] + if not self.path: + return path + if path == self.path: + return "" + for sep in ("#", "/"): + prefix = self.path + sep + if path.startswith(prefix): + return path[len(prefix) :] + raise AssertionError(f"Path {path!r} does not start with root {self.path!r}") + + +class FilesetFileSystemProvider: + """Filesystem provider that roots directory-style seed readers in a fileset.""" + + def __init__( + self, + sdk: NeMoPlatform | AsyncNeMoPlatform, + *, + workspace: str, + validated_roots: set[str] | None = None, + ) -> None: + if isinstance(sdk, AsyncNeMoPlatform): + sdk = async_to_sync_sdk(sdk) + self._sdk = sdk + self._workspace = workspace + self._validated_roots = set() if validated_roots is None else validated_roots + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: + root = self._canonical_root(runtime_path) + rooted_fs = _FilesetDirFileSystem(path=root, fs=FilesetFileSystem(self._sdk)) + return SeedReaderFileSystemContext(fs=rooted_fs, root_path=PurePosixPath(root)) + + def ensure_root_exists(self, *, runtime_path: str) -> None: + workspace, fileset, fragment = self._parse(runtime_path) + root = build_fileset_ref(fragment, workspace=workspace, fileset=fileset) + if root in self._validated_roots: + return + + fs = FilesetFileSystem(self._sdk) + if fs.exists(root): + self._validated_roots.add(root) + return + + fileset_root = build_fileset_ref("", workspace=workspace, fileset=fileset) + fully_qualified_fileset_name = f"{workspace}/{fileset}" + if not fs.exists(fileset_root): + raise SeedReaderConfigError(f"🛑 Fileset {fully_qualified_fileset_name!r} not found.") + raise SeedReaderConfigError(f"🛑 Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}.") + + def _canonical_root(self, runtime_path: str) -> str: + workspace, fileset, fragment = self._parse(runtime_path) + return build_fileset_ref(fragment, workspace=workspace, fileset=fileset) + + def _parse(self, runtime_path: str) -> tuple[str, str, str]: + try: + return parse_fileset_ref(runtime_path, workspace_fallback=self._workspace) + except FilesetPathError as error: + raise SeedReaderError(f"🛑 Invalid fileset seed source path {runtime_path!r}: {error}") from error + + +class HybridFileSystemProvider: + """Filesystem provider that resolves a seed path against local disk first, then a fileset. + + In local mode a directory-style seed source may point at either a directory on + the local filesystem or a NeMo Platform fileset, and the engine only lets us + inject a single provider per seed reader. We route per path: if the path + resolves to an existing local directory we serve it from disk, otherwise we + treat it as a fileset reference. This mirrors the local-first model-provider + resolution strategy (locally-defined providers first, Inference Gateway as the + fallback). + """ + + def __init__( + self, + sdk: NeMoPlatform | AsyncNeMoPlatform, + *, + workspace: str, + validated_roots: set[str] | None = None, + ) -> None: + self._local = LocalFileSystemProvider() + self._fileset = FilesetFileSystemProvider(sdk, workspace=workspace, validated_roots=validated_roots) + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: + return self._route(runtime_path).create_context(runtime_path=runtime_path) + + def ensure_root_exists(self, *, runtime_path: str) -> None: + self._route(runtime_path).ensure_root_exists(runtime_path=runtime_path) + + def _route(self, runtime_path: str) -> FileSystemProvider: + return self._local if is_local_directory(runtime_path) else self._fileset + + +def is_local_directory(runtime_path: str) -> bool: + """Whether a seed path resolves to an existing directory on the local filesystem. + + Shared by ``HybridFileSystemProvider`` routing and local-mode seed validation so + that eager validation and read-time routing always agree on which backend serves + a given path. + """ + try: + return Path(runtime_path).expanduser().is_dir() + except (OSError, ValueError, RuntimeError): + return False diff --git a/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py b/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py index 380bd045c6..3671c6b57a 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/model_provider.py @@ -54,7 +54,6 @@ def make_null_registry() -> ModelProviderRegistry: # is semantically valid. The library requires a non-empty ModelProviderRegistry, so in this scenario # we can provide this dummy null registry. return ModelProviderRegistry( - default=_NO_OP, providers=[make_noop_provider()], ) @@ -74,12 +73,6 @@ async def make_local_first_model_provider_registry( if len(model_configs) == 0: return None - missing_providers = [model_config for model_config in model_configs if model_config.provider is None] - if len(missing_providers) > 0: - raise NDDInvalidConfigError( - f"Error: following model configs do not have an explicit provider defined: {missing_providers}" - ) - logger.info("Building model provider registry. First checking locally-defined providers.") local_registry = _make_local_model_provider_registry() diff --git a/packages/data_designer_nemo/src/data_designer_nemo/seed.py b/packages/data_designer_nemo/src/data_designer_nemo/seed.py index 91baf47b70..14ebdae242 100644 --- a/packages/data_designer_nemo/src/data_designer_nemo/seed.py +++ b/packages/data_designer_nemo/src/data_designer_nemo/seed.py @@ -2,52 +2,167 @@ # SPDX-License-Identifier: Apache-2.0 import logging +from typing import Any import data_designer.config as dd from data_designer.config.seed_source import SeedSource from data_designer_nemo.errors import NDDInternalError, NDDInvalidConfigError from data_designer_nemo.fileset_file_seed_source import FilesetFileSeedSource +from data_designer_nemo.fileset_filesystem_provider import is_local_directory from data_designer_nemo.secret_resolver import validate_secret from nemo_platform import AsyncNeMoPlatform, NotFoundError, PermissionDeniedError +from nemo_platform.filesets import FilesetPathError, build_fileset_ref, parse_fileset_ref logger = logging.getLogger(__name__) +_SUPPORTED_SEED_TYPES = {"directory", "file_contents", "hf", "nmp"} +_UNSUPPORTED_SEED_TYPES_MESSAGE = ( + "The NeMo Platform Data Designer service only supports seed data from HuggingFace " + "or the NeMo Platform Files service (FilesetFile, Directory, or FileContents seed sources " + "referencing fileset paths). Upload your data to the Files service, adjust your config, and try again." +) +_DATAFRAME_SEED_TYPE = "df" +_DATAFRAME_SEED_TYPE_MESSAGE = ( + "Dataframe seed sources (seed_type=df) are not supported on the NeMo Platform. TODO: more detail here!" +) -def get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None: - return dd_config.seed_config.source if dd_config.seed_config else None +async def validate_seed( + dd_config: dd.DataDesignerConfig, + workspace: str, + sdk: AsyncNeMoPlatform, + is_local: bool, +) -> str | None: + if (seed_source := _get_seed_source(dd_config)) is None: + return None + + _validate_seed_type_for_execution_context( + seed_source.seed_type, + is_local=is_local, + ) -async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: AsyncNeMoPlatform) -> None: - if (seed_source := get_seed_source(dd_config)) is None: + if isinstance(seed_source, dd.HuggingFaceSeedSource): + # In local execution context, a HF seed source token will always "resolve" + # because the composite secret resolver includes a plaintext resolver. + # In remote execution context, a HF seed source token must be a reference + # to a Nemo Platform secret (if provided). + if not is_local and (token := seed_source.token) is not None: + await validate_secret(sdk, token, workspace) return None - if isinstance(seed_source, dd.HuggingFaceSeedSource) and (token := seed_source.token) is not None: - await validate_secret(sdk, token, workspace) + if is_local and isinstance(seed_source, dd.DirectorySeedSource | dd.FileContentsSeedSource): + if is_local_directory(seed_source.path): + return None + + if isinstance(seed_source, FilesetFileSeedSource | dd.DirectorySeedSource | dd.FileContentsSeedSource): + return await _validate_seed_from_files_service(seed_source, workspace, sdk) + + +async def _validate_seed_from_files_service( + seed_source: FilesetFileSeedSource | dd.DirectorySeedSource | dd.FileContentsSeedSource, + workspace: str, + sdk: AsyncNeMoPlatform, +) -> str | None: + try: + workspace, fileset_name, fragment = parse_fileset_ref(seed_source.path, workspace_fallback=workspace) + except FilesetPathError as e: + raise NDDInvalidConfigError( + f"The fileset reference in seed source path {seed_source.path!r} is formatted incorrectly" + ) from e + + try: + await sdk.files.filesets.retrieve(name=fileset_name, workspace=workspace) + except NotFoundError as e: + raise NDDInvalidConfigError(f"Could not find fileset {fileset_name!r} in workspace {workspace!r}") from e + except PermissionDeniedError as e: + raise NDDInvalidConfigError(f"Access denied to workspace {workspace!r}") from e + except Exception as e: + logger.exception("Error retrieving fileset", extra={"fileset_name": fileset_name, "workspace": workspace}) + raise NDDInternalError( + f"An unexpected error occurred while retrieving fileset {fileset_name!r} in workspace {workspace!r}: {e}" + ) from e + + canonical_root = build_fileset_ref(fragment, workspace=workspace, fileset=fileset_name) + if not fragment: + return canonical_root + + fully_qualified_fileset_name = f"{workspace}/{fileset_name}" + try: + response = await sdk.files.list(remote_path=fragment, fileset=fileset_name, workspace=workspace) + except NotFoundError as e: + raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}") from e + except PermissionDeniedError as e: + raise NDDInvalidConfigError(f"Access denied to workspace {workspace!r}") from e + except Exception as e: + logger.exception( + "Error listing fileset path", + extra={"fileset_name": fileset_name, "workspace": workspace, "fragment": fragment}, + ) + raise NDDInternalError( + f"An unexpected error occurred while listing path {fragment!r} in fileset {fully_qualified_fileset_name!r}: {e}" + ) from e + + if not response.data: + raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}") + + return canonical_root + + +def validate_seed_source_for_execution_context(data: Any, *, is_local: bool) -> None: + """Raises if a raw request seed source is unsupported for the execution context. + + This function is used in Pydantic validators defined on the preview and job request models, + both of which carry a `config: dd.DataDesignerConfig` field. + + This function is used in "before"-style Pydantic validators, where the data argument is typed + as Any. We run in the before context to preempt less-useful error messages from the DD library: + - missing dataframe field (we don't serialize dataframes over the wire) + - file does not exist (the client's local fs != the service's local fs) + + The validators using this function only care about preventing unsupported seed types. All the + other standard Pydantic validation will get applied by FastAPI parsing the request; this does + not bypass that. So, we can safely ignore all Exceptions (most commonly KeyError, on requests + that don't include a seed_config at all) and index our way straight to the deeply nested field + we care about for this particular validation. + + Per the Pydantic v2 contract, "before"-mode validators may raise ``ValueError``, + ``AssertionError``, or ``PydanticCustomError`` — anything else (including our + ``NDDInvalidConfigError``) propagates raw out of ``model_validate`` and is not wrapped in + ``pydantic.ValidationError``. That breaks ``except ValidationError`` clauses in CLI / framework + code that turn validation problems into clean user-facing messages. To keep those code paths + working *and* keep ``NDDInvalidConfigError`` as the canonical error class for non-Pydantic + callers, we translate at this boundary: catch the plugin's error class and re-raise as a + ``ValueError`` carrying the same message. + """ + seed_type = _get_raw_seed_type(data) + if seed_type is None: + return + + try: + _validate_seed_type_for_execution_context(seed_type, is_local=is_local) + except NDDInvalidConfigError as exc: + raise ValueError(str(exc)) from exc + + +def _validate_seed_type_for_execution_context(seed_type: str, *, is_local: bool) -> None: + """Raises if a seed source type is unsupported in this execution context.""" + if is_local: + if seed_type == _DATAFRAME_SEED_TYPE: + raise NDDInvalidConfigError(_DATAFRAME_SEED_TYPE_MESSAGE) + return + + if seed_type not in _SUPPORTED_SEED_TYPES: + raise NDDInvalidConfigError(_UNSUPPORTED_SEED_TYPES_MESSAGE) + + +def _get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None: + return dd_config.seed_config.source if dd_config.seed_config else None + + +def _get_raw_seed_type(data: Any) -> str | None: + try: + seed_type = data["config"]["seed_config"]["source"]["seed_type"] + except Exception: return None - if isinstance(seed_source, FilesetFileSeedSource): - workspace, fileset_name = _parse_seed_source_path(seed_source.path, workspace) - try: - await sdk.files.filesets.retrieve(name=fileset_name, workspace=workspace) - except NotFoundError as e: - raise NDDInvalidConfigError(f"Could not find fileset {fileset_name!r} in workspace {workspace!r}") from e - except PermissionDeniedError as e: - raise NDDInvalidConfigError(f"Access denied to workspace {workspace!r}") from e - except Exception as e: - logger.exception("Error retrieving fileset", extra={"fileset_name": fileset_name, "workspace": workspace}) - raise NDDInternalError( - f"An unexpected error occurred while retrieving fileset {fileset_name!r} in workspace {workspace!r}: {e}" - ) from e - - -def _parse_seed_source_path(path: str, request_workspace: str) -> tuple[str, str]: - provided_fileset = path.split("#")[0] - match provided_fileset.split("/"): - case [name]: - return request_workspace, name - case [workspace, name]: - return workspace, name - case _: - raise NDDInvalidConfigError( - f"The fileset reference {provided_fileset!r} in seed source path is formatted incorrectly" - ) + return seed_type if isinstance(seed_type, str) else None diff --git a/packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py b/packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py new file mode 100644 index 0000000000..a2edd7d43a --- /dev/null +++ b/packages/data_designer_nemo/src/data_designer_nemo/tool_configs.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import data_designer.config as dd +from data_designer_nemo.errors import NDDInvalidConfigError + + +def validate_no_tool_configs(config: dd.DataDesignerConfig) -> None: + if config.tool_configs and len(config.tool_configs) > 0: + raise NDDInvalidConfigError("Tool configs are not supported in the NeMo Platform Data Designer service.") diff --git a/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py b/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py deleted file mode 100644 index 013b399f5d..0000000000 --- a/packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py +++ /dev/null @@ -1,98 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Any - -import data_designer.config as dd -from data_designer_nemo.errors import NDDInvalidConfigError - -_SUPPORTED_SEED_TYPES = {"hf", "nmp"} -_UNSUPPORTED_SEED_TYPES_MESSAGE = ( - "The NeMo Platform Data Designer service only supports seed data from HuggingFace " - "(seed_type=hf) or the Files service (seed_type=nmp)." -) -_DATAFRAME_SEED_TYPE = "df" -_DATAFRAME_SEED_TYPE_MESSAGE = ( - "Dataframe seed sources (seed_type=df) are not supported on the NeMo Platform. " - "Use a serializable seed source such as a local file, directory, HuggingFace, or the Files service." -) - - -def validate_no_tool_configs(config: dd.DataDesignerConfig) -> None: - if config.tool_configs and len(config.tool_configs) > 0: - raise NDDInvalidConfigError("Tool configs are not supported in the NeMo Platform Data Designer service.") - - -def validate_remote_seed_type(seed_type: str) -> None: - """Raises if a seed source type is unsupported for remote execution.""" - _validate_seed_type_for_execution_context(seed_type, is_local=False) - - -def validate_seed_config_for_execution_context(config: dd.DataDesignerConfig, *, is_local: bool) -> None: - """Raises if a parsed config uses a seed source unsupported in this execution context.""" - seed_type = _get_config_seed_type(config) - if seed_type is not None: - _validate_seed_type_for_execution_context(seed_type, is_local=is_local) - - -def validate_seed_source_for_execution_context(data: Any, *, is_local: bool) -> None: - """Raises if a raw request seed source is unsupported for the execution context. - - This function is used in Pydantic validators defined on the preview and job request models, - both of which carry a `config: dd.DataDesignerConfig` field. - - This function is used in "before"-style Pydantic validators, where the data argument is typed - as Any. We run in the before context to preempt less-useful error messages from the DD library: - - missing dataframe field (we don't serialize dataframes over the wire) - - file does not exist (the client's local fs != the service's local fs) - - The validators using this function only care about preventing unsupported seed types. All the - other standard Pydantic validation will get applied by FastAPI parsing the request; this does - not bypass that. So, we can safely ignore all Exceptions (most commonly KeyError, on requests - that don't include a seed_config at all) and index our way straight to the deeply nested field - we care about for this particular validation. - - Per the Pydantic v2 contract, "before"-mode validators may raise ``ValueError``, - ``AssertionError``, or ``PydanticCustomError`` — anything else (including our - ``NDDInvalidConfigError``) propagates raw out of ``model_validate`` and is not wrapped in - ``pydantic.ValidationError``. That breaks ``except ValidationError`` clauses in CLI / framework - code that turn validation problems into clean user-facing messages. To keep those code paths - working *and* keep ``NDDInvalidConfigError`` as the canonical error class for non-Pydantic - callers, we translate at this boundary: catch the plugin's error class and re-raise as a - ``ValueError`` carrying the same message. - """ - seed_type = _get_raw_seed_type(data) - if seed_type is None: - return - - try: - _validate_seed_type_for_execution_context(seed_type, is_local=is_local) - except NDDInvalidConfigError as exc: - raise ValueError(str(exc)) from exc - - -def _validate_seed_type_for_execution_context(seed_type: str, *, is_local: bool) -> None: - """Raises if a seed source type is unsupported in this execution context.""" - if is_local: - if seed_type == _DATAFRAME_SEED_TYPE: - raise NDDInvalidConfigError(_DATAFRAME_SEED_TYPE_MESSAGE) - return - - if seed_type not in _SUPPORTED_SEED_TYPES: - raise NDDInvalidConfigError(_UNSUPPORTED_SEED_TYPES_MESSAGE) - - -def _get_config_seed_type(config: dd.DataDesignerConfig) -> str | None: - if config.seed_config is None: - return None - - return config.seed_config.source.seed_type - - -def _get_raw_seed_type(data: Any) -> str | None: - try: - seed_type = data["config"]["seed_config"]["source"]["seed_type"] - except Exception: - return None - - return seed_type if isinstance(seed_type, str) else None diff --git a/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py b/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py new file mode 100644 index 0000000000..cb05e35d21 --- /dev/null +++ b/packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest +from data_designer.engine.resources.seed_reader import ( + DirectorySeedReader, + FileContentsSeedReader, + SeedReaderConfigError, +) +from data_designer_nemo.context import LocalDataDesignerContext +from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider + + +def test_create_context_roots_reader_in_canonical_fileset_ref() -> None: + sdk = Mock() + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.async_impl = True + fs_class.return_value.asynchronous = False + context = FilesetFileSystemProvider(sdk, workspace="default").create_context(runtime_path="docs#corpus") + + fs_class.assert_called_once_with(sdk) + assert str(context.root_path) == "default/docs#corpus" + + +def test_ensure_root_exists_skips_validated_roots() -> None: + sdk = Mock() + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + FilesetFileSystemProvider( + sdk, + workspace="default", + validated_roots={"default/docs#corpus"}, + ).ensure_root_exists(runtime_path="docs#corpus") + + fs_class.assert_not_called() + + +def test_ensure_root_exists_reports_missing_fileset_path() -> None: + sdk = Mock() + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.exists.side_effect = [False, True] + provider = FilesetFileSystemProvider(sdk, workspace="default") + + with pytest.raises(SeedReaderConfigError, match="Path 'corpus' not found in fileset 'default/docs'"): + provider.ensure_root_exists(runtime_path="docs#corpus") + + assert fs_class.return_value.exists.call_count == 2 + + +def test_hybrid_routes_existing_local_directory_to_disk(tmp_path: Path) -> None: + sdk = Mock() + provider = HybridFileSystemProvider(sdk, workspace="default") + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + context = provider.create_context(runtime_path=str(tmp_path)) + provider.ensure_root_exists(runtime_path=str(tmp_path)) + + assert context.root_path == tmp_path.resolve() + fs_class.assert_not_called() + + +def test_hybrid_routes_non_local_path_to_fileset() -> None: + sdk = Mock() + provider = HybridFileSystemProvider(sdk, workspace="default") + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.async_impl = True + fs_class.return_value.asynchronous = False + context = provider.create_context(runtime_path="docs#corpus") + + fs_class.assert_called_once_with(sdk) + assert str(context.root_path) == "default/docs#corpus" + + +def test_hybrid_ensure_root_exists_validates_fileset_for_non_local_path() -> None: + sdk = Mock() + provider = HybridFileSystemProvider(sdk, workspace="default") + + with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class: + fs_class.return_value.exists.side_effect = [False, True] + + with pytest.raises(SeedReaderConfigError, match="Path 'corpus' not found in fileset 'default/docs'"): + provider.ensure_root_exists(runtime_path="docs#corpus") + + assert fs_class.return_value.exists.call_count == 2 + + +def test_local_context_wires_hybrid_provider_into_filesystem_readers() -> None: + readers = LocalDataDesignerContext(Mock(), "default").get_seed_readers() + + fs_readers = [r for r in readers if isinstance(r, DirectorySeedReader | FileContentsSeedReader)] + assert len(fs_readers) == 2 + assert all(isinstance(r._fs_provider, HybridFileSystemProvider) for r in fs_readers) diff --git a/packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py b/packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py new file mode 100644 index 0000000000..6a9fb270c5 --- /dev/null +++ b/packages/data_designer_nemo/tests/unit/test_local_filesystem_seeds.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path +from typing import Any +from unittest.mock import AsyncMock, Mock + +import data_designer.config as dd +import pytest +from data_designer_nemo.context import LocalDataDesignerContext +from data_designer_nemo.errors import NDDInvalidConfigError +from data_designer_nemo.seed import validate_seed +from nemo_platform import AsyncNeMoPlatform, NotFoundError + + +def _make_config(source: Any) -> dd.DataDesignerConfig: + builder = dd.DataDesignerConfigBuilder() + builder.with_seed_dataset(source) + return builder.build() + + +@pytest.mark.asyncio +async def test_local_validate_seed_passes_existing_local_directory_without_sdk(tmp_path: Path) -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + + validated_root = await validate_seed( + _make_config(dd.DirectorySeedSource(path=str(tmp_path))), "default", sdk, is_local=True + ) + + assert validated_root is None + sdk.files.filesets.retrieve.assert_not_called() + sdk.files.list.assert_not_called() + + +@pytest.mark.asyncio +async def test_local_validate_seed_validates_fileset_for_non_local_path() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock() + sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) + + validated_root = await validate_seed( + _make_config(dd.DirectorySeedSource(path="docs#corpus")), "default", sdk, is_local=True + ) + + assert validated_root == "default/docs#corpus" + sdk.files.filesets.retrieve.assert_awaited_once_with(name="docs", workspace="default") + + +@pytest.mark.asyncio +async def test_local_validate_seed_reports_missing_fileset() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock(side_effect=NotFoundError("missing", response=Mock(), body=None)) + + with pytest.raises(NDDInvalidConfigError, match="Could not find fileset"): + await validate_seed( + _make_config(dd.DirectorySeedSource(path="does-not-exist#corpus")), "default", sdk, is_local=True + ) + + +@pytest.mark.asyncio +async def test_local_validate_seed_skips_huggingface_secret_resolution() -> None: + # Remote mode resolves the HF token against the Files/secret service; local mode must not, + # since the token may be a plaintext value or an environment variable. + sdk = AsyncMock(spec=AsyncNeMoPlatform) + + validated_root = await validate_seed( + _make_config(dd.HuggingFaceSeedSource(path="org/dataset", token="hf_local_token")), + "default", + sdk, + is_local=True, + ) + + assert validated_root is None + + +@pytest.mark.asyncio +async def test_local_context_validate_caches_fileset_root() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock() + sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) + ctx = LocalDataDesignerContext(sdk, "default") + + errors = await ctx.validate(_make_config(dd.DirectorySeedSource(path="docs#corpus"))) + + assert errors == [] + assert "default/docs#corpus" in ctx._validated_filesystem_roots diff --git a/packages/data_designer_nemo/tests/unit/test_model_configs.py b/packages/data_designer_nemo/tests/unit/test_model_configs.py index 4a00ecb02a..1b0cf502f7 100644 --- a/packages/data_designer_nemo/tests/unit/test_model_configs.py +++ b/packages/data_designer_nemo/tests/unit/test_model_configs.py @@ -11,6 +11,7 @@ def _make_model_config(alias: str) -> dd.ModelConfig: return dd.ModelConfig( alias=alias, model="nvidia/nemotron-3", + provider="default/nvidia", ) diff --git a/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py b/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py new file mode 100644 index 0000000000..5c07493f07 --- /dev/null +++ b/packages/data_designer_nemo/tests/unit/test_remote_filesystem_seeds.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from unittest.mock import AsyncMock, Mock + +import data_designer.config as dd +import pytest +from data_designer.engine.resources.seed_reader import DirectorySeedReader, FileContentsSeedReader +from data_designer_nemo.context import RemoteDataDesignerContext +from data_designer_nemo.seed import validate_seed +from nemo_platform import AsyncNeMoPlatform + + +def test_remote_context_includes_filesystem_seed_readers() -> None: + readers = RemoteDataDesignerContext(Mock(), "default").get_seed_readers() + + assert any(isinstance(reader, DirectorySeedReader) for reader in readers) + assert any(isinstance(reader, FileContentsSeedReader) for reader in readers) + + +@pytest.mark.asyncio +async def test_validate_seed_returns_canonical_validated_filesystem_root() -> None: + sdk = AsyncMock(spec=AsyncNeMoPlatform) + sdk.files.filesets.retrieve = AsyncMock() + sdk.files.list = AsyncMock(return_value=Mock(data=[Mock(path="corpus/a.md")])) + + builder = dd.DataDesignerConfigBuilder() + builder.with_seed_dataset(dd.FileContentsSeedSource(path="docs#corpus", file_pattern="*.md")) + config = builder.build() + + validated_root = await validate_seed(config, "default", sdk, is_local=False) + + assert validated_root == "default/docs#corpus" + sdk.files.filesets.retrieve.assert_awaited_once_with(name="docs", workspace="default") + sdk.files.list.assert_awaited_once_with(remote_path="corpus", fileset="docs", workspace="default") diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py index 0fa799b0b3..6233a9561c 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/functions/_types.py @@ -8,9 +8,7 @@ import data_designer.config as dd from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.dataset_metadata import DatasetMetadata -from data_designer_nemo.unsupported_features import ( - validate_seed_source_for_execution_context, -) +from data_designer_nemo.seed import validate_seed_source_for_execution_context from nemo_platform_plugin.functions.frames import Done, Error, Heartbeat from pydantic import BaseModel, Field, ValidationInfo, model_validator diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py index 0dc77ecdaa..a33e05ffe1 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/spec.py @@ -4,7 +4,7 @@ from typing import Any import data_designer.config as dd -from data_designer_nemo.unsupported_features import validate_seed_source_for_execution_context +from data_designer_nemo.seed import validate_seed_source_for_execution_context from pydantic import BaseModel, ValidationInfo, model_validator diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py index c217eca6ba..4d4a61ed19 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/sdk/resources.py @@ -16,8 +16,6 @@ from data_designer.config.preview_results import PreviewResults from data_designer.config.utils.info import InterfaceInfo from data_designer.logging import RandomEmoji -from data_designer_nemo.errors import NDDInvalidConfigError -from data_designer_nemo.unsupported_features import validate_remote_seed_type from nemo_data_designer_plugin.functions._types import ( AnalysisFrame, DatasetFrame, @@ -247,7 +245,7 @@ def preview( Returns: An object containing the preview dataset and tools for inspecting the results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = PreviewSpec(config=config, num_records=num_records) with _PreviewFrameCollector() as message_collector: @@ -305,7 +303,7 @@ def create( Returns: An object with methods for querying the job's status and results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = DataDesignerJobConfig(config=config, num_records=num_records) try: resp = self._client().post( @@ -384,13 +382,6 @@ def validate( A :class:`ValidationReport` whose ``ok`` property is true iff every requested context validated cleanly. """ - # Don't apply the eager ``_get_config_for_api_call`` rejection that - # ``preview`` / ``create`` use — the validate pass is *meant* to - # surface unsupported-seed errors as part of its report, alongside - # any other problems. Short-circuiting on the first eager check would - # break aggregation and would also reject ``df``-seed configs that - # are surfaced cleanly with a helpful message by the validate pass - # itself (see ``_validate_seed_type_for_execution_context``). resolved_workspace = workspace or self._platform.workspace or "default" return validate_config_sync( config_builder, @@ -426,7 +417,7 @@ async def preview( Returns: An object containing the preview dataset and tools for inspecting the results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = PreviewSpec(config=config, num_records=num_records) with _PreviewFrameCollector() as message_collector: @@ -484,7 +475,7 @@ async def create( Returns: An object with methods for querying the job's status and results. """ - config = _get_config_for_api_call(config_builder) + config = config_builder.build() request = DataDesignerJobConfig(config=config, num_records=num_records) try: resp = await self._client().post( @@ -547,8 +538,6 @@ async def validate( workspace: str | None = None, ) -> ValidationReport: """Async equivalent of :meth:`DataDesignerResource.validate`.""" - # See the sync ``DataDesignerResource.validate`` docstring for why we - # bypass ``_get_config_for_api_call`` here. resolved_workspace = workspace or self._platform.workspace or "default" return await validate_config( config_builder, @@ -558,17 +547,6 @@ async def validate( ) -def _get_config_for_api_call(config_builder: dd.DataDesignerConfigBuilder) -> dd.DataDesignerConfig: - """Build the config and reject unsupported local-only seed source types.""" - - if (seed_config := config_builder.get_seed_config()) is not None: - try: - validate_remote_seed_type(seed_config.source.seed_type) - except NDDInvalidConfigError as exc: - raise DataDesignerConfigValidationError(str(exc)) from exc - return config_builder.build() - - def _get_error(e: BaseException) -> DataDesignerClientError: if isinstance(e, httpx.HTTPStatusError): status_code, detail = extract_http_error_info(e) diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py index 7315380a95..95b48b4e36 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/testing/utils.py @@ -144,6 +144,10 @@ def _dd_service_factory() -> NemoServiceAdapter: "data_designer_nemo.person_reader.async_to_sync_sdk", return_value=client_context.sdk, ), + patch( + "data_designer_nemo.fileset_filesystem_provider.async_to_sync_sdk", + return_value=client_context.sdk, + ), ): yield client_context @@ -175,7 +179,10 @@ def setup_mock_secret(client_context: ClientContext) -> Generator[None]: @contextmanager -def setup_mock_file(client_context: ClientContext) -> Generator[None]: +def setup_mock_file( + client_context: ClientContext, + remote_path: str | None = None, +) -> Generator[None]: client_context.sdk.files.filesets.create( name=FILESET_NAME, workspace=client_context.sdk.workspace or WORKSPACE_NAME, @@ -186,7 +193,7 @@ def setup_mock_file(client_context: ClientContext) -> Generator[None]: fileset=FILESET_NAME, workspace=client_context.sdk.workspace or WORKSPACE_NAME, local_path=tmpfile.name, - remote_path=FILE_PATH, + remote_path=remote_path or FILE_PATH, ) yield diff --git a/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py b/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py index 72a2c4ebec..05e5762869 100644 --- a/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py +++ b/plugins/nemo-data-designer/tests/integration/test_preview_remote_sdk.py @@ -2,8 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import logging +import tempfile from collections.abc import Iterator from contextlib import contextmanager +from pathlib import Path import data_designer.config as dd import nemo_data_designer_plugin.testing.utils as u @@ -102,6 +104,87 @@ def test_fileset_file_seed_dataset_plugin() -> None: assert set(preview_results.dataset["full_name"].values) == u.FULL_NAMES +def test_directory_seed_dataset_fileset_root() -> None: + builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) + builder.with_seed_dataset(dd.DirectorySeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}")) + builder.add_column( + column_config=dd.ExpressionColumnConfig( + name="expr", + expr="{{ source_kind }} :: {{ relative_path }}", + ) + ) + + with ( + u.make_mock_client_context() as client_context, + u.setup_mock_file(client_context), + ): + dd_client = u.make_dd_client(client_context) + preview_results = dd_client.preview(builder, num_records=3) + + assert preview_results.dataset is not None + assert set(preview_results.dataset["expr"].values) == {f"directory_file :: {u.FILE_PATH}"} + + +def test_directory_seed_dataset_fileset_subdir() -> None: + subdir = "some/subdir" + + builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) + builder.with_seed_dataset(dd.DirectorySeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}#{subdir}")) + builder.add_column( + column_config=dd.ExpressionColumnConfig( + name="expr", + expr="{{ source_kind }} :: {{ relative_path }}", + ) + ) + + with ( + u.make_mock_client_context() as client_context, + u.setup_mock_file(client_context, remote_path=f"{subdir}/{u.FILE_PATH}"), + ): + dd_client = u.make_dd_client(client_context) + preview_results = dd_client.preview(builder, num_records=3) + + assert preview_results.dataset is not None + assert set(preview_results.dataset["expr"].values) == {f"directory_file :: {u.FILE_PATH}"} + + +def test_file_contents_seed_dataset() -> None: + subdir = "some/subdir" + + builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) + builder.with_seed_dataset(dd.FileContentsSeedSource(path=f"{u.WORKSPACE_NAME}/{u.FILESET_NAME}#{subdir}")) + builder.add_column( + column_config=dd.ExpressionColumnConfig( + name="expr", + expr="{{ file_name }} :: {{ content }}", + ) + ) + + with ( + u.make_mock_client_context() as client_context, + tempfile.TemporaryDirectory() as tmpdir, + ): + client_context.sdk.files.filesets.create(name=u.FILESET_NAME, workspace=u.WORKSPACE_NAME) + for filename in ["abc.txt", "xyz.txt"]: + filepath = Path(tmpdir) / filename + filepath.write_text(f"This is {filename}") + client_context.sdk.files.upload( + fileset=u.FILESET_NAME, + workspace=u.WORKSPACE_NAME, + local_path=tmpdir, + remote_path=subdir, + ) + + dd_client = u.make_dd_client(client_context) + preview_results = dd_client.preview(builder, num_records=3) + + assert preview_results.dataset is not None + assert set(preview_results.dataset["expr"].values) == { + "abc.txt :: This is abc.txt", + "xyz.txt :: This is xyz.txt", + } + + def test_nemotron_personas_dataset() -> None: builder = dd.DataDesignerConfigBuilder(model_configs=[u.make_model_config()]) builder.add_column( diff --git a/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py b/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py index 1b6cd1a059..6bf6a80f45 100644 --- a/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py +++ b/plugins/nemo-data-designer/tests/integration/test_remote_validation_errors.py @@ -52,16 +52,6 @@ def test_unknown_provider_in_request() -> None: _assert_error(dd_client, builder, ["Cannot access provider", unknown_provider]) -def test_model_config_without_explicit_provider_is_rejected() -> None: - alias = "no-provider-specified" - bad_model_config = dd.ModelConfig(alias=alias, model="some-model") - builder = _builder_with_llm_column(bad_model_config) - - with u.make_mock_client_context() as client_context: - dd_client = u.make_dd_client(client_context) - _assert_error(dd_client, builder, ["does not have an explicit provider defined", alias]) - - def test_malformed_provider_reference_is_rejected() -> None: alias = "too-many-slashes" malformed_provider_name = "foo/bar/baz" diff --git a/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py b/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py index 2735bc12db..e23a4a4bb7 100644 --- a/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py +++ b/plugins/nemo-data-designer/tests/integration/test_validate_sdk.py @@ -171,7 +171,7 @@ async def test_sdk_validate_method_aggregates_df_seed_with_other_remote_errors() assert any("Tool configs" in m for m in messages) # Remote rejects everything outside the {hf, nmp} whitelist; the message # mentions both supported types rather than calling out "df" specifically. - assert any("seed_type=hf" in m and "seed_type=nmp" in m for m in messages) + assert any("seed sources" in m and "Files service" in m for m in messages) async def test_sdk_validate_method_rejects_df_seed_for_local() -> None: diff --git a/plugins/nemo-data-designer/tests/unit/test_context.py b/plugins/nemo-data-designer/tests/unit/test_context.py index 418519e985..f618cfe569 100644 --- a/plugins/nemo-data-designer/tests/unit/test_context.py +++ b/plugins/nemo-data-designer/tests/unit/test_context.py @@ -82,17 +82,13 @@ def validate_tools(validated_config: dd.DataDesignerConfig) -> None: assert validated_config is config calls.append("tools") - def validate_seed_type(validated_config: dd.DataDesignerConfig, *, is_local: bool) -> None: - assert validated_config is config - assert is_local is False - calls.append("seed-type") - async def validate_seed( - validated_config: dd.DataDesignerConfig, workspace: str, async_sdk: AsyncNeMoPlatform + validated_config: dd.DataDesignerConfig, workspace: str, async_sdk: AsyncNeMoPlatform, is_local: bool ) -> None: assert validated_config is config assert workspace == u.WORKSPACE_NAME assert async_sdk is sdk + assert not is_local calls.append("seed") async def validate_personas(validated_config: dd.DataDesignerConfig, async_sdk: AsyncNeMoPlatform) -> None: @@ -101,14 +97,13 @@ async def validate_personas(validated_config: dd.DataDesignerConfig, async_sdk: calls.append("personas") monkeypatch.setattr("data_designer_nemo.context.validate_no_tool_configs", validate_tools) - monkeypatch.setattr("data_designer_nemo.context.validate_seed_config_for_execution_context", validate_seed_type) monkeypatch.setattr("data_designer_nemo.context.validate_seed", validate_seed) monkeypatch.setattr("data_designer_nemo.context.ensure_nemotron_personas_filesets", validate_personas) errors = await RemoteDataDesignerContext(sdk, u.WORKSPACE_NAME).validate(config) assert errors == [] - assert calls == ["tools", "seed-type", "seed", "personas"] + assert calls == ["tools", "seed", "personas"] async def test_remote_validate_rejects_unsupported_seed_config() -> None: diff --git a/plugins/nemo-data-designer/tests/unit/test_model_provider.py b/plugins/nemo-data-designer/tests/unit/test_model_provider.py index 4a0b8449eb..e27c3fb479 100644 --- a/plugins/nemo-data-designer/tests/unit/test_model_provider.py +++ b/plugins/nemo-data-designer/tests/unit/test_model_provider.py @@ -1,41 +1,14 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - -import data_designer.config as dd import nemo_data_designer_plugin.testing.utils as u import pytest -from data_designer_nemo.errors import NDDInvalidConfigError from data_designer_nemo.model_provider import ( - make_local_first_model_provider_registry, make_model_provider_registry, make_null_registry, ) -@pytest.mark.asyncio -async def test_local_first_provider_cannot_be_none() -> None: - """When a local-first registry build sees a missing provider, it must fail fast - *before* hitting the local-provider lookup helper. - """ - bad_model_configs = [dd.ModelConfig(alias="no-provider-specified", model="some-model")] - - with u.make_mock_client_context() as client_context: - with ( - patch("data_designer_nemo.model_provider.get_default_providers") as default_lookup, - pytest.raises(NDDInvalidConfigError) as exc_info, - ): - await make_local_first_model_provider_registry( - bad_model_configs, - sdk=client_context.async_sdk, - default_workspace=u.WORKSPACE_NAME, - ) - - default_lookup.assert_not_called() - assert "explicit provider defined" in str(exc_info.value) - - @pytest.mark.asyncio async def test_no_model_configs_returns_none() -> None: """``make_model_provider_registry`` returns None for an empty model-config list, @@ -55,4 +28,3 @@ def test_null_registry() -> None: registry = make_null_registry() assert len(registry.providers) == 1 - assert registry.default == "no-op" diff --git a/plugins/nemo-data-designer/tests/unit/test_preview_function.py b/plugins/nemo-data-designer/tests/unit/test_preview_function.py index ae3d5f0a56..ed4f59c6cd 100644 --- a/plugins/nemo-data-designer/tests/unit/test_preview_function.py +++ b/plugins/nemo-data-designer/tests/unit/test_preview_function.py @@ -24,7 +24,9 @@ def _config() -> dd.DataDesignerConfig: - builder = dd.DataDesignerConfigBuilder(model_configs=[dd.ModelConfig(alias="text", model="model")]) + builder = dd.DataDesignerConfigBuilder( + model_configs=[dd.ModelConfig(alias="text", model="model", provider="default/nvidia")] + ) builder.add_column( column_config=dd.SamplerColumnConfig( name="foo", diff --git a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py index c07ea9b5ba..160da9cbe0 100644 --- a/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py +++ b/plugins/nemo-data-designer/tests/unit/test_sdk_resources.py @@ -71,7 +71,9 @@ def async_resource(async_platform: AsyncNeMoPlatform) -> AsyncDataDesignerResour @pytest.fixture def config_builder() -> dd.DataDesignerConfigBuilder: - builder = dd.DataDesignerConfigBuilder(model_configs=[dd.ModelConfig(alias="text", model="model")]) + builder = dd.DataDesignerConfigBuilder( + model_configs=[dd.ModelConfig(alias="text", model="model", provider="default/nvidia")] + ) builder.add_column( column_config=dd.SamplerColumnConfig( name="foo", @@ -204,46 +206,6 @@ def test_preview_collector_propagates_error_frame_message( resource.preview(config_builder) -# --------------------------------------------------------------------------- -# Client-side seed-source validation gate (_get_config_for_api_call) -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize("seed_kind", ["df", "local", "directory", "file_contents"]) -def test_preview_rejects_local_only_seed_sources_before_sending_request( - resource: DataDesignerResource, - config_builder: dd.DataDesignerConfigBuilder, - seed_kind: str, - tmp_path, -) -> None: - """The validation gate inside ``_get_config_for_api_call`` rejects seed sources that - only make sense locally (DataFrame, LocalFile, Directory, FileContents), so the SDK - fails fast with a typed error instead of letting the server emit a 422 round-trip - later. Patching ``_preview`` to raise an AssertionError catches any regression where - the request is sent anyway. - """ - if seed_kind == "df": - seed_source = dd.DataFrameSeedSource(df=pd.DataFrame(data={"foo": [1, 2, 3]})) - elif seed_kind == "local": - seed_file = tmp_path / "seed.parquet" - _make_basic_dataset().to_parquet(seed_file) - seed_source = dd.LocalFileSeedSource(path=str(seed_file)) - elif seed_kind == "directory": - seed_source = dd.DirectorySeedSource(path=str(tmp_path)) - else: - seed_source = dd.FileContentsSeedSource(path=str(tmp_path)) - - config_builder.with_seed_dataset(seed_source) - - with ( - patch.object(resource, "_preview", side_effect=AssertionError("preview request should not be sent")), - pytest.raises(DataDesignerConfigValidationError) as exc_info, - ): - resource.preview(config_builder) - - assert "only supports seed data" in str(exc_info.value) - - # --------------------------------------------------------------------------- # Default model surfaces # ---------------------------------------------------------------------------