diff --git a/e2e/test_jobs.py b/e2e/test_jobs.py index 12a857ac4e..56d323ce68 100644 --- a/e2e/test_jobs.py +++ b/e2e/test_jobs.py @@ -1,9 +1,7 @@ """E2E tests for platform jobs. -These tests submit jobs with CPUExecutionProviderSpec (container + command). -The container image is omitted so that: -- On subprocess mode, the cpu→subprocess translation discards it anyway. -- On Kubernetes/Docker, the execution profile's default_task_image is used. +These tests submit jobs with SubprocessExecutionProviderSpec (host command). +The e2e test environment runs against the subprocess backend. Ported from Platform-Deploy e2e/test_jobs.py, adapted for the SDK's TypedDict param types and filtered to tests that work without Docker. @@ -56,10 +54,9 @@ def test_basic_platform_job_lifecycle(sdk: NeMoPlatform, workspace: str): { "name": "echo-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["echo", "Hello from e2e test!"], - }, + "command": ["echo", "Hello from e2e test!"], }, }, ], @@ -99,10 +96,9 @@ def test_job_logs_across_multiple_batches(sdk: NeMoPlatform, workspace: str): { "name": "multi-log-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", log_command], - }, + "command": ["sh", "-c", log_command], }, }, ], @@ -138,10 +134,9 @@ def test_job_config_is_readable(sdk: NeMoPlatform, workspace: str): { "name": "config-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "echo 'Step config:'; cat $NEMO_JOB_STEP_CONFIG_FILE_PATH;"], - }, + "command": ["sh", "-c", "echo 'Step config:'; cat $NEMO_JOB_STEP_CONFIG_FILE_PATH;"], }, "config": { "message": "Hello from job config!", @@ -172,27 +167,25 @@ def test_job_passing_data_between_steps(sdk: NeMoPlatform, workspace: str): { "name": "generate-data-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "echo 'Data from first step' > $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", - ], - }, + "command": [ + "sh", + "-c", + "echo 'Data from first step' > $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", + ], }, }, { "name": "consume-data-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "echo 'Consuming data:'; cat $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", - ], - }, + "command": [ + "sh", + "-c", + "echo 'Consuming data:'; cat $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", + ], }, }, ], @@ -228,10 +221,9 @@ def test_job_using_secret_environment_variable(sdk: NeMoPlatform, workspace: str { "name": "secret-envvar-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], - }, + "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], }, "environment": [ { @@ -276,10 +268,9 @@ def test_job_with_expected_failure(sdk: NeMoPlatform, workspace: str): { "name": "failing-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "echo 'This step will fail'; exit 1;"], - }, + "command": ["sh", "-c", "echo 'This step will fail'; exit 1;"], }, }, ], @@ -305,10 +296,9 @@ def test_job_cancel_immediately(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 60"], - }, + "command": ["sh", "-c", "sleep 60"], }, }, ], @@ -334,10 +324,9 @@ def test_job_cancel_once_active(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 300"], - }, + "command": ["sh", "-c", "sleep 300"], }, }, ], @@ -374,10 +363,9 @@ def test_job_pause_resume(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step-pause-resume", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 300"], - }, + "command": ["sh", "-c", "sleep 300"], }, }, ], @@ -415,10 +403,9 @@ def test_job_pause_and_cancel(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step-pause-cancel", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 300"], - }, + "command": ["sh", "-c", "sleep 300"], }, }, ], @@ -451,29 +438,27 @@ def test_job_using_additional_volume(sdk: NeMoPlatform, workspace: str): { "name": "write-data", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "echo 'Hello, World!' > /mnt/additional_storage/shared_data.txt; " - "echo 'Successfully wrote data to persistent storage';", - ], - }, + "command": [ + "sh", + "-c", + "echo 'Hello, World!' > /mnt/additional_storage/shared_data.txt; " + "echo 'Successfully wrote data to persistent storage';", + ], }, }, { "name": "read-data", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "cat /mnt/additional_storage/shared_data.txt; " - "echo 'Successfully read data from persistent storage';", - ], - }, + "command": [ + "sh", + "-c", + "cat /mnt/additional_storage/shared_data.txt; " + "echo 'Successfully read data from persistent storage';", + ], }, }, ], @@ -506,11 +491,10 @@ def test_job_invalid_image_format(sdk: NeMoPlatform, workspace: str, bad_image: { "name": "bad-image-step", "executor": { + "kind": "subprocess", "provider": "cpu", - "container": { - "image": bad_image, - "command": ["echo", "This should not run"], - }, + "image": bad_image, + "command": ["echo", "This should not run"], }, }, ], diff --git a/openapi/ga/individual/platform.openapi.yaml b/openapi/ga/individual/platform.openapi.yaml index bd1e211721..6b5d15f300 100644 --- a/openapi/ga/individual/platform.openapi.yaml +++ b/openapi/ga/individual/platform.openapi.yaml @@ -8227,60 +8227,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8740,6 +8686,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9521,60 +9541,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -9588,6 +9554,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9735,6 +9710,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11073,60 +11057,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -12173,6 +12103,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -14910,18 +14849,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14960,18 +14895,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -17007,15 +16938,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -17025,20 +16963,30 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -18111,6 +18059,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/openapi/ga/openapi.yaml b/openapi/ga/openapi.yaml index bd1e211721..6b5d15f300 100644 --- a/openapi/ga/openapi.yaml +++ b/openapi/ga/openapi.yaml @@ -8227,60 +8227,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8740,6 +8686,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9521,60 +9541,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -9588,6 +9554,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9735,6 +9710,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11073,60 +11057,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -12173,6 +12103,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -14910,18 +14849,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14960,18 +14895,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -17007,15 +16938,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -17025,20 +16963,30 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -18111,6 +18059,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/openapi/openapi.yaml b/openapi/openapi.yaml index bd1e211721..6b5d15f300 100644 --- a/openapi/openapi.yaml +++ b/openapi/openapi.yaml @@ -8227,60 +8227,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8740,6 +8686,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9521,60 +9541,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -9588,6 +9554,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9735,6 +9710,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11073,60 +11057,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -12173,6 +12103,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -14910,18 +14849,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14960,18 +14895,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -17007,15 +16938,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -17025,20 +16963,30 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -18111,6 +18059,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py index ea91db6f39..0bcc7747ba 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py @@ -273,6 +273,7 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: AsyncNeMoPlatform, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> object: @@ -285,6 +286,13 @@ async def compile( must override this method; the plugin service produces the ``PlatformJobSpec`` the Jobs service expects by invoking it. + The ``kind`` parameter is the resolved executor payload shape + (``"container"`` or ``"subprocess"``), resolved by the framework + from the submitter's profile before ``compile()`` is called. + Compilers use this to decide which executor type to emit without + querying execution profiles themselves. ``profile`` is also + provided for compilers that need to stamp it on specific steps. + Args: workspace: Workspace scope. spec: Canonical :attr:`spec_schema` instance. @@ -293,9 +301,11 @@ async def compile( async_sdk: ``AsyncNeMoPlatform`` handle. Same contract as :meth:`to_spec`: this runs in the API process so only the async client is offered. - profile: Submitter-selected profile. The factory applies - ``stamp_profile(spec, profile)`` after this method - returns; per-step overrides set here take precedence. + kind: Resolved executor payload shape — ``"container"`` or + ``"subprocess"``. Defaults to ``"container"``. + profile: The submitter-selected execution profile name + (e.g. ``"subprocess"``, ``"default"``). ``None`` when + no profile was specified. options: Opaque wire ``{"": {...}}`` bag; read keys defensively. diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py index 63981b18e2..20c22f00d4 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py @@ -34,10 +34,8 @@ from nemo_platform.types.jobs import ( ComputeResourcesParam, ComputeResourceSpecParam, + ContainerExecutionProviderParam, ContainerSpecParam, - CPUExecutionProviderParam, - DistributedGPUExecutionProviderParam, - GPUExecutionProviderParam, PlatformJobEnvironmentVariableParam, PlatformJobSecretEnvironmentVariableRefParam, PlatformJobSpecParam, @@ -75,9 +73,7 @@ PlatformJobStep = PlatformJobStepSpecParam StepLifecycle = StepLifecycleParam ExecutorSpec = Executor -CPUExecutionProviderSpec = CPUExecutionProviderParam -GPUExecutionProviderSpec = GPUExecutionProviderParam -DistributedGPUExecutionProviderSpec = DistributedGPUExecutionProviderParam +ContainerExecutionProviderSpec = ContainerExecutionProviderParam SubprocessExecutionProviderSpec = SubprocessExecutionProviderParam ResourcesSpec = ComputeResourcesParam ResourcesLimitsSpec = ComputeResourceSpecParam @@ -102,6 +98,7 @@ class BaseJobRequest(BaseModel, Generic[JobConfigT]): spec: JobConfigT ownership: dict | None = None custom_fields: dict | None = None + profile: str | None = None class BaseJob(BaseModel, Generic[JobConfigT]): @@ -442,11 +439,15 @@ class PlatformJobResultRoute(BaseModel): # Signature: (workspace, original_spec, transformed_spec, entity_client, job_name, sdk) -> PlatformJobSpec # job_name is the resolved name (user-provided or auto-generated), None when no name is available # sdk is always provided for accessing secrets, files, and models with user context +# kind is the resolved executor payload shape ("container" or "subprocess") +# profile is the submitter-selected execution profile name PlatformJobSpecCompiler = Callable[ - [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform], PlatformJobSpec + [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform, str | None, str | None], + PlatformJobSpec, ] PlatformJobSpecCompilerAsync = Callable[ - [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform], Awaitable[PlatformJobSpec] + [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform, str | None, str | None], + Awaitable[PlatformJobSpec], ] # Input-to-output transformer types: receives job_name to use for related fields (e.g., output) @@ -628,6 +629,8 @@ async def _compile_platform_spec( job_name: str | None, service_name: str, sdk: AsyncNeMoPlatform, + profile: str | None = None, + default_provider: str = "cpu", ) -> PlatformJobSpec: """Compile input and output specs into a PlatformJobSpec for execution. @@ -635,6 +638,10 @@ async def _compile_platform_spec( (with auto-generated fields), allowing it to distinguish between user intent and system-generated values. + The ``kind`` (executor payload shape) is resolved here from the + ``(provider, profile)`` pair before being passed to the compiler, so + individual compilers never need to query execution profiles themselves. + Supports both sync and async compiler callables. Validates the resulting spec for common misconfigurations. @@ -642,15 +649,36 @@ async def _compile_platform_spec( HTTPException(422): If the compiler raises PlatformJobCompilationError. PermissionError: If the compiler raises a PermissionError. """ + from nemo_platform_plugin.jobs.profiles import resolve_profile_kind + + kind: str = "container" + if profile is not None: + try: + kind = await resolve_profile_kind(sdk, default_provider, profile) + except PlatformJobCompilationError: + logger.warning( + "Could not resolve kind for profile '%s/%s', defaulting to container", default_provider, profile + ) + kind = "container" + try: if inspect.iscoroutinefunction(compiler): - platform_spec = await compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk) + platform_spec = await compiler( + workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind, profile + ) else: # Run sync compilers in a thread pool to avoid blocking the event loop. platform_spec = await to_thread.run_sync( - partial(compiler, workspace, original_spec, transformed_spec, entity_client, job_name, sdk) + partial( + compiler, workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind, profile + ) ) + if profile is not None: + from nemo_platform_plugin.jobs.profile import stamp_profile + + stamp_profile(platform_spec, profile) + _validate_job_spec(platform_spec) return platform_spec except PermissionError as e: @@ -819,6 +847,7 @@ async def create_job( job_name, service_name, sdk, + profile=request.profile, ) # Create the job using the SDK pointed to the platform jobs microservice. diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py index 1e45c7405a..fc82afcc75 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py @@ -23,10 +23,9 @@ def compile(self, *, profile, ...): Why it lives in ``nemo_platform_plugin`` and not in the Jobs service: Each step's ``executor`` is a discriminated union -(``CPUExecutionProviderParam`` / ``GPUExecutionProviderParam`` / -``DistributedGPUExecutionProviderParam``) from the generated -``nemo_platform`` SDK. All three carry a ``profile: str`` field — that's the -only attribute the stamper touches. Keeping the helper in ``nemo_platform_plugin`` +(``ContainerExecutionProviderParam`` / ``SubprocessExecutionProviderParam``) +from the generated ``nemo_platform`` SDK. Both carry a ``profile: str`` +field — that's the only attribute the stamper touches. Keeping the helper in ``nemo_platform_plugin`` alongside the factory avoids dragging plugin-service code through the Jobs service's internals and matches where ``add_job_routes()`` already lives. """ diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py new file mode 100644 index 0000000000..86982fd459 --- /dev/null +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Execution profile resolution for plugin compilers. + +Provides :func:`resolve_profile_kind` which queries the Jobs service's +execution profiles endpoint to determine what executor payload shape +(``"container"`` or ``"subprocess"``) a given ``(provider, profile)`` +pair expects. + +Plugin compilers use this to emit the correct executor type without +hardcoding profile-name-to-kind mappings:: + + from nemo_platform_plugin.jobs.profiles import resolve_profile_kind + + kind = await resolve_profile_kind(async_sdk, "cpu", profile or "default") + if kind == "subprocess": + executor = SubprocessExecutionProviderSpec(...) + else: + executor = ContainerExecutionProviderSpec(...) + +.. note:: + + **Not the long-term strategy.** This client-side resolution is a + pragmatic bridge. The end-state (Razvan's ``compile_default`` design + from AIRCORE-397) moves compilation to the Jobs service backend + itself — the backend knows its own kind and constructs the executor + server-side. When that lands, plugins will post a ``PluginJobSpec`` + (just the domain payload + metadata) and this helper becomes + unnecessary. See ``plan-default-compilation.md`` in the AIRCORE-397 + architecture plans. +""" + +from __future__ import annotations + +import logging +import time +from typing import Literal + +from nemo_platform import AsyncNeMoPlatform +from nemo_platform.types.jobs.job_list_execution_profiles_response import JobListExecutionProfilesResponseItem +from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError + +ExecutorKind = Literal["container", "subprocess"] +"""Executor payload shape: ``"container"`` for image-backed work, ``"subprocess"`` for host commands.""" + +logger = logging.getLogger(__name__) + +# TODO(AIRCORE-397): Remove this module when compile_default() lands on +# the backend classes. At that point the Jobs service resolves the +# profile kind server-side and plugin compilers no longer need to query +# execution profiles. + +_CACHE_TTL_SECONDS = 300 # 5 minutes +_cached_profiles: list[JobListExecutionProfilesResponseItem] | None = None +_cached_at: float = 0.0 + + +async def _fetch_execution_profiles(sdk: AsyncNeMoPlatform) -> list[JobListExecutionProfilesResponseItem]: + """Fetch execution profiles from the Jobs service, with caching.""" + global _cached_profiles, _cached_at + now = time.monotonic() + if _cached_profiles is not None and (now - _cached_at) < _CACHE_TTL_SECONDS: + return _cached_profiles + + profiles = await sdk.jobs.list_execution_profiles() + _cached_profiles = profiles + _cached_at = now + return profiles + + +async def resolve_profile_kind( + sdk: AsyncNeMoPlatform, + provider: str, + profile: str, +) -> ExecutorKind: + """Resolve the executor payload kind for a ``(provider, profile)`` pair. + + Queries the Jobs service's ``GET /v2/execution-profiles`` endpoint + (cached for 5 minutes) and returns the profile's ``kind`` field + (``"container"`` or ``"subprocess"``). + + Args: + sdk: Async platform SDK client. + provider: Compute provider (``"cpu"``, ``"gpu"``, ``"gpu_distributed"``). + profile: Execution profile name (``"default"``, ``"subprocess"``, etc.). + + Returns: + ``"container"`` or ``"subprocess"``. + + Raises: + PlatformJobCompilationError: If no matching execution profile is found. + """ + profiles = await _fetch_execution_profiles(sdk) + for p in profiles: + if p.provider == provider and p.profile == profile and p.kind is not None: + return p.kind + + raise PlatformJobCompilationError( + f"Execution profile '{provider}/{profile}' not found. " + f"Check that the Jobs service has a profile registered for provider='{provider}', profile='{profile}'." + ) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py index 4a921a613d..caa2df6fbc 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py @@ -58,7 +58,6 @@ job_route_factory, ) from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError -from nemo_platform_plugin.jobs.profile import stamp_profile if TYPE_CHECKING: from collections.abc import Callable @@ -258,11 +257,12 @@ def _adapt_compile( """Bridge ``NemoJob.compile`` to the factory's ``platform_job_config_compiler`` shape. The factory calls ``compiler(workspace, original_spec, transformed_spec, - entity_client, job_name, sdk)``. :meth:`NemoJob.compile` is an - ``async classmethod`` that uses kwargs and also accepts - ``profile`` / ``options`` — phase 1 MR 1.1b passes ``None`` for - both (body-field wiring is a follow-up). After ``compile`` returns, - the adapter applies :func:`stamp_profile` with ``default_profile``. + entity_client, job_name, sdk, kind)`` where ``kind`` is the resolved + executor payload shape (``"container"`` or ``"subprocess"``), already + resolved by ``_compile_platform_spec`` from the submitter's profile. + + ``stamp_profile`` is applied by ``_compile_platform_spec`` after this + adapter returns — the adapter only handles the NemoJob.compile bridge. Missing-override errors from the ``NemoJob.compile`` base marker become :class:`PlatformJobCompilationError` so the factory's @@ -276,6 +276,8 @@ async def compile_adapter( entity_client: Any, job_name: str | None, sdk: Any, + kind: str = "container", + profile: str | None = None, ) -> Any: del original_spec # NemoJob.compile only needs the canonical (transformed) spec try: @@ -285,13 +287,13 @@ async def compile_adapter( entity_client=entity_client, job_name=job_name, async_sdk=sdk, - profile=None, + kind=kind, + profile=profile, options=None, ) except NotImplementedError as exc: raise PlatformJobCompilationError(str(exc)) from exc - stamp_profile(result, default_profile) return result return compile_adapter diff --git a/packages/nemo_platform_plugin/tests/test_jobs_filter.py b/packages/nemo_platform_plugin/tests/test_jobs_filter.py index 1fafe7d9e5..e14d4d83ab 100644 --- a/packages/nemo_platform_plugin/tests/test_jobs_filter.py +++ b/packages/nemo_platform_plugin/tests/test_jobs_filter.py @@ -42,7 +42,7 @@ class _Spec(BaseModel): foo: str = "bar" -def _fake_compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk): +def _fake_compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind="container", profile=None): return {"steps": []} diff --git a/packages/nemo_platform_plugin/tests/test_jobs_routes.py b/packages/nemo_platform_plugin/tests/test_jobs_routes.py index e114a4a32d..51193d2ebc 100644 --- a/packages/nemo_platform_plugin/tests/test_jobs_routes.py +++ b/packages/nemo_platform_plugin/tests/test_jobs_routes.py @@ -73,6 +73,7 @@ async def compile( entity_client, job_name, async_sdk, + kind="container", profile=None, options=None, ): @@ -115,6 +116,7 @@ async def compile( entity_client, job_name, async_sdk, + kind="container", profile=None, options=None, ): @@ -274,15 +276,14 @@ def run(self, config: dict) -> dict: @pytest.mark.asyncio -async def test_compile_adapter_invokes_nemo_compile_and_stamps_default_profile() -> None: +async def test_compile_adapter_invokes_nemo_compile() -> None: adapter = _adapt_compile(_WidgetJob, default_profile="research") spec = _WidgetSpec(name="w") - platform_spec = await adapter("ws", spec, spec, "entity_client", "job-1", "sdk") + # Adapter receives kind and profile from _compile_platform_spec. + # Profile stamping is now done by _compile_platform_spec, not the adapter. + platform_spec = await adapter("ws", spec, spec, "entity_client", "job-1", "sdk", "container", "research") assert isinstance(platform_spec, _FakePlatformSpec) - # Profile stamped on every step since the compiler didn't set one. - for step in platform_spec.steps: - assert step.executor.profile == "research" @pytest.mark.asyncio @@ -299,7 +300,7 @@ async def compile(cls, **kwargs): return _FakePlatformSpec(steps=[_FakeStep(profile="explicit")]) adapter = _adapt_compile(CompileSetsProfile, default_profile="default") - platform_spec = await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk") + platform_spec = await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk", None, None) assert platform_spec.steps[0].executor.profile == "explicit" @@ -308,7 +309,7 @@ async def compile(cls, **kwargs): async def test_compile_adapter_converts_not_implemented_to_compilation_error() -> None: adapter = _adapt_compile(_NoCompileJob, default_profile="default") with pytest.raises(PlatformJobCompilationError, match="must override compile"): - await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk") + await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk", None, None) # --------------------------------------------------------------------------- diff --git a/packages/nmp_common/tests/api_factory/test_api_factory.py b/packages/nmp_common/tests/api_factory/test_api_factory.py index cff87af28f..17a33ee14e 100644 --- a/packages/nmp_common/tests/api_factory/test_api_factory.py +++ b/packages/nmp_common/tests/api_factory/test_api_factory.py @@ -18,8 +18,8 @@ from nemo_platform.types.shared.platform_job_status import PlatformJobStatus from nemo_platform_plugin.entities import EntityClient from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, FileResultSerializer, JobRouteOption, PlatformJobResultRoute, @@ -64,12 +64,15 @@ def foo_job_config_compiler( entity_client: EntityClient, job_name: str | None, sdk, + kind: str = "container", + profile: str | None = None, ) -> PlatformJobSpec: return PlatformJobSpec( steps=[ PlatformJobStep( name="foo_step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec( @@ -121,7 +124,9 @@ def test_api_factory_routes(): def test_validate_job_spec(): - executor = CPUExecutionProviderSpec(provider="cpu", profile="default", container=ContainerSpec(image="foo_image")) + executor = ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="foo_image") + ) valid_job = PlatformJobSpec( steps=[ PlatformJobStep( @@ -1386,7 +1391,14 @@ def test_create_job_injects_workspace_and_entity_client(): received_entity_client = None def compiler( - workspace: str, input_spec: FooJobConfig, output_spec: FooJobConfig, entity_client, job_name: str | None, sdk + workspace: str, + input_spec: FooJobConfig, + output_spec: FooJobConfig, + entity_client, + job_name: str | None, + sdk, + kind: str = "container", + profile: str | None = None, ) -> PlatformJobSpec: nonlocal received_workspace received_workspace = workspace @@ -1396,7 +1408,8 @@ def compiler( steps=[ PlatformJobStep( name="test_step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="test_image"), @@ -1435,7 +1448,14 @@ def test_sync_compiler_is_called_correctly(): compiler_called = False def sync_compiler( - workspace: str, input_spec: FooJobConfig, output_spec: FooJobConfig, entity_client, job_name: str | None, sdk + workspace: str, + input_spec: FooJobConfig, + output_spec: FooJobConfig, + entity_client, + job_name: str | None, + sdk, + kind: str = "container", + profile: str | None = None, ) -> PlatformJobSpec: nonlocal compiler_called compiler_called = True @@ -1443,7 +1463,8 @@ def sync_compiler( steps=[ PlatformJobStep( name="test_step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="test_image"), @@ -1659,7 +1680,8 @@ def _make_platform_spec(self, output_spec: FooJobConfig) -> PlatformJobSpec: steps=[ PlatformJobStep( name="step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="img"), @@ -1675,7 +1697,7 @@ async def test_sync_compiler(self): spec = FooJobConfig(foo="a", bar=1) expected = self._make_platform_spec(spec) - def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk): + def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None): return expected result = await _compile_platform_spec(compiler, "ws", spec, spec, MagicMock(), "name", "svc", MagicMock()) @@ -1687,7 +1709,7 @@ async def test_async_compiler(self): spec = FooJobConfig(foo="a", bar=1) expected = self._make_platform_spec(spec) - async def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk): + async def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None): return expected result = await _compile_platform_spec(compiler, "ws", spec, spec, MagicMock(), "name", "svc", MagicMock()) @@ -1698,7 +1720,7 @@ async def test_compilation_error_becomes_422(self): """PlatformJobCompilationError is wrapped in HTTPException 422.""" from fastapi import HTTPException - def bad_compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk): + def bad_compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None): raise PlatformJobCompilationError("missing field") spec = FooJobConfig(foo="a", bar=1) @@ -1713,13 +1735,16 @@ async def test_validate_job_spec_is_called(self): """_validate_job_spec is invoked on the compiled result (catches non-serializable config).""" from fastapi import HTTPException - def compiler_bad_config(workspace, input_spec, output_spec, entity_client, job_name, sdk): + def compiler_bad_config( + workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None + ): # Return a spec whose step config is not JSON serializable return PlatformJobSpec( steps=[ PlatformJobStep( name="step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="img"), diff --git a/packages/nmp_common/tests/jobs/test_docker.py b/packages/nmp_common/tests/jobs/test_docker.py index 60b54000b0..042a3c057f 100644 --- a/packages/nmp_common/tests/jobs/test_docker.py +++ b/packages/nmp_common/tests/jobs/test_docker.py @@ -7,9 +7,8 @@ import pytest from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, - GPUExecutionProviderSpec, PlatformJobSpec, PlatformJobStep, ) @@ -20,11 +19,11 @@ def test_spec_has_gpu_step(): """spec_has_gpu_step returns True when any step has provider gpu or gpu_distributed.""" - cpu_executor = CPUExecutionProviderSpec( - provider="cpu", profile="default", container=ContainerSpec(image="foo_image") + cpu_executor = ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="foo_image") ) - gpu_executor = GPUExecutionProviderSpec( - provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") + gpu_executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") ) cpu_only_job = PlatformJobSpec( steps=[ @@ -61,8 +60,8 @@ def test_spec_has_gpu_step(): ) def test_validate_gpu_available_for_docker(runtime, reserved_gpu_ids, config_raises, expect_raise, message_contains): """GPU job validation: raise when Docker has no GPUs; pass or skip otherwise.""" - gpu_executor = GPUExecutionProviderSpec( - provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") + gpu_executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") ) gpu_job = PlatformJobSpec( steps=[ diff --git a/packages/nmp_platform/config/local.yaml b/packages/nmp_platform/config/local.yaml index 7b7542da0c..9c67656982 100644 --- a/packages/nmp_platform/config/local.yaml +++ b/packages/nmp_platform/config/local.yaml @@ -62,8 +62,8 @@ jobs: # keys; `merge_executor_profiles` keys on (provider, profile) so subprocess # and Docker entries coexist. executors: - - provider: subprocess - profile: default + - provider: cpu + profile: subprocess backend: subprocess config: working_directory: /tmp/nmp-subprocess-jobs diff --git a/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml b/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml index bcb5a60a55..ad351abadd 100644 --- a/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml +++ b/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml @@ -19,8 +19,8 @@ entities: {} jobs: executors: - - provider: subprocess - profile: default + - provider: cpu + profile: subprocess backend: subprocess config: working_directory: /tmp/nmp-subprocess-jobs diff --git a/packages/nmp_testing/src/nmp/testing/jobs.py b/packages/nmp_testing/src/nmp/testing/jobs.py index c5388c39bd..1bd5b0f234 100644 --- a/packages/nmp_testing/src/nmp/testing/jobs.py +++ b/packages/nmp_testing/src/nmp/testing/jobs.py @@ -30,7 +30,10 @@ def subprocess_job_executor_patch( from nmp.core.jobs.api.v2.jobs import endpoints as jobs_endpoints patched_executors = list(executors) - if not any(executor.provider == "subprocess" and executor.profile == profile for executor in patched_executors): + if not any( + getattr(executor, "backend", None) == "subprocess" and getattr(executor, "profile", None) == profile + for executor in patched_executors + ): patched_executors.insert(0, SubprocessJobExecutionProfile(profile=profile)) with ExitStack() as stack: diff --git a/plugins/nemo-agents/openapi/openapi.yaml b/plugins/nemo-agents/openapi/openapi.yaml index 18c511b9f1..4b00e75c7b 100644 --- a/plugins/nemo-agents/openapi/openapi.yaml +++ b/plugins/nemo-agents/openapi/openapi.yaml @@ -2549,6 +2549,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec @@ -2838,6 +2841,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec @@ -3053,6 +3059,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec @@ -3408,6 +3417,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec @@ -3624,6 +3636,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py index 36f55842f6..6cb1a51dff 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py @@ -64,6 +64,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: @@ -107,7 +108,8 @@ async def compile( # type: ignore[override] PlatformJobStep( name="analyze", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + kind="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.analyze"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py index 2eeda79456..9a7af615ea 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py @@ -158,6 +158,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: @@ -182,7 +183,8 @@ async def compile( # type: ignore[override] PlatformJobStep( name="evaluate-agent", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + kind="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.evaluate"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py index 61146380ab..d955efe9c5 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py @@ -137,6 +137,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: @@ -201,7 +202,8 @@ async def compile( # type: ignore[override] PlatformJobStep( name="evaluate-suite", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + kind="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py index 9f7edd10e1..e337e98dde 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py @@ -138,6 +138,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: @@ -164,7 +165,8 @@ async def compile( # type: ignore[override] PlatformJobStep( name="optimize-agent", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + kind="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.optimize"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py index 6993b491bc..9006951762 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py @@ -87,6 +87,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: @@ -147,7 +148,8 @@ async def compile( # type: ignore[override] PlatformJobStep( name="optimize-skills", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + kind="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.optimize_skills"], ), config=spec_dict, diff --git a/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py b/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py index 83252f691e..3747e4966f 100644 --- a/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py +++ b/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py @@ -36,8 +36,8 @@ async def test_compile_produces_single_cpu_step() -> None: assert len(steps) == 1 step = steps[0] assert step["name"] == "evaluate-agent" - assert step["executor"]["provider"] == "subprocess" - assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate"] + assert step["executor"]["provider"] == "cpu" + assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate"] # type: ignore[invalid-key] assert step["config"]["agent"] == "calc" assert step["config"]["eval_config"] == "config.yml" assert step["config"]["eval_config_fileset"] == "nemo-agent-eval-calc" diff --git a/plugins/nemo-agents/tests/unit/test_improvement_jobs.py b/plugins/nemo-agents/tests/unit/test_improvement_jobs.py index a80e3bd803..1221f448c8 100644 --- a/plugins/nemo-agents/tests/unit/test_improvement_jobs.py +++ b/plugins/nemo-agents/tests/unit/test_improvement_jobs.py @@ -86,8 +86,8 @@ async def test_evaluate_suite_compile_produces_single_subprocess_step() -> None: assert len(steps) == 1 step = steps[0] assert step["name"] == "evaluate-suite" - assert step["executor"]["provider"] == "subprocess" - assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"] + assert step["executor"]["provider"] == "cpu" + assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"] # type: ignore[invalid-key] assert step["config"]["evals"] == "/abs/evals" assert step["config"]["agent"] == "/abs/agent" @@ -199,7 +199,7 @@ async def test_optimize_skills_compile_produces_single_subprocess_step() -> None step = steps[0] assert step["name"] == "optimize-skills" executor = step["executor"] - assert executor.get("provider") == "subprocess" + assert executor.get("provider") == "cpu" assert executor.get("command") == ["python", "-m", "nemo_agents_plugin.tasks.optimize_skills"] env = {e["name"]: e for e in step["environment"]} @@ -279,7 +279,7 @@ async def test_analyze_compile_produces_single_subprocess_step() -> None: step = next(iter(platform_spec["steps"])) assert step["name"] == "analyze" executor = step["executor"] - assert executor.get("provider") == "subprocess" + assert executor.get("provider") == "cpu" assert executor.get("command") == ["python", "-m", "nemo_agents_plugin.tasks.analyze"] @@ -328,7 +328,7 @@ async def test_optimize_agent_compile_produces_single_subprocess_step() -> None: step = next(iter(platform_spec["steps"])) assert step["name"] == "optimize-agent" executor = step["executor"] - assert executor.get("provider") == "subprocess" + assert executor.get("provider") == "cpu" assert executor.get("command") == ["python", "-m", "nemo_agents_plugin.tasks.optimize"] assert step["config"]["workspace"] == "staging" diff --git a/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py b/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py index 78905bbbe6..5dcb54f472 100644 --- a/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py +++ b/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py @@ -26,8 +26,8 @@ from nemo_platform_plugin.job import NemoJob from nemo_platform_plugin.job_context import JobContext from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -114,7 +114,8 @@ async def compile( steps=[ PlatformJobStep( name="anonymizer-job", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", profile=profile or "default", provider="cpu", container=ContainerSpec( diff --git a/plugins/nemo-data-designer/openapi/openapi.yaml b/plugins/nemo-data-designer/openapi/openapi.yaml index 6da475527a..509f111f17 100644 --- a/plugins/nemo-data-designer/openapi/openapi.yaml +++ b/plugins/nemo-data-designer/openapi/openapi.yaml @@ -834,6 +834,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py index 6deb3aa05d..04db71e5e5 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py @@ -16,10 +16,11 @@ from nemo_platform_plugin.job import NemoJob from nemo_platform_plugin.job_context import JobContext from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, PlatformJobSpec, PlatformJobStep, + SubprocessExecutionProviderSpec, ) from nemo_platform_plugin.jobs.image import get_qualified_image from pydantic import BaseModel @@ -66,22 +67,36 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: + resolved_profile = profile or "default" + + if kind == "subprocess": + executor = SubprocessExecutionProviderSpec( + kind="subprocess", + provider="cpu", + profile=resolved_profile, + command=["python", "-m", "nemo_data_designer_plugin.jobs.bridge"], + ) + else: + executor = ContainerExecutionProviderSpec( + kind="container", + provider="cpu", + profile=resolved_profile, + container=ContainerSpec( + image=get_qualified_image("nmp-cpu-tasks"), + entrypoint=["python", "-m"], + command=["nemo_data_designer_plugin.jobs.bridge"], + ), + ) + return PlatformJobSpec( steps=[ PlatformJobStep( name="data-designer-job", - executor=CPUExecutionProviderSpec( - profile=profile or "default", - provider="cpu", - container=ContainerSpec( - image=get_qualified_image("nmp-cpu-tasks"), - entrypoint=["python", "-m"], - command=["nemo_data_designer_plugin.jobs.bridge"], - ), - ), + executor=executor, config=spec.model_dump(), environment=[], ) diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml index ecbed63932..2fada6a0c6 100644 --- a/plugins/nemo-evaluator/openapi/openapi.yaml +++ b/plugins/nemo-evaluator/openapi/openapi.yaml @@ -630,6 +630,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py index 79f9fd557d..57dce42839 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py @@ -8,8 +8,8 @@ from nemo_evaluator.jobs.evaluate import EvaluateSpec from nemo_evaluator_sdk.values import Agent, Model, RunConfig, RunConfigOnline, RunConfigOnlineModel from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, EnvironmentVariableFromSecret, PlatformJobSpec, @@ -80,7 +80,8 @@ def _secret_environment(spec: EvaluateSpec) -> list[EnvironmentVariable]: def _evaluate_step(spec: EvaluateSpec, profile: str | None) -> PlatformJobStep: return PlatformJobStep( name=EVALUATE_STEP_NAME, - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", profile=profile or "default", provider="cpu", container=ContainerSpec( diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py index bade6c835d..620cd79fa6 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py @@ -169,6 +169,7 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: object, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-safe-synthesizer/openapi/openapi.yaml b/plugins/nemo-safe-synthesizer/openapi/openapi.yaml index 1f8097ffe2..1694156aef 100644 --- a/plugins/nemo-safe-synthesizer/openapi/openapi.yaml +++ b/plugins/nemo-safe-synthesizer/openapi/openapi.yaml @@ -1502,6 +1502,9 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string type: object required: - spec diff --git a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py index bbb663cdc6..5925cae185 100644 --- a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py +++ b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py @@ -15,11 +15,11 @@ from nemo_platform.filesets import FilesetPathError, parse_fileset_ref from nemo_platform_plugin.entities import EntityClient from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, EnvironmentVariable, EnvironmentVariableFromSecret, FileResultSerializer, - GPUExecutionProviderSpec, PlatformJobResultRoute, PlatformJobSpec, PlatformJobStep, @@ -62,7 +62,8 @@ def _create_job_step(job_config: SafeSynthesizerJobConfig, environment: list[Env return PlatformJobStep( name="safe-synthesizer", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + kind="subprocess", + provider="cpu", profile=config.job_executor_profile, command=command, ), @@ -85,7 +86,8 @@ def _create_job_step(job_config: SafeSynthesizerJobConfig, environment: list[Env ) return PlatformJobStep( name="safe-synthesizer", - executor=GPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile=config.job_executor_profile, container=ContainerSpec( @@ -106,6 +108,8 @@ async def job_config_compiler( entity_client: EntityClient, job_name: str | None, sdk: AsyncNeMoPlatform, + kind: str = "container", + profile: str | None = None, ) -> PlatformJobSpec: """Compile Safe Synthesizer job config into a platform job.""" del original_spec, entity_client, job_name diff --git a/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py b/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py index 446aee4637..00c118062f 100644 --- a/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py +++ b/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py @@ -92,7 +92,7 @@ async def test_job_config_compiler_with_classify_provider(mock_sdk): mock_sdk.inference.providers.retrieve.assert_awaited_once_with("my-nim", workspace="default") step = next(iter(result["steps"])) - assert step["executor"]["provider"] == "subprocess" + assert step["executor"]["provider"] == "cpu" assert step["executor"]["command"] == ["/runtime/bin/python", "-m", TASK_MODULE] env = {e["name"]: e.get("value") for e in step.get("environment", [])} assert env["CLASSIFY_LLM_ENDPOINT_PATH"] == "/apis/inference-gateway/v2/workspaces/default/provider/my-nim/-/v1" diff --git a/sdk/python/nemo-platform/.nmpcontext/openapi.yaml b/sdk/python/nemo-platform/.nmpcontext/openapi.yaml index bd1e211721..6b5d15f300 100644 --- a/sdk/python/nemo-platform/.nmpcontext/openapi.yaml +++ b/sdk/python/nemo-platform/.nmpcontext/openapi.yaml @@ -8227,60 +8227,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8740,6 +8686,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9521,60 +9541,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -9588,6 +9554,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9735,6 +9710,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11073,60 +11057,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -12173,6 +12103,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -14910,18 +14849,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14960,18 +14895,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -17007,15 +16938,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -17025,20 +16963,30 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -18111,6 +18059,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/sdk/python/nemo-platform/.nmpcontext/stainless.yaml b/sdk/python/nemo-platform/.nmpcontext/stainless.yaml index fcbb5c2a58..21f2393f0a 100644 --- a/sdk/python/nemo-platform/.nmpcontext/stainless.yaml +++ b/sdk/python/nemo-platform/.nmpcontext/stainless.yaml @@ -519,20 +519,16 @@ resources: models: compute_resource_spec: ComputeResourceSpec compute_resources: ComputeResources + container_execution_provider: ContainerExecutionProviderOutput + container_execution_provider_param: ContainerExecutionProviderInput container_spec: ContainerSpec - cpu_execution_provider: CPUExecutionProviderOutput - cpu_execution_provider_param: CPUExecutionProviderInput create_platform_job_request: CreatePlatformJobRequest - distributed_gpu_execution_provider: DistributedGPUExecutionProviderOutput - distributed_gpu_execution_provider_param: DistributedGPUExecutionProviderInput docker_job_execution_profile: DockerJobExecutionProfile docker_job_execution_profile_config: DockerJobExecutionProfileConfig docker_job_network_config: DockerJobNetworkConfig docker_job_storage_config: DockerJobStorageConfig docker_volume_mount: DockerVolumeMount e2e_job_execution_profile: E2EJobExecutionProfile - gpu_execution_provider: GPUExecutionProviderOutput - gpu_execution_provider_param: GPUExecutionProviderInput image_pull_secret: ImagePullSecret job_execution_profile_config: JobExecutionProfileConfig kubernetes_empty_dir_volume: KubernetesEmptyDirVolume diff --git a/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md b/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md index 882f649add..72e7b5ca66 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md +++ b/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md @@ -33,7 +33,7 @@ Methods: Types: ```python -from nemo_platform.types.files import FilesetFilter, FilesetMetadata, FilesetMetadataParam +from nemo_platform.types.files import FilesetFilter ``` Methods: diff --git a/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py b/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py index 2fbd9935dc..9b7afcb95f 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py +++ b/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py @@ -34,7 +34,6 @@ from ...pagination import SyncDefaultPagination, AsyncDefaultPagination from ...types.files import ( FilesetPurpose, - FilesetMetadataParam, fileset_list_params, fileset_create_params, fileset_update_params, diff --git a/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md b/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md index f5bfd41e0c..954ef664c9 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md +++ b/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md @@ -6,20 +6,16 @@ Types: from nemo_platform.types.jobs import ( ComputeResourceSpec, ComputeResources, + ContainerExecutionProvider, + ContainerExecutionProviderParam, ContainerSpec, - CPUExecutionProvider, - CPUExecutionProviderParam, CreatePlatformJobRequest, - DistributedGPUExecutionProvider, - DistributedGPUExecutionProviderParam, DockerJobExecutionProfile, DockerJobExecutionProfileConfig, DockerJobNetworkConfig, DockerJobStorageConfig, DockerVolumeMount, E2EJobExecutionProfile, - GPUExecutionProvider, - GPUExecutionProviderParam, ImagePullSecret, JobExecutionProfileConfig, KubernetesEmptyDirVolume, diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py index 571b87927b..eb3af5c4f7 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py @@ -32,6 +32,7 @@ PlatformJobLog as PlatformJobLog, ToolCallConfig as ToolCallConfig, APIEndpointData as APIEndpointData, + FilesetMetadata as FilesetMetadata, FileStorageType as FileStorageType, InferenceParams as InferenceParams, LinearLayerSpec as LinearLayerSpec, diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py index b76dd4a694..920d1b889d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py @@ -22,7 +22,7 @@ from .cache_status import CacheStatus as CacheStatus from .fileset_file import FilesetFile as FilesetFile from .fileset_purpose import FilesetPurpose as FilesetPurpose -from .fileset_metadata import FilesetMetadata as FilesetMetadata +from ..shared.fileset_metadata import FilesetMetadata as FilesetMetadata from .s3_storage_config import S3StorageConfig as S3StorageConfig from .ngc_storage_config import NGCStorageConfig as NGCStorageConfig from .fileset_list_params import FilesetListParams as FilesetListParams @@ -33,7 +33,6 @@ from .fileset_create_params import FilesetCreateParams as FilesetCreateParams from .fileset_update_params import FilesetUpdateParams as FilesetUpdateParams from .file_list_files_params import FileListFilesParams as FileListFilesParams -from .fileset_metadata_param import FilesetMetadataParam as FilesetMetadataParam from .file_upload_file_params import FileUploadFileParams as FileUploadFileParams from .s3_storage_config_param import S3StorageConfigParam as S3StorageConfigParam from .ngc_storage_config_param import NGCStorageConfigParam as NGCStorageConfigParam diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py index e6d9642b7a..810d5ce990 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py @@ -20,10 +20,10 @@ from ..._models import BaseModel from .fileset_purpose import FilesetPurpose -from .fileset_metadata import FilesetMetadata from .s3_storage_config import S3StorageConfig from .ngc_storage_config import NGCStorageConfig from .local_storage_config import LocalStorageConfig +from ..shared.fileset_metadata import FilesetMetadata from .huggingface_storage_config import HuggingfaceStorageConfig __all__ = ["Fileset", "Storage"] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py index ea3cb763f7..9836fcb477 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py @@ -21,7 +21,6 @@ from typing_extensions import Required, TypeAlias, TypedDict from .fileset_purpose import FilesetPurpose -from .fileset_metadata_param import FilesetMetadataParam from .s3_storage_config_param import S3StorageConfigParam from .ngc_storage_config_param import NGCStorageConfigParam from .local_storage_config_param import LocalStorageConfigParam diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py deleted file mode 100644 index 66f37de921..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import TypedDict - -from ..shared_params.model_metadata_content import ModelMetadataContent -from ..shared_params.dataset_metadata_content import DatasetMetadataContent - -__all__ = ["FilesetMetadataParam"] - - -class FilesetMetadataParam(TypedDict, total=False): - """Tagged metadata container - the key indicates the type. - - Example: - metadata = FilesetMetadata( - dataset=DatasetMetadataContent( - schema={"columns": ["id", "name"]}, - ) - ) - """ - - dataset: DatasetMetadataContent - """Content for dataset-type filesets.""" - - model: ModelMetadataContent - """Content for model-type filesets. - - Contains tool calling configuration that is merged into the ModelSpec during - checkpoint analysis. - """ diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py index 6c6a8d01c1..34c9777606 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py @@ -36,8 +36,6 @@ from .step_lifecycle_param import StepLifecycleParam as StepLifecycleParam from .compute_resource_spec import ComputeResourceSpec as ComputeResourceSpec from .platform_job_response import PlatformJobResponse as PlatformJobResponse -from .cpu_execution_provider import CPUExecutionProvider as CPUExecutionProvider -from .gpu_execution_provider import GPUExecutionProvider as GPUExecutionProvider from .platform_job_step_spec import PlatformJobStepSpec as PlatformJobStepSpec from .compute_resources_param import ComputeResourcesParam as ComputeResourcesParam from .kubernetes_volume_mount import KubernetesVolumeMount as KubernetesVolumeMount @@ -51,9 +49,8 @@ from .compute_resource_spec_param import ComputeResourceSpecParam as ComputeResourceSpecParam from .kubernetes_empty_dir_volume import KubernetesEmptyDirVolume as KubernetesEmptyDirVolume from .platform_job_responses_page import PlatformJobResponsesPage as PlatformJobResponsesPage -from .cpu_execution_provider_param import CPUExecutionProviderParam as CPUExecutionProviderParam +from .container_execution_provider import ContainerExecutionProvider as ContainerExecutionProvider from .docker_job_execution_profile import DockerJobExecutionProfile as DockerJobExecutionProfile -from .gpu_execution_provider_param import GPUExecutionProviderParam as GPUExecutionProviderParam from .job_execution_profile_config import JobExecutionProfileConfig as JobExecutionProfileConfig from .platform_job_step_spec_param import PlatformJobStepSpecParam as PlatformJobStepSpecParam from .task_create_or_update_params import TaskCreateOrUpdateParams as TaskCreateOrUpdateParams @@ -67,7 +64,7 @@ from .kubernetes_job_execution_profile import KubernetesJobExecutionProfile as KubernetesJobExecutionProfile from .subprocess_job_execution_profile import SubprocessJobExecutionProfile as SubprocessJobExecutionProfile from .platform_job_environment_variable import PlatformJobEnvironmentVariable as PlatformJobEnvironmentVariable -from .distributed_gpu_execution_provider import DistributedGPUExecutionProvider as DistributedGPUExecutionProvider +from .container_execution_provider_param import ContainerExecutionProviderParam as ContainerExecutionProviderParam from .kubernetes_persistent_volume_claim import KubernetesPersistentVolumeClaim as KubernetesPersistentVolumeClaim from .docker_job_execution_profile_config import DockerJobExecutionProfileConfig as DockerJobExecutionProfileConfig from .subprocess_execution_provider_param import SubprocessExecutionProviderParam as SubprocessExecutionProviderParam @@ -84,9 +81,6 @@ from .subprocess_job_execution_profile_config import ( SubprocessJobExecutionProfileConfig as SubprocessJobExecutionProfileConfig, ) -from .distributed_gpu_execution_provider_param import ( - DistributedGPUExecutionProviderParam as DistributedGPUExecutionProviderParam, -) from .platform_job_secret_environment_variable_ref import ( PlatformJobSecretEnvironmentVariableRef as PlatformJobSecretEnvironmentVariableRef, ) diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider.py similarity index 75% rename from sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py rename to sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider.py index 8df8dc74d4..f18f5c41b7 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider.py @@ -22,14 +22,15 @@ from .container_spec import ContainerSpec from .compute_resources import ComputeResources -__all__ = ["GPUExecutionProvider"] +__all__ = ["ContainerExecutionProvider"] -class GPUExecutionProvider(BaseModel): - """GPU-based execution provider. +class ContainerExecutionProvider(BaseModel): + """Container-based execution provider. - Provides configuration for running jobs on GPU resources with - resource requests and limits. + Runs a job step inside a container image. The ``provider`` field + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + identifies the payload shape. """ container: ContainerSpec @@ -38,9 +39,11 @@ class GPUExecutionProvider(BaseModel): Defines the container image and related configuration for job execution. """ + kind: Optional[Literal["container"]] = None + profile: Optional[str] = None - provider: Optional[Literal["gpu"]] = None + provider: Optional[Literal["cpu", "gpu", "gpu_distributed"]] = None resources: Optional[ComputeResources] = None """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider_param.py similarity index 75% rename from sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py rename to sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider_param.py index 2ad674ab22..83971919e7 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider_param.py @@ -22,14 +22,15 @@ from .container_spec_param import ContainerSpecParam from .compute_resources_param import ComputeResourcesParam -__all__ = ["DistributedGPUExecutionProviderParam"] +__all__ = ["ContainerExecutionProviderParam"] -class DistributedGPUExecutionProviderParam(TypedDict, total=False): - """GPU-based execution provider. +class ContainerExecutionProviderParam(TypedDict, total=False): + """Container-based execution provider. - Provides configuration for running jobs on GPU resources with - resource requests and limits. + Runs a job step inside a container image. The ``provider`` field + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + identifies the payload shape. """ container: Required[ContainerSpecParam] @@ -38,9 +39,11 @@ class DistributedGPUExecutionProviderParam(TypedDict, total=False): Defines the container image and related configuration for job execution. """ + kind: Literal["container"] + profile: str - provider: Literal["gpu_distributed"] + provider: Literal["cpu", "gpu", "gpu_distributed"] resources: ComputeResourcesParam """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider.py deleted file mode 100644 index d890dbb9e9..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from ..._models import BaseModel -from .container_spec import ContainerSpec -from .compute_resources import ComputeResources - -__all__ = ["CPUExecutionProvider"] - - -class CPUExecutionProvider(BaseModel): - """CPU-based execution provider. - - Provides configuration for running jobs on CPU resources with - resource requests and limits. - """ - - container: ContainerSpec - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: Optional[str] = None - - provider: Optional[Literal["cpu"]] = None - - resources: Optional[ComputeResources] = None - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py deleted file mode 100644 index 02eac5b152..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Literal, Required, TypedDict - -from .container_spec_param import ContainerSpecParam -from .compute_resources_param import ComputeResourcesParam - -__all__ = ["CPUExecutionProviderParam"] - - -class CPUExecutionProviderParam(TypedDict, total=False): - """CPU-based execution provider. - - Provides configuration for running jobs on CPU resources with - resource requests and limits. - """ - - container: Required[ContainerSpecParam] - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: str - - provider: Literal["cpu"] - - resources: ComputeResourcesParam - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py deleted file mode 100644 index 5594c9b31c..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from ..._models import BaseModel -from .container_spec import ContainerSpec -from .compute_resources import ComputeResources - -__all__ = ["DistributedGPUExecutionProvider"] - - -class DistributedGPUExecutionProvider(BaseModel): - """GPU-based execution provider. - - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ - - container: ContainerSpec - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: Optional[str] = None - - provider: Optional[Literal["gpu_distributed"]] = None - - resources: Optional[ComputeResources] = None - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py index db6bfb344a..19b52806dc 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py @@ -36,6 +36,9 @@ class DockerJobExecutionProfile(BaseModel): backend: Optional[Literal["docker"]] = None + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py index c7a8d0f4df..04f77975c3 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py @@ -36,6 +36,9 @@ class E2EJobExecutionProfile(BaseModel): config: Optional[JobExecutionProfileConfig] = None """Configuration for the e2e test executor""" + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider_param.py deleted file mode 100644 index 4471def866..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider_param.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Literal, Required, TypedDict - -from .container_spec_param import ContainerSpecParam -from .compute_resources_param import ComputeResourcesParam - -__all__ = ["GPUExecutionProviderParam"] - - -class GPUExecutionProviderParam(TypedDict, total=False): - """GPU-based execution provider. - - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ - - container: Required[ContainerSpecParam] - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: str - - provider: Literal["gpu"] - - resources: ComputeResourcesParam - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py index 81aeec0675..d64092149c 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py @@ -36,6 +36,9 @@ class KubernetesJobExecutionProfile(BaseModel): backend: Optional[Literal["kubernetes_job"]] = None + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py index c23f1ec802..52267abf3d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py @@ -21,17 +21,14 @@ from ..._utils import PropertyInfo from ..._models import BaseModel from .step_lifecycle import StepLifecycle -from .cpu_execution_provider import CPUExecutionProvider -from .gpu_execution_provider import GPUExecutionProvider +from .container_execution_provider import ContainerExecutionProvider from .subprocess_execution_provider import SubprocessExecutionProvider from .platform_job_environment_variable import PlatformJobEnvironmentVariable -from .distributed_gpu_execution_provider import DistributedGPUExecutionProvider __all__ = ["PlatformJobStepSpec", "Executor"] Executor: TypeAlias = Annotated[ - Union[CPUExecutionProvider, GPUExecutionProvider, DistributedGPUExecutionProvider, SubprocessExecutionProvider], - PropertyInfo(discriminator="provider"), + Union[ContainerExecutionProvider, SubprocessExecutionProvider], PropertyInfo(discriminator="kind") ] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py index d0577d2eb7..0c7397010a 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py @@ -21,20 +21,13 @@ from typing_extensions import Required, TypeAlias, TypedDict from .step_lifecycle_param import StepLifecycleParam -from .cpu_execution_provider_param import CPUExecutionProviderParam -from .gpu_execution_provider_param import GPUExecutionProviderParam +from .container_execution_provider_param import ContainerExecutionProviderParam from .subprocess_execution_provider_param import SubprocessExecutionProviderParam from .platform_job_environment_variable_param import PlatformJobEnvironmentVariableParam -from .distributed_gpu_execution_provider_param import DistributedGPUExecutionProviderParam __all__ = ["PlatformJobStepSpecParam", "Executor"] -Executor: TypeAlias = Union[ - CPUExecutionProviderParam, - GPUExecutionProviderParam, - DistributedGPUExecutionProviderParam, - SubprocessExecutionProviderParam, -] +Executor: TypeAlias = Union[ContainerExecutionProviderParam, SubprocessExecutionProviderParam] class PlatformJobStepSpecParam(TypedDict, total=False): diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py index 8b5536767b..278b24063e 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py @@ -24,10 +24,16 @@ class SubprocessExecutionProvider(BaseModel): - """Host subprocess execution provider.""" + """Host subprocess execution provider. + + Runs a job step as a local OS process. The ``provider`` field + expresses compute intent while ``kind`` identifies the payload shape. + """ command: List[str] + kind: Optional[Literal["subprocess"]] = None + profile: Optional[str] = None - provider: Optional[Literal["subprocess"]] = None + provider: Optional[Literal["cpu", "gpu"]] = None diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py index 979c4ab4f5..f46f456f27 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py @@ -25,10 +25,16 @@ class SubprocessExecutionProviderParam(TypedDict, total=False): - """Host subprocess execution provider.""" + """Host subprocess execution provider. + + Runs a job step as a local OS process. The ``provider`` field + expresses compute intent while ``kind`` identifies the payload shape. + """ command: Required[SequenceNotStr[str]] + kind: Literal["subprocess"] + profile: str - provider: Literal["subprocess"] + provider: Literal["cpu", "gpu"] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py index a83d6acedc..c5154eaf6d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py @@ -30,7 +30,9 @@ class SubprocessJobExecutionProfile(BaseModel): config: Optional[SubprocessJobExecutionProfileConfig] = None """Additional configuration for the subprocess executor""" + kind: Optional[Literal["subprocess"]] = None + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" - provider: Optional[Literal["subprocess"]] = None + provider: Optional[Literal["cpu"]] = None diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py index f4e1456ff2..0954f4563a 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py @@ -32,6 +32,9 @@ class VolcanoJobExecutionProfile(BaseModel): backend: Optional[Literal["volcano_job"]] = None + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py index 70ea9bdc92..d16fead87f 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py @@ -26,6 +26,7 @@ from .delete_response import DeleteResponse as DeleteResponse from .finetuning_type import FinetuningType as FinetuningType from .pagination_data import PaginationData as PaginationData +from .fileset_metadata import FilesetMetadata as FilesetMetadata from .inference_params import InferenceParams as InferenceParams from .platform_job_log import PlatformJobLog as PlatformJobLog from .tool_call_config import ToolCallConfig as ToolCallConfig diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata.py b/sdk/python/nemo-platform/src/nemo_platform/types/shared/fileset_metadata.py similarity index 91% rename from sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata.py rename to sdk/python/nemo-platform/src/nemo_platform/types/shared/fileset_metadata.py index 36573bd374..b35b6d8ecc 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/shared/fileset_metadata.py @@ -18,8 +18,8 @@ from typing import Optional from ..._models import BaseModel -from ..shared.model_metadata_content import ModelMetadataContent -from ..shared.dataset_metadata_content import DatasetMetadataContent +from .model_metadata_content import ModelMetadataContent +from .dataset_metadata_content import DatasetMetadataContent __all__ = ["FilesetMetadata"] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py index 66f37de921..e3f510ca6e 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py @@ -19,8 +19,8 @@ from typing_extensions import TypedDict -from ..shared_params.model_metadata_content import ModelMetadataContent -from ..shared_params.dataset_metadata_content import DatasetMetadataContent +from .model_metadata_content import ModelMetadataContent +from .dataset_metadata_content import DatasetMetadataContent __all__ = ["FilesetMetadataParam"] diff --git a/sdk/python/nemo-platform/tests/api_resources/test_jobs.py b/sdk/python/nemo-platform/tests/api_resources/test_jobs.py index 16c770f140..112805e814 100644 --- a/sdk/python/nemo-platform/tests/api_resources/test_jobs.py +++ b/sdk/python/nemo-platform/tests/api_resources/test_jobs.py @@ -53,7 +53,7 @@ def test_method_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -78,6 +78,7 @@ def test_method_create_with_all_params(self, client: NeMoPlatform) -> None: "entrypoint": ["string"], "image": "x", }, + "kind": "container", "profile": "profile", "provider": "cpu", "resources": { @@ -127,7 +128,7 @@ def test_raw_response_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -152,7 +153,7 @@ def test_streaming_response_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -180,7 +181,7 @@ def test_path_params_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -747,7 +748,7 @@ async def test_method_create(self, async_client: AsyncNeMoPlatform) -> None: { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -772,6 +773,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncNeMoPlatfo "entrypoint": ["string"], "image": "x", }, + "kind": "container", "profile": "profile", "provider": "cpu", "resources": { @@ -821,7 +823,7 @@ async def test_raw_response_create(self, async_client: AsyncNeMoPlatform) -> Non { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -846,7 +848,7 @@ async def test_streaming_response_create(self, async_client: AsyncNeMoPlatform) { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -874,7 +876,7 @@ async def test_path_params_create(self, async_client: AsyncNeMoPlatform) -> None { "executor": { "container": {}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } diff --git a/sdk/stainless.yaml b/sdk/stainless.yaml index fcbb5c2a58..21f2393f0a 100644 --- a/sdk/stainless.yaml +++ b/sdk/stainless.yaml @@ -519,20 +519,16 @@ resources: models: compute_resource_spec: ComputeResourceSpec compute_resources: ComputeResources + container_execution_provider: ContainerExecutionProviderOutput + container_execution_provider_param: ContainerExecutionProviderInput container_spec: ContainerSpec - cpu_execution_provider: CPUExecutionProviderOutput - cpu_execution_provider_param: CPUExecutionProviderInput create_platform_job_request: CreatePlatformJobRequest - distributed_gpu_execution_provider: DistributedGPUExecutionProviderOutput - distributed_gpu_execution_provider_param: DistributedGPUExecutionProviderInput docker_job_execution_profile: DockerJobExecutionProfile docker_job_execution_profile_config: DockerJobExecutionProfileConfig docker_job_network_config: DockerJobNetworkConfig docker_job_storage_config: DockerJobStorageConfig docker_volume_mount: DockerVolumeMount e2e_job_execution_profile: E2EJobExecutionProfile - gpu_execution_provider: GPUExecutionProviderOutput - gpu_execution_provider_param: GPUExecutionProviderInput image_pull_secret: ImagePullSecret job_execution_profile_config: JobExecutionProfileConfig kubernetes_empty_dir_volume: KubernetesEmptyDirVolume diff --git a/services/automodel/src/nmp/automodel/app/jobs/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/compiler.py index 047b73798d..6cf37be7a2 100644 --- a/services/automodel/src/nmp/automodel/app/jobs/compiler.py +++ b/services/automodel/src/nmp/automodel/app/jobs/compiler.py @@ -8,8 +8,8 @@ from nemo_platform import AsyncNeMoPlatform, NotFoundError from nemo_platform.types.models.model_entity import ModelEntity from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -447,7 +447,8 @@ async def platform_job_config_compiler( # Step 1: Download model and dataset files from Files service PlatformJobStep( name="model-and-dataset-download", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -469,7 +470,8 @@ async def platform_job_config_compiler( # Step 3: Upload customized model PlatformJobStep( name="model-upload", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -484,7 +486,8 @@ async def platform_job_config_compiler( # Step 4: Create model entity PlatformJobStep( name="model-entity-creation", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py index 9eb1120cca..3b419ece61 100644 --- a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py +++ b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py @@ -7,10 +7,9 @@ from nemo_platform.types.models.model_entity import ModelEntity from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - DistributedGPUExecutionProviderSpec, EnvironmentVariable, - GPUExecutionProviderSpec, PlatformJobStep, ResourcesSpec, StepLifecycle, @@ -193,7 +192,8 @@ def compile_training_step( if p.num_nodes > 1: logger.debug(f"Using distributed GPU executor: num_nodes={p.num_nodes}, num_gpus_per_node={num_gpus_per_node}") - executor = DistributedGPUExecutionProviderSpec( + executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu_distributed", profile=profile, container=container, @@ -204,7 +204,8 @@ def compile_training_step( ) else: logger.debug(f"Using single-node GPU executor: num_gpus={num_gpus_per_node}") - executor = GPUExecutionProviderSpec( + executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile=profile, container=container, diff --git a/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py b/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py index 9a9312895e..2fa61dba8d 100644 --- a/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py +++ b/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py @@ -46,11 +46,10 @@ from nmp.core.jobs.app.ctx import JobContext from nmp.core.jobs.app.dispatcher import JobDispatcher, StateTransitionConflictError from nmp.core.jobs.app.profiles import ExecutionProfileT -from nmp.core.jobs.app.providers import CPUExecutionProvider, SubprocessExecutionProvider from nmp.core.jobs.app.schemas import ( PlatformJobSpec, ) -from nmp.core.jobs.config import config, profiles +from nmp.core.jobs.config import profiles from nmp.core.jobs.entities import PlatformJobStep, PlatformJobTask from pydantic import ValidationError from starlette.responses import FileResponse @@ -102,34 +101,6 @@ def validate_job_spec( ) from e -def translate_cpu_container_steps_to_subprocess( - job_spec: PlatformJobSpec, - subprocess_profiles: set[str], -) -> PlatformJobSpec: - """Translate CPU container steps when explicitly configured for subprocess compatibility.""" - if not subprocess_profiles: - return job_spec - - translated_spec = job_spec.model_copy(deep=True) - for step in translated_spec.steps: - executor = step.executor - if not isinstance(executor, CPUExecutionProvider) or executor.profile not in subprocess_profiles: - continue - command = [*executor.container.entrypoint, *executor.container.command] - if not command: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_CONTENT, - detail=f"Subprocess execution for step '{step.name}' requires container.entrypoint and/or container.command.", - ) - step.executor = SubprocessExecutionProvider(provider="subprocess", profile=executor.profile, command=command) - return translated_spec - - -def configured_subprocess_translation_profiles() -> set[str]: - """Return explicitly configured subprocess profiles that should accept CPU container jobs.""" - return {profile.profile for profile in config.executors if profile.provider == "subprocess"} - - # Execution Profiles Endpoint @router.get("/v2/execution-profiles") async def get_execution_profiles() -> list[ExecutionProfileT]: @@ -149,10 +120,6 @@ async def create_job( sdk: AsyncNeMoPlatform = Depends(get_sdk_client), ) -> PlatformJobResponse: """Create a new platform job.""" - platform_spec = translate_cpu_container_steps_to_subprocess( - request.platform_spec, configured_subprocess_translation_profiles() - ) - request = request.model_copy(update={"platform_spec": platform_spec}) validate_job_spec(request.platform_spec, profiles) try: diff --git a/services/core/jobs/src/nmp/core/jobs/app/providers.py b/services/core/jobs/src/nmp/core/jobs/app/providers.py index c57f6c08bd..a543dc74f7 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/providers.py +++ b/services/core/jobs/src/nmp/core/jobs/app/providers.py @@ -85,80 +85,48 @@ class TaskSpec(BaseModel): """Arguments to pass to the command. Can be a list of strings or a single string.""" -class CPUExecutionProvider(BaseModel): - """ - CPU-based execution provider. - - Provides configuration for running jobs on CPU resources with - resource requests and limits. - """ - - provider: Literal["cpu"] = "cpu" - """The provider type, always 'cpu' for CPU execution.""" - - profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" - - container: ContainerSpec - """Container specification defining the execution environment.""" +ExecutorKind = Literal["container", "subprocess"] +"""Executor payload shape: ``"container"`` for image-backed work, ``"subprocess"`` for host commands.""" - resources: ComputeResources = Field( - default_factory=ComputeResources, description="Resource requests and limits for CPU execution." - ) +class ContainerExecutionProvider(BaseModel): + """Container-based execution provider. -class GPUExecutionProvider(BaseModel): + Runs a job step inside a container image. The ``provider`` field + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + identifies the payload shape. """ - GPU-based execution provider. - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ + kind: Literal["container"] = "container" + """Executor payload shape — always ``"container"`` for image-backed work.""" - provider: Literal["gpu"] = "gpu" - """The provider type, always 'gpu' for GPU execution.""" + provider: Literal["cpu", "gpu", "gpu_distributed"] = "cpu" + """Compute requirement: ``cpu``, ``gpu``, or ``gpu_distributed``.""" profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" + """Operator-configured execution profile (e.g. ``"default"``, ``"a100"``).""" container: ContainerSpec """Container specification defining the execution environment.""" - resources: ComputeResources = Field( - default_factory=ComputeResources, description="Resource requests and limits for GPU execution." - ) + resources: ComputeResources = Field(default_factory=ComputeResources, description="Resource requests and limits.") -class DistributedGPUExecutionProvider(BaseModel): - """ - GPU-based execution provider. +class SubprocessExecutionProvider(BaseModel): + """Host subprocess execution provider. - Provides configuration for running jobs on GPU resources with - resource requests and limits. + Runs a job step as a local OS process. The ``provider`` field + expresses compute intent while ``kind`` identifies the payload shape. """ - provider: Literal["gpu_distributed"] = "gpu_distributed" - """The provider type, always 'gpu_distributed' for distributed GPU execution.""" - - profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" - - container: ContainerSpec - """Container specification defining the execution environment.""" - - resources: ComputeResources = Field( - default_factory=ComputeResources, description="Resource requests and limits for distributed GPU execution." - ) + kind: Literal["subprocess"] = "subprocess" + """Executor payload shape — always ``"subprocess"`` for host command execution.""" + provider: Literal["cpu", "gpu"] = "cpu" + """Compute requirement: ``"cpu"`` or ``"gpu"`` (GPU subprocess inherits host devices).""" -class SubprocessExecutionProvider(BaseModel): - """Host subprocess execution provider.""" - - provider: Literal["subprocess"] = "subprocess" - """The provider type, always 'subprocess' for host subprocess execution.""" - - profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" + profile: str = "subprocess" + """Execution profile. Defaults to ``"subprocess"`` to match the registered backend.""" command: list[str] """The host command to execute as a list of strings (e.g., ['python', '-m', 'my_task']).""" @@ -170,20 +138,9 @@ def validate_command(self) -> "SubprocessExecutionProvider": return self -# Type alias for the current execution provider implementation -ExecutionProviderT = Union[ - CPUExecutionProvider, GPUExecutionProvider, DistributedGPUExecutionProvider, SubprocessExecutionProvider -] -"""Type alias representing the current execution provider type.""" - -# Discriminated union type for execution providers +# Discriminated union type for execution providers. +# Uses ``kind`` to distinguish container vs subprocess payload shapes. Provider = Annotated[ - ExecutionProviderT, - Field(discriminator="provider"), + Union[ContainerExecutionProvider, SubprocessExecutionProvider], + Field(discriminator="kind"), ] -""" -Discriminated union type for execution providers. - -Uses the 'provider' field to determine the specific provider type. -Currently supports CPU execution providers, with extensibility for future provider types. -""" diff --git a/services/core/jobs/src/nmp/core/jobs/app/schemas.py b/services/core/jobs/src/nmp/core/jobs/app/schemas.py index d5e8b75992..c3f15b19d1 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/schemas.py +++ b/services/core/jobs/src/nmp/core/jobs/app/schemas.py @@ -7,7 +7,7 @@ from nmp.common.entities.constants import NAME_PATTERN, NAME_PATTERN_DESCRIPTION from nmp.common.jobs.constants import PERSISTENT_JOB_STORAGE_PATH_ENVVAR -from nmp.core.jobs.app.providers import Provider +from nmp.core.jobs.app.providers import ExecutorKind, Provider from pydantic import BaseModel, ConfigDict, Field, model_validator # ============================================================================= @@ -130,6 +130,10 @@ class BaseExecutionProfile(BaseModel): default="default", description="The profile name for the executor, e.g., high_priority_a100, low_priority, etc.", ) + kind: ExecutorKind = Field( + default="container", + description="The executor payload shape this profile expects: 'container' or 'subprocess'.", + ) @property def supports_persistent_storage(self) -> bool: diff --git a/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py b/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py index 004560f00b..1d0f469577 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py +++ b/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py @@ -7,7 +7,7 @@ that are shared across test files. """ -from nmp.core.jobs.app.providers import ComputeResources, ComputeResourceSpec, ContainerSpec, CPUExecutionProvider +from nmp.core.jobs.app.providers import ComputeResources, ComputeResourceSpec, ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobSpec, PlatformJobStepSpec, @@ -45,7 +45,7 @@ class TestConstants: # etc.) all set both fields; the fixture mirrors that so submissions through the # core /apis/jobs/v2/workspaces/{ws}/jobs endpoint validate successfully and we # exercise the same translation path the user-facing tutorials do. - TEST_EXECUTOR = CPUExecutionProvider( + TEST_EXECUTOR = ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec( diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py index f5d4724fae..8c2870c663 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py @@ -96,8 +96,8 @@ def get_default_executor_profiles_for_runtime( if enable_subprocess_executor: executors.append( SubprocessJobExecutionProfile( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", backend="subprocess", config=defaults.subprocess, ) diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py index 7353358f08..4ae90b318f 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py @@ -61,9 +61,7 @@ from nmp.core.jobs.app.ctx import JobContext from nmp.core.jobs.app.providers import ( ComputeResources, - CPUExecutionProvider, - ExecutionProviderT, - GPUExecutionProvider, + ContainerExecutionProvider, ) from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import ( @@ -113,7 +111,7 @@ def k8s_shm_quantity_to_docker(quantity: str) -> str: DOCKER_STOP_TIMEOUT = int(os.getenv("NEMO_JOBS_DEFAULT_DOCKER_STOP_TIMEOUT", "30")) -ProviderT = TypeVar("ProviderT", bound=ExecutionProviderT) +ProviderT = TypeVar("ProviderT", bound=ContainerExecutionProvider) class DockerVolumeMount(BaseModel): @@ -1346,12 +1344,12 @@ def name_for_step(self, step: PlatformJobStepWithContext) -> str: return f"{step.job}-{step.name}" -class CPUDockerJobBackend(DockerJobBackend[CPUExecutionProvider]): +class CPUDockerJobBackend(DockerJobBackend[ContainerExecutionProvider]): """Docker job backend for CPU execution.""" def schedule( self, - executor_config: CPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: return self.schedule_single_container(executor_config, step) @@ -1362,12 +1360,12 @@ def sync( ) -> JobUpdate: return self._sync(step) - def configure_container(self, container_args: dict, executor_config: CPUExecutionProvider) -> dict: + def configure_container(self, container_args: dict, executor_config: ContainerExecutionProvider) -> dict: """Customize container arguments for CPU execution.""" return self.apply_resource_limits(container_args, executor_config.resources) -class GPUDockerJobBackend(DockerJobBackend[GPUExecutionProvider]): +class GPUDockerJobBackend(DockerJobBackend[ContainerExecutionProvider]): """Docker job backend for GPU execution.""" def init(self) -> None: @@ -1385,7 +1383,7 @@ def init(self) -> None: def schedule( self, - executor_config: GPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: return self.schedule_single_container(executor_config, step) @@ -1406,7 +1404,7 @@ def sync( self.gpu_pool.release_gpu(step.id) return job_update - def configure_container(self, container_args: dict, executor_config: GPUExecutionProvider) -> dict: + def configure_container(self, container_args: dict, executor_config: ContainerExecutionProvider) -> dict: """Customize container arguments for GPU execution.""" # Apply resource limits container_args = self.apply_resource_limits(container_args, executor_config.resources) diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py index d00870793e..16dab8de18 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py @@ -24,10 +24,8 @@ ) from nmp.core.jobs.app.providers import ( ComputeResources, + ContainerExecutionProvider, ContainerSpec, - CPUExecutionProvider, - ExecutionProviderT, - GPUExecutionProvider, ) from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import JobBackend, JobUpdate, staleness_error_message @@ -51,7 +49,7 @@ logger = logging.getLogger(__name__) -ProviderT = TypeVar("ProviderT", bound=ExecutionProviderT) +ProviderT = TypeVar("ProviderT", bound=ContainerExecutionProvider) class KubernetesJobExecutionProfileConfig(BaseKubernetesExecutionProfileConfig): @@ -546,12 +544,12 @@ def cleanup_steps(self): self.terminate_job(job) -class CPUKubernetesJobBackend(KubernetesJobBackend[CPUExecutionProvider]): +class CPUKubernetesJobBackend(KubernetesJobBackend[ContainerExecutionProvider]): """Kubernetes job backend for CPU execution.""" def schedule( self, - executor_config: CPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: return self.schedule_job(executor_config.container, step) @@ -563,12 +561,12 @@ def sync( return self._sync(step) -class GPUKubernetesJobBackend(KubernetesJobBackend[GPUExecutionProvider]): +class GPUKubernetesJobBackend(KubernetesJobBackend[ContainerExecutionProvider]): """Kubernetes job backend for GPU execution.""" def schedule( self, - executor_config: GPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: if executor_config.resources is not None and executor_config.resources.num_gpus is not None: diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py index 0d07a89aa0..3d8ca76b21 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py @@ -24,7 +24,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import DistributedGPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import JobBackend, JobUpdate, staleness_error_message from nmp.core.jobs.controllers.backends.kubernetes.common import ( @@ -86,7 +86,7 @@ def supports_persistent_storage(self) -> bool: class VolcanoJobBackend( - JobBackend[DistributedGPUExecutionProvider, VolcanoJobExecutionProfileConfig], + JobBackend[ContainerExecutionProvider, VolcanoJobExecutionProfileConfig], ): BACKEND_NAME: str = "volcano_job" @@ -151,7 +151,7 @@ def get_volcano_job_list_by_labels(self, labels: dict[str, str]) -> list[dict]: def schedule( self, - executor_config: DistributedGPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: """ diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py index f5fd76accc..73ec1d9bad 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py @@ -56,7 +56,8 @@ class BackendKey: BackendKey("cpu", "kubernetes_job"): CPUKubernetesJobBackend, BackendKey("gpu", "kubernetes_job"): GPUKubernetesJobBackend, BackendKey("gpu_distributed", "volcano_job"): VolcanoJobBackend, - BackendKey("subprocess", "subprocess"): SubprocessJobBackend, + BackendKey("cpu", "subprocess"): SubprocessJobBackend, + BackendKey("gpu", "subprocess"): SubprocessJobBackend, BackendKey("cpu", "e2e"): TestE2ECPUJobBackend, BackendKey("gpu", "e2e"): TestE2EGPUJobBackend, } diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py index 35610fd19a..7b69ab4b22 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py @@ -83,8 +83,9 @@ class SubprocessJobExecutionProfileConfig(JobExecutionProfileConfig): class SubprocessJobExecutionProfile(BaseExecutionProfile): - provider: Literal["subprocess"] = "subprocess" + provider: Literal["cpu"] = "cpu" backend: Literal["subprocess"] = "subprocess" + kind: Literal["subprocess"] = "subprocess" config: SubprocessJobExecutionProfileConfig = Field( default_factory=SubprocessJobExecutionProfileConfig, description="Additional configuration for the subprocess executor", diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py index 0604f82657..7ead19ac6a 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py @@ -6,14 +6,14 @@ from nemo_platform.types.jobs import PlatformJobStepWithContext from nmp.common.jobs.schemas import PlatformJobStatus -from nmp.core.jobs.app.providers import CPUExecutionProvider, ExecutionProviderT, GPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import JobBackend, JobExecutionProfileConfig, JobUpdate from nmp.core.jobs.controllers.backends.docker import DockerJobExecutionProfileConfig from nmp.core.jobs.controllers.backends.kubernetes import KubernetesJobExecutionProfileConfig from pydantic import Field -ProviderT = TypeVar("ProviderT", bound=ExecutionProviderT) +ProviderT = TypeVar("ProviderT", bound=ContainerExecutionProvider) class E2EJobExecutionProfile(BaseExecutionProfile): @@ -136,25 +136,25 @@ def cleanup_steps(self): return -class TestE2ECPUJobBackend(TestE2EJobBackend[CPUExecutionProvider]): +class TestE2ECPUJobBackend(TestE2EJobBackend[ContainerExecutionProvider]): pass -class TestE2EGPUJobBackend(TestE2EJobBackend[GPUExecutionProvider]): +class TestE2EGPUJobBackend(TestE2EJobBackend[ContainerExecutionProvider]): pass -class MockDockerCPUJobBackend(MockDockerJobBackend[CPUExecutionProvider]): +class MockDockerCPUJobBackend(MockDockerJobBackend[ContainerExecutionProvider]): pass -class MockDockerGPUJobBackend(MockDockerJobBackend[GPUExecutionProvider]): +class MockDockerGPUJobBackend(MockDockerJobBackend[ContainerExecutionProvider]): pass -class MockKubernetesCPUJobBackend(MockKubernetesJobBackend[CPUExecutionProvider]): +class MockKubernetesCPUJobBackend(MockKubernetesJobBackend[ContainerExecutionProvider]): pass -class MockKubernetesGPUJobBackend(MockKubernetesJobBackend[GPUExecutionProvider]): +class MockKubernetesGPUJobBackend(MockKubernetesJobBackend[ContainerExecutionProvider]): pass diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index 03dbfbbbc4..30f43f0c87 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -12,8 +12,10 @@ from fastapi import FastAPI from httpx import ASGITransport, AsyncClient from nemo_platform import AsyncNeMoPlatform +from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec as FactoryContainerExecutionProviderSpec, +) from nemo_platform_plugin.jobs.api_factory import ContainerSpec as FactoryContainerSpec -from nemo_platform_plugin.jobs.api_factory import CPUExecutionProviderSpec as FactoryCPUExecutionProviderSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobEnvironmentVariableParam, job_route_factory from nemo_platform_plugin.jobs.api_factory import PlatformJobSpec as FactoryPlatformJobSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobStep as FactoryPlatformJobStep @@ -35,7 +37,7 @@ PlatformJobStepWithContext, ) from nmp.core.jobs.app.dispatcher import JobDispatcher -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobStepSpec, @@ -212,6 +214,7 @@ def sample_job_dict(): { "name": "docker-step-cpu-1", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "ubuntu:latest", "command": ["c1", "c2"], "entrypoint": ["a1", "a2"]}, @@ -221,6 +224,7 @@ def sample_job_dict(): { "name": "docker-step-gpu", "executor": { + "kind": "container", "provider": "gpu", "profile": "default", "container": {"image": "ubuntu:latest", "command": ["c1", "c2"], "entrypoint": ["a1", "a2"]}, @@ -231,6 +235,7 @@ def sample_job_dict(): { "name": "docker-step-no-command-or-entrypoint", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "ubuntu:latest"}, @@ -341,7 +346,7 @@ def create_step_with_status(status: PlatformJobStatus) -> PlatformJobStepWithCon fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image") ), config={}, @@ -385,16 +390,10 @@ def job_config_with_many_profiles() -> JobsServiceConfig: volume_name: test_jobs_storage jobs: - # Executor profiles configuration. The subprocess/default entry mirrors what - # ships in `packages/nmp_platform/config/local.yaml` and opts the documented - # `cpu/default` plugin steps into the cpu→subprocess translation in the Jobs - # API (see `translate_cpu_container_steps_to_subprocess`). Tests that submit - # jobs through the core /apis/jobs/v2/workspaces/{ws}/jobs endpoint with a - # `cpu/default` step will get rewritten to `subprocess/default` before - # validation, matching production deployment behavior. + # Executor profiles configuration. executors: - - provider: subprocess - profile: default + - provider: cpu + profile: subprocess backend: subprocess config: working_directory: /tmp/nmp-subprocess-jobs @@ -464,16 +463,14 @@ def backend_registry(mock_nmp_client, job_config_with_many_profiles) -> BackendR nmp_sdk=mock_nmp_client, profiles=job_config_with_many_profiles.executors, # Mock the backends. Register the real SubprocessJobBackend to satisfy - # the subprocess/default executor that ships in - # `job_config_with_many_profiles` (added so test_client picks up the - # subprocess profile and the cpu→subprocess translation in the Jobs - # API fires consistently with production deployments). + # the cpu/subprocess executor that ships in + # `job_config_with_many_profiles`. backends={ BackendKey("cpu", "docker"): MockDockerCPUJobBackend, BackendKey("gpu", "docker"): MockDockerGPUJobBackend, BackendKey("cpu", "kubernetes_job"): MockKubernetesCPUJobBackend, BackendKey("gpu", "kubernetes_job"): MockKubernetesGPUJobBackend, - BackendKey("subprocess", "subprocess"): SubprocessJobBackend, + BackendKey("cpu", "subprocess"): SubprocessJobBackend, }, ) @@ -490,12 +487,15 @@ def hello_world_job_config( entity_client: EntityClient, job_name: str | None, sdk, + kind: str = "container", + profile: str | None = None, ) -> FactoryPlatformJobSpec: return FactoryPlatformJobSpec( steps=[ FactoryPlatformJobStep( name="hello-world-step-1", - executor=FactoryCPUExecutionProviderSpec( + executor=FactoryContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=FactoryContainerSpec( @@ -513,8 +513,7 @@ def hello_world_job_config( @pytest_asyncio.fixture async def test_client(mock_dispatcher, mock_store, job_config_with_many_profiles) -> AsyncGenerator[AsyncClient, None]: - # Mock the config.executors to have the test execution profiles, including - # subprocess/default for cpu/default to subprocess/default translation. + # Mock the config.executors to have the test execution profiles. from nmp.common.auth.middleware import AuthorizationMiddleware from nmp.common.service.dependencies import get_sdk_client diff --git a/services/core/jobs/tests/controllers/test_base.py b/services/core/jobs/tests/controllers/test_base.py index 6d85637939..c3ef2d37a2 100644 --- a/services/core/jobs/tests/controllers/test_base.py +++ b/services/core/jobs/tests/controllers/test_base.py @@ -9,7 +9,7 @@ from nmp.common.config import PlatformConfig from nmp.common.jobs.schemas import PlatformJobStatus from nmp.core.jobs.api.v2.jobs.schemas import PlatformJobStepWithContext -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import PlatformJobStepSpec, StepLifecycle from nmp.core.jobs.controllers.backends.base import get_logs_endpoint_from_fileset, resolve_task_image from nmp.core.jobs.controllers.backends.test import MockKubernetesCPUJobBackend @@ -164,7 +164,9 @@ def _make_step( if step_spec is ...: step_spec = PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider(provider="cpu", profile="default", container=ContainerSpec(image="img")), + executor=ContainerExecutionProvider( + provider="cpu", profile="default", container=ContainerSpec(image="img") + ), config={}, lifecycle=StepLifecycle(staleness_timeout_seconds=staleness_timeout), ) diff --git a/services/core/jobs/tests/controllers/test_docker_backend.py b/services/core/jobs/tests/controllers/test_docker_backend.py index 0a9e1e4b08..542b5d396a 100644 --- a/services/core/jobs/tests/controllers/test_docker_backend.py +++ b/services/core/jobs/tests/controllers/test_docker_backend.py @@ -38,9 +38,8 @@ from nmp.core.jobs.app.providers import ( ComputeResources, ComputeResourceSpec, + ContainerExecutionProvider, ContainerSpec, - CPUExecutionProvider, - GPUExecutionProvider, ) from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, @@ -128,7 +127,7 @@ def test_job_step(): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -161,7 +160,7 @@ def test_job_step_with_persistence(): workspace="default", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -467,7 +466,7 @@ def test_docker_job_sync_cancelling_sigkill(docker_job, docker_client_mock, test def test_docker_job_schedule_no_resources(docker_job, docker_client_mock): """Test that scheduling works with providers that don't have resources attribute.""" # Create a provider without resources attribute - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) test_job_step = PlatformJobStepWithContext( id="test-step-id", @@ -478,7 +477,7 @@ def test_docker_job_schedule_no_resources(docker_job, docker_client_mock): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -516,7 +515,7 @@ def test_docker_job_schedule_no_resources(docker_job, docker_client_mock): def test_docker_job_schedule_with_secrets(docker_job, docker_client_mock): """Test that scheduling works when secrets are provided.""" # Create a provider without resources attribute - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) test_job_step = PlatformJobStepWithContext( id="test-step-id", @@ -527,7 +526,7 @@ def test_docker_job_schedule_with_secrets(docker_job, docker_client_mock): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -576,7 +575,7 @@ def test_docker_job_nemo_job_secrets_format_same_and_cross_workspace(docker_job, Format must be ENV_VAR=workspace/secret_name per SECRETS.md; cross-workspace refs use the explicit workspace/secret_name from from_secret.name. """ - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) # Step in workspace "default"; one secret in same workspace, one in other workspace test_job_step = PlatformJobStepWithContext( id="test-step-id", @@ -587,7 +586,7 @@ def test_docker_job_nemo_job_secrets_format_same_and_cross_workspace(docker_job, name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -633,7 +632,7 @@ def test_docker_job_nemo_job_secrets_format_same_and_cross_workspace(docker_job, def test_docker_job_profile_environment_applied(mock_nmp_client, docker_client_mock, mock_platform_config): """Profile environment (e.g. HOME=/tmp) is applied to scheduled job containers.""" - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) config = DockerJobExecutionProfileConfig( storage=DockerJobStorageConfig(volume_name="test_jobs_storage"), env={"HOME": "/tmp"}, @@ -651,7 +650,7 @@ def test_docker_job_profile_environment_applied(mock_nmp_client, docker_client_m name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -690,7 +689,7 @@ def test_schedule_docker_gpu(mock_nmp_client, docker_client_mock): """Test successful job scheduling.""" gpus = 2 - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -828,7 +827,7 @@ def test_schedule_docker_gpu(mock_nmp_client, docker_client_mock): def test_gpu_cleanup_on_job_completion(mock_nmp_client, docker_client_mock): """Test that GPU resources are released when a job completes successfully.""" - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -928,7 +927,7 @@ def test_gpu_cleanup_on_job_completion(mock_nmp_client, docker_client_mock): def test_gpu_cleanup_on_job_error(mock_nmp_client, docker_client_mock): """Test that GPU resources are released when a job fails with an error.""" - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -1836,7 +1835,7 @@ def test_job_step_with_auth_context(): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), diff --git a/services/core/jobs/tests/controllers/test_kubernetes_backend.py b/services/core/jobs/tests/controllers/test_kubernetes_backend.py index 2986c892bd..8641138626 100644 --- a/services/core/jobs/tests/controllers/test_kubernetes_backend.py +++ b/services/core/jobs/tests/controllers/test_kubernetes_backend.py @@ -28,7 +28,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider, GPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobSecretEnvironmentVariableRef, @@ -126,7 +126,7 @@ def kubernetes_execution_profile_config(): @pytest.fixture def cpu_execution_provider(): """Create a test CPU execution provider.""" - return CPUExecutionProvider( + return ContainerExecutionProvider( container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -1184,7 +1184,7 @@ def test_name_for_job_truncation(kubernetes_job): name="test-step-", # Job name with trailing dash. step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="k8s_profile", container=ContainerSpec(image="test-image") ), config={"command": ["echo", "Hello"]}, @@ -1204,7 +1204,7 @@ def test_name_for_job_truncation(kubernetes_job): def test_schedule_kubernetes_gpu(mock_nmp_client, kubernetes_execution_profile_config): """Test successful job scheduling.""" - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -1392,7 +1392,7 @@ def test_schedule_nemo_job_secrets_format_same_and_cross_workspace(kubernetes_jo fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image") ), config={}, @@ -1439,7 +1439,7 @@ def test_schedule_without_storage_no_label(kubernetes_job, cpu_execution_provide fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image") ), config={}, @@ -1705,7 +1705,7 @@ def test_step_pending_with_auth_context() -> PlatformJobStepWithContext: name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="nvidia/cuda:11.8-runtime-ubuntu20.04"), diff --git a/services/core/jobs/tests/controllers/test_subprocess_backend.py b/services/core/jobs/tests/controllers/test_subprocess_backend.py index 2f40410326..50446eb69d 100644 --- a/services/core/jobs/tests/controllers/test_subprocess_backend.py +++ b/services/core/jobs/tests/controllers/test_subprocess_backend.py @@ -26,16 +26,14 @@ def _subprocess_backend(mock_nmp_client, tmp_path, mock_platform_config) -> Subp def _step_with_command(step, command: list[str]): updated_step = step.model_copy(deep=True) - updated_step.step_spec.executor = SubprocessExecutionProvider( - provider="subprocess", profile="default", command=command - ) + updated_step.step_spec.executor = SubprocessExecutionProvider(provider="cpu", profile="subprocess", command=command) return updated_step def _step_with_unvalidated_command(step, command: list[str]): updated_step = step.model_copy(deep=True) updated_step.step_spec.executor = SubprocessExecutionProvider.model_construct( - provider="subprocess", profile="default", command=command + provider="cpu", profile="subprocess", command=command ) return updated_step @@ -219,8 +217,8 @@ def test_missing_command_fails_without_process(mock_nmp_client, tmp_path, mock_p def test_build_command_uses_current_interpreter_for_python_module_commands() -> None: executor = SubprocessExecutionProvider( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", command=["python", "-m", "nemo_evaluator.tasks.evaluate"], ) @@ -233,8 +231,8 @@ def test_build_command_uses_current_interpreter_for_python_module_commands() -> def test_build_command_uses_current_interpreter_for_python3_commands() -> None: executor = SubprocessExecutionProvider( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", command=["python3", "-m", "nemo_evaluator.tasks.evaluate"], ) @@ -251,8 +249,8 @@ def test_build_command_prefers_virtual_env_python(tmp_path) -> None: venv_python.write_text("#!/bin/sh\n", encoding="utf-8") venv_python.chmod(0o755) executor = SubprocessExecutionProvider( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", command=["python", "-m", "nemo_evaluator.tasks.evaluate"], ) diff --git a/services/core/jobs/tests/controllers/test_volcano_backend.py b/services/core/jobs/tests/controllers/test_volcano_backend.py index fcaf352ec2..317fa08dc9 100644 --- a/services/core/jobs/tests/controllers/test_volcano_backend.py +++ b/services/core/jobs/tests/controllers/test_volcano_backend.py @@ -29,7 +29,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import ComputeResources, ContainerSpec, DistributedGPUExecutionProvider +from nmp.core.jobs.app.providers import ComputeResources, ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobSecretEnvironmentVariableRef, @@ -115,7 +115,8 @@ def volcano_execution_profile_config(): @pytest.fixture def distributed_gpu_execution_provider(): """Create a test Distributed GPU execution provider.""" - return DistributedGPUExecutionProvider( + return ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -335,7 +336,8 @@ def test_schedule_job_single_node_success( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Tweak the distributed_gpu_execution_provider for this one - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -469,7 +471,7 @@ def test_volcano_job_nemo_job_secrets_format_same_and_cross_workspace( fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=DistributedGPUExecutionProvider( + executor=ContainerExecutionProvider( provider="gpu_distributed", profile="default", container=ContainerSpec(image="test-image"), @@ -551,7 +553,8 @@ def test_multi_node_networking_annotations_added( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Create multi-node job (num_nodes > 1) - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -591,7 +594,8 @@ def test_single_node_no_networking_annotations( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Create single-node job (num_nodes = 1) - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -658,7 +662,8 @@ def test_networking_annotations_disabled_via_config( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Create multi-node job (num_nodes > 1) - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -801,7 +806,7 @@ def test_name_for_job_truncation(volcano_job: VolcanoJobBackend): name="test-step-", # Job name with trailing dash. step_spec=PlatformJobStepSpec( name="test-step", - executor=DistributedGPUExecutionProvider( + executor=ContainerExecutionProvider( provider="gpu_distributed", profile="volcano_profile", container=ContainerSpec(image="test-image") ), config={"command": ["echo", "Hello"]}, diff --git a/services/core/jobs/tests/integration/test_jobs_auth_propagation.py b/services/core/jobs/tests/integration/test_jobs_auth_propagation.py index 1efbc6cab7..d501d8e042 100644 --- a/services/core/jobs/tests/integration/test_jobs_auth_propagation.py +++ b/services/core/jobs/tests/integration/test_jobs_auth_propagation.py @@ -61,6 +61,7 @@ def test_auth_context_stripped_for_regular_user(self, sdk: NeMoPlatform): { "name": "test-step", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { @@ -97,6 +98,7 @@ def test_auth_context_visible_to_service_principal(self, sdk: NeMoPlatform): { "name": "test-step", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { diff --git a/services/core/jobs/tests/integration/test_jobs_secrets_access.py b/services/core/jobs/tests/integration/test_jobs_secrets_access.py index 5ed5e411f7..cdb4412a83 100644 --- a/services/core/jobs/tests/integration/test_jobs_secrets_access.py +++ b/services/core/jobs/tests/integration/test_jobs_secrets_access.py @@ -49,6 +49,7 @@ def _platform_spec_with_secret(secret_ref: str, env_var_name: str = "MY_SECRET") { "name": "step-with-secret", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { diff --git a/services/core/jobs/tests/test_config.py b/services/core/jobs/tests/test_config.py index f26eb23a48..b808ac5595 100644 --- a/services/core/jobs/tests/test_config.py +++ b/services/core/jobs/tests/test_config.py @@ -8,9 +8,8 @@ from nmp.common.config import Configuration, Runtime from nmp.core.jobs.app.providers import ( ComputeResources, + ContainerExecutionProvider, ContainerSpec, - CPUExecutionProvider, - GPUExecutionProvider, SubprocessExecutionProvider, ) from nmp.core.jobs.app.schemas import PlatformJobEnvironmentVariable @@ -67,7 +66,7 @@ def test_job_instantiation_and_validation(sample_job_dict): assert cpu_step.executor.profile == "default" assert cpu_step.executor.container.image == "ubuntu:latest" assert cpu_step.environment == [PlatformJobEnvironmentVariable(name="TEST_ENV", value="test_value")] - assert isinstance(cpu_step.executor, CPUExecutionProvider) + assert isinstance(cpu_step.executor, ContainerExecutionProvider) # Validate second step (GPU) gpu_step = job.platform_spec.steps[1] @@ -77,7 +76,7 @@ def test_job_instantiation_and_validation(sample_job_dict): assert gpu_step.executor.container.image == "ubuntu:latest" assert gpu_step.environment == [PlatformJobEnvironmentVariable(name="TEST_ENV", value="test_value")] assert gpu_step.executor.resources.num_gpus == 2 - assert isinstance(gpu_step.executor, GPUExecutionProvider) + assert isinstance(gpu_step.executor, ContainerExecutionProvider) def test_step_container_command_configuration(sample_job_dict): @@ -309,13 +308,13 @@ def test_default_profiles_include_subprocess_for_docker_runtime(): assert ("cpu", "default", "docker") in [(p.provider, p.profile, p.backend) for p in profiles] assert ("gpu", "default", "docker") in [(p.provider, p.profile, p.backend) for p in profiles] - assert ("subprocess", "default", "subprocess") in [(p.provider, p.profile, p.backend) for p in profiles] + assert ("cpu", "subprocess", "subprocess") in [(p.provider, p.profile, p.backend) for p in profiles] def test_default_profiles_include_subprocess_for_none_runtime(): profiles = get_default_executor_profiles_for_runtime(Runtime.NONE, DefaultExecutionProfileConfig()) - assert [(p.provider, p.profile, p.backend) for p in profiles] == [("subprocess", "default", "subprocess")] + assert [(p.provider, p.profile, p.backend) for p in profiles] == [("cpu", "subprocess", "subprocess")] def test_backend_registry_resolves_subprocess_default(mock_nmp_client): @@ -330,23 +329,23 @@ def __init__(self, nmp_sdk, execution_profile_config, profile_name): registry = BackendRegistry.from_config( nmp_sdk=mock_nmp_client, profiles=profiles, - backends={BackendKey("subprocess", "subprocess"): DummyBackend}, + backends={BackendKey("cpu", "subprocess"): DummyBackend}, ) - assert registry.get_backend(provider="subprocess", profile="default") is not None + assert registry.get_backend(provider="cpu", profile="subprocess") is not None -def test_subprocess_execution_profile_defaults_provider_to_subprocess(): - profile = SubprocessJobExecutionProfile(profile="default") +def test_subprocess_execution_profile_defaults_provider_to_cpu(): + profile = SubprocessJobExecutionProfile(profile="subprocess") - assert profile.provider == "subprocess" + assert profile.provider == "cpu" assert profile.backend == "subprocess" def test_default_profiles_exclude_subprocess_for_kubernetes_runtime(): profiles = get_default_executor_profiles_for_runtime(Runtime.KUBERNETES, DefaultExecutionProfileConfig()) - assert ("subprocess", "default", "subprocess") not in [(p.provider, p.profile, p.backend) for p in profiles] + assert ("cpu", "subprocess", "subprocess") not in [(p.provider, p.profile, p.backend) for p in profiles] def test_merged_profiles(): @@ -422,7 +421,7 @@ def test_merged_profiles(): assert type(gpu_distributed.config) is VolcanoJobExecutionProfileConfig assert gpu_distributed.config.storage.pvc_name == "default-pvc" - subprocess_default = next((p for p in merged if p.provider == "subprocess" and p.profile == "default"), None) + subprocess_default = next((p for p in merged if p.provider == "cpu" and p.profile == "subprocess"), None) assert subprocess_default is not None assert type(subprocess_default.config) is SubprocessJobExecutionProfileConfig diff --git a/services/core/jobs/tests/test_job_search.py b/services/core/jobs/tests/test_job_search.py index 370a8fdad4..9cd0741575 100644 --- a/services/core/jobs/tests/test_job_search.py +++ b/services/core/jobs/tests/test_job_search.py @@ -20,7 +20,15 @@ async def test_search_jobs_by_name(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -31,7 +39,15 @@ async def test_search_jobs_by_name(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -53,7 +69,15 @@ async def test_search_jobs_by_project(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -65,7 +89,15 @@ async def test_search_jobs_by_project(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -85,7 +117,15 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -96,7 +136,15 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -107,7 +155,15 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -133,7 +189,15 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -145,7 +209,15 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -157,7 +229,15 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -181,7 +261,15 @@ async def test_search_jobs_case_insensitive(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -204,7 +292,15 @@ async def test_search_jobs_partial_match(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -224,7 +320,15 @@ async def test_search_combined_with_filter(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -235,7 +339,15 @@ async def test_search_combined_with_filter(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -259,7 +371,15 @@ async def test_search_no_results(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -278,7 +398,15 @@ async def test_search_empty_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -289,7 +417,15 @@ async def test_search_empty_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -330,7 +466,12 @@ async def test_search_pagination(test_sdk: AsyncNeMoPlatform): "steps": [ { "name": "step1", - "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}, + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, } ] }, @@ -359,7 +500,15 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -370,7 +519,15 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -388,7 +545,15 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -410,7 +575,15 @@ async def test_search_long_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -434,7 +607,12 @@ async def test_search_result_limit(test_sdk: AsyncNeMoPlatform): "steps": [ { "name": "step1", - "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}, + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, } ] }, @@ -473,7 +651,15 @@ async def test_search_special_characters(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) diff --git a/services/core/jobs/tests/test_jobs_api.py b/services/core/jobs/tests/test_jobs_api.py index 218c6f6c05..27b6d7dbd7 100644 --- a/services/core/jobs/tests/test_jobs_api.py +++ b/services/core/jobs/tests/test_jobs_api.py @@ -25,7 +25,7 @@ PlatformJobStepsListFilter, ) from nmp.core.jobs.app.dispatcher import JobDispatcher -from nmp.core.jobs.app.providers import ContainerSpec, GPUExecutionProvider, SubprocessExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobSpec, PlatformJobStepSpec, @@ -48,23 +48,13 @@ def to_sdk_create_params(request: CreatePlatformJobRequest) -> Dict[str, Any]: return data -def expected_translated_executor_dump() -> Dict[str, Any]: +def expected_persisted_executor_dump() -> Dict[str, Any]: """Return the expected persisted executor for ``TestConstants.TEST_EXECUTOR``. - The Jobs API rewrites ``cpu/`` steps into ``subprocess/`` - steps before persistence (see - ``translate_cpu_container_steps_to_subprocess`` in - ``services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py``), so the - round-trip representation of a step submitted with ``TestConstants.TEST_EXECUTOR`` - is the translated subprocess executor — with ``command`` set to - ``container.entrypoint + container.command``. + The executor is stored as-is — a ``ContainerExecutionProvider`` with + ``provider="cpu"`` and the container spec from the test constant. """ - container = TestConstants.TEST_EXECUTOR.container - return SubprocessExecutionProvider( - provider="subprocess", - profile=TestConstants.TEST_EXECUTOR.profile, - command=[*container.entrypoint, *container.command], - ).model_dump() + return TestConstants.TEST_EXECUTOR.model_dump() @pytest.mark.asyncio @@ -79,14 +69,9 @@ async def test_create_job_using_sdk(test_sdk: AsyncNeMoPlatform): { "name": "basic", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", - # entrypoint+command are required so the cpu→subprocess - # translation hop in the Jobs API (see - # `translate_cpu_container_steps_to_subprocess`) can - # produce a non-empty subprocess command. Real plugin - # compilers always set both; mirroring that here keeps - # the SDK round-trip path realistic. "container": { "image": "test-image", "entrypoint": ["python", "-m"], @@ -180,6 +165,7 @@ async def test_create_job_with_secrets(test_sdk: AsyncNeMoPlatform): { "name": "basic", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test-image"}, @@ -238,7 +224,7 @@ async def test_create_job_gpu_fail_fast_when_docker_no_gpus(test_client: AsyncCl """Direct Jobs API create with GPU step fails fast with 422 when platform is Docker with no GPUs.""" from nmp.common.config import Runtime - gpu_executor = GPUExecutionProvider( + gpu_executor = ContainerExecutionProvider( provider="gpu", profile="default", container=ContainerSpec(image="gpu-image"), @@ -398,7 +384,7 @@ async def test_job_lifecycle_single_step(test_client: AsyncClient): # Assert that the platform_spec is created correctly assert len(get_data["platform_spec"]["steps"]) == 1 assert get_data["platform_spec"]["steps"][0]["name"] == "step1" - assert get_data["platform_spec"]["steps"][0]["executor"] == expected_translated_executor_dump() + assert get_data["platform_spec"]["steps"][0]["executor"] == expected_persisted_executor_dump() assert get_data["platform_spec"]["steps"][0]["config"] == {} # list all steps (scoped to this job name — list_steps injects filter.job = name) @@ -557,10 +543,10 @@ async def test_job_lifecycle_multi_step(test_client: AsyncClient): # Assert that the platform_spec is created correctly assert len(get_data["platform_spec"]["steps"]) == 2 assert get_data["platform_spec"]["steps"][0]["name"] == "step1" - assert get_data["platform_spec"]["steps"][0]["executor"] == expected_translated_executor_dump() + assert get_data["platform_spec"]["steps"][0]["executor"] == expected_persisted_executor_dump() assert get_data["platform_spec"]["steps"][0]["config"] == {} assert get_data["platform_spec"]["steps"][1]["name"] == "step2" - assert get_data["platform_spec"]["steps"][1]["executor"] == expected_translated_executor_dump() + assert get_data["platform_spec"]["steps"][1]["executor"] == expected_persisted_executor_dump() assert get_data["platform_spec"]["steps"][1]["config"] == {} # Assert from the api that the first step is created correctly diff --git a/services/core/jobs/tests/test_jobs_endpoint_translation.py b/services/core/jobs/tests/test_jobs_endpoint_translation.py index ed424ef5a7..4687a25d10 100644 --- a/services/core/jobs/tests/test_jobs_endpoint_translation.py +++ b/services/core/jobs/tests/test_jobs_endpoint_translation.py @@ -3,48 +3,18 @@ import pytest from fastapi import HTTPException -from nmp.core.jobs.api.v2.jobs.endpoints import translate_cpu_container_steps_to_subprocess, validate_job_spec -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider, SubprocessExecutionProvider +from nmp.core.jobs.api.v2.jobs.endpoints import validate_job_spec +from nmp.core.jobs.app.providers import SubprocessExecutionProvider from nmp.core.jobs.app.schemas import PlatformJobSpec, PlatformJobStepSpec from nmp.core.jobs.controllers.backends.docker import DockerJobExecutionProfile, DockerJobExecutionProfileConfig -def _cpu_step(name: str, profile: str = "default") -> PlatformJobStepSpec: - return PlatformJobStepSpec( - name=name, - executor=CPUExecutionProvider( - provider="cpu", - profile=profile, - container=ContainerSpec(image="image", entrypoint=["python", "-m"], command=["task"]), - ), - ) - - -def test_translate_cpu_container_steps_to_subprocess_uses_explicit_compat_profiles() -> None: - spec = PlatformJobSpec(steps=[_cpu_step("local-step"), _cpu_step("docker-step", profile="docker")]) - - translated = translate_cpu_container_steps_to_subprocess(spec, {"default"}) - - assert isinstance(translated.steps[0].executor, SubprocessExecutionProvider) - assert translated.steps[0].executor.command == ["python", "-m", "task"] - assert isinstance(translated.steps[1].executor, CPUExecutionProvider) - assert isinstance(spec.steps[0].executor, CPUExecutionProvider) - - -def test_translate_cpu_container_steps_to_subprocess_does_not_use_implicit_defaults() -> None: - spec = PlatformJobSpec(steps=[_cpu_step("docker-step")]) - - translated = translate_cpu_container_steps_to_subprocess(spec, set()) - - assert isinstance(translated.steps[0].executor, CPUExecutionProvider) - - def test_validate_job_spec_matches_provider_and_profile() -> None: spec = PlatformJobSpec( steps=[ PlatformJobStepSpec( name="local-step", - executor=SubprocessExecutionProvider(provider="subprocess", profile="default", command=["true"]), + executor=SubprocessExecutionProvider(provider="cpu", profile="subprocess", command=["true"]), ) ] ) @@ -54,5 +24,5 @@ def test_validate_job_spec_matches_provider_and_profile() -> None: ) ] - with pytest.raises(HTTPException, match="subprocess/default"): + with pytest.raises(HTTPException, match="cpu/subprocess"): validate_job_spec(spec, profiles) diff --git a/services/core/models/src/nmp/core/models/api/v2/models.py b/services/core/models/src/nmp/core/models/api/v2/models.py index aa84d33ffd..b023c7681b 100644 --- a/services/core/models/src/nmp/core/models/api/v2/models.py +++ b/services/core/models/src/nmp/core/models/api/v2/models.py @@ -6,8 +6,8 @@ from fastapi import APIRouter, Depends, HTTPException, Query, status from nemo_platform import APIError, AsyncNeMoPlatform from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -267,7 +267,8 @@ async def start_update_model_spec_job(model_entity: ModelEntity): # Step 1: Download model and dataset files from Files service PlatformJobStep( name="model-spec-analysis", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_qualified_image("nmp-automodel-tasks"), diff --git a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py index 711a80e9df..a118b6b868 100644 --- a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py +++ b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py @@ -7,8 +7,8 @@ from nemo_platform import AsyncNeMoPlatform from nemo_platform_plugin.entities import EntityClient from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, PlatformJobSpec, PlatformJobStep, job_route_factory, @@ -24,6 +24,8 @@ def compile_hello_world_job( entity_client: EntityClient, job_name: str | None, sdk: AsyncNeMoPlatform, + kind: str = "container", + profile: str | None = None, ) -> PlatformJobSpec: """Compile a hello world job config into a platform job spec. @@ -41,7 +43,8 @@ def compile_hello_world_job( steps=[ PlatformJobStep( name="hello-world", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec( diff --git a/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py b/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py index 5ce588af7b..e492d9cb94 100644 --- a/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py +++ b/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py @@ -19,8 +19,8 @@ from nemo_platform import AsyncNeMoPlatform from nemo_platform.types.models.model_entity import ModelEntity from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -240,7 +240,8 @@ async def platform_job_config_compiler( steps: list[PlatformJobStep] = [ PlatformJobStep( name="model-and-dataset-download", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -260,7 +261,8 @@ async def platform_job_config_compiler( ), PlatformJobStep( name="model-upload", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -274,7 +276,8 @@ async def platform_job_config_compiler( ), PlatformJobStep( name="model-entity-creation", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), diff --git a/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py b/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py index 13e120c00e..a64e40b532 100644 --- a/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py +++ b/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py @@ -13,9 +13,9 @@ import logging from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, EnvironmentVariable, - GPUExecutionProviderSpec, PlatformJobStep, ResourcesSpec, ) @@ -66,7 +66,8 @@ def compile_training_step( output_path=DEFAULT_OUTPUT_MODEL_PATH, ) - executor: GPUExecutionProviderSpec = { + executor: ContainerExecutionProviderSpec = { + "kind": "container", "provider": "gpu", "container": ContainerSpec( image=get_training_image(), diff --git a/tools/lint/lint-python-types.sh b/tools/lint/lint-python-types.sh index ae6077aef3..d132eb5b96 100755 --- a/tools/lint/lint-python-types.sh +++ b/tools/lint/lint-python-types.sh @@ -9,7 +9,8 @@ set -euo pipefail # Counts reflect the violation count at the time of suppression. ci_ignored_rules=( invalid-argument-type # 148 - unused-ignore-comment # 14 + unused-ignore-comment # 14 + unused-type-ignore-comment # triggered by SDK type changes unresolved-attribute # 141 not-subscriptable # 19 invalid-assignment # 9