Resolve Merge Conflict & Rename replica_groups to replicas

Bihan  Rana · Bihan  Rana · commit 5abbcad6ae91 · 2025-12-23T15:12:39.000+05:45
diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py
@@ -383,10 +383,10 @@ def get_runs_table(
 
         # Replica Group Changes: Build mapping from replica group names to indices
         group_name_to_index: Dict[str, int] = {}
-        # Replica Group Changes: Check if replica_groups attribute exists (only available for ServiceConfiguration)
-        replica_groups = getattr(run.run_spec.configuration, "replica_groups", None)
-        if replica_groups:
-            for idx, group in enumerate(replica_groups):
+        # Replica Group Changes: Check if replicas attribute exists (only available for ServiceConfiguration)
+        replicas = getattr(run.run_spec.configuration, "replicas", None)
+        if replicas:
+            for idx, group in enumerate(replicas):
                 group_name_to_index[group.name] = idx
 
         run_row: Dict[Union[str, int], Any] = {
diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py
@@ -612,8 +612,8 @@ class ConfigurationWithCommandsParams(CoreModel):
 
     @root_validator
     def check_image_or_commands_present(cls, values):
-        # If replica_groups is present, skip validation - commands come from replica groups
-        replica_groups = values.get("replica_groups")
+        # If replicas is present, skip validation - commands come from replica groups
+        replica_groups = values.get("replicas")
         if replica_groups:
             return values
 
@@ -838,25 +838,25 @@ class ServiceConfigurationParams(CoreModel):
         SERVICE_HTTPS_DEFAULT
     )
     auth: Annotated[bool, Field(description="Enable the authorization")] = True
-    replicas: Annotated[
-        Range[int],
-        Field(
-            description="The number of replicas. Can be a number (e.g. `2`) or a range (`0..4` or `1..8`). "
-            "If it's a range, the `scaling` property is required"
-        ),
-    ] = Range[int](min=1, max=1)
-    scaling: Annotated[
-        Optional[ScalingSpec],
-        Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
-    ] = None
+    # replicas: Annotated[
+    #     Range[int],
+    #     Field(
+    #         description="The number of replicas. Can be a number (e.g. `2`) or a range (`0..4` or `1..8`). "
+    #         "If it's a range, the `scaling` property is required"
+    #     ),
+    # ] = Range[int](min=1, max=1)
+    # scaling: Annotated[
+    #     Optional[ScalingSpec],
+    #     Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
+    # ] = None
     rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
     probes: Annotated[
         list[ProbeConfig],
         Field(description="List of probes used to determine job health"),
     ] = []
 
-    replica_groups: Annotated[
-        Optional[List[ReplicaGroup]],
+    replicas: Annotated[
+        Optional[Union[Range[int], List[ReplicaGroup], int, str]],
         Field(
             description=(
                 "List of replica groups. Each group defines replicas with shared configuration "
@@ -882,15 +882,15 @@ def convert_model(cls, v: Optional[Union[AnyModel, str]]) -> Optional[AnyModel]:
             return OpenAIChatModel(type="chat", name=v, format="openai")
         return v
 
-    @validator("replicas")
-    def convert_replicas(cls, v: Range[int]) -> Range[int]:
-        if v.max is None:
-            raise ValueError("The maximum number of replicas is required")
-        if v.min is None:
-            v.min = 0
-        if v.min < 0:
-            raise ValueError("The minimum number of replicas must be greater than or equal to 0")
-        return v
+    # @validator("replicas")
+    # def convert_replicas(cls, v: Range[int]) -> Range[int]:
+    #     if v.max is None:
+    #         raise ValueError("The maximum number of replicas is required")
+    #     if v.min is None:
+    #         v.min = 0
+    #     if v.min < 0:
+    #         raise ValueError("The minimum number of replicas must be greater than or equal to 0")
+    #     return v
 
     @validator("gateway")
     def validate_gateway(
@@ -902,53 +902,43 @@ def validate_gateway(
             )
         return v
 
-    @root_validator()
-    def validate_scaling(cls, values):
-        replica_groups = values.get("replica_groups")
-        # If replica_groups are set, we don't need to validate scaling.
-        # Each replica group has its own scaling.
-        if replica_groups:
-            return values
-
-        scaling = values.get("scaling")
-        replicas = values.get("replicas")
-        if replicas and replicas.min != replicas.max and not scaling:
-            raise ValueError("When you set `replicas` to a range, ensure to specify `scaling`.")
-        if replicas and replicas.min == replicas.max and scaling:
-            raise ValueError("To use `scaling`, `replicas` must be set to a range.")
-        return values
+    # @root_validator()
+    # def validate_scaling(cls, values):
+    #     replica_groups = values.get("replica_groups")
+    #     # If replica_groups are set, we don't need to validate scaling.
+    #     # Each replica group has its own scaling.
+    #     if replica_groups:
+    #         return values
+
+    #     scaling = values.get("scaling")
+    #     replicas = values.get("replicas")
+    #     if replicas and replicas.min != replicas.max and not scaling:
+    #         raise ValueError("When you set `replicas` to a range, ensure to specify `scaling`.")
+    #     if replicas and replicas.min == replicas.max and scaling:
+    #         raise ValueError("To use `scaling`, `replicas` must be set to a range.")
+    #     return values
 
     @root_validator()
-    def normalize_to_replica_groups(cls, values):
-        replica_groups = values.get("replica_groups")
-        if replica_groups:
-            return values
-
-        # TEMP: prove we’re here and see the inputs
-        print(
-            "[normalize_to_replica_groups]",
-            "commands:",
-            values.get("commands"),
-            "replicas:",
-            values.get("replicas"),
-            "resources:",
-            values.get("resources"),
-            "scaling:",
-            values.get("scaling"),
-            "probes:",
-            values.get("probes"),
-            "rate_limits:",
-            values.get("rate_limits"),
-        )
-        # If replica_groups is not set, we need to normalize the configuration to replica groups.
-        values["replica_groups"] = [
+    def normalize_replicas(cls, values):
+        replicas = values.get("replicas")
+        if isinstance(replicas, list) and len(replicas) > 0:
+            if all(isinstance(item, ReplicaGroup) for item in replicas):
+                return values
+
+        # Handle backward compatibility: convert old-style replica config to groups
+        old_replicas = values.get("replicas")
+        if isinstance(old_replicas, Range):
+            replica_count = old_replicas
+        else:
+            replica_count = Range[int](min=1, max=1)
+        values["replicas"] = [
             ReplicaGroup(
                 name="default",
-                replicas=values.get("replicas"),
-                commands=values.get("commands"),
+                replicas=replica_count,
+                commands=values.get("commands", []),
                 resources=values.get("resources"),
                 scaling=values.get("scaling"),
-                probes=values.get("probes"),
+                probes=values.get("probes", []),
                 rate_limits=values.get("rate_limits"),
             )
         ]
@@ -975,22 +965,24 @@ def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
             raise ValueError("Probes must be unique")
         return v
 
-    @validator("replica_groups")
-    def validate_replica_groups(
-        cls, v: Optional[List[ReplicaGroup]]
-    ) -> Optional[List[ReplicaGroup]]:
+    @validator("replicas")
+    def validate_replicas(cls, v: Optional[List[ReplicaGroup]]) -> Optional[List[ReplicaGroup]]:
         if v is None:
             return v
-        if not v:
-            raise ValueError("`replica_groups` cannot be an empty list")
-        # Check for duplicate names
-        names = [group.name for group in v]
-        if len(names) != len(set(names)):
-            duplicates = [name for name in set(names) if names.count(name) > 1]
-            raise ValueError(
-                f"Duplicate replica group names found: {duplicates}. "
-                "Each replica group must have a unique name."
-            )
+        if isinstance(v, (Range, int, str)):
+            return v
+
+        if isinstance(v, list):
+            if not v:
+                raise ValueError("`replicas` cannot be an empty list")
+            # Check for duplicate names
+            names = [group.name for group in v]
+            if len(names) != len(set(names)):
+                duplicates = [name for name in set(names) if names.count(name) > 1]
+                raise ValueError(
+                    f"Duplicate replica group names found: {duplicates}. "
+                    "Each replica group must have a unique name."
+                )
         return v
 
 
diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py
@@ -198,7 +198,9 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
 
     # run_model.desired_replica_count = 1
     if run.run_spec.configuration.type == "service":
-        run_model.desired_replica_count = run.run_spec.configuration.replicas.min or 0
+        run_model.desired_replica_count = sum(
+            group.replicas.min or 0 for group in run.run_spec.configuration.replicas
+        )
         await update_service_desired_replica_count(
             session,
             run_model,
@@ -211,15 +213,14 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
             # stay zero scaled
             return
 
-
         # Per group scaling because single replica is also normalized to replica groups.
-        replica_groups = run.run_spec.configuration.replica_groups or []
+        replicas = run.run_spec.configuration.replicas or []
         counts = (
             json.loads(run_model.desired_replica_counts)
             if run_model.desired_replica_counts
             else {}
         )
-        await scale_run_replicas_per_group(session, run_model, replica_groups, counts)
+        await scale_run_replicas_per_group(session, run_model, replicas, counts)
     else:
         run_model.desired_replica_count = 1
         await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count)
@@ -460,24 +461,24 @@ async def _handle_run_replicas(
             # FIXME: should only include scaling events, not retries and deployments
             last_scaled_at=max((r.timestamp for r in replicas_info), default=None),
         )
-        replica_groups = run_spec.configuration.replica_groups or []
-        if replica_groups:
+        replicas = run_spec.configuration.replicas or []
+        if replicas:
             counts = (
                 json.loads(run_model.desired_replica_counts)
                 if run_model.desired_replica_counts
                 else {}
             )
-            await scale_run_replicas_per_group(session, run_model, replica_groups, counts)
+            await scale_run_replicas_per_group(session, run_model, replicas, counts)
 
             # Handle per-group rolling deployment
             await _update_jobs_to_new_deployment_in_place(
                 session=session,
                 run_model=run_model,
                 run_spec=run_spec,
-                replica_groups=replica_groups,
+                replicas=replicas,
             )
             # Process per-group rolling deployment
-            for group in replica_groups:
+            for group in replicas:
                 await _handle_rolling_deployment_for_group(
                     session=session,
                     run_model=run_model,
@@ -554,7 +555,7 @@ async def _update_jobs_to_new_deployment_in_place(
     session: AsyncSession,
     run_model: RunModel,
     run_spec: RunSpec,
-    replica_groups: Optional[List] = None,
+    replicas: Optional[List] = None,
 ) -> None:
     """
     Bump deployment_num for jobs that do not require redeployment.
@@ -575,11 +576,11 @@ async def _update_jobs_to_new_deployment_in_place(
         replica_group_name = None
         group_run_spec = base_run_spec
 
-        if replica_groups:
+        if replicas:
             job_spec = JobSpec.__response__.parse_raw(job_models[0].job_spec_data)
             replica_group_name = job_spec.replica_group or "default"
 
-            for group in replica_groups:
+            for group in replicas:
                 if group.name == replica_group_name:
                     group_run_spec = create_group_run_spec(base_run_spec, group)
                     break
diff --git a/src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py b/src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py
@@ -11,7 +11,7 @@
 
 # revision identifiers, used by Alembic.
 revision = "706e0acc3a7d"
-down_revision = "22d74df9897e"
+down_revision = "903c91e24634"
 branch_labels = None
 depends_on = None
 
diff --git a/src/dstack/_internal/server/services/runs/__init__.py b/src/dstack/_internal/server/services/runs/__init__.py
@@ -520,7 +520,7 @@ async def submit_run(
 
             global_replica_num = 0  # Global counter across all groups for unique replica_num
 
-            for replica_group in service_config.replica_groups:
+            for replica_group in service_config.replicas:
                 if run_spec.merged_profile.schedule is not None:
                     group_initial_replicas = 0
                 else:
diff --git a/src/dstack/_internal/server/services/runs/replicas.py b/src/dstack/_internal/server/services/runs/replicas.py
@@ -36,8 +36,8 @@ async def retry_run_replica_jobs(
     replica_group = None
 
     # Find matching replica group
-    if replica_group_name and base_run_spec.configuration.replica_groups:
-        for group in base_run_spec.configuration.replica_groups:
+    if replica_group_name and base_run_spec.configuration.replicas:
+        for group in base_run_spec.configuration.replicas:
             if group.name == replica_group_name:
                 replica_group = group
                 break
@@ -245,14 +245,14 @@ async def _scale_up_replicas(
 async def scale_run_replicas_per_group(
     session: AsyncSession,
     run_model: RunModel,
-    replica_groups: List[ReplicaGroup],
+    replicas: List[ReplicaGroup],
     desired_replica_counts: Dict[str, int],
 ) -> None:
     """Scale each replica group independently"""
-    if not replica_groups:
+    if not replicas:
         return
 
-    for group in replica_groups:
+    for group in replicas:
         group_desired = desired_replica_counts.get(group.name, group.replicas.min or 0)
 
         # Build replica lists filtered by this group
diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py
@@ -50,7 +50,6 @@
         "env",
         "shell",
         "commands",
-        "replica_groups",
     ],
 }
 
@@ -89,7 +88,10 @@ def validate_run_spec_and_set_defaults(
             f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
         )
     if isinstance(run_spec.configuration, ServiceConfiguration):
-        if run_spec.merged_profile.schedule and run_spec.configuration.replicas.min == 0:
+        # Check if any group has min=0
+        if run_spec.merged_profile.schedule and any(
+            group.replicas.min == 0 for group in run_spec.configuration.replicas
+        ):
             raise ServerClientError(
                 "Scheduled services with autoscaling to zero are not supported"
             )
@@ -150,11 +152,10 @@ def get_nodes_required_num(run_spec: RunSpec) -> int:
     nodes_required_num = 1
     if run_spec.configuration.type == "task":
         nodes_required_num = run_spec.configuration.nodes
-    elif (
-        run_spec.configuration.type == "service"
-        and run_spec.configuration.replicas.min is not None
-    ):
-        nodes_required_num = run_spec.configuration.replicas.min
+    elif run_spec.configuration.type == "service":
+        nodes_required_num = sum(
+            group.replicas.min or 0 for group in run_spec.configuration.replicas
+        )
     return nodes_required_num
 
 
diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py