Add dpo to cli, fix typing mismatch with API

VProv · VProv · commit ee7e02d9bfc6 · 2025-03-03T04:12:34.000-08:00
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -104,6 +104,18 @@ def fine_tuning(ctx: click.Context) -> None:
     default="all-linear",
     help="Trainable modules for LoRA adapters. For example, 'all-linear', 'q_proj,v_proj'",
 )
+@click.option(
+    "--training-method",
+    type=click.Choice(["sft", "dpo"]),
+    default="sft",
+    help="Training method to use. Options: sft (supervised fine-tuning), dpo (direct preference optimization)",
+)
+@click.option(
+    "--dpo-beta",
+    type=float,
+    default=0.1,
+    help="Beta parameter for DPO training (only used when training-method is 'dpo')",
+)
 @click.option(
     "--suffix", type=str, default=None, help="Suffix for the fine-tuned model name"
 )
@@ -152,6 +164,8 @@ def create(
     wandb_name: str,
     confirm: bool,
     train_on_inputs: bool | Literal["auto"],
+    training_method: str,
+    dpo_beta: float,
 ) -> None:
     """Start fine-tuning"""
     client: Together = ctx.obj
@@ -180,6 +194,8 @@ def create(
         wandb_project_name=wandb_project_name,
         wandb_name=wandb_name,
         train_on_inputs=train_on_inputs,
+        training_method=training_method,
+        dpo_beta=dpo_beta,
     )
 
     model_limits: FinetuneTrainingLimits = client.fine_tuning.get_model_limits(
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Literal
+from typing import Literal, Union
 
 from rich import print as rprint
 
@@ -22,7 +22,8 @@
     TrainingType,
     FinetuneLRScheduler,
     FinetuneLinearLRSchedulerArgs,
-    DPOTrainingMethodType,
+    TrainingMethodDPO,
+    TrainingMethodSFT,
 )
 from together.types.finetune import DownloadCheckpointType
 from together.utils import log_warn_once, normalize_key
@@ -108,10 +109,13 @@ def createFinetuneRequest(
         lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
     )
 
+    training_method_cls: Union[TrainingMethodSFT, TrainingMethodDPO] = (
+        TrainingMethodSFT()
+    )
     if training_method == "dpo":
-        training_method_args = DPOTrainingMethodType(dpo_beta=dpo_beta)
-    else:
-        training_method_args = None
+        training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta)
+
+    print("\n TRAINING METHOD at CREATE FINE TUNE REQUEST", training_method)
 
     finetune_request = FinetuneRequest(
         model=model,
@@ -133,8 +137,7 @@ def createFinetuneRequest(
         wandb_project_name=wandb_project_name,
         wandb_name=wandb_name,
         train_on_inputs=train_on_inputs,
-        training_method=training_method,
-        training_method_args=training_method_args,
+        training_method=training_method_cls,
     )
 
     return finetune_request
@@ -173,7 +176,7 @@ def create(
         model_limits: FinetuneTrainingLimits | None = None,
         train_on_inputs: bool | Literal["auto"] = "auto",
         training_method: str = "sft",
-        dpo_beta: float = 0.1,
+        dpo_beta: float | None = None,
     ) -> FinetuneResponse:
         """
         Method to initiate a fine-tuning job
@@ -221,7 +224,7 @@ def create(
                 Defaults to "auto".
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
-            dpo_beta (float, optional): DPO beta parameter. Defaults to 0.1.
+            dpo_beta (float, optional): DPO beta parameter. Defaults to None.
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
@@ -233,7 +236,7 @@ def create(
 
         if model_limits is None:
             model_limits = self.get_model_limits(model=model)
-
+        print("\n DPO BETA at CREATE FINE TUNE REQUEST", dpo_beta)
         finetune_request = createFinetuneRequest(
             model_limits=model_limits,
             training_file=training_file,
@@ -268,6 +271,7 @@ def create(
                 "Submitting a fine-tuning job with the following parameters:",
                 finetune_request,
             )
+        print("\n FINETUNE REQUEST before dump", finetune_request)
         parameter_payload = finetune_request.model_dump(exclude_none=True)
 
         # Print the request payload before sending
@@ -525,7 +529,7 @@ async def create(
         model_limits: FinetuneTrainingLimits | None = None,
         train_on_inputs: bool | Literal["auto"] = "auto",
         training_method: str = "sft",
-        dpo_beta: float = 0.1,
+        dpo_beta: float | None = None,
     ) -> FinetuneResponse:
         """
         Async method to initiate a fine-tuning job
@@ -573,7 +577,7 @@ async def create(
                 Defaults to "auto".
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
-            dpo_beta (float, optional): DPO beta parameter. Defaults to 0.1.
+            dpo_beta (float, optional): DPO beta parameter. Defaults to None.
 
         Returns:
             FinetuneResponse: Object containing information about fine-tuning job.
diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
@@ -31,7 +31,8 @@
     FileType,
 )
 from together.types.finetune import (
-    DPOTrainingMethodType,
+    TrainingMethodDPO,
+    TrainingMethodSFT,
     FinetuneDownloadResult,
     FinetuneLinearLRSchedulerArgs,
     FinetuneList,
@@ -80,7 +81,8 @@
     "TrainingType",
     "FullTrainingType",
     "LoRATrainingType",
-    "DPOTrainingMethodType",
+    "TrainingMethodDPO",
+    "TrainingMethodSFT",
     "RerankRequest",
     "RerankResponse",
     "FinetuneTrainingLimits",
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import List, Literal
+from typing import List, Literal, Union
 
 from pydantic import StrictBool, Field, validator, field_validator
 
@@ -135,12 +135,29 @@ class LoRATrainingType(TrainingType):
     type: str = "Lora"
 
 
-class DPOTrainingMethodType(BaseModel):
+class TrainingMethod(BaseModel):
+    """
+    Training method type
+    """
+
+    method: str
+
+
+class TrainingMethodSFT(TrainingMethod):
+    """
+    Training method type for SFT training
+    """
+
+    method: str = "sft"
+
+
+class TrainingMethodDPO(TrainingMethod):
     """
     Training method type for DPO training
     """
 
-    dpo_beta: float
+    method: str = "dpo"
+    dpo_beta: float | None = None
 
 
 class FinetuneRequest(BaseModel):
@@ -187,9 +204,9 @@ class FinetuneRequest(BaseModel):
     # train on inputs
     train_on_inputs: StrictBool | Literal["auto"] = "auto"
     # training method
-    training_method: str = "sft"
-    # DPO params
-    training_method_args: DPOTrainingMethodType | None = None
+    training_method: Union[TrainingMethodSFT, TrainingMethodDPO] = Field(
+        default_factory=TrainingMethodSFT
+    )
 
 
 class FinetuneResponse(BaseModel):