quic-abhamidi · quic-abhamidi · Nov 28, 2025 · Nov 28, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -24,6 +24,7 @@
     QEFFAutoModelForCausalLM,
     QEFFAutoModelForCTC,
     QEFFAutoModelForImageTextToText,
+    QEFFAutoModelForSequenceClassification,
     QEFFAutoModelForSpeechSeq2Seq,
     QEFFCommonLoader,
 )
@@ -53,6 +54,7 @@
     "QEFFAutoModelForCTC",
     "QEffAutoPeftModelForCausalLM",
     "QEFFAutoModelForImageTextToText",
+    "QEFFAutoModelForSequenceClassification",
     "QEFFAutoModelForSpeechSeq2Seq",
     "QEFFCommonLoader",
     "QEffFluxPipeline",
@@ -61,7 +63,7 @@
 
 
 # Conditionally import QAIC-related modules if the SDK is installed
-__version__ = "0.0.1.dev0"
+__version__ = "1.22.0.dev0"
 
 
 def check_qaic_sdk():

diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py
@@ -11,5 +11,6 @@
     QEFFAutoModelForCausalLM,
     QEFFAutoModelForCTC,
     QEFFAutoModelForImageTextToText,
+    QEFFAutoModelForSequenceClassification,
     QEFFAutoModelForSpeechSeq2Seq,
 )
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -60,7 +60,6 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         super().__init__()
         self.model = model
         self.hash_params = create_model_params(self, **kwargs)
-        self.prefill_onnx_path: Optional[str] = None
         self.onnx_path: Optional[str] = None
         self.qpc_path: Optional[str] = None
         self.qpc_session: Optional[QAICInferenceSession] = None
@@ -181,7 +180,7 @@ def compile(self, *args, **kwargs) -> Path:
                     :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
                     :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
 
-                for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+                for QAIC compilation path, any flag that is supported by ``qaic-compile`` can be passed. Params are converted to flags as below:
 
                     - aic_num_cores=16 -> -aic-num-cores=16
                     - convert_to_fp16=True -> -convert-to-fp16
@@ -240,10 +239,7 @@ def _export(
 
         # Return early if ONNX already exists
         if onnx_path.is_file():
-            if prefill_only:
-                self.prefill_onnx_path = onnx_path
-            else:
-                self.onnx_path = onnx_path
+            self.onnx_path = onnx_path
             return onnx_path
 
         # check if the model is in meta state or weights are offloaded
@@ -322,10 +318,7 @@ def _export(
         finally:
             shutil.rmtree(tmp_onnx_dir, ignore_errors=True)
 
-        if prefill_only:
-            self.prefill_onnx_path = onnx_path
-        else:
-            self.onnx_path = onnx_path
+        self.onnx_path = onnx_path
         return onnx_path
 
     def get_onnx_path(
@@ -342,21 +335,18 @@ def get_onnx_path(
             "use_onnx_subfunctions": use_onnx_subfunctions,
             "retain_full_kv": retain_full_kv,
         }
+
         if prefill_only:
-            if self.prefill_onnx_path is None:
-                kwargs.update(
-                    {
-                        "prefill_only": prefill_only,
-                        "prefill_seq_len": specializations[0].get("seq_len"),
-                        "enable_chunking": enable_chunking,
-                    }
-                )
-                self.export(**kwargs)
-            return self.prefill_onnx_path
-        else:
-            if self.onnx_path is None:
-                self.export(**kwargs)
-            return self.onnx_path
+            kwargs.update(
+                {
+                    "prefill_only": prefill_only,
+                    "prefill_seq_len": specializations[0].get("seq_len"),
+                    "enable_chunking": enable_chunking,
+                }
+            )
+
+        self.export(**kwargs)
+        return self.onnx_path
 
     @dump_qconfig
     def _compile(
@@ -379,7 +369,7 @@ def _compile(
         **compiler_options,
     ) -> str:
         """
-        Interface for qaic-exec compiler
+        Interface for qaic-compile compiler
 
         Args:
             :onnx_path (str): Onnx file to compile
@@ -392,7 +382,7 @@ def _compile(
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input.
-                Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+                Any flag that is supported by `qaic-compile` can be passed. Params are converted to flags as below:
 
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
@@ -404,6 +394,8 @@ def _compile(
         onnx_path = Path(
             onnx_path
             if onnx_path
+            else self.onnx_path
+            if self.onnx_path
             else self.get_onnx_path(
                 prefill_only,
                 enable_chunking,
@@ -446,8 +438,27 @@ def _compile(
             + [f"-m={onnx_path}"]
         )
 
-        if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None):
+        # MDP partition config: prioritize dump over load
+        mdp_dump_json_path = compiler_options.pop("mdp_dump_partition_config", None)
+        mdp_ts_json_path = compiler_options.pop("mdp_load_partition_config", None)
+        mdp_ts_json = None
+        user_provided_load_config = False
+
+        if mdp_dump_json_path:
+            if mdp_ts_json_path:
+                logger.warning(
+                    "Loading and Dumping partition is not supported at the same time. Prioritizing dump config over load config!"
+                )
+            command.append(f"-mdp-dump-partition-config={mdp_dump_json_path}")
+        elif mdp_ts_json_path:
             command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
+            mdp_ts_json = load_json(str(mdp_ts_json_path))
+            user_provided_load_config = True
+        elif mdp_ts_num_devices > 1:
+            # Generate mdp config only if neither dump nor load is provided and num_devices > 1
+            mdp_ts_json = generate_mdp_partition_config(
+                mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
+            )
 
         for key, value in compiler_options.items():
             option = "-" + key.replace("_", "-")
@@ -457,16 +468,6 @@ def _compile(
                 continue
             command.append(f"{option}={value}")
 
-        # Create a dummy mdp_ts_json if mdp-load-partition-config not provided and num_devices > 1
-        if mdp_ts_json_path is not None:
-            mdp_ts_json = load_json(str(mdp_ts_json_path))
-        elif mdp_ts_num_devices > 1:
-            mdp_ts_json = generate_mdp_partition_config(
-                mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
-            )
-        else:
-            mdp_ts_json = None
-
         if use_onnx_subfunctions:
             logger.info("Using ONNX subfunctions for compilation.")
             command.append("-sub-functions")
@@ -493,8 +494,8 @@ def _compile(
             # Probably compilation failure last time, delete directory to start over
             shutil.rmtree(qpc_path)
 
-        # write the MDP partition config file if not provided
-        if mdp_ts_json is not None:
+        # Write the generated MDP partition config file (not if user provided it)
+        if mdp_ts_json is not None and not user_provided_load_config:
             mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
             create_json(str(mdp_ts_json_path), mdp_ts_json)
             command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -28,7 +28,7 @@
 )
 from QEfficient.finetune.utils.dataset_utils import get_dataloader, get_longest_seq_length
 from QEfficient.finetune.utils.device_map import get_device_map
-from QEfficient.finetune.utils.helper import Task_Mode, get_world_size
+from QEfficient.finetune.utils.helper import Task_Mode, get_local_rank, get_local_world_size, get_rank, get_world_size
 from QEfficient.finetune.utils.logging_utils import logger
 from QEfficient.finetune.utils.parser import get_finetune_parser
 from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
@@ -52,10 +52,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     """
     Initialize the distributed training environment if Distributed Data Parallel (DDP) is enabled.
 
-    This function configures the PyTorch distributed backend based on the device type
-    and initializes the process group. It also validates device availability and
-    pipeline parallelism settings.
-
+    Supports single-node and multi-node training launched via torchrun
+    (uses WORLD_SIZE, RANK, LOCAL_RANK, LOCAL_WORLD_SIZE environment variables).
     Parameters
     ----------
     train_config : TrainConfig
@@ -67,32 +65,57 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
         If the number of required devices exceeds the total available devices.
         If pipeline parallelism (`num_pp_stages`) is enabled but set to 1.
         If DDP is enabled with a CPU device or with a specific device index (DDP requires device type only).
-
     Notes
     -----
     - If `train_config.enable_ddp` is False, this function performs no action.
     - Sets the appropriate device for each process in a distributed setup.
     """
 
     torch_device = torch.device(train_config.device)
-    num_available_devices = getattr(torch, torch_device.type).device_count()
-    assert get_world_size() * train_config.num_pp_stages <= num_available_devices, (
-        "Number of devices required should be less than or equal to total available devices."
-    )
+
+    # Validate pipeline parallelism settings
     if train_config.enable_pp:
         assert train_config.num_pp_stages > 1, (
             f"For pipeline parallelism, num_pp_stages should be greater than 1. Got {train_config.num_pp_stages}"
         )
 
+    # If DDP is disabled, nothing to initialize here
     if not train_config.enable_ddp:
+        # Non-DDP path: allow explicit device index, just set it if present
+        if torch_device.type != "cpu" and torch_device.index is not None:
+            getattr(torch, torch_device.type).set_device(torch_device.index)
         return
 
+    # ---- DDP path (single- or multi-node) ----
     assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
-    assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
+    assert torch_device.index is None, f"DDP requires only device type (qaic/cuda), got: {torch_device}"
+
+    # Torchrun-provided env vars
+    world_size = get_world_size()
+    rank = get_rank()
+    local_rank = get_local_rank()
+    local_world_size = get_local_world_size()
+
+    # Per-node device validation
+    num_available_devices = getattr(torch, torch_device.type).device_count()
+    assert local_world_size * train_config.num_pp_stages <= num_available_devices, (
+        "Number of devices required per node (LOCAL_WORLD_SIZE * num_pp_stages) should be <= locally available devices."
+    )
+
     dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
-    dist.init_process_group(backend=dist_backend_map[torch_device.type])
+    dist.init_process_group(dist_backend_map[torch_device.type], rank=rank, world_size=world_size)
+
+    # Set the base device index for this process on this node
+    # For PP: each process controls num_pp_stages devices starting from base_device_index
+    base_device_index = local_rank * train_config.num_pp_stages
     # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
-    getattr(torch, torch_device.type).set_device(dist.get_rank() * train_config.num_pp_stages)
+    getattr(torch, torch_device.type).set_device(base_device_index)
+
+    # persist rank info in the config
+    train_config.rank = rank
+    train_config.local_rank = local_rank
+    train_config.world_size = world_size
+    train_config.local_world_size = local_world_size
 
 
 def setup_seeds(seed: int) -> None:
@@ -362,14 +385,26 @@ def main(**kwargs) -> None:
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
     )
+
+    # Figure out the concrete device for this process
+    torch_device = torch.device(train_config.device)
+    if train_config.enable_ddp and torch_device.type != "cpu":
+        # setup_distributed_training has already set the current device based on LOCAL_RANK
+        current_idx = getattr(torch, torch_device.type).current_device()
+        device = torch.device(torch_device.type, current_idx)
+    else:
+        device = torch_device
+
     if not train_config.enable_pp:
-        model.to(train_config.device)
+        model.to(device)
+
     optimizer = optim.AdamW(
         model.parameters(),
         lr=train_config.lr,
         weight_decay=train_config.weight_decay,
     )
     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
+
     if train_config.enable_ddp:
         ignore_names = set()
         for name, param in model.named_parameters():
@@ -378,6 +413,7 @@ def main(**kwargs) -> None:
         # Adding params in ignore list will enforce DDP to ignore them during synchronization,
         # which will further reduce the tensor exchange across devices.
         torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(model, ignore_names)
+
         model = nn.parallel.DistributedDataParallel(model)
 
     results = train(