Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a732d02
[QEff. Finetune]: Added logger and its test cases. (#644)
quic-meetkuma Nov 28, 2025
3564e27
[QEff. Finetune]: Added component registry and factory functionality.…
quic-meetkuma Nov 28, 2025
9715b1b
[QEff. Finetune]: Adding optimizer registry and its test cases (#649)
tchawada Dec 5, 2025
52af924
[QEff. Finetune]: Added Base dataset class and SFT dataset classes al…
quic-dhirajku Dec 5, 2025
5f834bd
[QEff. Finetune] Adding callback and its test cases. (#652)
tchawada Dec 8, 2025
46fca42
"[QEff.finetuning] Adding config_manager and its test cases." (#656)
tchawada Dec 15, 2025
c80d0a8
Revert " "[QEff.finetuning] Adding config_manager and its test cases.…
quic-akuruvil Dec 15, 2025
d9cc6fe
"[QEff.finetuning} Rebasing: hf_config_mananger." (#667)
tchawada Dec 15, 2025
389f15a
[QEff. Finetune]: Adding base class and HF class (#658)
quic-swatia Dec 25, 2025
edfef59
Added Trainer classes and tests for FT (#697)
quic-dhirajku Jan 2, 2026
d072a9c
[QEff.finetuning] Adding sample config and ReadMe file (#692)
tchawada Feb 5, 2026
b78efe6
['QEff.finetuning'] Changing some params from training config to mode…
tchawada Feb 5, 2026
68c4a98
Ft experimental rebasing with main (#785)
quic-akuruvil Feb 9, 2026
4414951
[QEff. Finetuning] Adding text field and some other changes in datase…
quic-swatia Feb 9, 2026
ab5918e
[QEff. Finetuning]: Adding FinetuningPipeline (finetune_experiemental…
quic-swatia Feb 15, 2026
356355c
Ft experimental rebasing with main (#793)
quic-akuruvil Feb 16, 2026
e8cc003
Aligning with main (#794)
quic-akuruvil Feb 17, 2026
c0ef8cc
[QEff. Finetuning]: Adding PP support in HF trainer stack (#813)
quic-swatia Feb 27, 2026
0c49669
[QEff.finetuning] Hf config update (#795)
tchawada Mar 4, 2026
5942ccc
Restructure and added info in docs
Mar 5, 2026
00a4dbd
Cleanup
Mar 5, 2026
bcf8d34
Cleanup
Mar 5, 2026
d191e8a
[QEff.finetune]Test finetune (#826)
tchawada Mar 6, 2026
cecd2c2
Docs Updated (#833)
quic-akuruvil Mar 8, 2026
9751148
[QEff. Finetuning]: adding example scripts to demonstrate custom data…
smedhe Mar 9, 2026
f2ab8a3
Added per epoch time in the log files
Mar 10, 2026
db2a803
Added per epoch time in the log files
Mar 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
QEFFAutoModelForCausalLM,
QEFFAutoModelForCTC,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSequenceClassification,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
Expand Down Expand Up @@ -53,6 +54,7 @@
"QEFFAutoModelForCTC",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSequenceClassification",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
"QEffFluxPipeline",
Expand All @@ -61,7 +63,7 @@


# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"
__version__ = "1.22.0.dev0"


def check_qaic_sdk():
Expand Down
1 change: 1 addition & 0 deletions QEfficient/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
QEFFAutoModelForCausalLM,
QEFFAutoModelForCTC,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSequenceClassification,
QEFFAutoModelForSpeechSeq2Seq,
)
79 changes: 40 additions & 39 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
super().__init__()
self.model = model
self.hash_params = create_model_params(self, **kwargs)
self.prefill_onnx_path: Optional[str] = None
self.onnx_path: Optional[str] = None
self.qpc_path: Optional[str] = None
self.qpc_session: Optional[QAICInferenceSession] = None
Expand Down Expand Up @@ -181,7 +180,7 @@ def compile(self, *args, **kwargs) -> Path:
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``

for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
for QAIC compilation path, any flag that is supported by ``qaic-compile`` can be passed. Params are converted to flags as below:

- aic_num_cores=16 -> -aic-num-cores=16
- convert_to_fp16=True -> -convert-to-fp16
Expand Down Expand Up @@ -240,10 +239,7 @@ def _export(

# Return early if ONNX already exists
if onnx_path.is_file():
if prefill_only:
self.prefill_onnx_path = onnx_path
else:
self.onnx_path = onnx_path
self.onnx_path = onnx_path
return onnx_path

# check if the model is in meta state or weights are offloaded
Expand Down Expand Up @@ -322,10 +318,7 @@ def _export(
finally:
shutil.rmtree(tmp_onnx_dir, ignore_errors=True)

if prefill_only:
self.prefill_onnx_path = onnx_path
else:
self.onnx_path = onnx_path
self.onnx_path = onnx_path
return onnx_path

def get_onnx_path(
Expand All @@ -342,21 +335,18 @@ def get_onnx_path(
"use_onnx_subfunctions": use_onnx_subfunctions,
"retain_full_kv": retain_full_kv,
}

if prefill_only:
if self.prefill_onnx_path is None:
kwargs.update(
{
"prefill_only": prefill_only,
"prefill_seq_len": specializations[0].get("seq_len"),
"enable_chunking": enable_chunking,
}
)
self.export(**kwargs)
return self.prefill_onnx_path
else:
if self.onnx_path is None:
self.export(**kwargs)
return self.onnx_path
kwargs.update(
{
"prefill_only": prefill_only,
"prefill_seq_len": specializations[0].get("seq_len"),
"enable_chunking": enable_chunking,
}
)

self.export(**kwargs)
return self.onnx_path

@dump_qconfig
def _compile(
Expand All @@ -379,7 +369,7 @@ def _compile(
**compiler_options,
) -> str:
"""
Interface for qaic-exec compiler
Interface for qaic-compile compiler

Args:
:onnx_path (str): Onnx file to compile
Expand All @@ -392,7 +382,7 @@ def _compile(
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
:qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
:compiler_options: Pass any compiler option as input.
Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
Any flag that is supported by `qaic-compile` can be passed. Params are converted to flags as below:

- aic_num_cores=16 -> -aic-num-cores=16
- convert_to_fp16=True -> -convert-to-fp16
Expand All @@ -404,6 +394,8 @@ def _compile(
onnx_path = Path(
onnx_path
if onnx_path
else self.onnx_path
if self.onnx_path
else self.get_onnx_path(
prefill_only,
enable_chunking,
Expand Down Expand Up @@ -446,8 +438,27 @@ def _compile(
+ [f"-m={onnx_path}"]
)

if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None):
# MDP partition config: prioritize dump over load
mdp_dump_json_path = compiler_options.pop("mdp_dump_partition_config", None)
mdp_ts_json_path = compiler_options.pop("mdp_load_partition_config", None)
mdp_ts_json = None
user_provided_load_config = False

if mdp_dump_json_path:
if mdp_ts_json_path:
logger.warning(
"Loading and Dumping partition is not supported at the same time. Prioritizing dump config over load config!"
)
command.append(f"-mdp-dump-partition-config={mdp_dump_json_path}")
elif mdp_ts_json_path:
command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
mdp_ts_json = load_json(str(mdp_ts_json_path))
user_provided_load_config = True
elif mdp_ts_num_devices > 1:
# Generate mdp config only if neither dump nor load is provided and num_devices > 1
mdp_ts_json = generate_mdp_partition_config(
mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
)

for key, value in compiler_options.items():
option = "-" + key.replace("_", "-")
Expand All @@ -457,16 +468,6 @@ def _compile(
continue
command.append(f"{option}={value}")

# Create a dummy mdp_ts_json if mdp-load-partition-config not provided and num_devices > 1
if mdp_ts_json_path is not None:
mdp_ts_json = load_json(str(mdp_ts_json_path))
elif mdp_ts_num_devices > 1:
mdp_ts_json = generate_mdp_partition_config(
mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
)
else:
mdp_ts_json = None

if use_onnx_subfunctions:
logger.info("Using ONNX subfunctions for compilation.")
command.append("-sub-functions")
Expand All @@ -493,8 +494,8 @@ def _compile(
# Probably compilation failure last time, delete directory to start over
shutil.rmtree(qpc_path)

# write the MDP partition config file if not provided
if mdp_ts_json is not None:
# Write the generated MDP partition config file (not if user provided it)
if mdp_ts_json is not None and not user_provided_load_config:
mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
create_json(str(mdp_ts_json_path), mdp_ts_json)
command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
Expand Down
64 changes: 50 additions & 14 deletions QEfficient/cloud/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)
from QEfficient.finetune.utils.dataset_utils import get_dataloader, get_longest_seq_length
from QEfficient.finetune.utils.device_map import get_device_map
from QEfficient.finetune.utils.helper import Task_Mode, get_world_size
from QEfficient.finetune.utils.helper import Task_Mode, get_local_rank, get_local_world_size, get_rank, get_world_size
from QEfficient.finetune.utils.logging_utils import logger
from QEfficient.finetune.utils.parser import get_finetune_parser
from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
Expand All @@ -52,10 +52,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
"""
Initialize the distributed training environment if Distributed Data Parallel (DDP) is enabled.

This function configures the PyTorch distributed backend based on the device type
and initializes the process group. It also validates device availability and
pipeline parallelism settings.

Supports single-node and multi-node training launched via torchrun
(uses WORLD_SIZE, RANK, LOCAL_RANK, LOCAL_WORLD_SIZE environment variables).
Parameters
----------
train_config : TrainConfig
Expand All @@ -67,32 +65,57 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
If the number of required devices exceeds the total available devices.
If pipeline parallelism (`num_pp_stages`) is enabled but set to 1.
If DDP is enabled with a CPU device or with a specific device index (DDP requires device type only).

Notes
-----
- If `train_config.enable_ddp` is False, this function performs no action.
- Sets the appropriate device for each process in a distributed setup.
"""

torch_device = torch.device(train_config.device)
num_available_devices = getattr(torch, torch_device.type).device_count()
assert get_world_size() * train_config.num_pp_stages <= num_available_devices, (
"Number of devices required should be less than or equal to total available devices."
)

# Validate pipeline parallelism settings
if train_config.enable_pp:
assert train_config.num_pp_stages > 1, (
f"For pipeline parallelism, num_pp_stages should be greater than 1. Got {train_config.num_pp_stages}"
)

# If DDP is disabled, nothing to initialize here
if not train_config.enable_ddp:
# Non-DDP path: allow explicit device index, just set it if present
if torch_device.type != "cpu" and torch_device.index is not None:
getattr(torch, torch_device.type).set_device(torch_device.index)
return

# ---- DDP path (single- or multi-node) ----
assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
assert torch_device.index is None, f"DDP requires only device type (qaic/cuda), got: {torch_device}"

# Torchrun-provided env vars
world_size = get_world_size()
rank = get_rank()
local_rank = get_local_rank()
local_world_size = get_local_world_size()

# Per-node device validation
num_available_devices = getattr(torch, torch_device.type).device_count()
assert local_world_size * train_config.num_pp_stages <= num_available_devices, (
"Number of devices required per node (LOCAL_WORLD_SIZE * num_pp_stages) should be <= locally available devices."
)

dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
dist.init_process_group(backend=dist_backend_map[torch_device.type])
dist.init_process_group(dist_backend_map[torch_device.type], rank=rank, world_size=world_size)

# Set the base device index for this process on this node
# For PP: each process controls num_pp_stages devices starting from base_device_index
base_device_index = local_rank * train_config.num_pp_stages
# from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
getattr(torch, torch_device.type).set_device(dist.get_rank() * train_config.num_pp_stages)
getattr(torch, torch_device.type).set_device(base_device_index)

# persist rank info in the config
train_config.rank = rank
train_config.local_rank = local_rank
train_config.world_size = world_size
train_config.local_world_size = local_world_size


def setup_seeds(seed: int) -> None:
Expand Down Expand Up @@ -362,14 +385,26 @@ def main(**kwargs) -> None:
f"passed context length is {train_config.context_length} and overall model's context length is "
f"{model.config.max_position_embeddings}"
)

# Figure out the concrete device for this process
torch_device = torch.device(train_config.device)
if train_config.enable_ddp and torch_device.type != "cpu":
# setup_distributed_training has already set the current device based on LOCAL_RANK
current_idx = getattr(torch, torch_device.type).current_device()
device = torch.device(torch_device.type, current_idx)
else:
device = torch_device

if not train_config.enable_pp:
model.to(train_config.device)
model.to(device)

optimizer = optim.AdamW(
model.parameters(),
lr=train_config.lr,
weight_decay=train_config.weight_decay,
)
scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)

if train_config.enable_ddp:
ignore_names = set()
for name, param in model.named_parameters():
Expand All @@ -378,6 +413,7 @@ def main(**kwargs) -> None:
# Adding params in ignore list will enforce DDP to ignore them during synchronization,
# which will further reduce the tensor exchange across devices.
torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(model, ignore_names)

model = nn.parallel.DistributedDataParallel(model)

results = train(
Expand Down
Loading