Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion QEfficient/cloud/finetune_experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _create_datasets(self) -> Tuple[Any, Any]:
dataset_name = dataset_config.get("dataset_name")
train_split = dataset_config.get("train_split", "train")
test_split = dataset_config.get("test_split", "test")
seed = self.config.training["seed"]
seed = dataset_config.get("data_seed", 42)

# Create a copy of dataset_config excluding keys that are passed explicitly
# to avoid duplicate keyword arguments when unpacking
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dataset:
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dataset:
dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
completion_template: "{output}" # Model will be trained on this part.
data_seed: 42 # Random seed for dataset shuffling


# Training configuration
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Dataset: Style-Remix (hallisky/DiSC)
# Model configuration
Comment thread
tchawada marked this conversation as resolved.
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub
prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
completion_template: "{generation}" # Model will be trained on this part.
dataset_disc_style: "sarcasm_more" # Style of dataset to use
data_seed: 42 # Random seed for dataset shuffling

# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 2e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dataset:
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling


# Training configuration
Expand Down
7 changes: 6 additions & 1 deletion QEfficient/finetune/experimental/core/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,13 @@ class DatasetConfig:
metadata={"help": "Function for formatting output completions (e.g., '{output}')."},
)
collate_fn: str = field(
default="dynamic_padding",
default=None,
metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
)
dataset_disc_style: str = field(
default=None,
metadata={"help": "Style of dataset"},
)
group_by_length: bool = field(
default=True,
metadata={"help": "Whether to group samples by length to minimize padding."},
Expand Down Expand Up @@ -184,6 +188,7 @@ class DatasetConfig:
metadata={"help": "Name of the hf configuration file."},
)
json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."})
data_seed: int = field(default=42, metadata={"help": "Seed for data shuffling and sampling."})


@dataclass
Expand Down
11 changes: 11 additions & 0 deletions QEfficient/finetune/experimental/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,15 @@ def __init__(
**kwargs,
):
self.split_ratio = split_ratio
self.seed = seed
self.json_file_path = kwargs.get("json_file_path", None)
self.prompt_template = kwargs.get("prompt_template", None)
self.completion_template = kwargs.get("completion_template", None)
self.prompt_func_path = kwargs.get("prompt_func", None)
self.completion_func_path = kwargs.get("completion_func", None)
self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
self.config_name = kwargs.get("config_name", None)
self.dataset_disc_style = kwargs.get("dataset_disc_style", None)

if self.json_file_path not in (None, ""):
if not os.path.isfile(self.json_file_path):
Expand Down Expand Up @@ -127,6 +129,7 @@ def _initialize_dataset(self):
# Load dataset from JSON file
validate_json_structure(self.json_file_path)
self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
self.dataset = self.dataset.shuffle(seed=self.seed)
# Apply train/test split if needed
if self.split in ["train", "test"]:
self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
Expand All @@ -149,6 +152,14 @@ def _initialize_dataset(self):
load_split = "train"
# FIXME: Add streaming support for larger datasets.
self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
self.dataset = self.dataset.shuffle(seed=self.seed)
if self.dataset_disc_style:
available_styles = set(self.dataset["category"])
if self.dataset_disc_style not in available_styles:
raise RuntimeError(
f"For DiSC dataset the provided disc_style '{self.dataset_disc_style}' is not supported."
)
self.dataset = self.dataset.filter(lambda example: example["category"] == self.dataset_disc_style)

if len(available_splits) == 1:
self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
Expand Down
11 changes: 7 additions & 4 deletions QEfficient/finetune/experimental/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ def setUp(self):
{"question": "What is AI?", "answer": "Artificial Intelligence"},
{"question": "What is ML?", "answer": "Machine Learning"},
{"question": "What is DL?", "answer": "Deep Learning"},
{"question": "What is LLM?", "answer": "Large Language Model"},
{"question": "What is NLP?", "answer": "Natural Language Processing"},
{"question": "What is VLM?", "answer": "Vision Language Model"},
{"question": "", "answer": "Empty question"}, # Empty question
{"question": "Valid question", "answer": ""}, # Empty answer
{"question": None, "answer": "None question"}, # None question
Expand Down Expand Up @@ -78,6 +80,7 @@ def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder,
def create_mock_dataset():
mock_dataset = MagicMock()
mock_dataset.column_names = ["text", "label"]
mock_dataset.shuffle.return_value = mock_dataset
mock_dataset.num_rows = 3

# Mock __getitem__ to return processed samples
Expand Down Expand Up @@ -177,7 +180,7 @@ def test_sft_dataset_json_file_without_filtering(self):
)

# When filtering is disabled and split="train" is used, it still applies train/test split
# So we get ~80% of 8 samples = ~6 samples
# So we get ~80% of 10 samples = ~8 samples
self.assertGreater(len(dataset), 0)
self.assertLessEqual(len(dataset), 8)

Expand All @@ -203,12 +206,12 @@ def test_sft_dataset_train_test_split_from_json(self):
seed=SEED,
)

# After filtering, we have 4 valid samples
# With split ratio, train should have ~3 samples, test should have ~1 sample
# After filtering, we have 6 valid samples
# With split ratio, train should have ~4 samples, test should have ~2 sample
self.assertGreater(len(train_dataset), 0)
self.assertGreater(len(test_dataset), 0)
# Total should equal the filtered dataset size
self.assertEqual(len(train_dataset) + len(test_dataset), 4)
self.assertEqual(len(train_dataset) + len(test_dataset), 6)

def test_sft_dataset_with_custom_prompt_function(self):
"""Test loading with custom prompt function."""
Expand Down
1 change: 1 addition & 0 deletions QEfficient/finetune/experimental/tests/test_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ def test_create_datasets_called_and_assigned(
"dataset_name": "test_dataset",
"train_split": train_split,
"test_split": test_split,
"data_seed": 42,
}

train_ds = MagicMock(name="train_ds")
Expand Down
20 changes: 10 additions & 10 deletions docs/source/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ If provided, this takes precedence over dataset_name.
* **train_batch_size**: `default = 1` → Per-device batch size during training.
* **eval_batch_size**: `default = 1` → Per-device batch size during evaluation.
* **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch).
* **dataset_disc_style**: `default = None` → Selects the style remix category to apply to the dataset during preprocessing; when None, no style remixing is applied and the original dataset style is preserved.

* **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching.
* **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field).
* **num_workers**: `default = 4` → Number of subprocesses to use for data loading.
Expand All @@ -88,7 +90,7 @@ dataset:
train_split: "train"
test_split: "test"
max_seq_length: 512
prompt_func: "preprocess/alpaca_func:create_alpaca_prompt"
prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt"
completion_template: "{output}"

```
Expand Down Expand Up @@ -144,21 +146,19 @@ dataset:
completion_template: "{answer}"

```

***
#### **4. grammar (grammar_dataset)**

#### **4. Style-Remix (hallisky/DiSC)**

```yaml
dataset:
tokenizer_name: "meta-llama/Llama-3.2-1B"
dataset_type: "sft_dataset"
dataset_name: "grammar"
train_split: "train"
split_ratio: 0.8
prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n"
completion_template: "{target}"
```
dataset_name: "hallisky/DiSC"
prompt_template: "### Original:{original} \n ### Rewrite:\n"
completion_template: "{generation}"
dataset_disc_style: "sarcasm_more"

```
***

## 3. Training Configuration
Expand Down
Loading