quic · quic-akuruvil · Mar 26, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 23, 2026
@@ -115,7 +115,7 @@ def _create_datasets(self) -> Tuple[Any, Any]:
         dataset_name = dataset_config.get("dataset_name")
         train_split = dataset_config.get("train_split", "train")
         test_split = dataset_config.get("test_split", "test")
-        seed = self.config.training["seed"]
+        seed = dataset_config.get("data_seed", 42)
 
         # Create a copy of dataset_config excluding keys that are passed explicitly
         # to avoid duplicate keyword arguments when unpacking

@@ -25,6 +25,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 

@@ -24,6 +24,7 @@ dataset:
   dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
   prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
   completion_template: "{output}"    # Model will be trained on this part. 
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 # Training configuration

@@ -0,0 +1,50 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Dataset: Style-Remix (hallisky/DiSC)
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
+  completion_template: "{generation}"    # Model will be trained on this part. 
+  dataset_disc_style: "sarcasm_more" # Style of dataset to use
+  data_seed: 42 # Random seed for dataset shuffling
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
@@ -25,6 +25,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 # Training configuration

@@ -144,9 +144,13 @@ class DatasetConfig:
         metadata={"help": "Function for formatting output completions (e.g., '{output}')."},
     )
     collate_fn: str = field(
-        default="dynamic_padding",
+        default=None,
         metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
     )
+    dataset_disc_style: str = field(
+        default=None,
+        metadata={"help": "Style of dataset"},
+    )
     group_by_length: bool = field(
         default=True,
         metadata={"help": "Whether to group samples by length to minimize padding."},
@@ -184,6 +188,7 @@ class DatasetConfig:
         metadata={"help": "Name of the hf configuration file."},
     )
     json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."})
+    data_seed: int = field(default=42, metadata={"help": "Seed for data shuffling and sampling."})
 
 
 @dataclass

@@ -89,13 +89,15 @@ def __init__(
         **kwargs,
     ):
         self.split_ratio = split_ratio
+        self.seed = seed
         self.json_file_path = kwargs.get("json_file_path", None)
         self.prompt_template = kwargs.get("prompt_template", None)
         self.completion_template = kwargs.get("completion_template", None)
         self.prompt_func_path = kwargs.get("prompt_func", None)
         self.completion_func_path = kwargs.get("completion_func", None)
         self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
         self.config_name = kwargs.get("config_name", None)
+        self.dataset_disc_style = kwargs.get("dataset_disc_style", None)
 
         if self.json_file_path not in (None, ""):
             if not os.path.isfile(self.json_file_path):
@@ -127,6 +129,7 @@ def _initialize_dataset(self):
             # Load dataset from JSON file
             validate_json_structure(self.json_file_path)
             self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
+            self.dataset = self.dataset.shuffle(seed=self.seed)
             # Apply train/test split if needed
             if self.split in ["train", "test"]:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
@@ -149,6 +152,14 @@ def _initialize_dataset(self):
                 load_split = "train"
             # FIXME: Add streaming support for larger datasets.
             self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+            self.dataset = self.dataset.shuffle(seed=self.seed)
+            if self.dataset_disc_style:
+                available_styles = set(self.dataset["category"])
+                if self.dataset_disc_style not in available_styles:
+                    raise RuntimeError(
+                        f"For DiSC dataset the provided disc_style '{self.dataset_disc_style}' is not supported."
+                    )
+                self.dataset = self.dataset.filter(lambda example: example["category"] == self.dataset_disc_style)
 
             if len(available_splits) == 1:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)

@@ -44,7 +44,9 @@ def setUp(self):
             {"question": "What is AI?", "answer": "Artificial Intelligence"},
             {"question": "What is ML?", "answer": "Machine Learning"},
             {"question": "What is DL?", "answer": "Deep Learning"},
+            {"question": "What is LLM?", "answer": "Large Language Model"},
             {"question": "What is NLP?", "answer": "Natural Language Processing"},
+            {"question": "What is VLM?", "answer": "Vision Language Model"},
             {"question": "", "answer": "Empty question"},  # Empty question
             {"question": "Valid question", "answer": ""},  # Empty answer
             {"question": None, "answer": "None question"},  # None question
@@ -78,6 +80,7 @@ def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder,
         def create_mock_dataset():
             mock_dataset = MagicMock()
             mock_dataset.column_names = ["text", "label"]
+            mock_dataset.shuffle.return_value = mock_dataset
             mock_dataset.num_rows = 3
 
             # Mock __getitem__ to return processed samples
@@ -177,7 +180,7 @@ def test_sft_dataset_json_file_without_filtering(self):
         )
 
         # When filtering is disabled and split="train" is used, it still applies train/test split
-        # So we get ~80% of 8 samples = ~6 samples
+        # So we get ~80% of 10 samples = ~8 samples
         self.assertGreater(len(dataset), 0)
         self.assertLessEqual(len(dataset), 8)
 
@@ -203,12 +206,12 @@ def test_sft_dataset_train_test_split_from_json(self):
             seed=SEED,
         )
 
-        # After filtering, we have 4 valid samples
-        # With split ratio, train should have ~3 samples, test should have ~1 sample
+        # After filtering, we have 6 valid samples
+        # With split ratio, train should have ~4 samples, test should have ~2 sample
         self.assertGreater(len(train_dataset), 0)
         self.assertGreater(len(test_dataset), 0)
         # Total should equal the filtered dataset size
-        self.assertEqual(len(train_dataset) + len(test_dataset), 4)
+        self.assertEqual(len(train_dataset) + len(test_dataset), 6)
 
     def test_sft_dataset_with_custom_prompt_function(self):
         """Test loading with custom prompt function."""

@@ -226,6 +226,7 @@ def test_create_datasets_called_and_assigned(
         "dataset_name": "test_dataset",
         "train_split": train_split,
         "test_split": test_split,
+        "data_seed": 42,
     }
 
     train_ds = MagicMock(name="train_ds")

@@ -63,6 +63,8 @@ If provided, this takes precedence over dataset_name.
 *   **train_batch_size**: `default = 1` → Per-device batch size during training.
 *   **eval_batch_size**: `default = 1` → Per-device batch size during evaluation.
 *   **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch).
+*   **dataset_disc_style**: `default = None` →  Selects the style remix category to apply to the dataset during preprocessing; when None, no style remixing is applied and the original dataset style is preserved.
+
 *   **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching.
 *   **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field).
 *   **num_workers**: `default = 4` → Number of subprocesses to use for data loading.
@@ -88,7 +90,7 @@ dataset:
   train_split: "train"
   test_split: "test"
   max_seq_length: 512
-  prompt_func: "preprocess/alpaca_func:create_alpaca_prompt"
+  prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt"
   completion_template: "{output}"
 
 ```
@@ -144,21 +146,19 @@ dataset:
   completion_template: "{answer}"
 
 ```
-
 ***
-#### **4. grammar (grammar_dataset)**
+
+#### **4. Style-Remix (hallisky/DiSC)**
 
 ```yaml
 dataset:
-  tokenizer_name: "meta-llama/Llama-3.2-1B"
   dataset_type: "sft_dataset"
-  dataset_name: "grammar"
-  train_split: "train"
-  split_ratio: 0.8
-  prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n"
-  completion_template: "{target}"
-```
+  dataset_name: "hallisky/DiSC" 
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" 
+  completion_template: "{generation}"     
+  dataset_disc_style: "sarcasm_more" 
 
+```
 ***
 
 ## 3. Training Configuration