diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..d843f34 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/F2LLM/README.md b/F2LLM/README.md index 6b79819..9093053 100644 --- a/F2LLM/README.md +++ b/F2LLM/README.md @@ -1,6 +1,44 @@ -## F2LLM +# F2LLM -F2LLMs (Foundation-to-Feature Large Language Models) are foundation models directly finetuned on 6 million high-quality query-document pairs, striking a strong balance between model size, training cost, and embedding performance: +F2LLM is a framework for converting decoder-only LLMs to embedding models. + +## LoRA Support + +F2LLM now supports Low-Rank Adaptation (LoRA) for efficient fine-tuning. This allows you to adapt base models with minimal parameter updates, significantly reducing computational costs and memory requirements. + +### Features + +- Support for LoRA with configurable rank (r), alpha, and dropout +- Target module selection for LoRA adaptation +- Full compatibility with existing training and inference pipelines +- Easy model merging capabilities + +### Configuration + +To enable LoRA, set `use_lora: true` in your configuration file and specify the LoRA parameters: + +- `use_lora`: Enable LoRA (boolean) +- `lora_r`: LoRA attention dimension (int, default: 8) +- `lora_alpha`: LoRA scaling factor (int, default: 16) +- `lora_dropout`: Dropout probability for LoRA layers (float, default: 0.05) +- `lora_target_modules`: Target modules for LoRA (string, default: "all-linear") + +### Example Configuration + +See `config_lora_example.json` for a complete example of using LoRA with F2LLM. + +### Usage + +1. Install the required dependencies: `pip install peft` +2. Update your config file to enable LoRA +3. Run training as usual: `python run.py --config your_config.json` + +### Benefits + +- **Memory Efficiency**: Only train a small subset of parameters +- **Computational Efficiency**: Faster training and lower GPU memory usage +- **Modularity**: Multiple adapters can be applied to the same base model +- **Compatibility**: Seamless integration with existing F2LLM pipeline

diff --git a/F2LLM/arguments.py b/F2LLM/arguments.py index b967c8f..8bad43c 100644 --- a/F2LLM/arguments.py +++ b/F2LLM/arguments.py @@ -27,6 +27,12 @@ class Args: log_interval: int = 20 checkpointing_steps: int = 100 validation_steps: int = 100 + # LoRA-specific arguments + use_lora: bool = False + lora_r: int = 8 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_target_modules: str = "all-linear" # Comma-separated list or "all-linear" # just placeholder, for logging purpose num_processes: int=0 diff --git a/F2LLM/config_lora_example.json b/F2LLM/config_lora_example.json new file mode 100644 index 0000000..afef649 --- /dev/null +++ b/F2LLM/config_lora_example.json @@ -0,0 +1,25 @@ +{ + "model_path": "models/qwen3-0.6b", + "experiment_id": "f2llm_lora_example", + "output_dir": "output", + "tb_dir": "tb_logs", + "cache_dir": "cache", + "train_data_path": "data_tokenized_qwen", + "train_batch_size": 4, + "max_seq_length": 1024, + "learning_rate": 1e-4, + "min_lr": 1e-6, + "weight_decay": 1e-2, + "warmup_steps": 100, + "num_hard_neg": 7, + "train_steps": 1000, + "train_epochs": 3, + "log_interval": 20, + "checkpointing_steps": 100, + "validation_steps": 100, + "use_lora": true, + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_modules": "all-linear" +} \ No newline at end of file diff --git a/F2LLM/docs/lora_support.md b/F2LLM/docs/lora_support.md new file mode 100644 index 0000000..bc39dbb --- /dev/null +++ b/F2LLM/docs/lora_support.md @@ -0,0 +1,157 @@ +# LoRA Support in F2LLM + +## Overview + +Low-Rank Adaptation (LoRA) is a parameter-efficient fine-tuning technique that significantly reduces the number of trainable parameters while maintaining model performance. F2LLM provides built-in support for LoRA, allowing users to fine-tune large language models efficiently without requiring full model updates. + +## Key Benefits + +- **Memory Efficiency**: Dramatically reduces memory requirements during training +- **Computational Efficiency**: Faster training with fewer parameters to update +- **Storage Efficiency**: Smaller adapter files compared to full model checkpoints +- **Modularity**: Easy to switch between different LoRA adapters for various tasks + +## Configuration + +LoRA can be enabled by setting the appropriate parameters in your configuration file or through command line arguments. + +### Configuration Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `use_lora` | bool | `false` | Enable or disable LoRA | +| `lora_r` | int | `8` | The rank of the LoRA decomposition | +| `lora_alpha` | int | `16` | Scaling factor for LoRA | +| `lora_dropout` | float | `0.05` | Dropout rate applied to LoRA layers | +| `lora_target_modules` | str | `"all-linear"` | Target modules to apply LoRA to | + +### Target Modules + +The `lora_target_modules` parameter specifies which layers to apply LoRA to: + +- **"all-linear"** (default): Applies LoRA to all linear projection layers including: + - `q_proj`: Query projections + - `v_proj`: Value projections + - `k_proj`: Key projections + - `o_proj`: Output projections + - `gate_proj`: Gate projections (in feed-forward networks) + - `up_proj`: Up projections (in feed-forward networks) + - `down_proj`: Down projections (in feed-forward networks) + - `lm_head`: Language model head + +- **Custom list**: Comma-separated module names (e.g., `"q_proj,v_proj"`) + +## Example Configuration + +```json +{ + "model_path": "models/qwen3-0.6b", + "experiment_id": "f2llm_lora_example", + "output_dir": "output", + "tb_dir": "tb_logs", + "cache_dir": "cache", + "train_data_path": "data_tokenized_qwen", + "train_batch_size": 4, + "max_seq_length": 1024, + "learning_rate": 1e-4, + "min_lr": 1e-6, + "weight_decay": 1e-2, + "warmup_steps": 100, + "num_hard_neg": 7, + "train_steps": 1000, + "train_epochs": 3, + "log_interval": 20, + "checkpointing_steps": 100, + "validation_steps": 100, + "use_lora": true, + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_modules": "all-linear" +} +``` + +## Implementation Details + +### Model Initialization + +When `use_lora` is set to `true`, the model automatically applies LoRA during initialization in the `F2LLM.__init__()` method: + +1. The base model is loaded from the specified `model_path` +2. LoRA configuration is created with the provided parameters +3. The PEFT (Parameter-Efficient FineTuning) library applies the LoRA adapters + +### Parameter Efficiency + +With LoRA enabled, only a fraction of the model's parameters are trainable: + +- **Full model parameters**: All model weights +- **Trainable parameters**: Only LoRA adapter weights and biases +- **Memory savings**: Often 90%+ reduction in trainable parameters + +## Usage Examples + +### Training with LoRA + +1. Create a configuration file with LoRA enabled +2. Run the training script: + +```bash +python run.py --config config_lora_example.json +``` + +### Loading Models with LoRA Adapters + +Use the `lora_utils.py` module to load models with previously trained adapters: + +```python +from lora_utils import load_model_with_lora + +model, tokenizer = load_model_with_lora( + base_model_path="path/to/base/model", + lora_adapter_path="path/to/lora/adapter" +) +``` + +### Merging LoRA Weights + +To permanently merge LoRA weights with the base model: + +```python +from lora_utils import merge_lora_weights + +merged_model = merge_lora_weights(model, save_path="path/to/merged/model") +``` + +## Utilities + +### lora_utils.py + +This module provides several utility functions for LoRA operations: + +- `load_model_with_lora()`: Load a base model with optional LoRA adapter +- `merge_lora_weights()`: Merge LoRA weights with the base model +- `get_lora_model_info()`: Get information about a LoRA model configuration +- `count_parameters()`: Count model parameters (trainable vs total) + +## Best Practices + +1. **Start with default parameters**: Use r=8, alpha=16, dropout=0.05 as a starting point +2. **Adjust r value**: Higher r values (16, 32) may improve performance but increase memory +3. **Tune alpha**: Alpha/r ratio often around 2 is effective (e.g., r=8, alpha=16) +4. **Monitor parameter count**: Check the trainable vs total parameter ratio during initialization +5. **Use appropriate target modules**: "all-linear" covers most important layers, but task-specific modules might be more efficient + +## Troubleshooting + +### Common Issues + +- **PEFT library not found**: Install with `pip install peft` +- **Memory issues**: Reduce LoRA rank (`lora_r`) to further decrease memory usage +- **Performance degradation**: Try increasing `lora_r` or `lora_alpha` values + +### Performance Considerations + +- Lower ranks (r=4, 8) use less memory but may underperform +- Higher ranks (r=32, 64) approach full fine-tuning performance but use more memory +- The alpha/ratio is often kept around 2 for optimal performance \ No newline at end of file diff --git a/F2LLM/lora_utils.py b/F2LLM/lora_utils.py new file mode 100644 index 0000000..ecbd8af --- /dev/null +++ b/F2LLM/lora_utils.py @@ -0,0 +1,123 @@ +""" +Utilities for LoRA (Low-Rank Adaptation) support in F2LLM. +This module provides functions for loading LoRA models and converting between full and LoRA models. +""" + +from transformers import AutoModel, AutoTokenizer +from peft import PeftModel, LoraConfig, get_peft_model, TaskType +import torch + + +def load_model_with_lora(base_model_path, lora_adapter_path=None, **lora_kwargs): + """ + Load a base model with optional LoRA adapter. + + Args: + base_model_path (str): Path to the base model + lora_adapter_path (str, optional): Path to the LoRA adapter + **lora_kwargs: Additional LoRA configuration arguments + + Returns: + tuple: (model, tokenizer) + """ + # Load the base model + model = AutoModel.from_pretrained( + base_model_path, + trust_remote_code=True, + torch_dtype=torch.bfloat16, + attn_implementation='flash_attention_2' + ) + model.config.use_cache = False + + tokenizer = AutoTokenizer.from_pretrained(base_model_path) + + # Apply LoRA if adapter path is provided + if lora_adapter_path: + model = PeftModel.from_pretrained(model, lora_adapter_path) + print(f"Loaded LoRA adapter from {lora_adapter_path}") + elif lora_kwargs: # Apply new LoRA if configuration is provided + target_modules = lora_kwargs.get("target_modules", "all-linear") + if target_modules == "all-linear": + target_modules = [ + "q_proj", "v_proj", "k_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + "lm_head" + ] + elif isinstance(target_modules, str): + target_modules = [module.strip() for module in target_modules.split(",")] + + lora_config = LoraConfig( + task_type=TaskType.FEATURE_EXTRACTION, + r=lora_kwargs.get("lora_r", 8), + lora_alpha=lora_kwargs.get("lora_alpha", 16), + target_modules=target_modules, + lora_dropout=lora_kwargs.get("lora_dropout", 0.05), + bias="none", + ) + + model = get_peft_model(model, lora_config) + print(f"Applied LoRA with config: {lora_config}") + + return model, tokenizer + + +def merge_lora_weights(model, save_path=None): + """ + Merge LoRA weights with the base model. + + Args: + model: PEFT model with LoRA + save_path (str, optional): Path to save the merged model + + Returns: + Merged model + """ + if hasattr(model, 'merge_and_unload'): + merged_model = model.merge_and_unload() + if save_path: + merged_model.save_pretrained(save_path) + return merged_model + else: + raise ValueError("Model does not support merging. Make sure it's a PEFT model.") + + +def get_lora_model_info(model): + """ + Get information about a LoRA model. + + Args: + model: PEFT model with LoRA + + Returns: + dict: Information about the model's LoRA configuration + """ + if hasattr(model, 'peft_config'): + info = {} + for adapter_name, config in model.peft_config.items(): + info[adapter_name] = { + 'r': config.r, + 'alpha': config.lora_alpha, + 'dropout': config.lora_dropout, + 'target_modules': config.target_modules, + 'bias': config.bias, + } + return info + else: + return {"message": "Model does not have LoRA configuration"} + + +def count_parameters(model, only_trainable=False): + """ + Count the number of parameters in the model. + + Args: + model: PyTorch model + only_trainable (bool): Whether to count only trainable parameters + + Returns: + int: Number of parameters + """ + if only_trainable: + return sum(p.numel() for p in model.parameters() if p.requires_grad) + else: + return sum(p.numel() for p in model.parameters()) \ No newline at end of file diff --git a/F2LLM/model.py b/F2LLM/model.py index d33ade7..ed283dd 100644 --- a/F2LLM/model.py +++ b/F2LLM/model.py @@ -12,11 +12,53 @@ def __init__(self, self.args = args self.dtype = torch.bfloat16 self.device = None # set after accelerator.prepare + + # Load base model self.lm = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=self.dtype, attn_implementation='flash_attention_2') self.lm.config.use_cache = False + + # Apply LoRA if enabled + if args and args.use_lora: + self._apply_lora() + self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.max_seq_length = max_seq_length + def _apply_lora(self): + """Apply LoRA to the model if enabled.""" + try: + from peft import LoraConfig, get_peft_model, TaskType + except ImportError: + raise ImportError( + "To use LoRA, please install the `peft` library: `pip install peft`" + ) + + # Process target modules + if self.args.lora_target_modules == "all-linear": + # For decoder-only models, common target modules are linear layers + target_modules = [ + "q_proj", "v_proj", "k_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + "lm_head" + ] + else: + target_modules = [module.strip() for module in self.args.lora_target_modules.split(",")] + + lora_config = LoraConfig( + task_type=TaskType.FEATURE_EXTRACTION, # Feature extraction for embedding models + r=self.args.lora_r, + lora_alpha=self.args.lora_alpha, + target_modules=target_modules, + lora_dropout=self.args.lora_dropout, + bias="none", + modules_to_save=[], # We don't need to save any additional modules + ) + + self.lm = get_peft_model(self.lm, lora_config) + print(f"LoRA applied with config: r={self.args.lora_r}, alpha={self.args.lora_alpha}, dropout={self.args.lora_dropout}") + print(f"Trainable parameters after LoRA: {self.lm.num_parameters(only_trainable=True)}") + print(f"Total parameters: {self.lm.num_parameters()}") + def set_device(self): self.device = self.lm.device diff --git a/F2LLM/requirements.txt b/F2LLM/requirements.txt index 82fb447..d5deb83 100644 --- a/F2LLM/requirements.txt +++ b/F2LLM/requirements.txt @@ -5,3 +5,4 @@ flash-attn torch transformers tensorboard +peft diff --git a/F2LLM/run.py b/F2LLM/run.py index e40b707..aea8b59 100644 --- a/F2LLM/run.py +++ b/F2LLM/run.py @@ -124,10 +124,20 @@ def __iter__(self): # set seed again to make sure that different models share the same seed set_seed(0) -optimizer = AdamW(model.lm.parameters(), - weight_decay=args.weight_decay, - lr=args.learning_rate, - betas=(0.9, 0.98)) +# Determine parameters for optimizer based on LoRA usage +if args.use_lora: + # Only optimize LoRA parameters if LoRA is enabled + optimizer = AdamW(model.lm.parameters(), + weight_decay=args.weight_decay, + lr=args.learning_rate, + betas=(0.9, 0.98)) + print(f"Using LoRA - optimizing {model.lm.num_parameters(only_trainable=True)} trainable parameters out of {model.lm.num_parameters()}") +else: + # Optimize all model parameters + optimizer = AdamW(model.lm.parameters(), + weight_decay=args.weight_decay, + lr=args.learning_rate, + betas=(0.9, 0.98)) lr_scheduler = get_scheduler("cosine", optimizer=optimizer, diff --git a/F2LLM/test_lora.py b/F2LLM/test_lora.py new file mode 100644 index 0000000..6155063 --- /dev/null +++ b/F2LLM/test_lora.py @@ -0,0 +1,118 @@ +""" +test to verify LoRA functionality in F2LLM +""" +import torch +from arguments import Args +from model import F2LLM +import tempfile +import os + +def test_lora_functionality(): + """Test that LoRA can be applied to the model correctly.""" + + # Create a mock args object with LoRA enabled + args = Args( + model_path="microsoft/Phi-3-mini-4k-instruct", # Using a smaller model for testing + experiment_id="test_lora", + output_dir="test_output", + tb_dir="test_tb", + cache_dir="test_cache", + train_data_path="dummy_path", + use_lora=True, + lora_r=8, + lora_alpha=16, + lora_dropout=0.05, + lora_target_modules="all-linear" + ) + + try: + print("Testing LoRA functionality...") + + # Create model with LoRA + model = F2LLM( + model_path=args.model_path, + max_seq_length=512, + args=args + ) + + # Check that model has LoRA applied + total_params = model.lm.num_parameters() + trainable_params = model.lm.num_parameters(only_trainable=True) + + print(f"Total parameters: {total_params}") + print(f"Trainable parameters: {trainable_params}") + print(f"Percentage of trainable parameters: {trainable_params/total_params*100:.2f}%") + + # With LoRA, we expect significantly fewer trainable parameters + assert trainable_params < total_params * 0.1, \ + f"Expected fewer trainable parameters with LoRA. Total: {total_params}, Trainable: {trainable_params}" + + print("LoRA functionality test passed!") + return True + + except ImportError as e: + print(f"PEFT library not available: {e}") + print("Please install PEFT: pip install peft") + return False + except Exception as e: + print(f"Error during LoRA test: {e}") + return False + + +def test_non_lora_functionality(): + """Test that the model still works without LoRA.""" + + # Create a mock args object with LoRA disabled + args = Args( + model_path="microsoft/Phi-3-mini-4k-instruct", # Using a smaller model for testing + experiment_id="test_no_lora", + output_dir="test_output", + tb_dir="test_tb", + cache_dir="test_cache", + train_data_path="dummy_path", + use_lora=False + ) + + try: + print("Testing non-LoRA functionality...") + + # Create model without LoRA + model = F2LLM( + model_path=args.model_path, + max_seq_length=512, + args=args + ) + + # Check that model parameters are as expected (all trainable) + total_params = model.lm.num_parameters() + trainable_params = model.lm.num_parameters(only_trainable=True) + + print(f"Total parameters: {total_params}") + print(f"Trainable parameters: {trainable_params}") + + # Without LoRA, most parameters should be trainable + assert abs(trainable_params - total_params) < 10, \ + f"Expected most parameters to be trainable without LoRA. Total: {total_params}, Trainable: {trainable_params}" + + print("Non-LoRA functionality test passed!") + return True + + except Exception as e: + print(f"Error during non-LoRA test: {e}") + return False + + +if __name__ == "__main__": + print("Running LoRA functionality tests...") + + # Test LoRA functionality + lora_test_passed = test_lora_functionality() + + # Test non-LoRA functionality + no_lora_test_passed = test_non_lora_functionality() + + if lora_test_passed and no_lora_test_passed: + print("\nAll tests passed!") + else: + print("\nSome tests failed!") + exit(1) diff --git a/F2LLM/utils.py b/F2LLM/utils.py index b167d3c..a839626 100644 --- a/F2LLM/utils.py +++ b/F2LLM/utils.py @@ -21,13 +21,31 @@ def save_checkpoint(args, accelerator, model, output_dir, lr_scheduler): if accelerator.is_main_process: model.tokenizer.save_pretrained(output_dir) + unwrapped_model = accelerator.unwrap_model(model.lm) - unwrapped_model.save_pretrained( - output_dir, - is_main_process=accelerator.is_main_process, - save_function=accelerator.save, - state_dict=accelerator.get_state_dict(model.lm), # this is required for zero 3 - ) + + # Handle LoRA-specific saving + if args.use_lora: + # For LoRA models, save both the base model and adapters + unwrapped_model.save_pretrained( + output_dir, + is_main_process=accelerator.is_main_process, + save_function=accelerator.save, + state_dict=accelerator.get_state_dict(model.lm), # this is required for zero 3 + ) + # Also save the base model config and tokenizer if not saved already + if accelerator.is_main_process: + from transformers import AutoConfig + config = AutoConfig.from_pretrained(args.model_path) + config.save_pretrained(output_dir) + else: + unwrapped_model.save_pretrained( + output_dir, + is_main_process=accelerator.is_main_process, + save_function=accelerator.save, + state_dict=accelerator.get_state_dict(model.lm), # this is required for zero 3 + ) + accelerator.wait_for_everyone()