Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions lpm_kernel/L2/dpo/dpo_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"):
def train(args):
tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(
args.base_model_path,
trust_remote_code=True,
ignore_mismatched_sizes=True,
torch_dtype=torch.float32, # CPU doesn't support bfloat16
)
args.base_model_path,
trust_remote_code=True,
ignore_mismatched_sizes=True,
torch_dtype="auto", # Use auto detection instead of hardcoding float32
)
time_str = get_east_eight_time_formatted()

# merged_model = model.merge_and_unload()
Expand Down
12 changes: 5 additions & 7 deletions lpm_kernel/L2/memory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,12 @@ def get_optimal_training_config(self) -> Dict[str, Any]:
"gradient_accumulation_steps": 1,
}

# Enable mixed precision based on hardware support
# Let PyTorch automatically decide the best dtype if CUDA is available
if self.cuda_available:
capability = torch.cuda.get_device_capability()
if capability[0] >= 8: # Ampere or newer (supports BF16)
config["bf16"] = True
elif capability[0] >= 7: # Volta or newer (supports FP16)
config["fp16"] = True

# Instead of manually checking capabilities, use "auto" dtype
# PyTorch will automatically select the best precision for the hardware
config["dtype"] = "auto"

# Adjust accumulation steps based on available memory
vram_gb = self.get_memory_info().get("vram_total_gb", 0)
if vram_gb < 8: # Small GPUs
Expand Down
9 changes: 4 additions & 5 deletions lpm_kernel/L2/merge_lora_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,15 @@ def merge_lora_weights(base_model_path, lora_adapter_path, output_model_path):
# Clean up memory before starting
memory_manager.cleanup_memory(force=True)

# Explicitly set device configuration based on available hardware
# Use auto dtype selection instead of manually choosing based on hardware
device_map = "auto" if use_cuda else None
dtype = torch.float16 if use_cuda else torch.float32

logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, dtype={dtype}")
logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, using auto dtype")

# Use explicit configuration for GPU utilization
# Use auto dtype configuration for optimal hardware utilization
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=dtype,
torch_dtype="auto",
device_map=device_map
)

Expand Down
16 changes: 14 additions & 2 deletions lpm_kernel/L2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,20 @@ def main(model_args, data_args, training_args):
# Configure quantization if requested
if model_args.use_4bit_quantization:
from transformers import BitsAndBytesConfig
compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)

# Handle "auto" dtype appropriately
if model_args.bnb_4bit_compute_dtype == "auto":
# Let BitsAndBytesConfig handle the dtype automatically
compute_dtype = "auto"
else:
# Use the specified dtype
compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)

# Storage dtype follows the same pattern
if model_args.bnb_4bit_quant_storage_dtype == "auto":
quant_storage_dtype = "auto"
else:
quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)

model_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=model_args.use_4bit_quantization,
Expand Down
15 changes: 3 additions & 12 deletions lpm_kernel/L2/train_for_user.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ LEARNING_RATE="2e-4"
NUM_TRAIN_EPOCHS="3"
CONCURRENCY_THREADS="2"
DATA_SYNTHESIS_MODE="low"
HALF=False
USE_CUDA=False # Default to False, will be overridden by parameter
IS_COT=False

Expand Down Expand Up @@ -71,19 +70,11 @@ if [ "$CONCURRENCY_THREADS" != "1" ]; then
echo "Set thread environment variables to $CONCURRENCY_THREADS"
fi

# Add BF16 option based on the platform and CUDA availability
if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then
HALF=True
echo "Enabling BF16 half precision for non-Apple platform with CUDA"
else
echo "Using standard precision (not using BF16)"
fi

# Print environment for debugging
echo "Environment configuration:"
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}"
echo " Using half precision: ${HALF}"
echo " Using automatic mixed precision"

# Execute training script with parameters from environment variables
python lpm_kernel/L2/train.py \
Expand All @@ -103,7 +94,6 @@ python lpm_kernel/L2/train.py \
--save_strategy "steps" \
--save_steps 5 \
--push_to_hub False \
--bf16 $HALF \
--packing False \
--learning_rate $LEARNING_RATE \
--lr_scheduler_type "cosine" \
Expand All @@ -121,7 +111,8 @@ python lpm_kernel/L2/train.py \
--lora_target_modules "all-linear" \
--use_4bit_quantization False \
--use_nested_quant False \
--bnb_4bit_compute_dtype "bfloat16" \
--bnb_4bit_compute_dtype "auto" \
--bnb_4bit_quant_storage_dtype "auto" \
--is_cot $IS_COT \
--use_cuda $USE_CUDA

39 changes: 26 additions & 13 deletions lpm_kernel/L2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,9 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
if cuda_available and use_cuda_requested:
device = "cuda"
model_kwargs["device_map"] = "auto"
# Use auto dtype instead of hardcoded dtype
if "torch_dtype" not in model_kwargs:
model_kwargs["torch_dtype"] = "auto"
else:
if use_cuda_requested and not cuda_available:
logger.warning("⚠️ CUDA was requested but is not available on this system. Falling back to CPU.")
Expand All @@ -326,8 +329,19 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
# Use model_kwargs quantization_config if provided, otherwise build it
if "quantization_config" not in model_kwargs:
if args.use_4bit_quantization:
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
# Handle "auto" dtype appropriately
if args.bnb_4bit_compute_dtype == "auto":
# Let BitsAndBytesConfig handle the dtype automatically
compute_dtype = "auto"
else:
# Use the specified dtype
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)

# Storage dtype follows the same pattern
if args.bnb_4bit_quant_storage_dtype == "auto":
quant_storage_dtype = "auto"
else:
quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)

bnb_config = BitsAndBytesConfig(
load_in_4bit=args.use_4bit_quantization,
Expand All @@ -337,11 +351,6 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
bnb_4bit_quant_storage=quant_storage_dtype,
)
model_kwargs["quantization_config"] = bnb_config

if compute_dtype == torch.float16 and args.use_4bit_quantization:
major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
if major >= 8:
logger.info("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
elif args.use_8bit_quantization:
bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
model_kwargs["quantization_config"] = bnb_config
Expand All @@ -358,7 +367,7 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
unsloth_kwargs = {
"model_name": args.model_name_or_path,
"max_seq_length": data_args.max_seq_length,
"dtype": None,
"dtype": "auto", # Use auto dtype for automatic precision selection
"load_in_4bit": args.use_4bit_quantization,
"load_in_8bit": args.use_8bit_quantization,
"trust_remote_code": True,
Expand All @@ -383,6 +392,10 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
# Set default device_map if not specified
if "device_map" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
load_kwargs["device_map"] = "auto"

# Ensure automatic dtype selection
if "torch_dtype" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
load_kwargs["torch_dtype"] = "auto"

logger.info(f"Loading model with parameters: {load_kwargs}")
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **load_kwargs)
Expand All @@ -396,17 +409,17 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
memory_manager.cleanup_memory(force=True)

try:
# Try with simpler configuration - float16 instead of bfloat16
logger.info("Attempting to load with float16 precision...")
# Try with simpler configuration - use auto dtype instead of float16
logger.info("Attempting to load with auto precision...")
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
device_map="auto" if torch.cuda.is_available() and args.use_cuda else None,
torch_dtype=torch.float16 if torch.cuda.is_available() and args.use_cuda else None,
torch_dtype="auto" if torch.cuda.is_available() and args.use_cuda else None,
trust_remote_code=True
)
except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
# If that fails too, try even more conservative loading
logger.warning(f"Float16 loading failed: {str(e)}")
logger.warning(f"Auto dtype loading failed: {str(e)}")
memory_manager.cleanup_memory(force=True)

try:
Expand All @@ -417,7 +430,7 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
device_map="auto",
offload_folder="offload_folder",
offload_state_dict=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else None,
torch_dtype="auto" if torch.cuda.is_available() else None,
trust_remote_code=True,
low_cpu_mem_usage=True
)
Expand Down