Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions lpm_kernel/L2/train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
import torch
import logging
Expand Down Expand Up @@ -234,7 +237,7 @@ def configure_system_resources(num_cores=None):
tokenizer,
)

response_template = "\n<|im_start|>assistant\n"
response_template = "\n<|im_end|>assistant\n"
Comment thread
Likhithsai2580 marked this conversation as resolved.

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

Expand Down Expand Up @@ -391,7 +394,13 @@ def print_model_structure(model, prefix=""):

# Start training
logger.info("Starting actual training process...")
trainer.train(resume_from_checkpoint=checkpoint)
try:
trainer.train(resume_from_checkpoint=checkpoint)
except torch.cuda.OutOfMemoryError as e:
logger.error(f"CUDA out of memory error: {str(e)}")
torch.cuda.empty_cache()
logger.info("Freed up GPU memory, retrying backward pass...")
trainer.train(resume_from_checkpoint=checkpoint)
except Exception as e:
logger.error(f"Error during training: {str(e)}")
logger.error(f"Error type: {type(e)}")
Expand Down