Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion lpm_kernel/L2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import gc
import requests

progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json")

# Initialize the logger
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -595,6 +597,15 @@ def setup_logger(log_path, logger_name="download_logger"):

return logger

def _write_progress_to_file(progress_data,progress_file):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

space needed

try:
import json
with open(progress_file, 'w', encoding='utf-8') as f:
json_str = json.dumps(progress_data)
f.write(json_str)
except Exception as e:
logger.error(f"Error writing progress to file: {str(e)}")


def save_hf_model(model_name=None, log_file_path=None) -> str:
"""Saves a Hugging Face model locally.
Expand Down Expand Up @@ -708,10 +719,21 @@ def progress_callback(current, total):
if current % (1024 * 1024 * 10) < 8192:
if total and total > 0:
percent = current / total * 100
progress_data ={
"file_name": filename,
"file_size": total_size / 1024 / 1024 if total_size >0 else None,
"downloaded_mb": current/1024/1024,
"total_mb": total/1024/1024,
"percentage": percent,
"completed": False
}
_write_progress_to_file(progress_data, progress_file)
logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB / {total/1024/1024:.2f} MB ({percent:.2f}%)")

else:
logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)")



# Download file with progress tracking
response = requests.get(url, stream=True)
if response.status_code == 200:
Expand Down Expand Up @@ -789,6 +811,18 @@ def progress_callback(current, total):
logger.info(f"Model {model_name} downloaded with {file_count} files.")
except Exception:
logger.info(f"Download completed for model: {model_name}.")

progress_data = {
"file_name": "ALL_FILES",
"file_size": 100,
"downloaded_mb": 100,
"total_mb": 100,
"percentage": 100.0,
"completed": True,
}

_write_progress_to_file(progress_data, progress_file)

except requests.RequestException:
try:
from modelscope.hub.snapshot_download import snapshot_download
Expand Down
124 changes: 52 additions & 72 deletions lpm_kernel/api/domains/trainprocess/trainprocess_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import subprocess
from lpm_kernel.configs.logging import get_train_process_logger, TRAIN_LOG_FILE
logger = get_train_process_logger()
progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json")

class TrainProcessService:
"""Training process service (singleton pattern)"""
Expand Down Expand Up @@ -808,86 +809,65 @@ def _monitor_model_download(self) -> bool:
last_position = 0

# Variables to track download status
current_file = ""
file_name = ""
file_size = 0
total_size = 0 # Total size of all files
file_sizes = {} # Dictionary to store file sizes
last_update_time = time.time()
completed = False

while True:
try:
# Read new log content
with open(log_file, 'r') as f:
f.seek(last_position)
new_lines = f.readlines()
last_position = f.tell()

for line in new_lines:
line = line.strip()

# Check for download start
if "Starting download of model:" in line:
logger.info("Model download started")
continue

# Get file size information when a download starts
if "Starting download of file:" in line:
match = re.search(r"Starting download of file: (.+) \(Size: ([\d\.]+) MB\)", line)
if match:
current_file = match.group(1)
file_size = float(match.group(2))
file_sizes[current_file] = file_size
total_size = sum(file_sizes.values())
# logger.info(f"Starting download of {current_file} ({file_size} MB)")

# Track file download progress
if "Downloaded" in line and "MB /" in line:
match = re.search(r"File (.+): Downloaded ([\d\.]+) MB / ([\d\.]+) MB \(([\d\.]+)%\)", line)
if match:
file_name = match.group(1)
downloaded_mb = float(match.group(2))
total_mb = float(match.group(3))
percentage = float(match.group(4))

# Update file size if it was updated (especially for model.safetensors)
if total_mb > file_sizes.get(file_name, 0):
file_sizes[file_name] = total_mb
total_size = sum(file_sizes.values())

# Calculate overall progress
if total_size > 0:
# Sum up all downloaded data
completed_files_size = sum([file_sizes.get(f, 0) for f in file_sizes if f != file_name])
current_file_downloaded = (percentage / 100.0) * total_mb
overall_downloaded = completed_files_size + current_file_downloaded
current_progress = (overall_downloaded / total_size) * 100
current_progress = min(99.0, current_progress) # Cap at 99% until fully complete
# Update progress at most once per second
current_time = time.time()
if current_time - last_update_time >= 3.0:
if os.path.exists(progress_file):
try:
import json
with open(progress_file, 'r') as f:
progress_data = json.load(f)
file_name = progress_data.get("file_name")
file_size = float(progress_data.get("file_size"))
downloaded_mb = float(progress_data.get("downloaded_mb"))
total_mb = float(progress_data.get("total_mb"))
percentage = float(progress_data.get("percentage"))
completed = progress_data.get("completed")

self._update_progress(
"downloading_the_base_model",
"model_download",
current_progress,
f"Overall: {current_progress:.1f}% - Downloading {file_name}: {percentage}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)",
file_name
)
last_update_time = current_time
except Exception as e:
logger.error(f"Error reading progress file: {str(e)}")
if completed:
self.progress.mark_step_status(ProcessStep.MODEL_DOWNLOAD, Status.COMPLETED)
logger.info("Model download completed")
return True

file_sizes[file_name] = file_size
total_size = sum(file_sizes.values())

# Update file size if it was updated (especially for model.safetensors)
if total_mb > file_sizes.get(file_name, 0):
file_sizes[file_name] = total_mb
total_size = sum(file_sizes.values())

# Calculate overall progress
if total_size > 0:
# Sum up all downloaded data
completed_files_size = sum(
[file_sizes.get(f, 0) for f in file_sizes if f != file_name])
current_file_downloaded = (percentage / 100.0) * total_mb
overall_downloaded = completed_files_size + current_file_downloaded
current_progress = (overall_downloaded / total_size) * 100
current_progress = min(99.0, current_progress) # Cap at 99% until fully complete
# Update progress at most once per second
current_time = time.time()
if current_time - last_update_time >= 3.0:
self._update_progress(
"downloading_the_base_model",
"model_download",
current_progress,
f"Overall: {current_progress:.1f}% - Downloading {file_name}: {percentage}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)",
file_name
)
last_update_time = current_time

# Briefly pause to avoid excessive CPU usage
time.sleep(0.1)

if "Model downloaded successfully" in line:
self.progress.mark_step_status(ProcessStep.MODEL_DOWNLOAD, Status.COMPLETED)
logger.info("Model download completed")
return True

# Briefly pause to avoid excessive CPU usage
time.sleep(0.1)

except IOError as e:
logger.error(f"Failed to read log file: {str(e)}")
time.sleep(0.1)
continue

except Exception as e:
logger.error(f"Failed to monitor model download progress: {str(e)}")
return False
Expand Down