Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions skyrl-train/skyrl_train/entrypoints/main_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,23 @@ def run(self):

@ray.remote(num_cpus=1)
def skyrl_entrypoint(cfg: DictConfig):
from skyrl_train.utils.logging import setup_logging

# Set up logging with run name for log file organization
setup_logging(run_name=cfg.trainer.run_name)

# make sure that the training loop is not run on the head node.
exp = BasePPOExp(cfg)
exp.run()


@hydra.main(config_path=config_dir, config_name="ppo_base_config", version_base=None)
def main(cfg: DictConfig) -> None:
from skyrl_train.utils.logging import setup_logging

# Set up logging early to capture all logs
setup_logging(run_name=cfg.trainer.run_name)

# validate the arguments
validate_cfg(cfg)

Expand Down
22 changes: 22 additions & 0 deletions skyrl-train/skyrl_train/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,25 @@

See https://github.com/ray-project/ray/issues/56697 for details on why this is needed.
"""

# ─────────────────────────────────────────────────────────────────────────────
# Logging
# ─────────────────────────────────────────────────────────────────────────────

SKYRL_LOG_LEVEL = os.environ.get("SKYRL_LOG_LEVEL", "INFO").upper()
"""
Log level for SkyRL (default: INFO). Controls log filtering and Ray verbosity.

- Application logs: Filters loguru output (DEBUG, INFO, WARNING, ERROR, CRITICAL)
- Ray verbosity: DEBUG enables verbose Ray logging (worker/raylet logs to stdout);
INFO or higher suppresses Ray infrastructure logs
"""

SKYRL_LOG_DIR = os.environ.get("SKYRL_LOG_DIR", "/tmp/skyrl-logs")
"""
Directory for SkyRL log files (default: /tmp/skyrl-logs).

All logs (infra + training) are written to {SKYRL_LOG_DIR}/{run_name}/logs.log.
By default, only training progress logs go to stdout. Set SKYRL_LOG_LEVEL=DEBUG
to also show infrastructure logs on stdout.
"""
227 changes: 227 additions & 0 deletions skyrl-train/skyrl_train/utils/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
"""
Logging configuration for SkyRL.

This module provides logging setup that separates training progress from infrastructure logs:
- All logs go to a file in SKYRL_LOG_DIR
- Only training progress logs go to stdout by default
- Set SKYRL_LOG_LEVEL=DEBUG to show all logs on stdout
"""

import logging
import os
import sys
from pathlib import Path
from typing import Optional

from loguru import logger

from skyrl_train.env_vars import SKYRL_LOG_DIR, SKYRL_LOG_LEVEL

# Logger names that are considered "training progress" and should go to stdout
# All other loggers are considered "infrastructure" and only go to the log file
TRAINING_PROGRESS_LOGGERS = frozenset(
{
"skyrl_train.trainer",
"skyrl_train.fully_async_trainer",
"skyrl_train.utils.tracking",
"skyrl_train.evaluate",
}
)

# Logger name prefixes to suppress from stdout (infrastructure loggers)
INFRA_LOGGER_PREFIXES = (
"vllm",
"ray",
"transformers",
"torch",
"httpx",
"httpcore",
"urllib3",
"asyncio",
"skyrl_train.workers",
"skyrl_train.inference_engines",
"skyrl_train.distributed",
"skyrl_train.model_wrapper",
"skyrl_train.utils.utils",
"skyrl_train.utils.ppo_utils",
"skyrl_train.dataset",
"skyrl_train.generators",
)


class TrainingProgressFilter(logging.Filter):
"""Filter that only allows training progress logs through."""

def filter(self, record: logging.LogRecord) -> bool:
# Allow if it's a training progress logger
if record.name in TRAINING_PROGRESS_LOGGERS:
return True

# Allow if it starts with a training progress logger prefix
for prefix in TRAINING_PROGRESS_LOGGERS:
if record.name.startswith(prefix + "."):
return True

# Block if it's an infrastructure logger
for prefix in INFRA_LOGGER_PREFIXES:
if record.name.startswith(prefix):
return False

# Allow other loggers (e.g., root logger, unknown loggers)
return True


def setup_logging(run_name: Optional[str] = None) -> Path:
"""
Set up SkyRL logging with file and console handlers.

All logs go to the log file. Only training progress logs go to stdout
unless SKYRL_LOG_LEVEL=DEBUG, in which case all logs go to stdout.

Args:
run_name: Name of the training run for organizing logs. If None,
logs go directly to SKYRL_LOG_DIR.

Returns:
Path to the log directory.
"""
# Create log directory
log_dir = Path(SKYRL_LOG_DIR)
if run_name:
log_dir = log_dir / run_name
log_dir.mkdir(parents=True, exist_ok=True)

log_file = log_dir / "logs.log"

# Determine if we should show all logs on stdout
verbose = SKYRL_LOG_LEVEL == "DEBUG"

# Configure loguru for file output (all logs)
logger.remove()

# File handler - all logs with colors
logger.add(
str(log_file),
colorize=True,
level="DEBUG", # Capture all levels in file
enqueue=True,
backtrace=False,
diagnose=False,
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
"<level>{level: <8}</level> | "
"<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
"<level>{message}</level>",
)

# Console handler - filtered or all based on log level
def console_filter(record):
"""Filter for console output - only training progress unless DEBUG."""
if verbose:
return True
name = record["name"]
# Check if it's a training progress logger
if name in TRAINING_PROGRESS_LOGGERS:
return True
for prefix in TRAINING_PROGRESS_LOGGERS:
if name.startswith(prefix + "."):
return True
# Check if it's an infra logger to block
for prefix in INFRA_LOGGER_PREFIXES:
if name.startswith(prefix):
return False
return True

logger.add(
sys.stderr,
colorize=True,
level=SKYRL_LOG_LEVEL,
filter=console_filter,
enqueue=True,
backtrace=False,
diagnose=False,
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
"<level>{level: <8}</level> | "
"<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
"<level>{message}</level>",
)

# Configure stdlib logging to route through loguru
_setup_stdlib_logging(log_file, verbose)

logger.info(f"Logging initialized. Log file: {log_file}")
return log_dir


def _setup_stdlib_logging(log_file: Path, verbose: bool) -> None:
"""
Configure stdlib logging to write to file and optionally stdout.

This captures logs from vLLM, transformers, and other third-party libraries.
"""

class InterceptHandler(logging.Handler):
"""Handler that routes stdlib logging to loguru."""

def emit(self, record: logging.LogRecord) -> None:
try:
level = logger.level(record.levelname).name
except ValueError:
level = record.levelno

# Find caller from where the logged message originated
frame, depth = logging.currentframe(), 2
while frame and frame.f_code.co_filename == logging.__file__:
frame = frame.f_back
depth += 1

logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

# Remove existing handlers and add our interceptor
logging.root.handlers = [InterceptHandler()]
level = getattr(logging, SKYRL_LOG_LEVEL, logging.INFO)
logging.root.setLevel(level)

# Suppress noisy third-party loggers
for name in ["httpx", "httpcore", "urllib3", "asyncio"]:
logging.getLogger(name).setLevel(logging.WARNING)


def configure_worker_logging() -> None:
"""
Configure logging for Ray workers.

This is called within Ray workers to set up proper logging. Workers
write to stderr which is captured by Ray.
"""
# Use the same setup but without run_name (workers don't create log dirs)
logger.remove()

level_name = SKYRL_LOG_LEVEL

# Console only for workers (Ray handles log forwarding)
logger.level("INFO", color="<bold><green>")
logger.add(
sys.stderr,
colorize=True,
level=level_name,
enqueue=True,
backtrace=False,
diagnose=False,
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
"<level>{level: <8}</level> | "
"<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
"<level>{message}</level>",
)

# Route stdlib logging through loguru
class InterceptHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
try:
level = logger.level(record.levelname).name
except ValueError:
level = record.levelno
logger.opt(depth=6, exception=record.exc_info).log(level, record.getMessage())

logging.root.handlers = [InterceptHandler()]
level = getattr(logging, level_name, logging.INFO)
logging.root.setLevel(level)
49 changes: 17 additions & 32 deletions skyrl-train/skyrl_train/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,40 +595,15 @@ def prepare_runtime_environment(cfg: DictConfig) -> dict[str, str]:

def configure_ray_worker_logging() -> None:
"""
Configure logging for Ray workers.

In Ray workers, stderr/stdout are not TTYs, so Loguru disables color.
This method forces color and formatting (e.g., bold) and routes stdlib `logging`
through Loguru so third-party logs match formatting
This method forces color and formatting and routes stdlib logging
through Loguru so third-party logs match formatting.
"""
level_name = os.getenv("LOG_LEVEL", "INFO").upper()

# 1) Loguru formatting (force colors)
logger.remove()
logger.level("INFO", color="<bold><green>")
logger.add(
sys.stderr,
colorize=True, # keep ANSI even without a TTY
level=level_name, # ensure Loguru filters below this level
enqueue=True,
backtrace=False,
diagnose=False,
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
"<level>{level: <8}</level> | "
"<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
"<level>{message}</level>",
)

# 2) Route stdlib logging -> Loguru (so vLLM/transformers/etc. are formatted)
class _InterceptHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
try:
level = logger.level(record.levelname).name
except ValueError:
level = record.levelno
logger.opt(depth=6, exception=record.exc_info).log(level, record.getMessage())
from skyrl_train.utils.logging import configure_worker_logging

logging.root.handlers = [_InterceptHandler()]
level = getattr(logging, level_name, logging.INFO)
logging.root.setLevel(level)
configure_worker_logging()


def initialize_ray(cfg: DictConfig):
Expand All @@ -639,9 +614,19 @@ def initialize_ray(cfg: DictConfig):
cfg: Training config
"""
from .ppo_utils import sync_registries
from skyrl_train.env_vars import SKYRL_LOG_LEVEL

# SKYRL_LOG_LEVEL=DEBUG enables verbose Ray logging
verbose_logging = SKYRL_LOG_LEVEL == "DEBUG"

if not verbose_logging:
# Suppress C++ logs (metrics exporter errors) on stdout
os.environ["RAY_BACKEND_LOG_LEVEL"] = "fatal"

env_vars = prepare_runtime_environment(cfg)
ray.init(runtime_env={"env_vars": env_vars})

# log_to_driver=False suppresses worker/raylet logs forwarding to driver stdout
ray.init(runtime_env={"env_vars": env_vars}, log_to_driver=verbose_logging)

# create the named ray actors for the registries to make available to all workers
sync_registries()
Expand Down
Loading