From 93ce8dab747039daeac93fb3625b4b1cfd818a28 Mon Sep 17 00:00:00 2001 From: Tyler Griggs Date: Sun, 25 Jan 2026 01:24:56 +0000 Subject: [PATCH 1/2] Add SKYRL_LOG_LEVEL env var and suppress verbose Ray logs by default - Add SKYRL_LOG_LEVEL to env_vars.py for centralized log level control - Update configure_ray_worker_logging to use SKYRL_LOG_LEVEL - Add log_to_driver=False to suppress worker/raylet log forwarding - Set RAY_BACKEND_LOG_LEVEL=fatal to suppress C++ metrics errors - Enable verbose logging when SKYRL_LOG_LEVEL=DEBUG Co-Authored-By: Claude Opus 4.5 --- skyrl-train/skyrl_train/env_vars.py | 12 ++++++++++++ skyrl-train/skyrl_train/utils/utils.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/skyrl-train/skyrl_train/env_vars.py b/skyrl-train/skyrl_train/env_vars.py index 0fe18e357..abf75baa2 100644 --- a/skyrl-train/skyrl_train/env_vars.py +++ b/skyrl-train/skyrl_train/env_vars.py @@ -65,3 +65,15 @@ See https://github.com/ray-project/ray/issues/56697 for details on why this is needed. """ + +# ───────────────────────────────────────────────────────────────────────────── +# Logging +# ───────────────────────────────────────────────────────────────────────────── + +SKYRL_LOG_LEVEL = os.environ.get("SKYRL_LOG_LEVEL", "INFO").upper() +""" +Log level for SkyRL (default: INFO). Controls both application log filtering and Ray verbosity. + +DEBUG: Shows debug logs and enables verbose Ray logging (raylet/worker logs to stdout). +INFO or higher: Normal logging with suppressed Ray infrastructure logs. +""" diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py index 2c4dc8f7e..42a993035 100644 --- a/skyrl-train/skyrl_train/utils/utils.py +++ b/skyrl-train/skyrl_train/utils/utils.py @@ -599,7 +599,9 @@ def configure_ray_worker_logging() -> None: This method forces color and formatting (e.g., bold) and routes stdlib `logging` through Loguru so third-party logs match formatting """ - level_name = os.getenv("LOG_LEVEL", "INFO").upper() + from skyrl_train.env_vars import SKYRL_LOG_LEVEL + + level_name = SKYRL_LOG_LEVEL # 1) Loguru formatting (force colors) logger.remove() @@ -639,9 +641,19 @@ def initialize_ray(cfg: DictConfig): cfg: Training config """ from .ppo_utils import sync_registries + from skyrl_train.env_vars import SKYRL_LOG_LEVEL + + # SKYRL_LOG_LEVEL=DEBUG enables verbose Ray logging + verbose_logging = SKYRL_LOG_LEVEL == "DEBUG" + + if not verbose_logging: + # Suppress C++ logs (metrics exporter errors) on stdout + os.environ["RAY_BACKEND_LOG_LEVEL"] = "fatal" env_vars = prepare_runtime_environment(cfg) - ray.init(runtime_env={"env_vars": env_vars}) + + # log_to_driver=False suppresses worker/raylet logs forwarding to driver stdout + ray.init(runtime_env={"env_vars": env_vars}, log_to_driver=verbose_logging) # create the named ray actors for the registries to make available to all workers sync_registries() From 234c54e8e1b3da9bcb8a2dc8d6bcf56e26f3b925 Mon Sep 17 00:00:00 2001 From: Tyler Griggs Date: Sun, 25 Jan 2026 01:36:55 +0000 Subject: [PATCH 2/2] Add log file routing and separate training progress from infra logs - Add SKYRL_LOG_DIR env var for log file location (default: /tmp/skyrl-logs) - Create new logging module (skyrl_train/utils/logging.py) with: - setup_logging(): Routes all logs to file, training progress to stdout - configure_worker_logging(): Simplified worker logging setup - Training progress loggers (trainer, fully_async_trainer, tracking, evaluate) go to stdout by default - Infrastructure loggers (vllm, ray, workers, inference_engines, etc.) only go to log file by default - Set SKYRL_LOG_LEVEL=DEBUG to show all logs on stdout - Update main_base.py to call setup_logging at startup Co-Authored-By: Claude Opus 4.5 --- .../skyrl_train/entrypoints/main_base.py | 10 + skyrl-train/skyrl_train/env_vars.py | 16 +- skyrl-train/skyrl_train/utils/logging.py | 227 ++++++++++++++++++ skyrl-train/skyrl_train/utils/utils.py | 39 +-- 4 files changed, 256 insertions(+), 36 deletions(-) create mode 100644 skyrl-train/skyrl_train/utils/logging.py diff --git a/skyrl-train/skyrl_train/entrypoints/main_base.py b/skyrl-train/skyrl_train/entrypoints/main_base.py index fa6bc7000..46d1717e9 100644 --- a/skyrl-train/skyrl_train/entrypoints/main_base.py +++ b/skyrl-train/skyrl_train/entrypoints/main_base.py @@ -316,6 +316,11 @@ def run(self): @ray.remote(num_cpus=1) def skyrl_entrypoint(cfg: DictConfig): + from skyrl_train.utils.logging import setup_logging + + # Set up logging with run name for log file organization + setup_logging(run_name=cfg.trainer.run_name) + # make sure that the training loop is not run on the head node. exp = BasePPOExp(cfg) exp.run() @@ -323,6 +328,11 @@ def skyrl_entrypoint(cfg: DictConfig): @hydra.main(config_path=config_dir, config_name="ppo_base_config", version_base=None) def main(cfg: DictConfig) -> None: + from skyrl_train.utils.logging import setup_logging + + # Set up logging early to capture all logs + setup_logging(run_name=cfg.trainer.run_name) + # validate the arguments validate_cfg(cfg) diff --git a/skyrl-train/skyrl_train/env_vars.py b/skyrl-train/skyrl_train/env_vars.py index abf75baa2..3e5dca507 100644 --- a/skyrl-train/skyrl_train/env_vars.py +++ b/skyrl-train/skyrl_train/env_vars.py @@ -72,8 +72,18 @@ SKYRL_LOG_LEVEL = os.environ.get("SKYRL_LOG_LEVEL", "INFO").upper() """ -Log level for SkyRL (default: INFO). Controls both application log filtering and Ray verbosity. +Log level for SkyRL (default: INFO). Controls log filtering and Ray verbosity. -DEBUG: Shows debug logs and enables verbose Ray logging (raylet/worker logs to stdout). -INFO or higher: Normal logging with suppressed Ray infrastructure logs. +- Application logs: Filters loguru output (DEBUG, INFO, WARNING, ERROR, CRITICAL) +- Ray verbosity: DEBUG enables verbose Ray logging (worker/raylet logs to stdout); + INFO or higher suppresses Ray infrastructure logs +""" + +SKYRL_LOG_DIR = os.environ.get("SKYRL_LOG_DIR", "/tmp/skyrl-logs") +""" +Directory for SkyRL log files (default: /tmp/skyrl-logs). + +All logs (infra + training) are written to {SKYRL_LOG_DIR}/{run_name}/logs.log. +By default, only training progress logs go to stdout. Set SKYRL_LOG_LEVEL=DEBUG +to also show infrastructure logs on stdout. """ diff --git a/skyrl-train/skyrl_train/utils/logging.py b/skyrl-train/skyrl_train/utils/logging.py new file mode 100644 index 000000000..3cb0c40af --- /dev/null +++ b/skyrl-train/skyrl_train/utils/logging.py @@ -0,0 +1,227 @@ +""" +Logging configuration for SkyRL. + +This module provides logging setup that separates training progress from infrastructure logs: +- All logs go to a file in SKYRL_LOG_DIR +- Only training progress logs go to stdout by default +- Set SKYRL_LOG_LEVEL=DEBUG to show all logs on stdout +""" + +import logging +import os +import sys +from pathlib import Path +from typing import Optional + +from loguru import logger + +from skyrl_train.env_vars import SKYRL_LOG_DIR, SKYRL_LOG_LEVEL + +# Logger names that are considered "training progress" and should go to stdout +# All other loggers are considered "infrastructure" and only go to the log file +TRAINING_PROGRESS_LOGGERS = frozenset( + { + "skyrl_train.trainer", + "skyrl_train.fully_async_trainer", + "skyrl_train.utils.tracking", + "skyrl_train.evaluate", + } +) + +# Logger name prefixes to suppress from stdout (infrastructure loggers) +INFRA_LOGGER_PREFIXES = ( + "vllm", + "ray", + "transformers", + "torch", + "httpx", + "httpcore", + "urllib3", + "asyncio", + "skyrl_train.workers", + "skyrl_train.inference_engines", + "skyrl_train.distributed", + "skyrl_train.model_wrapper", + "skyrl_train.utils.utils", + "skyrl_train.utils.ppo_utils", + "skyrl_train.dataset", + "skyrl_train.generators", +) + + +class TrainingProgressFilter(logging.Filter): + """Filter that only allows training progress logs through.""" + + def filter(self, record: logging.LogRecord) -> bool: + # Allow if it's a training progress logger + if record.name in TRAINING_PROGRESS_LOGGERS: + return True + + # Allow if it starts with a training progress logger prefix + for prefix in TRAINING_PROGRESS_LOGGERS: + if record.name.startswith(prefix + "."): + return True + + # Block if it's an infrastructure logger + for prefix in INFRA_LOGGER_PREFIXES: + if record.name.startswith(prefix): + return False + + # Allow other loggers (e.g., root logger, unknown loggers) + return True + + +def setup_logging(run_name: Optional[str] = None) -> Path: + """ + Set up SkyRL logging with file and console handlers. + + All logs go to the log file. Only training progress logs go to stdout + unless SKYRL_LOG_LEVEL=DEBUG, in which case all logs go to stdout. + + Args: + run_name: Name of the training run for organizing logs. If None, + logs go directly to SKYRL_LOG_DIR. + + Returns: + Path to the log directory. + """ + # Create log directory + log_dir = Path(SKYRL_LOG_DIR) + if run_name: + log_dir = log_dir / run_name + log_dir.mkdir(parents=True, exist_ok=True) + + log_file = log_dir / "logs.log" + + # Determine if we should show all logs on stdout + verbose = SKYRL_LOG_LEVEL == "DEBUG" + + # Configure loguru for file output (all logs) + logger.remove() + + # File handler - all logs with colors + logger.add( + str(log_file), + colorize=True, + level="DEBUG", # Capture all levels in file + enqueue=True, + backtrace=False, + diagnose=False, + format="{time:YYYY-MM-DD HH:mm:ss.SSS} | " + "{level: <8} | " + "{name}:{function}:{line} - " + "{message}", + ) + + # Console handler - filtered or all based on log level + def console_filter(record): + """Filter for console output - only training progress unless DEBUG.""" + if verbose: + return True + name = record["name"] + # Check if it's a training progress logger + if name in TRAINING_PROGRESS_LOGGERS: + return True + for prefix in TRAINING_PROGRESS_LOGGERS: + if name.startswith(prefix + "."): + return True + # Check if it's an infra logger to block + for prefix in INFRA_LOGGER_PREFIXES: + if name.startswith(prefix): + return False + return True + + logger.add( + sys.stderr, + colorize=True, + level=SKYRL_LOG_LEVEL, + filter=console_filter, + enqueue=True, + backtrace=False, + diagnose=False, + format="{time:YYYY-MM-DD HH:mm:ss.SSS} | " + "{level: <8} | " + "{name}:{function}:{line} - " + "{message}", + ) + + # Configure stdlib logging to route through loguru + _setup_stdlib_logging(log_file, verbose) + + logger.info(f"Logging initialized. Log file: {log_file}") + return log_dir + + +def _setup_stdlib_logging(log_file: Path, verbose: bool) -> None: + """ + Configure stdlib logging to write to file and optionally stdout. + + This captures logs from vLLM, transformers, and other third-party libraries. + """ + + class InterceptHandler(logging.Handler): + """Handler that routes stdlib logging to loguru.""" + + def emit(self, record: logging.LogRecord) -> None: + try: + level = logger.level(record.levelname).name + except ValueError: + level = record.levelno + + # Find caller from where the logged message originated + frame, depth = logging.currentframe(), 2 + while frame and frame.f_code.co_filename == logging.__file__: + frame = frame.f_back + depth += 1 + + logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage()) + + # Remove existing handlers and add our interceptor + logging.root.handlers = [InterceptHandler()] + level = getattr(logging, SKYRL_LOG_LEVEL, logging.INFO) + logging.root.setLevel(level) + + # Suppress noisy third-party loggers + for name in ["httpx", "httpcore", "urllib3", "asyncio"]: + logging.getLogger(name).setLevel(logging.WARNING) + + +def configure_worker_logging() -> None: + """ + Configure logging for Ray workers. + + This is called within Ray workers to set up proper logging. Workers + write to stderr which is captured by Ray. + """ + # Use the same setup but without run_name (workers don't create log dirs) + logger.remove() + + level_name = SKYRL_LOG_LEVEL + + # Console only for workers (Ray handles log forwarding) + logger.level("INFO", color="") + logger.add( + sys.stderr, + colorize=True, + level=level_name, + enqueue=True, + backtrace=False, + diagnose=False, + format="{time:YYYY-MM-DD HH:mm:ss.SSS} | " + "{level: <8} | " + "{name}:{function}:{line} - " + "{message}", + ) + + # Route stdlib logging through loguru + class InterceptHandler(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + try: + level = logger.level(record.levelname).name + except ValueError: + level = record.levelno + logger.opt(depth=6, exception=record.exc_info).log(level, record.getMessage()) + + logging.root.handlers = [InterceptHandler()] + level = getattr(logging, level_name, logging.INFO) + logging.root.setLevel(level) diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py index 42a993035..dcbcbe462 100644 --- a/skyrl-train/skyrl_train/utils/utils.py +++ b/skyrl-train/skyrl_train/utils/utils.py @@ -595,42 +595,15 @@ def prepare_runtime_environment(cfg: DictConfig) -> dict[str, str]: def configure_ray_worker_logging() -> None: """ + Configure logging for Ray workers. + In Ray workers, stderr/stdout are not TTYs, so Loguru disables color. - This method forces color and formatting (e.g., bold) and routes stdlib `logging` - through Loguru so third-party logs match formatting + This method forces color and formatting and routes stdlib logging + through Loguru so third-party logs match formatting. """ - from skyrl_train.env_vars import SKYRL_LOG_LEVEL - - level_name = SKYRL_LOG_LEVEL - - # 1) Loguru formatting (force colors) - logger.remove() - logger.level("INFO", color="") - logger.add( - sys.stderr, - colorize=True, # keep ANSI even without a TTY - level=level_name, # ensure Loguru filters below this level - enqueue=True, - backtrace=False, - diagnose=False, - format="{time:YYYY-MM-DD HH:mm:ss.SSS} | " - "{level: <8} | " - "{name}:{function}:{line} - " - "{message}", - ) + from skyrl_train.utils.logging import configure_worker_logging - # 2) Route stdlib logging -> Loguru (so vLLM/transformers/etc. are formatted) - class _InterceptHandler(logging.Handler): - def emit(self, record: logging.LogRecord) -> None: - try: - level = logger.level(record.levelname).name - except ValueError: - level = record.levelno - logger.opt(depth=6, exception=record.exc_info).log(level, record.getMessage()) - - logging.root.handlers = [_InterceptHandler()] - level = getattr(logging, level_name, logging.INFO) - logging.root.setLevel(level) + configure_worker_logging() def initialize_ray(cfg: DictConfig):