diff --git a/skyrl-train/skyrl_train/entrypoints/main_base.py b/skyrl-train/skyrl_train/entrypoints/main_base.py
index fa6bc7000..46d1717e9 100644
--- a/skyrl-train/skyrl_train/entrypoints/main_base.py
+++ b/skyrl-train/skyrl_train/entrypoints/main_base.py
@@ -316,6 +316,11 @@ def run(self):
@ray.remote(num_cpus=1)
def skyrl_entrypoint(cfg: DictConfig):
+ from skyrl_train.utils.logging import setup_logging
+
+ # Set up logging with run name for log file organization
+ setup_logging(run_name=cfg.trainer.run_name)
+
# make sure that the training loop is not run on the head node.
exp = BasePPOExp(cfg)
exp.run()
@@ -323,6 +328,11 @@ def skyrl_entrypoint(cfg: DictConfig):
@hydra.main(config_path=config_dir, config_name="ppo_base_config", version_base=None)
def main(cfg: DictConfig) -> None:
+ from skyrl_train.utils.logging import setup_logging
+
+ # Set up logging early to capture all logs
+ setup_logging(run_name=cfg.trainer.run_name)
+
# validate the arguments
validate_cfg(cfg)
diff --git a/skyrl-train/skyrl_train/env_vars.py b/skyrl-train/skyrl_train/env_vars.py
index 0fe18e357..3e5dca507 100644
--- a/skyrl-train/skyrl_train/env_vars.py
+++ b/skyrl-train/skyrl_train/env_vars.py
@@ -65,3 +65,25 @@
See https://github.com/ray-project/ray/issues/56697 for details on why this is needed.
"""
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Logging
+# ─────────────────────────────────────────────────────────────────────────────
+
+SKYRL_LOG_LEVEL = os.environ.get("SKYRL_LOG_LEVEL", "INFO").upper()
+"""
+Log level for SkyRL (default: INFO). Controls log filtering and Ray verbosity.
+
+- Application logs: Filters loguru output (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+- Ray verbosity: DEBUG enables verbose Ray logging (worker/raylet logs to stdout);
+ INFO or higher suppresses Ray infrastructure logs
+"""
+
+SKYRL_LOG_DIR = os.environ.get("SKYRL_LOG_DIR", "/tmp/skyrl-logs")
+"""
+Directory for SkyRL log files (default: /tmp/skyrl-logs).
+
+All logs (infra + training) are written to {SKYRL_LOG_DIR}/{run_name}/logs.log.
+By default, only training progress logs go to stdout. Set SKYRL_LOG_LEVEL=DEBUG
+to also show infrastructure logs on stdout.
+"""
diff --git a/skyrl-train/skyrl_train/utils/logging.py b/skyrl-train/skyrl_train/utils/logging.py
new file mode 100644
index 000000000..3cb0c40af
--- /dev/null
+++ b/skyrl-train/skyrl_train/utils/logging.py
@@ -0,0 +1,227 @@
+"""
+Logging configuration for SkyRL.
+
+This module provides logging setup that separates training progress from infrastructure logs:
+- All logs go to a file in SKYRL_LOG_DIR
+- Only training progress logs go to stdout by default
+- Set SKYRL_LOG_LEVEL=DEBUG to show all logs on stdout
+"""
+
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+from loguru import logger
+
+from skyrl_train.env_vars import SKYRL_LOG_DIR, SKYRL_LOG_LEVEL
+
+# Logger names that are considered "training progress" and should go to stdout
+# All other loggers are considered "infrastructure" and only go to the log file
+TRAINING_PROGRESS_LOGGERS = frozenset(
+ {
+ "skyrl_train.trainer",
+ "skyrl_train.fully_async_trainer",
+ "skyrl_train.utils.tracking",
+ "skyrl_train.evaluate",
+ }
+)
+
+# Logger name prefixes to suppress from stdout (infrastructure loggers)
+INFRA_LOGGER_PREFIXES = (
+ "vllm",
+ "ray",
+ "transformers",
+ "torch",
+ "httpx",
+ "httpcore",
+ "urllib3",
+ "asyncio",
+ "skyrl_train.workers",
+ "skyrl_train.inference_engines",
+ "skyrl_train.distributed",
+ "skyrl_train.model_wrapper",
+ "skyrl_train.utils.utils",
+ "skyrl_train.utils.ppo_utils",
+ "skyrl_train.dataset",
+ "skyrl_train.generators",
+)
+
+
+class TrainingProgressFilter(logging.Filter):
+ """Filter that only allows training progress logs through."""
+
+ def filter(self, record: logging.LogRecord) -> bool:
+ # Allow if it's a training progress logger
+ if record.name in TRAINING_PROGRESS_LOGGERS:
+ return True
+
+ # Allow if it starts with a training progress logger prefix
+ for prefix in TRAINING_PROGRESS_LOGGERS:
+ if record.name.startswith(prefix + "."):
+ return True
+
+ # Block if it's an infrastructure logger
+ for prefix in INFRA_LOGGER_PREFIXES:
+ if record.name.startswith(prefix):
+ return False
+
+ # Allow other loggers (e.g., root logger, unknown loggers)
+ return True
+
+
+def setup_logging(run_name: Optional[str] = None) -> Path:
+ """
+ Set up SkyRL logging with file and console handlers.
+
+ All logs go to the log file. Only training progress logs go to stdout
+ unless SKYRL_LOG_LEVEL=DEBUG, in which case all logs go to stdout.
+
+ Args:
+ run_name: Name of the training run for organizing logs. If None,
+ logs go directly to SKYRL_LOG_DIR.
+
+ Returns:
+ Path to the log directory.
+ """
+ # Create log directory
+ log_dir = Path(SKYRL_LOG_DIR)
+ if run_name:
+ log_dir = log_dir / run_name
+ log_dir.mkdir(parents=True, exist_ok=True)
+
+ log_file = log_dir / "logs.log"
+
+ # Determine if we should show all logs on stdout
+ verbose = SKYRL_LOG_LEVEL == "DEBUG"
+
+ # Configure loguru for file output (all logs)
+ logger.remove()
+
+ # File handler - all logs with colors
+ logger.add(
+ str(log_file),
+ colorize=True,
+ level="DEBUG", # Capture all levels in file
+ enqueue=True,
+ backtrace=False,
+ diagnose=False,
+ format="{time:YYYY-MM-DD HH:mm:ss.SSS} | "
+ "{level: <8} | "
+ "{name}:{function}:{line} - "
+ "{message}",
+ )
+
+ # Console handler - filtered or all based on log level
+ def console_filter(record):
+ """Filter for console output - only training progress unless DEBUG."""
+ if verbose:
+ return True
+ name = record["name"]
+ # Check if it's a training progress logger
+ if name in TRAINING_PROGRESS_LOGGERS:
+ return True
+ for prefix in TRAINING_PROGRESS_LOGGERS:
+ if name.startswith(prefix + "."):
+ return True
+ # Check if it's an infra logger to block
+ for prefix in INFRA_LOGGER_PREFIXES:
+ if name.startswith(prefix):
+ return False
+ return True
+
+ logger.add(
+ sys.stderr,
+ colorize=True,
+ level=SKYRL_LOG_LEVEL,
+ filter=console_filter,
+ enqueue=True,
+ backtrace=False,
+ diagnose=False,
+ format="{time:YYYY-MM-DD HH:mm:ss.SSS} | "
+ "{level: <8} | "
+ "{name}:{function}:{line} - "
+ "{message}",
+ )
+
+ # Configure stdlib logging to route through loguru
+ _setup_stdlib_logging(log_file, verbose)
+
+ logger.info(f"Logging initialized. Log file: {log_file}")
+ return log_dir
+
+
+def _setup_stdlib_logging(log_file: Path, verbose: bool) -> None:
+ """
+ Configure stdlib logging to write to file and optionally stdout.
+
+ This captures logs from vLLM, transformers, and other third-party libraries.
+ """
+
+ class InterceptHandler(logging.Handler):
+ """Handler that routes stdlib logging to loguru."""
+
+ def emit(self, record: logging.LogRecord) -> None:
+ try:
+ level = logger.level(record.levelname).name
+ except ValueError:
+ level = record.levelno
+
+ # Find caller from where the logged message originated
+ frame, depth = logging.currentframe(), 2
+ while frame and frame.f_code.co_filename == logging.__file__:
+ frame = frame.f_back
+ depth += 1
+
+ logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+
+ # Remove existing handlers and add our interceptor
+ logging.root.handlers = [InterceptHandler()]
+ level = getattr(logging, SKYRL_LOG_LEVEL, logging.INFO)
+ logging.root.setLevel(level)
+
+ # Suppress noisy third-party loggers
+ for name in ["httpx", "httpcore", "urllib3", "asyncio"]:
+ logging.getLogger(name).setLevel(logging.WARNING)
+
+
+def configure_worker_logging() -> None:
+ """
+ Configure logging for Ray workers.
+
+ This is called within Ray workers to set up proper logging. Workers
+ write to stderr which is captured by Ray.
+ """
+ # Use the same setup but without run_name (workers don't create log dirs)
+ logger.remove()
+
+ level_name = SKYRL_LOG_LEVEL
+
+ # Console only for workers (Ray handles log forwarding)
+ logger.level("INFO", color="")
+ logger.add(
+ sys.stderr,
+ colorize=True,
+ level=level_name,
+ enqueue=True,
+ backtrace=False,
+ diagnose=False,
+ format="{time:YYYY-MM-DD HH:mm:ss.SSS} | "
+ "{level: <8} | "
+ "{name}:{function}:{line} - "
+ "{message}",
+ )
+
+ # Route stdlib logging through loguru
+ class InterceptHandler(logging.Handler):
+ def emit(self, record: logging.LogRecord) -> None:
+ try:
+ level = logger.level(record.levelname).name
+ except ValueError:
+ level = record.levelno
+ logger.opt(depth=6, exception=record.exc_info).log(level, record.getMessage())
+
+ logging.root.handlers = [InterceptHandler()]
+ level = getattr(logging, level_name, logging.INFO)
+ logging.root.setLevel(level)
diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py
index 2c4dc8f7e..dcbcbe462 100644
--- a/skyrl-train/skyrl_train/utils/utils.py
+++ b/skyrl-train/skyrl_train/utils/utils.py
@@ -595,40 +595,15 @@ def prepare_runtime_environment(cfg: DictConfig) -> dict[str, str]:
def configure_ray_worker_logging() -> None:
"""
+ Configure logging for Ray workers.
+
In Ray workers, stderr/stdout are not TTYs, so Loguru disables color.
- This method forces color and formatting (e.g., bold) and routes stdlib `logging`
- through Loguru so third-party logs match formatting
+ This method forces color and formatting and routes stdlib logging
+ through Loguru so third-party logs match formatting.
"""
- level_name = os.getenv("LOG_LEVEL", "INFO").upper()
-
- # 1) Loguru formatting (force colors)
- logger.remove()
- logger.level("INFO", color="")
- logger.add(
- sys.stderr,
- colorize=True, # keep ANSI even without a TTY
- level=level_name, # ensure Loguru filters below this level
- enqueue=True,
- backtrace=False,
- diagnose=False,
- format="{time:YYYY-MM-DD HH:mm:ss.SSS} | "
- "{level: <8} | "
- "{name}:{function}:{line} - "
- "{message}",
- )
-
- # 2) Route stdlib logging -> Loguru (so vLLM/transformers/etc. are formatted)
- class _InterceptHandler(logging.Handler):
- def emit(self, record: logging.LogRecord) -> None:
- try:
- level = logger.level(record.levelname).name
- except ValueError:
- level = record.levelno
- logger.opt(depth=6, exception=record.exc_info).log(level, record.getMessage())
+ from skyrl_train.utils.logging import configure_worker_logging
- logging.root.handlers = [_InterceptHandler()]
- level = getattr(logging, level_name, logging.INFO)
- logging.root.setLevel(level)
+ configure_worker_logging()
def initialize_ray(cfg: DictConfig):
@@ -639,9 +614,19 @@ def initialize_ray(cfg: DictConfig):
cfg: Training config
"""
from .ppo_utils import sync_registries
+ from skyrl_train.env_vars import SKYRL_LOG_LEVEL
+
+ # SKYRL_LOG_LEVEL=DEBUG enables verbose Ray logging
+ verbose_logging = SKYRL_LOG_LEVEL == "DEBUG"
+
+ if not verbose_logging:
+ # Suppress C++ logs (metrics exporter errors) on stdout
+ os.environ["RAY_BACKEND_LOG_LEVEL"] = "fatal"
env_vars = prepare_runtime_environment(cfg)
- ray.init(runtime_env={"env_vars": env_vars})
+
+ # log_to_driver=False suppresses worker/raylet logs forwarding to driver stdout
+ ray.init(runtime_env={"env_vars": env_vars}, log_to_driver=verbose_logging)
# create the named ray actors for the registries to make available to all workers
sync_registries()