From 304fdf9d7a2b81a49040c0dde2c40128e1f696b9 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 19 Nov 2025 21:28:36 -0800 Subject: [PATCH 01/14] first draft of performance tips tutorial --- docs/source/conf.py | 1 + examples/decoding/performance_tips.py | 159 ++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 examples/decoding/performance_tips.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 133bccf2e..87f14f75d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -81,6 +81,7 @@ def __call__(self, filename): "approximate_mode.py", "sampling.py", "parallel_decoding.py", + "performance_tips.py", "custom_frame_mappings.py", ] else: diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py new file mode 100644 index 000000000..e04d4fb89 --- /dev/null +++ b/examples/decoding/performance_tips.py @@ -0,0 +1,159 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +==================================== +Performance Tips and Best Practices +==================================== + +This tutorial consolidates performance optimization techniques for video +decoding with TorchCodec. Learn when and how to apply various strategies +to increase performance. +""" + + +# %% +# Overview +# -------- +# +# When decoding videos with TorchCodec, several techniques can significantly +# improve performance depending on your use case. This guide covers: +# +# 1. **Batch APIs** - Decode multiple frames at once +# 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed +# 3. **Multi-threading** - Parallelize decoding across videos or chunks +# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats +# +# We'll explore each technique and when to use it. + +# %% +# 1. Use Batch APIs When Possible +# -------------------------------- +# +# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage +# internal optimizations. +# +# **Key Methods:** +# +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges +# +# **When to use:** +# +# - Decoding multiple frames + +# %% +# .. note:: +# +# For complete examples with runnable code demonstrating batch decoding, +# iteration, and frame retrieval, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` + +# %% +# 2. Approximate Mode & Keyframe Mappings +# ---------------------------------------- +# +# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when +# the decoder is created to build an accurate internal index of frames. This +# ensures frame-accurate seeking but takes longer for decoder initialization, +# especially on long videos. + +# %% +# **Approximate Mode** +# ~~~~~~~~~~~~~~~~~~~~ +# +# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the +# video file's metadata headers. This dramatically speeds up +# :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long +# videos, but may result in slightly less accurate seeking in some cases. +# +# +# **Which mode should you use:** +# +# - If you care about exactness of frame seeking, use “exact”. +# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”. +# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster. +# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster. + +# %% +# **Custom Frame Mappings** +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For advanced use cases, you can pre-compute a custom mapping between desired +# frame indices and actual keyframe locations. This allows you to speed up :class:`~torchcodec.decoders.VideoDecoder` +# instantiation while maintaining the frame seeking accuracy of ``seek_mode="exact"`` +# +# **When to use:** +# +# - Frame accuracy is critical, so approximate mode cannot be used +# - Videos can be preprocessed once and then decoded many times +# +# **Performance impact:** Enables consistent, predictable performance for repeated +# random access without the overhead of exact mode's scanning. + +# %% +# .. note:: +# +# For complete benchmarks showing actual speedup numbers, accuracy comparisons, +# and implementation examples, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` +# +# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` + +# %% +# 3. Multi-threading for Parallel Decoding +# ----------------------------------------- +# +# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: +# +# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities +# - Multiprocessing: Distributing work across multiple processes +# - Multithreading: Using multiple threads within a single process + +# %% +# .. note:: +# +# For complete examples comparing +# sequential, ffmpeg-based parallelism, multi-process, and multi-threaded approaches, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` + +# %% +# 4. BETA: CUDA Acceleration +# --------------------------- +# +# TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder +# (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory, +# avoiding expensive CPU-GPU transfers for downstream GPU operations. +# +# **When to use:** +# +# - Decoding large resolution videos +# - Large batch of videos saturating the CPU +# - GPU-intensive pipelines with transforms like scaling and cropping +# - CPU is saturated and you want to free it up for other work +# +# **When NOT to use:** +# +# - You need bit-exact results +# - Small resolution videos and the PCI-e transfer latency is large +# - GPU is already busy and CPU is idle +# +# **Performance impact:** CUDA decoding can significantly outperform CPU decoding, +# especially for high-resolution videos and when combined with GPU-based transforms. +# Actual speedup varies by hardware, resolution, and codec. + +# %% +# .. note:: +# +# For installation instructions, detailed examples, and visual comparisons +# between CPU and CUDA decoding, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` From 5693776db99e44088be6692e01f6e414b5bd53c4 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Thu, 20 Nov 2025 07:52:46 -0800 Subject: [PATCH 02/14] modify format --- examples/decoding/performance_tips.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index e04d4fb89..e36598e30 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -111,11 +111,11 @@ # 3. Multi-threading for Parallel Decoding # ----------------------------------------- # -# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: +# When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: # -# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities -# - Multiprocessing: Distributing work across multiple processes -# - Multithreading: Using multiple threads within a single process +# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities +# - **Multiprocessing** - Distributing work across multiple processes +# - **Multithreading** - Using multiple threads within a single process # %% # .. note:: From a74f653b477547c33f3bd95f747e4978aee1c96b Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Fri, 21 Nov 2025 08:38:38 -0800 Subject: [PATCH 03/14] address feedback --- examples/decoding/performance_tips.py | 33 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index e36598e30..4e1705623 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -25,7 +25,7 @@ # 1. **Batch APIs** - Decode multiple frames at once # 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed # 3. **Multi-threading** - Parallelize decoding across videos or chunks -# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats +# 4. **CUDA Acceleration** - Use GPU decoding for supported formats # # We'll explore each technique and when to use it. @@ -33,8 +33,9 @@ # 1. Use Batch APIs When Possible # -------------------------------- # -# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage -# internal optimizations. +# If you need to decode multiple frames at once, the batch methods are faster than calling single-frame decoding methods multiple times. +# For example, :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` is faster than calling :meth:`~torchcodec.decoders.VideoDecoder.get_frame_at` multiple times. +# TorchCodec's batch APIs reduce overhead and can leverage internal optimizations. # # **Key Methods:** # @@ -59,7 +60,7 @@ # 2. Approximate Mode & Keyframe Mappings # ---------------------------------------- # -# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when +# By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when # the decoder is created to build an accurate internal index of frames. This # ensures frame-accurate seeking but takes longer for decoder initialization, # especially on long videos. @@ -68,7 +69,7 @@ # **Approximate Mode** # ~~~~~~~~~~~~~~~~~~~~ # -# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the +# Setting ``seek_mode="approximate"`` skips the initial :term:`scan` and relies on the # video file's metadata headers. This dramatically speeds up # :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long # videos, but may result in slightly less accurate seeking in some cases. @@ -77,9 +78,7 @@ # **Which mode should you use:** # # - If you care about exactness of frame seeking, use “exact”. -# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”. -# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster. -# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster. +# - If the video is long and you're only decoding a small amount of frames, approximate mode should be faster. # %% # **Custom Frame Mappings** @@ -113,9 +112,11 @@ # # When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: # -# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities +# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities for intra-frame parallelism, where parallelization happens within individual frames rather than across frames # - **Multiprocessing** - Distributing work across multiple processes # - **Multithreading** - Using multiple threads within a single process +# +# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. # %% # .. note:: @@ -126,8 +127,8 @@ # - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` # %% -# 4. BETA: CUDA Acceleration -# --------------------------- +# 4. CUDA Acceleration +# -------------------- # # TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder # (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory, @@ -150,6 +151,16 @@ # especially for high-resolution videos and when combined with GPU-based transforms. # Actual speedup varies by hardware, resolution, and codec. +# %% +# **Recommended Usage for Beta Interface** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. code-block:: python +# +# with set_cuda_backend("beta"): +# decoder = VideoDecoder("file.mp4", device="cuda") +# + # %% # .. note:: # From 547d8e5310c8754556c178c5aabdf1af52d206e5 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 24 Nov 2025 12:58:01 -0800 Subject: [PATCH 04/14] address feedback --- docs/source/index.rst | 8 ++++++++ examples/decoding/performance_tips.py | 13 +++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 74e8d1298..e25a79827 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -84,6 +84,14 @@ Decoding How to sample regular and random clips from a video + .. grid-item-card:: :octicon:`file-code;1em` + Performance Tips + :img-top: _static/img/card-background.svg + :link: generated_examples/decoding/performance_tips.html + :link-type: url + + Tips for optimizing video decoding performance + Encoding ^^^^^^^^ diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index 4e1705623..17781f451 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -39,8 +39,13 @@ # # **Key Methods:** # +# For index-based frame retrieval: +# # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges +# +# For timestamp-based frame retrieval: +# # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges # @@ -61,7 +66,7 @@ # ---------------------------------------- # # By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when -# the decoder is created to build an accurate internal index of frames. This +# you create the decoder to build an accurate internal index of frames. This # ensures frame-accurate seeking but takes longer for decoder initialization, # especially on long videos. @@ -90,8 +95,8 @@ # # **When to use:** # -# - Frame accuracy is critical, so approximate mode cannot be used -# - Videos can be preprocessed once and then decoded many times +# - Frame accuracy is critical, so you cannot use approximate mode +# - You can preprocess videos once and then decode them many times # # **Performance impact:** Enables consistent, predictable performance for repeated # random access without the overhead of exact mode's scanning. @@ -116,7 +121,7 @@ # - **Multiprocessing** - Distributing work across multiple processes # - **Multithreading** - Using multiple threads within a single process # -# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. +# You can use both multiprocessing and multithreading to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. # %% # .. note:: From 9e0f33ad8688bb9dab5be491b55d975584e4347e Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 1 Dec 2025 21:12:51 -0800 Subject: [PATCH 05/14] address feedback --- examples/decoding/performance_tips.py | 31 ++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index 17781f451..ac247fd64 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -5,9 +5,12 @@ # LICENSE file in the root directory of this source tree. """ -==================================== -Performance Tips and Best Practices -==================================== +.. meta:: + :description: Learn how to optimize TorchCodec video decoding performance with batch APIs, approximate seeking, multi-threading, and CUDA acceleration. + +============================================== +TorchCodec Performance Tips and Best Practices +============================================== This tutorial consolidates performance optimization techniques for video decoding with TorchCodec. Learn when and how to apply various strategies @@ -173,3 +176,25 @@ # between CPU and CUDA decoding, see: # # - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` + +# %% +# Conclusion +# ---------- +# +# TorchCodec offers multiple performance optimization strategies, each suited to +# different scenarios. Use batch APIs for multi-frame decoding, approximate mode +# for faster initialization, parallel processing for high throughput, and CUDA +# acceleration for GPU-intensive workflows. +# +# The best results often come from combining techniques. Profile your specific +# use case and apply optimizations incrementally, using the benchmarks in the +# linked examples as a guide. +# +# For more information, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` - Basic decoding examples +# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` - Approximate mode benchmarks +# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` - Custom frame mappings +# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` - Parallel decoding strategies +# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` - CUDA acceleration guide +# - :class:`torchcodec.decoders.VideoDecoder` - Full API reference From b32e6f3a3acd543f17c39fa0696fd8fac05447be Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 3 Dec 2025 12:40:54 -0800 Subject: [PATCH 06/14] expose cpu_fallback --- src/torchcodec/decoders/__init__.py | 2 +- src/torchcodec/decoders/_video_decoder.py | 85 ++++++++++++++++++++++ test/test_decoders.py | 86 +++++++++++++++++++++++ 3 files changed, 172 insertions(+), 1 deletion(-) diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py index 980ba98a9..a07760a60 100644 --- a/src/torchcodec/decoders/__init__.py +++ b/src/torchcodec/decoders/__init__.py @@ -7,6 +7,6 @@ from .._core import AudioStreamMetadata, VideoStreamMetadata from ._audio_decoder import AudioDecoder # noqa from ._decoder_utils import set_cuda_backend # noqa -from ._video_decoder import VideoDecoder # noqa +from ._video_decoder import FallbackInfo, VideoDecoder # noqa SimpleVideoDecoder = VideoDecoder diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 1b4d4706d..c0eb67e4b 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -7,6 +7,7 @@ import io import json import numbers +from dataclasses import dataclass from pathlib import Path from typing import List, Literal, Optional, Sequence, Tuple, Union @@ -22,6 +23,48 @@ from torchcodec.transforms import DecoderTransform, Resize +@dataclass +class FallbackInfo: + """Information about decoder fallback status. + + This class tracks whether hardware-accelerated decoding failed and the decoder + fell back to software decoding. + + Usage: + - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the fallback status + - Use ``bool(fallback_info)`` to check if any fallback occurred + + Attributes: + status_known (bool): Whether the fallback status has been determined. + """ + + def __init__(self): + self.status_known = False + self.__nvcuvid_unavailable = False + self.__video_not_supported = False + + def __bool__(self): + """Returns True if fallback occurred (and status is known).""" + return self.status_known and ( + self.__nvcuvid_unavailable or self.__video_not_supported + ) + + def __str__(self): + """Returns a human-readable string representation of the fallback status.""" + if not self.status_known: + return "Fallback status: Unknown" + + reasons = [] + if self.__nvcuvid_unavailable: + reasons.append("NVcuvid unavailable") + if self.__video_not_supported: + reasons.append("Video not supported") + + if reasons: + return "Fallback status: Falling back due to: " + ", ".join(reasons) + return "Fallback status: No fallback required" + + class VideoDecoder: """A single-stream video decoder. @@ -180,13 +223,48 @@ def __init__( custom_frame_mappings=custom_frame_mappings_data, ) + # Initialize fallback info + self._fallback_info = FallbackInfo() + def __len__(self) -> int: return self._num_frames + @property + def cpu_fallback(self) -> FallbackInfo: + """Get information about decoder fallback status. + + Returns: + FallbackInfo: Information about whether hardware-accelerated decoding + failed and the decoder fell back to software decoding. + + Note: + The fallback status is only determined after the first frame access. + Before that, the status will be "Unknown". + """ + return self._fallback_info + + def _update_cpu_fallback(self): + """Update the fallback status if it hasn't been determined yet. + + This method should be called after any frame decoding operation to determine + if fallback to software decoding occurred. + """ + if not self._fallback_info.status_known: + backend_details = core._get_backend_details(self._decoder) + + self._fallback_info.status_known = True + + if "CPU fallback" in backend_details: + if "NVCUVID not available" in backend_details: + self._fallback_info._FallbackInfo__nvcuvid_unavailable = True + else: + self._fallback_info._FallbackInfo__video_not_supported = True + def _getitem_int(self, key: int) -> Tensor: assert isinstance(key, int) frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key) + self._update_cpu_fallback() return frame_data def _getitem_slice(self, key: slice) -> Tensor: @@ -199,6 +277,7 @@ def _getitem_slice(self, key: slice) -> Tensor: stop=stop, step=step, ) + self._update_cpu_fallback() return frame_data def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor: @@ -252,6 +331,7 @@ def get_frame_at(self, index: int) -> Frame: data, pts_seconds, duration_seconds = core.get_frame_at_index( self._decoder, frame_index=index ) + self._update_cpu_fallback() return Frame( data=data, pts_seconds=pts_seconds.item(), @@ -271,6 +351,7 @@ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch: data, pts_seconds, duration_seconds = core.get_frames_at_indices( self._decoder, frame_indices=indices ) + self._update_cpu_fallback() return FrameBatch( data=data, @@ -300,6 +381,7 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc stop=stop, step=step, ) + self._update_cpu_fallback() return FrameBatch(*frames) def get_frame_played_at(self, seconds: float) -> Frame: @@ -329,6 +411,7 @@ def get_frame_played_at(self, seconds: float) -> Frame: data, pts_seconds, duration_seconds = core.get_frame_at_pts( self._decoder, seconds ) + self._update_cpu_fallback() return Frame( data=data, pts_seconds=pts_seconds.item(), @@ -350,6 +433,7 @@ def get_frames_played_at( data, pts_seconds, duration_seconds = core.get_frames_by_pts( self._decoder, timestamps=seconds ) + self._update_cpu_fallback() return FrameBatch( data=data, pts_seconds=pts_seconds, @@ -394,6 +478,7 @@ def get_frames_played_in_range( start_seconds=start_seconds, stop_seconds=stop_seconds, ) + self._update_cpu_fallback() return FrameBatch(*frames) diff --git a/test/test_decoders.py b/test/test_decoders.py index efa2d11c8..d85387f8b 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1737,6 +1737,92 @@ def test_set_cuda_backend(self): with set_cuda_backend(backend): VideoDecoder(H265_VIDEO.path, device=f"cuda:{bad_device_number}") + def test_cpu_fallback_before_after_decoding(self): + decoder = VideoDecoder(NASA_VIDEO.path) + + # Before accessing any frames, status should be unknown + assert not decoder.cpu_fallback.status_known + assert str(decoder.cpu_fallback) == "Fallback status: Unknown" + assert not bool(decoder.cpu_fallback) + + # After accessing frames, status should be known + _ = decoder[0] + assert decoder.cpu_fallback.status_known + assert str(decoder.cpu_fallback) != "Fallback status: Unknown" + + def test_cpu_fallback_no_fallback_on_cpu_device(self): + """Test that CPU device doesn't trigger fallback (it's not a fallback scenario).""" + decoder = VideoDecoder(NASA_VIDEO.path, device="cpu") + + _ = decoder[0] + + assert decoder.cpu_fallback.status_known + assert not bool(decoder.cpu_fallback) + assert "No fallback required" in str(decoder.cpu_fallback) + + @needs_cuda + def test_cpu_fallback_h265_video_ffmpeg_cuda(self): + """Test that H265 video triggers CPU fallback on FFmpeg CUDA interface.""" + # H265_VIDEO is known to trigger CPU fallback on FFmpeg CUDA + # because its dimensions are too small + decoder = VideoDecoder(H265_VIDEO.path, device="cuda") + + _ = decoder.get_frame_at(0) + + assert decoder.cpu_fallback.status_known + assert bool(decoder.cpu_fallback) + assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback) + + @needs_cuda + def test_cpu_fallback_h265_video_beta_cuda(self): + """Test that H265 video triggers CPU fallback on Beta CUDA interface.""" + with set_cuda_backend("beta"): + decoder = VideoDecoder(H265_VIDEO.path, device="cuda") + + _ = decoder.get_frame_at(0) + + assert decoder.cpu_fallback.status_known + assert bool(decoder.cpu_fallback) + assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback) + + @needs_cuda + def test_cpu_fallback_no_fallback_on_supported_video(self): + """Test that supported videos don't trigger fallback on CUDA.""" + decoder = VideoDecoder(NASA_VIDEO.path, device="cuda") + + # Access a frame to determine status + _ = decoder[0] + + assert not bool(decoder.cpu_fallback) + + def test_cpu_fallback_status_cached(self): + """Test that cpu_fallback status is determined once and then cached.""" + decoder = VideoDecoder(NASA_VIDEO.path) + + _ = decoder[0] + first_status = str(decoder.cpu_fallback) + assert decoder.cpu_fallback.status_known + + _ = decoder[1] + second_status = str(decoder.cpu_fallback) + assert decoder.cpu_fallback.status_known + + assert first_status == second_status + + def test_cpu_fallback_multiple_access_methods(self): + """Test that cpu_fallback works with different frame access methods.""" + decoder = VideoDecoder(NASA_VIDEO.path) + + _ = decoder.get_frame_at(0) + assert decoder.cpu_fallback.status_known + status_after_get_frame = str(decoder.cpu_fallback) + + _ = decoder.get_frames_in_range(1, 3) + assert str(decoder.cpu_fallback) == status_after_get_frame + + _ = decoder.get_frame_played_at(0.5) + assert str(decoder.cpu_fallback) == status_after_get_frame + class TestAudioDecoder: @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32)) From cf5b718f988b6a34bc8c43c3602b08583022a584 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 3 Dec 2025 12:53:51 -0800 Subject: [PATCH 07/14] modify comments --- src/torchcodec/decoders/_video_decoder.py | 22 ++++++---------------- test/test_decoders.py | 2 +- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index c0eb67e4b..0c5586e61 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -27,11 +27,10 @@ class FallbackInfo: """Information about decoder fallback status. - This class tracks whether hardware-accelerated decoding failed and the decoder - fell back to software decoding. + This class tracks whether the decoder fell back to CPU decoding. Usage: - - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the fallback status + - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the cpu fallback status - Use ``bool(fallback_info)`` to check if any fallback occurred Attributes: @@ -44,13 +43,13 @@ def __init__(self): self.__video_not_supported = False def __bool__(self): - """Returns True if fallback occurred (and status is known).""" + """Returns True if fallback occurred.""" return self.status_known and ( self.__nvcuvid_unavailable or self.__video_not_supported ) def __str__(self): - """Returns a human-readable string representation of the fallback status.""" + """Returns a human-readable string representation of the cpu fallback status.""" if not self.status_known: return "Fallback status: Unknown" @@ -223,7 +222,6 @@ def __init__( custom_frame_mappings=custom_frame_mappings_data, ) - # Initialize fallback info self._fallback_info = FallbackInfo() def __len__(self) -> int: @@ -231,16 +229,8 @@ def __len__(self) -> int: @property def cpu_fallback(self) -> FallbackInfo: - """Get information about decoder fallback status. - - Returns: - FallbackInfo: Information about whether hardware-accelerated decoding - failed and the decoder fell back to software decoding. - - Note: - The fallback status is only determined after the first frame access. - Before that, the status will be "Unknown". - """ + # We can only determine whether fallback to CPU is happening after + # the first frame access. Before that, the status will be "Unknown". return self._fallback_info def _update_cpu_fallback(self): diff --git a/test/test_decoders.py b/test/test_decoders.py index d85387f8b..a54496d97 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1790,10 +1790,10 @@ def test_cpu_fallback_no_fallback_on_supported_video(self): """Test that supported videos don't trigger fallback on CUDA.""" decoder = VideoDecoder(NASA_VIDEO.path, device="cuda") - # Access a frame to determine status _ = decoder[0] assert not bool(decoder.cpu_fallback) + assert "No fallback required" in str(decoder.cpu_fallback) def test_cpu_fallback_status_cached(self): """Test that cpu_fallback status is determined once and then cached.""" From 6e69c8ca771d686d1e55460628538559d515543d Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 3 Dec 2025 15:11:22 -0800 Subject: [PATCH 08/14] modify comments --- src/torchcodec/decoders/_video_decoder.py | 29 +++++++++++++---------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 0c5586e61..f8046a0da 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -223,23 +223,26 @@ def __init__( ) self._fallback_info = FallbackInfo() + self._has_decoded_frame = False def __len__(self) -> int: return self._num_frames @property def cpu_fallback(self) -> FallbackInfo: - # We can only determine whether fallback to CPU is happening after - # the first frame access. Before that, the status will be "Unknown". + # We can only determine whether fallback to CPU is happening when this + # property is accessed and requires that at least one frame has been decoded. + self._update_cpu_fallback() return self._fallback_info def _update_cpu_fallback(self): """Update the fallback status if it hasn't been determined yet. - This method should be called after any frame decoding operation to determine - if fallback to software decoding occurred. + This method queries the C++ backend to determine if fallback to CPU + decoding occurred. The query is only performed after at least one frame + has been decoded. """ - if not self._fallback_info.status_known: + if not self._fallback_info.status_known and self._has_decoded_frame: backend_details = core._get_backend_details(self._decoder) self._fallback_info.status_known = True @@ -254,7 +257,7 @@ def _getitem_int(self, key: int) -> Tensor: assert isinstance(key, int) frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key) - self._update_cpu_fallback() + self._has_decoded_frame = True return frame_data def _getitem_slice(self, key: slice) -> Tensor: @@ -267,7 +270,7 @@ def _getitem_slice(self, key: slice) -> Tensor: stop=stop, step=step, ) - self._update_cpu_fallback() + self._has_decoded_frame = True return frame_data def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor: @@ -321,7 +324,7 @@ def get_frame_at(self, index: int) -> Frame: data, pts_seconds, duration_seconds = core.get_frame_at_index( self._decoder, frame_index=index ) - self._update_cpu_fallback() + self._has_decoded_frame = True return Frame( data=data, pts_seconds=pts_seconds.item(), @@ -341,7 +344,7 @@ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch: data, pts_seconds, duration_seconds = core.get_frames_at_indices( self._decoder, frame_indices=indices ) - self._update_cpu_fallback() + self._has_decoded_frame = True return FrameBatch( data=data, @@ -371,7 +374,7 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc stop=stop, step=step, ) - self._update_cpu_fallback() + self._has_decoded_frame = True return FrameBatch(*frames) def get_frame_played_at(self, seconds: float) -> Frame: @@ -401,7 +404,7 @@ def get_frame_played_at(self, seconds: float) -> Frame: data, pts_seconds, duration_seconds = core.get_frame_at_pts( self._decoder, seconds ) - self._update_cpu_fallback() + self._has_decoded_frame = True return Frame( data=data, pts_seconds=pts_seconds.item(), @@ -423,7 +426,7 @@ def get_frames_played_at( data, pts_seconds, duration_seconds = core.get_frames_by_pts( self._decoder, timestamps=seconds ) - self._update_cpu_fallback() + self._has_decoded_frame = True return FrameBatch( data=data, pts_seconds=pts_seconds, @@ -468,7 +471,7 @@ def get_frames_played_in_range( start_seconds=start_seconds, stop_seconds=stop_seconds, ) - self._update_cpu_fallback() + self._has_decoded_frame = True return FrameBatch(*frames) From 5ac83215c72b301705df23532918d6a63c7e88f3 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Thu, 4 Dec 2025 11:03:32 -0800 Subject: [PATCH 09/14] address feedback: --- src/torchcodec/_core/CudaDeviceInterface.cpp | 6 ++ src/torchcodec/_core/CudaDeviceInterface.h | 1 + src/torchcodec/decoders/__init__.py | 2 +- src/torchcodec/decoders/_video_decoder.py | 62 ++++++++------------ test/test_decoders.py | 20 +++---- 5 files changed, 40 insertions(+), 51 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 0e20c5e8d..67c274136 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -241,6 +241,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( std::optional preAllocatedOutputTensor) { validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame); + hasDecodedFrame_ = true; + // All of our CUDA decoding assumes NV12 format. We handle non-NV12 formats by // converting them to NV12. avFrame = maybeConvertAVFrameToNV12OrRGB24(avFrame); @@ -358,6 +360,10 @@ std::string CudaDeviceInterface::getDetails() { // Note: for this interface specifically the fallback is only known after a // frame has been decoded, not before: that's when FFmpeg decides to fallback, // so we can't know earlier. + if (!hasDecodedFrame_) { + return std::string( + "FFmpeg CUDA Device Interface. Fallback status unknown (no frames decoded)."); + } return std::string("FFmpeg CUDA Device Interface. Using ") + (usingCPUFallback_ ? "CPU fallback." : "NVDEC."); } diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index c892bd49b..90d359185 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -63,6 +63,7 @@ class CudaDeviceInterface : public DeviceInterface { std::unique_ptr nv12Conversion_; bool usingCPUFallback_ = false; + bool hasDecodedFrame_ = false; }; } // namespace facebook::torchcodec diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py index a07760a60..ef08cce83 100644 --- a/src/torchcodec/decoders/__init__.py +++ b/src/torchcodec/decoders/__init__.py @@ -7,6 +7,6 @@ from .._core import AudioStreamMetadata, VideoStreamMetadata from ._audio_decoder import AudioDecoder # noqa from ._decoder_utils import set_cuda_backend # noqa -from ._video_decoder import FallbackInfo, VideoDecoder # noqa +from ._video_decoder import CpuFallbackStatus, VideoDecoder # noqa SimpleVideoDecoder = VideoDecoder diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index f8046a0da..54dec7bf4 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -24,14 +24,14 @@ @dataclass -class FallbackInfo: - """Information about decoder fallback status. +class CpuFallbackStatus: + """Information about CPU fallback status. This class tracks whether the decoder fell back to CPU decoding. Usage: - - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the cpu fallback status - - Use ``bool(fallback_info)`` to check if any fallback occurred + - Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status + - Use ``bool(cpu_fallback_status)`` to check if any fallback occurred Attributes: status_known (bool): Whether the fallback status has been determined. @@ -39,13 +39,13 @@ class FallbackInfo: def __init__(self): self.status_known = False - self.__nvcuvid_unavailable = False - self.__video_not_supported = False + self._nvcuvid_unavailable = False + self._video_not_supported = False def __bool__(self): """Returns True if fallback occurred.""" return self.status_known and ( - self.__nvcuvid_unavailable or self.__video_not_supported + self._nvcuvid_unavailable or self._video_not_supported ) def __str__(self): @@ -54,9 +54,9 @@ def __str__(self): return "Fallback status: Unknown" reasons = [] - if self.__nvcuvid_unavailable: + if self._nvcuvid_unavailable: reasons.append("NVcuvid unavailable") - if self.__video_not_supported: + if self._video_not_supported: reasons.append("Video not supported") if reasons: @@ -142,6 +142,10 @@ class VideoDecoder: stream_index (int): The stream index that this decoder is retrieving frames from. If a stream index was provided at initialization, this is the same value. If it was left unspecified, this is the :term:`best stream`. + cpu_fallback (CpuFallbackStatus): Information about whether the decoder fell back to CPU + decoding. Use ``bool(cpu_fallback)`` to check if fallback occurred, or + ``str(cpu_fallback)`` to get a human-readable status message. The status is only + determined after at least one frame has been decoded. """ def __init__( @@ -222,42 +226,33 @@ def __init__( custom_frame_mappings=custom_frame_mappings_data, ) - self._fallback_info = FallbackInfo() - self._has_decoded_frame = False + self._cpu_fallback = CpuFallbackStatus() def __len__(self) -> int: return self._num_frames @property - def cpu_fallback(self) -> FallbackInfo: + def cpu_fallback(self) -> CpuFallbackStatus: # We can only determine whether fallback to CPU is happening when this # property is accessed and requires that at least one frame has been decoded. - self._update_cpu_fallback() - return self._fallback_info - - def _update_cpu_fallback(self): - """Update the fallback status if it hasn't been determined yet. - - This method queries the C++ backend to determine if fallback to CPU - decoding occurred. The query is only performed after at least one frame - has been decoded. - """ - if not self._fallback_info.status_known and self._has_decoded_frame: + if not self._cpu_fallback.status_known: backend_details = core._get_backend_details(self._decoder) - self._fallback_info.status_known = True + if "status unknown" not in backend_details: + self._cpu_fallback.status_known = True + + if "CPU fallback" in backend_details: + if "NVCUVID not available" in backend_details: + self._cpu_fallback._nvcuvid_unavailable = True + else: + self._cpu_fallback._video_not_supported = True - if "CPU fallback" in backend_details: - if "NVCUVID not available" in backend_details: - self._fallback_info._FallbackInfo__nvcuvid_unavailable = True - else: - self._fallback_info._FallbackInfo__video_not_supported = True + return self._cpu_fallback def _getitem_int(self, key: int) -> Tensor: assert isinstance(key, int) frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key) - self._has_decoded_frame = True return frame_data def _getitem_slice(self, key: slice) -> Tensor: @@ -270,7 +265,6 @@ def _getitem_slice(self, key: slice) -> Tensor: stop=stop, step=step, ) - self._has_decoded_frame = True return frame_data def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor: @@ -324,7 +318,6 @@ def get_frame_at(self, index: int) -> Frame: data, pts_seconds, duration_seconds = core.get_frame_at_index( self._decoder, frame_index=index ) - self._has_decoded_frame = True return Frame( data=data, pts_seconds=pts_seconds.item(), @@ -344,7 +337,6 @@ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch: data, pts_seconds, duration_seconds = core.get_frames_at_indices( self._decoder, frame_indices=indices ) - self._has_decoded_frame = True return FrameBatch( data=data, @@ -374,7 +366,6 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc stop=stop, step=step, ) - self._has_decoded_frame = True return FrameBatch(*frames) def get_frame_played_at(self, seconds: float) -> Frame: @@ -404,7 +395,6 @@ def get_frame_played_at(self, seconds: float) -> Frame: data, pts_seconds, duration_seconds = core.get_frame_at_pts( self._decoder, seconds ) - self._has_decoded_frame = True return Frame( data=data, pts_seconds=pts_seconds.item(), @@ -426,7 +416,6 @@ def get_frames_played_at( data, pts_seconds, duration_seconds = core.get_frames_by_pts( self._decoder, timestamps=seconds ) - self._has_decoded_frame = True return FrameBatch( data=data, pts_seconds=pts_seconds, @@ -471,7 +460,6 @@ def get_frames_played_in_range( start_seconds=start_seconds, stop_seconds=stop_seconds, ) - self._has_decoded_frame = True return FrameBatch(*frames) diff --git a/test/test_decoders.py b/test/test_decoders.py index a54496d97..95034b259 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1737,19 +1737,6 @@ def test_set_cuda_backend(self): with set_cuda_backend(backend): VideoDecoder(H265_VIDEO.path, device=f"cuda:{bad_device_number}") - def test_cpu_fallback_before_after_decoding(self): - decoder = VideoDecoder(NASA_VIDEO.path) - - # Before accessing any frames, status should be unknown - assert not decoder.cpu_fallback.status_known - assert str(decoder.cpu_fallback) == "Fallback status: Unknown" - assert not bool(decoder.cpu_fallback) - - # After accessing frames, status should be known - _ = decoder[0] - assert decoder.cpu_fallback.status_known - assert str(decoder.cpu_fallback) != "Fallback status: Unknown" - def test_cpu_fallback_no_fallback_on_cpu_device(self): """Test that CPU device doesn't trigger fallback (it's not a fallback scenario).""" decoder = VideoDecoder(NASA_VIDEO.path, device="cpu") @@ -1767,6 +1754,8 @@ def test_cpu_fallback_h265_video_ffmpeg_cuda(self): # because its dimensions are too small decoder = VideoDecoder(H265_VIDEO.path, device="cuda") + assert not decoder.cpu_fallback.status_known + _ = decoder.get_frame_at(0) assert decoder.cpu_fallback.status_known @@ -1779,9 +1768,14 @@ def test_cpu_fallback_h265_video_beta_cuda(self): with set_cuda_backend("beta"): decoder = VideoDecoder(H265_VIDEO.path, device="cuda") + # Before accessing any frames, status should be unknown + assert decoder.cpu_fallback.status_known + _ = decoder.get_frame_at(0) + # After accessing frames, status should be known assert decoder.cpu_fallback.status_known + assert bool(decoder.cpu_fallback) assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback) From e97490e27d6cbd9db2ffaa38a0ec8bbaa902c23c Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Thu, 4 Dec 2025 14:44:45 -0800 Subject: [PATCH 10/14] switch _.code._get_backend_details() to new api --- src/torchcodec/decoders/_video_decoder.py | 13 +++++- test/test_decoders.py | 49 +++++++++-------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 54dec7bf4..b25904663 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -41,6 +41,7 @@ def __init__(self): self.status_known = False self._nvcuvid_unavailable = False self._video_not_supported = False + self._backend = "" def __bool__(self): """Returns True if fallback occurred.""" @@ -60,8 +61,11 @@ def __str__(self): reasons.append("Video not supported") if reasons: - return "Fallback status: Falling back due to: " + ", ".join(reasons) - return "Fallback status: No fallback required" + return ( + f"[{self._backend}] Fallback status: Falling back due to: " + + ", ".join(reasons) + ) + return f"[{self._backend}] Fallback status: No fallback required" class VideoDecoder: @@ -241,6 +245,11 @@ def cpu_fallback(self) -> CpuFallbackStatus: if "status unknown" not in backend_details: self._cpu_fallback.status_known = True + for backend in ("FFmpeg CUDA", "Beta CUDA", "CPU"): + if backend_details.startswith(backend): + self._cpu_fallback._backend = backend + break + if "CPU fallback" in backend_details: if "NVCUVID not available" in backend_details: self._cpu_fallback._nvcuvid_unavailable = True diff --git a/test/test_decoders.py b/test/test_decoders.py index 95034b259..b56d70290 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1672,22 +1672,27 @@ def test_beta_cuda_interface_cpu_fallback(self): # to the CPU path, too. ref_dec = VideoDecoder(H265_VIDEO.path, device="cuda") - ref_frames = ref_dec.get_frame_at(0) - assert ( - _core._get_backend_details(ref_dec._decoder) - == "FFmpeg CUDA Device Interface. Using CPU fallback." - ) + + # Before accessing any frames, status should be unknown + assert not ref_dec.cpu_fallback.status_known + + ref_frame = ref_dec.get_frame_at(0) + + assert "FFmpeg CUDA" in str(ref_dec.cpu_fallback) + assert ref_dec.cpu_fallback.status_known + assert bool(ref_dec.cpu_fallback) with set_cuda_backend("beta"): beta_dec = VideoDecoder(H265_VIDEO.path, device="cuda") - assert ( - _core._get_backend_details(beta_dec._decoder) - == "Beta CUDA Device Interface. Using CPU fallback." - ) + assert "Beta CUDA" in str(beta_dec.cpu_fallback) + # For beta interface, status is known immediately + assert beta_dec.cpu_fallback.status_known + assert bool(beta_dec.cpu_fallback) + beta_frame = beta_dec.get_frame_at(0) - assert psnr(ref_frames.data, beta_frame.data) > 25 + assert psnr(ref_frame.data, beta_frame.data) > 25 @needs_cuda def test_beta_cuda_interface_error(self): @@ -1715,7 +1720,8 @@ def test_set_cuda_backend(self): # Check that the default is the ffmpeg backend assert _get_cuda_backend() == "ffmpeg" dec = VideoDecoder(H265_VIDEO.path, device="cuda") - assert _core._get_backend_details(dec._decoder).startswith("FFmpeg CUDA") + _ = dec.get_frame_at(0) + assert "FFmpeg CUDA" in str(dec.cpu_fallback) # Check the setting "beta" effectively uses the BETA backend. # We also show that the affects decoder creation only. When the decoder @@ -1724,9 +1730,9 @@ def test_set_cuda_backend(self): with set_cuda_backend("beta"): dec = VideoDecoder(H265_VIDEO.path, device="cuda") assert _get_cuda_backend() == "ffmpeg" - assert _core._get_backend_details(dec._decoder).startswith("Beta CUDA") + assert "Beta CUDA" in str(dec.cpu_fallback) with set_cuda_backend("ffmpeg"): - assert _core._get_backend_details(dec._decoder).startswith("Beta CUDA") + assert "Beta CUDA" in str(dec.cpu_fallback) # Hacky way to ensure passing "cuda:1" is supported by both backends. We # just check that there's an error when passing cuda:N where N is too @@ -1762,23 +1768,6 @@ def test_cpu_fallback_h265_video_ffmpeg_cuda(self): assert bool(decoder.cpu_fallback) assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback) - @needs_cuda - def test_cpu_fallback_h265_video_beta_cuda(self): - """Test that H265 video triggers CPU fallback on Beta CUDA interface.""" - with set_cuda_backend("beta"): - decoder = VideoDecoder(H265_VIDEO.path, device="cuda") - - # Before accessing any frames, status should be unknown - assert decoder.cpu_fallback.status_known - - _ = decoder.get_frame_at(0) - - # After accessing frames, status should be known - assert decoder.cpu_fallback.status_known - - assert bool(decoder.cpu_fallback) - assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback) - @needs_cuda def test_cpu_fallback_no_fallback_on_supported_video(self): """Test that supported videos don't trigger fallback on CUDA.""" From 6a05947d2ac6d03dad4335c21671255b33dfd2a0 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Sun, 7 Dec 2025 20:57:48 -0800 Subject: [PATCH 11/14] address feedback --- docs/source/api_ref_decoders.rst | 1 + src/torchcodec/decoders/_video_decoder.py | 47 +++++++++------ test/test_decoders.py | 69 ++++++++--------------- test/utils.py | 7 +++ 4 files changed, 61 insertions(+), 63 deletions(-) diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst index 1417d7aea..b3a1f3250 100644 --- a/docs/source/api_ref_decoders.rst +++ b/docs/source/api_ref_decoders.rst @@ -33,3 +33,4 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a VideoStreamMetadata AudioStreamMetadata + CpuFallbackStatus diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 2f91878ca..38e9dd11f 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -7,7 +7,7 @@ import io import json import numbers -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Literal, Optional, Sequence, Tuple, Union @@ -29,20 +29,24 @@ class CpuFallbackStatus: """Information about CPU fallback status. This class tracks whether the decoder fell back to CPU decoding. + Users should not instantiate this class directly; instead, access it + via the :attr:`VideoDecoder.cpu_fallback` attribute. Usage: - - Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status - - Use ``bool(cpu_fallback_status)`` to check if any fallback occurred - Attributes: - status_known (bool): Whether the fallback status has been determined. + - Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status + - Use ``if cpu_fallback_status:`` to check if any fallback occurred """ - def __init__(self): - self.status_known = False - self._nvcuvid_unavailable = False - self._video_not_supported = False - self._backend = "" + status_known: bool = False + """Whether the fallback status has been determined. + For the Beta CUDA backend (see :func:`~torchcodec.decoders.set_cuda_backend`), + this is always ``True`` immediately after decoder creation. + For the FFmpeg CUDA backend, this becomes ``True`` after decoding + the first frame.""" + _nvcuvid_unavailable: bool = field(default=False, init=False) + _video_not_supported: bool = field(default=False, init=False) + _backend: str = field(default="", init=False) def __bool__(self): """Returns True if fallback occurred.""" @@ -53,7 +57,7 @@ def __bool__(self): def __str__(self): """Returns a human-readable string representation of the cpu fallback status.""" if not self.status_known: - return "Fallback status: Unknown" + return f"[{self._backend}] Fallback status: Unknown" reasons = [] if self._nvcuvid_unavailable: @@ -235,25 +239,32 @@ def __init__( ) self._cpu_fallback = CpuFallbackStatus() + if device.startswith("cuda"): + if device_variant == "beta": + self._cpu_fallback._backend = "Beta CUDA" + else: + self._cpu_fallback._backend = "FFmpeg CUDA" + else: + self._cpu_fallback._backend = "CPU" def __len__(self) -> int: return self._num_frames @property def cpu_fallback(self) -> CpuFallbackStatus: - # We can only determine whether fallback to CPU is happening when this - # property is accessed and requires that at least one frame has been decoded. + # We only query the CPU fallback info if status is unknown. That happens + # either when: + # - this @property has never been called before + # - no frame has been decoded yet on the FFmpeg interface. + # Note that for the beta interface, we're able to know the fallback status + # right when the VideoDecoder is instantiated, but the status_known + # attribute is initialized to False. if not self._cpu_fallback.status_known: backend_details = core._get_backend_details(self._decoder) if "status unknown" not in backend_details: self._cpu_fallback.status_known = True - for backend in ("FFmpeg CUDA", "Beta CUDA", "CPU"): - if backend_details.startswith(backend): - self._cpu_fallback._backend = backend - break - if "CPU fallback" in backend_details: if "NVCUVID not available" in backend_details: self._cpu_fallback._nvcuvid_unavailable = True diff --git a/test/test_decoders.py b/test/test_decoders.py index b56d70290..dee4325e4 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -27,6 +27,7 @@ assert_frames_equal, AV1_VIDEO, BT709_FULL_RANGE, + cuda_devices, cuda_version_used_for_building_torch, get_ffmpeg_major_version, get_python_version, @@ -1680,7 +1681,7 @@ def test_beta_cuda_interface_cpu_fallback(self): assert "FFmpeg CUDA" in str(ref_dec.cpu_fallback) assert ref_dec.cpu_fallback.status_known - assert bool(ref_dec.cpu_fallback) + assert ref_dec.cpu_fallback with set_cuda_backend("beta"): beta_dec = VideoDecoder(H265_VIDEO.path, device="cuda") @@ -1688,7 +1689,7 @@ def test_beta_cuda_interface_cpu_fallback(self): assert "Beta CUDA" in str(beta_dec.cpu_fallback) # For beta interface, status is known immediately assert beta_dec.cpu_fallback.status_known - assert bool(beta_dec.cpu_fallback) + assert beta_dec.cpu_fallback beta_frame = beta_dec.get_frame_at(0) @@ -1720,7 +1721,6 @@ def test_set_cuda_backend(self): # Check that the default is the ffmpeg backend assert _get_cuda_backend() == "ffmpeg" dec = VideoDecoder(H265_VIDEO.path, device="cuda") - _ = dec.get_frame_at(0) assert "FFmpeg CUDA" in str(dec.cpu_fallback) # Check the setting "beta" effectively uses the BETA backend. @@ -1747,65 +1747,44 @@ def test_cpu_fallback_no_fallback_on_cpu_device(self): """Test that CPU device doesn't trigger fallback (it's not a fallback scenario).""" decoder = VideoDecoder(NASA_VIDEO.path, device="cpu") + assert decoder.cpu_fallback.status_known _ = decoder[0] - assert decoder.cpu_fallback.status_known - assert not bool(decoder.cpu_fallback) + assert not decoder.cpu_fallback assert "No fallback required" in str(decoder.cpu_fallback) @needs_cuda - def test_cpu_fallback_h265_video_ffmpeg_cuda(self): - """Test that H265 video triggers CPU fallback on FFmpeg CUDA interface.""" - # H265_VIDEO is known to trigger CPU fallback on FFmpeg CUDA + @pytest.mark.parametrize("device", cuda_devices()) + def test_cpu_fallback_h265_video(self, device): + """Test that H265 video triggers CPU fallback on CUDA interfaces.""" + # H265_VIDEO is known to trigger CPU fallback on CUDA # because its dimensions are too small - decoder = VideoDecoder(H265_VIDEO.path, device="cuda") + decoder, _ = make_video_decoder(H265_VIDEO.path, device=device) - assert not decoder.cpu_fallback.status_known + if "beta" in device: + # For beta interface, status is known immediately + assert decoder.cpu_fallback.status_known + else: + # For FFmpeg interface, status is unknown until first frame is decoded + assert not decoder.cpu_fallback.status_known - _ = decoder.get_frame_at(0) + decoder.get_frame_at(0) assert decoder.cpu_fallback.status_known - assert bool(decoder.cpu_fallback) - assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback) + assert decoder.cpu_fallback + assert "Video not supported" in str(decoder.cpu_fallback) @needs_cuda - def test_cpu_fallback_no_fallback_on_supported_video(self): + @pytest.mark.parametrize("device", cuda_devices()) + def test_cpu_fallback_no_fallback_on_supported_video(self, device): """Test that supported videos don't trigger fallback on CUDA.""" - decoder = VideoDecoder(NASA_VIDEO.path, device="cuda") + decoder, _ = make_video_decoder(NASA_VIDEO.path, device=device) - _ = decoder[0] + decoder[0] - assert not bool(decoder.cpu_fallback) + assert not decoder.cpu_fallback assert "No fallback required" in str(decoder.cpu_fallback) - def test_cpu_fallback_status_cached(self): - """Test that cpu_fallback status is determined once and then cached.""" - decoder = VideoDecoder(NASA_VIDEO.path) - - _ = decoder[0] - first_status = str(decoder.cpu_fallback) - assert decoder.cpu_fallback.status_known - - _ = decoder[1] - second_status = str(decoder.cpu_fallback) - assert decoder.cpu_fallback.status_known - - assert first_status == second_status - - def test_cpu_fallback_multiple_access_methods(self): - """Test that cpu_fallback works with different frame access methods.""" - decoder = VideoDecoder(NASA_VIDEO.path) - - _ = decoder.get_frame_at(0) - assert decoder.cpu_fallback.status_known - status_after_get_frame = str(decoder.cpu_fallback) - - _ = decoder.get_frames_in_range(1, 3) - assert str(decoder.cpu_fallback) == status_after_get_frame - - _ = decoder.get_frame_played_at(0.5) - assert str(decoder.cpu_fallback) == status_after_get_frame - class TestAudioDecoder: @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32)) diff --git a/test/utils.py b/test/utils.py index fb2d84483..9e3ddde00 100644 --- a/test/utils.py +++ b/test/utils.py @@ -52,6 +52,13 @@ def all_supported_devices(): ) +def cuda_devices(): + return ( + pytest.param("cuda", marks=pytest.mark.needs_cuda), + pytest.param(_CUDA_BETA_DEVICE_STR, marks=pytest.mark.needs_cuda), + ) + + def unsplit_device_str(device_str: str) -> str: # helper meant to be used as # device, device_variant = unsplit_device_str(device) From 8b75eacf9605f6839804d9df10ddec07684e0a54 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Sun, 7 Dec 2025 21:06:54 -0800 Subject: [PATCH 12/14] fix lint --- src/torchcodec/decoders/_video_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 040caa946..5f607a47a 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -8,8 +8,8 @@ import io import json import numbers -from dataclasses import dataclass, field from collections.abc import Sequence +from dataclasses import dataclass, field from pathlib import Path from typing import Literal From 14ad6c7e0d7412eb5aec0619b56326d30747ebb3 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 8 Dec 2025 07:30:33 -0800 Subject: [PATCH 13/14] ffmpeg backend logic --- src/torchcodec/decoders/_video_decoder.py | 12 +++++++----- test/test_decoders.py | 7 ++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py index 5f607a47a..b8518f766 100644 --- a/src/torchcodec/decoders/_video_decoder.py +++ b/src/torchcodec/decoders/_video_decoder.py @@ -48,13 +48,12 @@ class CpuFallbackStatus: the first frame.""" _nvcuvid_unavailable: bool = field(default=False, init=False) _video_not_supported: bool = field(default=False, init=False) + _is_fallback: bool = field(default=False, init=False) _backend: str = field(default="", init=False) def __bool__(self): """Returns True if fallback occurred.""" - return self.status_known and ( - self._nvcuvid_unavailable or self._video_not_supported - ) + return self.status_known and self._is_fallback def __str__(self): """Returns a human-readable string representation of the cpu fallback status.""" @@ -64,8 +63,10 @@ def __str__(self): reasons = [] if self._nvcuvid_unavailable: reasons.append("NVcuvid unavailable") - if self._video_not_supported: + elif self._video_not_supported: reasons.append("Video not supported") + elif self._is_fallback: + reasons.append("Unknown reason - try the Beta interface to know more!") if reasons: return ( @@ -268,9 +269,10 @@ def cpu_fallback(self) -> CpuFallbackStatus: self._cpu_fallback.status_known = True if "CPU fallback" in backend_details: + self._cpu_fallback._is_fallback = True if "NVCUVID not available" in backend_details: self._cpu_fallback._nvcuvid_unavailable = True - else: + elif self._cpu_fallback._backend == "Beta CUDA": self._cpu_fallback._video_not_supported = True return self._cpu_fallback diff --git a/test/test_decoders.py b/test/test_decoders.py index ae8170d8a..8fcd78665 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1772,7 +1772,12 @@ def test_cpu_fallback_h265_video(self, device): assert decoder.cpu_fallback.status_known assert decoder.cpu_fallback - assert "Video not supported" in str(decoder.cpu_fallback) + if "beta" in device: + # Beta interface provides the specific reason for fallback + assert "Video not supported" in str(decoder.cpu_fallback) + else: + # FFmpeg interface doesn't know the specific reason + assert "Unknown reason" in str(decoder.cpu_fallback) @needs_cuda @pytest.mark.parametrize("device", cuda_devices()) From bddfa7cb11283558945d1f4e748477458df9092e Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 8 Dec 2025 11:29:51 -0800 Subject: [PATCH 14/14] add cpufallback --- examples/decoding/basic_cuda_example.py | 41 ++++++++++++------------- examples/decoding/performance_tips.py | 22 +++++++++++++ 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/examples/decoding/basic_cuda_example.py b/examples/decoding/basic_cuda_example.py index 8f82940c0..13a0afe52 100644 --- a/examples/decoding/basic_cuda_example.py +++ b/examples/decoding/basic_cuda_example.py @@ -18,28 +18,6 @@ running the transform steps. Encoded packets are often much smaller than decoded frames so CUDA decoding also uses less PCI-e bandwidth. -When to and when not to use CUDA Decoding ------------------------------------------ - -CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios: - -#. You are decoding a large resolution video -#. You are decoding a large batch of videos that's saturating the CPU -#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors - after decoding -#. Your CPU is saturated and you want to free it up for other work - - -Here are situations where CUDA Decoding may not make sense: - -#. You want bit-exact results compared to CPU Decoding -#. You have small resolution videos and the PCI-e transfer latency is large -#. Your GPU is already busy and CPU is not - -It's best to experiment with CUDA Decoding to see if it improves your use-case. With -TorchCodec you can simply pass in a device parameter to the -:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding. - Installing TorchCodec with CUDA Enabled --------------------------------------- @@ -161,3 +139,22 @@ def plot_cpu_and_cuda_frames(cpu_frames: torch.Tensor, cuda_frames: torch.Tensor print(f"{frames_equal=}") print(f"{mean_abs_diff=}") print(f"{max_abs_diff=}") + + +# %% +# Checking for CPU Fallback +# ------------------------------------- +# +# In some cases, CUDA decoding may fall back to CPU decoding. This can happen +# when the video codec or format is not supported by the NVDEC hardware decoder. +# TorchCodec provides the :class:`~torchcodec.decoders.CpuFallbackStatus` class +# to help you detect when this fallback occurs. +# +# You can access the fallback status via the +# :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute: + +with set_cuda_backend("beta"): + decoder = VideoDecoder(video_file, device="cuda") + +# Check and print the CPU fallback status +print(decoder.cpu_fallback) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index ac247fd64..9bcd64877 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -170,8 +170,30 @@ # # %% +# **Checking for CPU Fallback** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In some cases, CUDA decoding may silently fall back to CPU decoding when the +# video codec or format is not supported by NVDEC. You can detect this using +# the :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute: +# +# .. code-block:: python +# +# decoder = VideoDecoder("file.mp4", device="cuda") +# decoder[0] # Decode at least one frame first (for FFmpeg backend) +# +# # Print detailed fallback status +# print(decoder.cpu_fallback) +# # .. note:: # +# The timing of when you can detect CPU fallback differs between backends: +# +# - **FFmpeg backend**: You can only check fallback status after decoding at +# least one frame, because FFmpeg determines codec support lazily during decoding. +# - **BETA backend**: You can check fallback status immediately after +# decoder creation, as the backend checks codec support upfront. +# # For installation instructions, detailed examples, and visual comparisons # between CPU and CUDA decoding, see: #