From 304fdf9d7a2b81a49040c0dde2c40128e1f696b9 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 19 Nov 2025 21:28:36 -0800
Subject: [PATCH 01/14] first draft of performance tips tutorial

---
 docs/source/conf.py                   |   1 +
 examples/decoding/performance_tips.py | 159 ++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 examples/decoding/performance_tips.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 133bccf2e..87f14f75d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -81,6 +81,7 @@ def __call__(self, filename):
                 "approximate_mode.py",
                 "sampling.py",
                 "parallel_decoding.py",
+                "performance_tips.py",
                 "custom_frame_mappings.py",
             ]
         else:
diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
new file mode 100644
index 000000000..e04d4fb89
--- /dev/null
+++ b/examples/decoding/performance_tips.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+====================================
+Performance Tips and Best Practices
+====================================
+
+This tutorial consolidates performance optimization techniques for video
+decoding with TorchCodec. Learn when and how to apply various strategies
+to increase performance.
+"""
+
+
+# %%
+# Overview
+# --------
+#
+# When decoding videos with TorchCodec, several techniques can significantly
+# improve performance depending on your use case. This guide covers:
+#
+# 1. **Batch APIs** - Decode multiple frames at once
+# 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed
+# 3. **Multi-threading** - Parallelize decoding across videos or chunks
+# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats
+#
+# We'll explore each technique and when to use it.
+
+# %%
+# 1. Use Batch APIs When Possible
+# --------------------------------
+#
+# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage
+# internal optimizations.
+#
+# **Key Methods:**
+#
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges
+#
+# **When to use:**
+#
+# - Decoding multiple frames
+
+# %%
+# .. note::
+#
+#     For complete examples with runnable code demonstrating batch decoding,
+#     iteration, and frame retrieval, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_basic_example.py`
+
+# %%
+# 2. Approximate Mode & Keyframe Mappings
+# ----------------------------------------
+#
+# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when
+# the decoder is created to build an accurate internal index of frames. This
+# ensures frame-accurate seeking but takes longer for decoder initialization,
+# especially on long videos.
+
+# %%
+# **Approximate Mode**
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the
+# video file's metadata headers. This dramatically speeds up
+# :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long
+# videos, but may result in slightly less accurate seeking in some cases.
+#
+#
+# **Which mode should you use:**
+#
+# - If you care about exactness of frame seeking, use “exact”.
+# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”.
+# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster.
+# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster.
+
+# %%
+# **Custom Frame Mappings**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For advanced use cases, you can pre-compute a custom mapping between desired
+# frame indices and actual keyframe locations. This allows you to speed up :class:`~torchcodec.decoders.VideoDecoder`
+# instantiation while maintaining the frame seeking accuracy of ``seek_mode="exact"``
+#
+# **When to use:**
+#
+# - Frame accuracy is critical, so approximate mode cannot be used
+# - Videos can be preprocessed once and then decoded many times
+#
+# **Performance impact:** Enables consistent, predictable performance for repeated
+# random access without the overhead of exact mode's scanning.
+
+# %%
+# .. note::
+#
+#     For complete benchmarks showing actual speedup numbers, accuracy comparisons,
+#     and implementation examples, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`
+
+# %%
+# 3. Multi-threading for Parallel Decoding
+# -----------------------------------------
+#
+# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
+#
+# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities
+# - Multiprocessing: Distributing work across multiple processes
+# - Multithreading: Using multiple threads within a single process
+
+# %%
+# .. note::
+#
+#     For complete examples comparing
+#     sequential, ffmpeg-based parallelism, multi-process, and multi-threaded approaches, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py`
+
+# %%
+# 4. BETA: CUDA Acceleration
+# ---------------------------
+#
+# TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder
+# (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory,
+# avoiding expensive CPU-GPU transfers for downstream GPU operations.
+#
+# **When to use:**
+#
+# - Decoding large resolution videos
+# - Large batch of videos saturating the CPU
+# - GPU-intensive pipelines with transforms like scaling and cropping
+# - CPU is saturated and you want to free it up for other work
+#
+# **When NOT to use:**
+#
+# - You need bit-exact results
+# - Small resolution videos and the PCI-e transfer latency is large
+# - GPU is already busy and CPU is idle
+#
+# **Performance impact:** CUDA decoding can significantly outperform CPU decoding,
+# especially for high-resolution videos and when combined with GPU-based transforms.
+# Actual speedup varies by hardware, resolution, and codec.
+
+# %%
+# .. note::
+#
+#     For installation instructions, detailed examples, and visual comparisons
+#     between CPU and CUDA decoding, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py`

From 5693776db99e44088be6692e01f6e414b5bd53c4 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Thu, 20 Nov 2025 07:52:46 -0800
Subject: [PATCH 02/14] modify format

---
 examples/decoding/performance_tips.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index e04d4fb89..e36598e30 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -111,11 +111,11 @@
 # 3. Multi-threading for Parallel Decoding
 # -----------------------------------------
 #
-# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
+# When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
 #
-# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities
-# - Multiprocessing: Distributing work across multiple processes
-# - Multithreading: Using multiple threads within a single process
+# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities
+# - **Multiprocessing** - Distributing work across multiple processes
+# - **Multithreading** - Using multiple threads within a single process
 
 # %%
 # .. note::

From a74f653b477547c33f3bd95f747e4978aee1c96b Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Fri, 21 Nov 2025 08:38:38 -0800
Subject: [PATCH 03/14] address feedback

---
 examples/decoding/performance_tips.py | 33 ++++++++++++++++++---------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index e36598e30..4e1705623 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -25,7 +25,7 @@
 # 1. **Batch APIs** - Decode multiple frames at once
 # 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed
 # 3. **Multi-threading** - Parallelize decoding across videos or chunks
-# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats
+# 4. **CUDA Acceleration** - Use GPU decoding for supported formats
 #
 # We'll explore each technique and when to use it.
 
@@ -33,8 +33,9 @@
 # 1. Use Batch APIs When Possible
 # --------------------------------
 #
-# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage
-# internal optimizations.
+# If you need to decode multiple frames at once, the batch methods are faster than calling single-frame decoding methods multiple times.
+# For example, :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` is faster than calling :meth:`~torchcodec.decoders.VideoDecoder.get_frame_at` multiple times.
+# TorchCodec's batch APIs reduce overhead and can leverage internal optimizations.
 #
 # **Key Methods:**
 #
@@ -59,7 +60,7 @@
 # 2. Approximate Mode & Keyframe Mappings
 # ----------------------------------------
 #
-# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when
+# By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when
 # the decoder is created to build an accurate internal index of frames. This
 # ensures frame-accurate seeking but takes longer for decoder initialization,
 # especially on long videos.
@@ -68,7 +69,7 @@
 # **Approximate Mode**
 # ~~~~~~~~~~~~~~~~~~~~
 #
-# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the
+# Setting ``seek_mode="approximate"`` skips the initial :term:`scan` and relies on the
 # video file's metadata headers. This dramatically speeds up
 # :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long
 # videos, but may result in slightly less accurate seeking in some cases.
@@ -77,9 +78,7 @@
 # **Which mode should you use:**
 #
 # - If you care about exactness of frame seeking, use “exact”.
-# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”.
-# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster.
-# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster.
+# - If the video is long and you're only decoding a small amount of frames, approximate mode should be faster.
 
 # %%
 # **Custom Frame Mappings**
@@ -113,9 +112,11 @@
 #
 # When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
 #
-# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities
+# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities for intra-frame parallelism, where parallelization happens within individual frames rather than across frames
 # - **Multiprocessing** - Distributing work across multiple processes
 # - **Multithreading** - Using multiple threads within a single process
+#
+# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks.
 
 # %%
 # .. note::
@@ -126,8 +127,8 @@
 #     - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py`
 
 # %%
-# 4. BETA: CUDA Acceleration
-# ---------------------------
+# 4. CUDA Acceleration
+# --------------------
 #
 # TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder
 # (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory,
@@ -150,6 +151,16 @@
 # especially for high-resolution videos and when combined with GPU-based transforms.
 # Actual speedup varies by hardware, resolution, and codec.
 
+# %%
+# **Recommended Usage for Beta Interface**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. code-block:: python
+#
+#     with set_cuda_backend("beta"):
+#         decoder = VideoDecoder("file.mp4", device="cuda")
+#
+
 # %%
 # .. note::
 #

From 547d8e5310c8754556c178c5aabdf1af52d206e5 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Mon, 24 Nov 2025 12:58:01 -0800
Subject: [PATCH 04/14] address feedback

---
 docs/source/index.rst                 |  8 ++++++++
 examples/decoding/performance_tips.py | 13 +++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 74e8d1298..e25a79827 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -84,6 +84,14 @@ Decoding
 
         How to sample regular and random clips from a video
 
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Performance Tips
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/decoding/performance_tips.html
+        :link-type: url
+
+        Tips for optimizing video decoding performance
+
 
 Encoding
 ^^^^^^^^
diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index 4e1705623..17781f451 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -39,8 +39,13 @@
 #
 # **Key Methods:**
 #
+# For index-based frame retrieval:
+#
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges
+#
+# For timestamp-based frame retrieval:
+#
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges
 #
@@ -61,7 +66,7 @@
 # ----------------------------------------
 #
 # By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when
-# the decoder is created to build an accurate internal index of frames. This
+# you create the decoder to build an accurate internal index of frames. This
 # ensures frame-accurate seeking but takes longer for decoder initialization,
 # especially on long videos.
 
@@ -90,8 +95,8 @@
 #
 # **When to use:**
 #
-# - Frame accuracy is critical, so approximate mode cannot be used
-# - Videos can be preprocessed once and then decoded many times
+# - Frame accuracy is critical, so you cannot use approximate mode
+# - You can preprocess videos once and then decode them many times
 #
 # **Performance impact:** Enables consistent, predictable performance for repeated
 # random access without the overhead of exact mode's scanning.
@@ -116,7 +121,7 @@
 # - **Multiprocessing** - Distributing work across multiple processes
 # - **Multithreading** - Using multiple threads within a single process
 #
-# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks.
+# You can use both multiprocessing and multithreading to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks.
 
 # %%
 # .. note::

From 9e0f33ad8688bb9dab5be491b55d975584e4347e Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Mon, 1 Dec 2025 21:12:51 -0800
Subject: [PATCH 05/14] address feedback

---
 examples/decoding/performance_tips.py | 31 ++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index 17781f451..ac247fd64 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -5,9 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-====================================
-Performance Tips and Best Practices
-====================================
+.. meta::
+   :description: Learn how to optimize TorchCodec video decoding performance with batch APIs, approximate seeking, multi-threading, and CUDA acceleration.
+
+==============================================
+TorchCodec Performance Tips and Best Practices
+==============================================
 
 This tutorial consolidates performance optimization techniques for video
 decoding with TorchCodec. Learn when and how to apply various strategies
@@ -173,3 +176,25 @@
 #     between CPU and CUDA decoding, see:
 #
 #     - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py`
+
+# %%
+# Conclusion
+# ----------
+#
+# TorchCodec offers multiple performance optimization strategies, each suited to
+# different scenarios. Use batch APIs for multi-frame decoding, approximate mode
+# for faster initialization, parallel processing for high throughput, and CUDA
+# acceleration for GPU-intensive workflows.
+#
+# The best results often come from combining techniques. Profile your specific
+# use case and apply optimizations incrementally, using the benchmarks in the
+# linked examples as a guide.
+#
+# For more information, see:
+#
+# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` - Basic decoding examples
+# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` - Approximate mode benchmarks
+# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` - Custom frame mappings
+# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` - Parallel decoding strategies
+# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` - CUDA acceleration guide
+# - :class:`torchcodec.decoders.VideoDecoder` - Full API reference

From b32e6f3a3acd543f17c39fa0696fd8fac05447be Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 3 Dec 2025 12:40:54 -0800
Subject: [PATCH 06/14] expose cpu_fallback

---
 src/torchcodec/decoders/__init__.py       |  2 +-
 src/torchcodec/decoders/_video_decoder.py | 85 ++++++++++++++++++++++
 test/test_decoders.py                     | 86 +++++++++++++++++++++++
 3 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py
index 980ba98a9..a07760a60 100644
--- a/src/torchcodec/decoders/__init__.py
+++ b/src/torchcodec/decoders/__init__.py
@@ -7,6 +7,6 @@
 from .._core import AudioStreamMetadata, VideoStreamMetadata
 from ._audio_decoder import AudioDecoder  # noqa
 from ._decoder_utils import set_cuda_backend  # noqa
-from ._video_decoder import VideoDecoder  # noqa
+from ._video_decoder import FallbackInfo, VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 1b4d4706d..c0eb67e4b 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -7,6 +7,7 @@
 import io
 import json
 import numbers
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Literal, Optional, Sequence, Tuple, Union
 
@@ -22,6 +23,48 @@
 from torchcodec.transforms import DecoderTransform, Resize
 
 
+@dataclass
+class FallbackInfo:
+    """Information about decoder fallback status.
+
+    This class tracks whether hardware-accelerated decoding failed and the decoder
+    fell back to software decoding.
+
+    Usage:
+        - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the fallback status
+        - Use ``bool(fallback_info)`` to check if any fallback occurred
+
+    Attributes:
+        status_known (bool): Whether the fallback status has been determined.
+    """
+
+    def __init__(self):
+        self.status_known = False
+        self.__nvcuvid_unavailable = False
+        self.__video_not_supported = False
+
+    def __bool__(self):
+        """Returns True if fallback occurred (and status is known)."""
+        return self.status_known and (
+            self.__nvcuvid_unavailable or self.__video_not_supported
+        )
+
+    def __str__(self):
+        """Returns a human-readable string representation of the fallback status."""
+        if not self.status_known:
+            return "Fallback status: Unknown"
+
+        reasons = []
+        if self.__nvcuvid_unavailable:
+            reasons.append("NVcuvid unavailable")
+        if self.__video_not_supported:
+            reasons.append("Video not supported")
+
+        if reasons:
+            return "Fallback status: Falling back due to: " + ", ".join(reasons)
+        return "Fallback status: No fallback required"
+
+
 class VideoDecoder:
     """A single-stream video decoder.
 
@@ -180,13 +223,48 @@ def __init__(
             custom_frame_mappings=custom_frame_mappings_data,
         )
 
+        # Initialize fallback info
+        self._fallback_info = FallbackInfo()
+
     def __len__(self) -> int:
         return self._num_frames
 
+    @property
+    def cpu_fallback(self) -> FallbackInfo:
+        """Get information about decoder fallback status.
+
+        Returns:
+            FallbackInfo: Information about whether hardware-accelerated decoding
+                         failed and the decoder fell back to software decoding.
+
+        Note:
+            The fallback status is only determined after the first frame access.
+            Before that, the status will be "Unknown".
+        """
+        return self._fallback_info
+
+    def _update_cpu_fallback(self):
+        """Update the fallback status if it hasn't been determined yet.
+
+        This method should be called after any frame decoding operation to determine
+        if fallback to software decoding occurred.
+        """
+        if not self._fallback_info.status_known:
+            backend_details = core._get_backend_details(self._decoder)
+
+            self._fallback_info.status_known = True
+
+            if "CPU fallback" in backend_details:
+                if "NVCUVID not available" in backend_details:
+                    self._fallback_info._FallbackInfo__nvcuvid_unavailable = True
+                else:
+                    self._fallback_info._FallbackInfo__video_not_supported = True
+
     def _getitem_int(self, key: int) -> Tensor:
         assert isinstance(key, int)
 
         frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key)
+        self._update_cpu_fallback()
         return frame_data
 
     def _getitem_slice(self, key: slice) -> Tensor:
@@ -199,6 +277,7 @@ def _getitem_slice(self, key: slice) -> Tensor:
             stop=stop,
             step=step,
         )
+        self._update_cpu_fallback()
         return frame_data
 
     def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor:
@@ -252,6 +331,7 @@ def get_frame_at(self, index: int) -> Frame:
         data, pts_seconds, duration_seconds = core.get_frame_at_index(
             self._decoder, frame_index=index
         )
+        self._update_cpu_fallback()
         return Frame(
             data=data,
             pts_seconds=pts_seconds.item(),
@@ -271,6 +351,7 @@ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch:
         data, pts_seconds, duration_seconds = core.get_frames_at_indices(
             self._decoder, frame_indices=indices
         )
+        self._update_cpu_fallback()
 
         return FrameBatch(
             data=data,
@@ -300,6 +381,7 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc
             stop=stop,
             step=step,
         )
+        self._update_cpu_fallback()
         return FrameBatch(*frames)
 
     def get_frame_played_at(self, seconds: float) -> Frame:
@@ -329,6 +411,7 @@ def get_frame_played_at(self, seconds: float) -> Frame:
         data, pts_seconds, duration_seconds = core.get_frame_at_pts(
             self._decoder, seconds
         )
+        self._update_cpu_fallback()
         return Frame(
             data=data,
             pts_seconds=pts_seconds.item(),
@@ -350,6 +433,7 @@ def get_frames_played_at(
         data, pts_seconds, duration_seconds = core.get_frames_by_pts(
             self._decoder, timestamps=seconds
         )
+        self._update_cpu_fallback()
         return FrameBatch(
             data=data,
             pts_seconds=pts_seconds,
@@ -394,6 +478,7 @@ def get_frames_played_in_range(
             start_seconds=start_seconds,
             stop_seconds=stop_seconds,
         )
+        self._update_cpu_fallback()
         return FrameBatch(*frames)
 
 
diff --git a/test/test_decoders.py b/test/test_decoders.py
index efa2d11c8..d85387f8b 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1737,6 +1737,92 @@ def test_set_cuda_backend(self):
                 with set_cuda_backend(backend):
                     VideoDecoder(H265_VIDEO.path, device=f"cuda:{bad_device_number}")
 
+    def test_cpu_fallback_before_after_decoding(self):
+        decoder = VideoDecoder(NASA_VIDEO.path)
+
+        # Before accessing any frames, status should be unknown
+        assert not decoder.cpu_fallback.status_known
+        assert str(decoder.cpu_fallback) == "Fallback status: Unknown"
+        assert not bool(decoder.cpu_fallback)
+
+        # After accessing frames, status should be known
+        _ = decoder[0]
+        assert decoder.cpu_fallback.status_known
+        assert str(decoder.cpu_fallback) != "Fallback status: Unknown"
+
+    def test_cpu_fallback_no_fallback_on_cpu_device(self):
+        """Test that CPU device doesn't trigger fallback (it's not a fallback scenario)."""
+        decoder = VideoDecoder(NASA_VIDEO.path, device="cpu")
+
+        _ = decoder[0]
+
+        assert decoder.cpu_fallback.status_known
+        assert not bool(decoder.cpu_fallback)
+        assert "No fallback required" in str(decoder.cpu_fallback)
+
+    @needs_cuda
+    def test_cpu_fallback_h265_video_ffmpeg_cuda(self):
+        """Test that H265 video triggers CPU fallback on FFmpeg CUDA interface."""
+        # H265_VIDEO is known to trigger CPU fallback on FFmpeg CUDA
+        # because its dimensions are too small
+        decoder = VideoDecoder(H265_VIDEO.path, device="cuda")
+
+        _ = decoder.get_frame_at(0)
+
+        assert decoder.cpu_fallback.status_known
+        assert bool(decoder.cpu_fallback)
+        assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback)
+
+    @needs_cuda
+    def test_cpu_fallback_h265_video_beta_cuda(self):
+        """Test that H265 video triggers CPU fallback on Beta CUDA interface."""
+        with set_cuda_backend("beta"):
+            decoder = VideoDecoder(H265_VIDEO.path, device="cuda")
+
+        _ = decoder.get_frame_at(0)
+
+        assert decoder.cpu_fallback.status_known
+        assert bool(decoder.cpu_fallback)
+        assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback)
+
+    @needs_cuda
+    def test_cpu_fallback_no_fallback_on_supported_video(self):
+        """Test that supported videos don't trigger fallback on CUDA."""
+        decoder = VideoDecoder(NASA_VIDEO.path, device="cuda")
+
+        # Access a frame to determine status
+        _ = decoder[0]
+
+        assert not bool(decoder.cpu_fallback)
+
+    def test_cpu_fallback_status_cached(self):
+        """Test that cpu_fallback status is determined once and then cached."""
+        decoder = VideoDecoder(NASA_VIDEO.path)
+
+        _ = decoder[0]
+        first_status = str(decoder.cpu_fallback)
+        assert decoder.cpu_fallback.status_known
+
+        _ = decoder[1]
+        second_status = str(decoder.cpu_fallback)
+        assert decoder.cpu_fallback.status_known
+
+        assert first_status == second_status
+
+    def test_cpu_fallback_multiple_access_methods(self):
+        """Test that cpu_fallback works with different frame access methods."""
+        decoder = VideoDecoder(NASA_VIDEO.path)
+
+        _ = decoder.get_frame_at(0)
+        assert decoder.cpu_fallback.status_known
+        status_after_get_frame = str(decoder.cpu_fallback)
+
+        _ = decoder.get_frames_in_range(1, 3)
+        assert str(decoder.cpu_fallback) == status_after_get_frame
+
+        _ = decoder.get_frame_played_at(0.5)
+        assert str(decoder.cpu_fallback) == status_after_get_frame
+
 
 class TestAudioDecoder:
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))

From cf5b718f988b6a34bc8c43c3602b08583022a584 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 3 Dec 2025 12:53:51 -0800
Subject: [PATCH 07/14] modify comments

---
 src/torchcodec/decoders/_video_decoder.py | 22 ++++++----------------
 test/test_decoders.py                     |  2 +-
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index c0eb67e4b..0c5586e61 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -27,11 +27,10 @@
 class FallbackInfo:
     """Information about decoder fallback status.
 
-    This class tracks whether hardware-accelerated decoding failed and the decoder
-    fell back to software decoding.
+    This class tracks whether the decoder fell back to CPU decoding.
 
     Usage:
-        - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the fallback status
+        - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the cpu fallback status
         - Use ``bool(fallback_info)`` to check if any fallback occurred
 
     Attributes:
@@ -44,13 +43,13 @@ def __init__(self):
         self.__video_not_supported = False
 
     def __bool__(self):
-        """Returns True if fallback occurred (and status is known)."""
+        """Returns True if fallback occurred."""
         return self.status_known and (
             self.__nvcuvid_unavailable or self.__video_not_supported
         )
 
     def __str__(self):
-        """Returns a human-readable string representation of the fallback status."""
+        """Returns a human-readable string representation of the cpu fallback status."""
         if not self.status_known:
             return "Fallback status: Unknown"
 
@@ -223,7 +222,6 @@ def __init__(
             custom_frame_mappings=custom_frame_mappings_data,
         )
 
-        # Initialize fallback info
         self._fallback_info = FallbackInfo()
 
     def __len__(self) -> int:
@@ -231,16 +229,8 @@ def __len__(self) -> int:
 
     @property
     def cpu_fallback(self) -> FallbackInfo:
-        """Get information about decoder fallback status.
-
-        Returns:
-            FallbackInfo: Information about whether hardware-accelerated decoding
-                         failed and the decoder fell back to software decoding.
-
-        Note:
-            The fallback status is only determined after the first frame access.
-            Before that, the status will be "Unknown".
-        """
+        # We can only determine whether fallback to CPU is happening after
+        # the first frame access. Before that, the status will be "Unknown".
         return self._fallback_info
 
     def _update_cpu_fallback(self):
diff --git a/test/test_decoders.py b/test/test_decoders.py
index d85387f8b..a54496d97 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1790,10 +1790,10 @@ def test_cpu_fallback_no_fallback_on_supported_video(self):
         """Test that supported videos don't trigger fallback on CUDA."""
         decoder = VideoDecoder(NASA_VIDEO.path, device="cuda")
 
-        # Access a frame to determine status
         _ = decoder[0]
 
         assert not bool(decoder.cpu_fallback)
+        assert "No fallback required" in str(decoder.cpu_fallback)
 
     def test_cpu_fallback_status_cached(self):
         """Test that cpu_fallback status is determined once and then cached."""

From 6e69c8ca771d686d1e55460628538559d515543d Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 3 Dec 2025 15:11:22 -0800
Subject: [PATCH 08/14] modify comments

---
 src/torchcodec/decoders/_video_decoder.py | 29 +++++++++++++----------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 0c5586e61..f8046a0da 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -223,23 +223,26 @@ def __init__(
         )
 
         self._fallback_info = FallbackInfo()
+        self._has_decoded_frame = False
 
     def __len__(self) -> int:
         return self._num_frames
 
     @property
     def cpu_fallback(self) -> FallbackInfo:
-        # We can only determine whether fallback to CPU is happening after
-        # the first frame access. Before that, the status will be "Unknown".
+        # We can only determine whether fallback to CPU is happening when this
+        # property is accessed and requires that at least one frame has been decoded.
+        self._update_cpu_fallback()
         return self._fallback_info
 
     def _update_cpu_fallback(self):
         """Update the fallback status if it hasn't been determined yet.
 
-        This method should be called after any frame decoding operation to determine
-        if fallback to software decoding occurred.
+        This method queries the C++ backend to determine if fallback to CPU
+        decoding occurred. The query is only performed after at least one frame
+        has been decoded.
         """
-        if not self._fallback_info.status_known:
+        if not self._fallback_info.status_known and self._has_decoded_frame:
             backend_details = core._get_backend_details(self._decoder)
 
             self._fallback_info.status_known = True
@@ -254,7 +257,7 @@ def _getitem_int(self, key: int) -> Tensor:
         assert isinstance(key, int)
 
         frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key)
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return frame_data
 
     def _getitem_slice(self, key: slice) -> Tensor:
@@ -267,7 +270,7 @@ def _getitem_slice(self, key: slice) -> Tensor:
             stop=stop,
             step=step,
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return frame_data
 
     def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor:
@@ -321,7 +324,7 @@ def get_frame_at(self, index: int) -> Frame:
         data, pts_seconds, duration_seconds = core.get_frame_at_index(
             self._decoder, frame_index=index
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return Frame(
             data=data,
             pts_seconds=pts_seconds.item(),
@@ -341,7 +344,7 @@ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch:
         data, pts_seconds, duration_seconds = core.get_frames_at_indices(
             self._decoder, frame_indices=indices
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
 
         return FrameBatch(
             data=data,
@@ -371,7 +374,7 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc
             stop=stop,
             step=step,
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return FrameBatch(*frames)
 
     def get_frame_played_at(self, seconds: float) -> Frame:
@@ -401,7 +404,7 @@ def get_frame_played_at(self, seconds: float) -> Frame:
         data, pts_seconds, duration_seconds = core.get_frame_at_pts(
             self._decoder, seconds
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return Frame(
             data=data,
             pts_seconds=pts_seconds.item(),
@@ -423,7 +426,7 @@ def get_frames_played_at(
         data, pts_seconds, duration_seconds = core.get_frames_by_pts(
             self._decoder, timestamps=seconds
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return FrameBatch(
             data=data,
             pts_seconds=pts_seconds,
@@ -468,7 +471,7 @@ def get_frames_played_in_range(
             start_seconds=start_seconds,
             stop_seconds=stop_seconds,
         )
-        self._update_cpu_fallback()
+        self._has_decoded_frame = True
         return FrameBatch(*frames)
 
 

From 5ac83215c72b301705df23532918d6a63c7e88f3 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Thu, 4 Dec 2025 11:03:32 -0800
Subject: [PATCH 09/14] address feedback:

---
 src/torchcodec/_core/CudaDeviceInterface.cpp |  6 ++
 src/torchcodec/_core/CudaDeviceInterface.h   |  1 +
 src/torchcodec/decoders/__init__.py          |  2 +-
 src/torchcodec/decoders/_video_decoder.py    | 62 ++++++++------------
 test/test_decoders.py                        | 20 +++----
 5 files changed, 40 insertions(+), 51 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 0e20c5e8d..67c274136 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -241,6 +241,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
 
+  hasDecodedFrame_ = true;
+
   // All of our CUDA decoding assumes NV12 format. We handle non-NV12 formats by
   // converting them to NV12.
   avFrame = maybeConvertAVFrameToNV12OrRGB24(avFrame);
@@ -358,6 +360,10 @@ std::string CudaDeviceInterface::getDetails() {
   // Note: for this interface specifically the fallback is only known after a
   // frame has been decoded, not before: that's when FFmpeg decides to fallback,
   // so we can't know earlier.
+  if (!hasDecodedFrame_) {
+    return std::string(
+        "FFmpeg CUDA Device Interface. Fallback status unknown (no frames decoded).");
+  }
   return std::string("FFmpeg CUDA Device Interface. Using ") +
       (usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
 }
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index c892bd49b..90d359185 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -63,6 +63,7 @@ class CudaDeviceInterface : public DeviceInterface {
   std::unique_ptr<FilterGraph> nv12Conversion_;
 
   bool usingCPUFallback_ = false;
+  bool hasDecodedFrame_ = false;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py
index a07760a60..ef08cce83 100644
--- a/src/torchcodec/decoders/__init__.py
+++ b/src/torchcodec/decoders/__init__.py
@@ -7,6 +7,6 @@
 from .._core import AudioStreamMetadata, VideoStreamMetadata
 from ._audio_decoder import AudioDecoder  # noqa
 from ._decoder_utils import set_cuda_backend  # noqa
-from ._video_decoder import FallbackInfo, VideoDecoder  # noqa
+from ._video_decoder import CpuFallbackStatus, VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index f8046a0da..54dec7bf4 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -24,14 +24,14 @@
 
 
 @dataclass
-class FallbackInfo:
-    """Information about decoder fallback status.
+class CpuFallbackStatus:
+    """Information about CPU fallback status.
 
     This class tracks whether the decoder fell back to CPU decoding.
 
     Usage:
-        - Use ``str(fallback_info)`` or ``print(fallback_info)`` to see the cpu fallback status
-        - Use ``bool(fallback_info)`` to check if any fallback occurred
+        - Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status
+        - Use ``bool(cpu_fallback_status)`` to check if any fallback occurred
 
     Attributes:
         status_known (bool): Whether the fallback status has been determined.
@@ -39,13 +39,13 @@ class FallbackInfo:
 
     def __init__(self):
         self.status_known = False
-        self.__nvcuvid_unavailable = False
-        self.__video_not_supported = False
+        self._nvcuvid_unavailable = False
+        self._video_not_supported = False
 
     def __bool__(self):
         """Returns True if fallback occurred."""
         return self.status_known and (
-            self.__nvcuvid_unavailable or self.__video_not_supported
+            self._nvcuvid_unavailable or self._video_not_supported
         )
 
     def __str__(self):
@@ -54,9 +54,9 @@ def __str__(self):
             return "Fallback status: Unknown"
 
         reasons = []
-        if self.__nvcuvid_unavailable:
+        if self._nvcuvid_unavailable:
             reasons.append("NVcuvid unavailable")
-        if self.__video_not_supported:
+        if self._video_not_supported:
             reasons.append("Video not supported")
 
         if reasons:
@@ -142,6 +142,10 @@ class VideoDecoder:
         stream_index (int): The stream index that this decoder is retrieving frames from. If a
             stream index was provided at initialization, this is the same value. If it was left
             unspecified, this is the :term:`best stream`.
+        cpu_fallback (CpuFallbackStatus): Information about whether the decoder fell back to CPU
+            decoding. Use ``bool(cpu_fallback)`` to check if fallback occurred, or
+            ``str(cpu_fallback)`` to get a human-readable status message. The status is only
+            determined after at least one frame has been decoded.
     """
 
     def __init__(
@@ -222,42 +226,33 @@ def __init__(
             custom_frame_mappings=custom_frame_mappings_data,
         )
 
-        self._fallback_info = FallbackInfo()
-        self._has_decoded_frame = False
+        self._cpu_fallback = CpuFallbackStatus()
 
     def __len__(self) -> int:
         return self._num_frames
 
     @property
-    def cpu_fallback(self) -> FallbackInfo:
+    def cpu_fallback(self) -> CpuFallbackStatus:
         # We can only determine whether fallback to CPU is happening when this
         # property is accessed and requires that at least one frame has been decoded.
-        self._update_cpu_fallback()
-        return self._fallback_info
-
-    def _update_cpu_fallback(self):
-        """Update the fallback status if it hasn't been determined yet.
-
-        This method queries the C++ backend to determine if fallback to CPU
-        decoding occurred. The query is only performed after at least one frame
-        has been decoded.
-        """
-        if not self._fallback_info.status_known and self._has_decoded_frame:
+        if not self._cpu_fallback.status_known:
             backend_details = core._get_backend_details(self._decoder)
 
-            self._fallback_info.status_known = True
+            if "status unknown" not in backend_details:
+                self._cpu_fallback.status_known = True
+
+                if "CPU fallback" in backend_details:
+                    if "NVCUVID not available" in backend_details:
+                        self._cpu_fallback._nvcuvid_unavailable = True
+                    else:
+                        self._cpu_fallback._video_not_supported = True
 
-            if "CPU fallback" in backend_details:
-                if "NVCUVID not available" in backend_details:
-                    self._fallback_info._FallbackInfo__nvcuvid_unavailable = True
-                else:
-                    self._fallback_info._FallbackInfo__video_not_supported = True
+        return self._cpu_fallback
 
     def _getitem_int(self, key: int) -> Tensor:
         assert isinstance(key, int)
 
         frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key)
-        self._has_decoded_frame = True
         return frame_data
 
     def _getitem_slice(self, key: slice) -> Tensor:
@@ -270,7 +265,6 @@ def _getitem_slice(self, key: slice) -> Tensor:
             stop=stop,
             step=step,
         )
-        self._has_decoded_frame = True
         return frame_data
 
     def __getitem__(self, key: Union[numbers.Integral, slice]) -> Tensor:
@@ -324,7 +318,6 @@ def get_frame_at(self, index: int) -> Frame:
         data, pts_seconds, duration_seconds = core.get_frame_at_index(
             self._decoder, frame_index=index
         )
-        self._has_decoded_frame = True
         return Frame(
             data=data,
             pts_seconds=pts_seconds.item(),
@@ -344,7 +337,6 @@ def get_frames_at(self, indices: Union[torch.Tensor, list[int]]) -> FrameBatch:
         data, pts_seconds, duration_seconds = core.get_frames_at_indices(
             self._decoder, frame_indices=indices
         )
-        self._has_decoded_frame = True
 
         return FrameBatch(
             data=data,
@@ -374,7 +366,6 @@ def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatc
             stop=stop,
             step=step,
         )
-        self._has_decoded_frame = True
         return FrameBatch(*frames)
 
     def get_frame_played_at(self, seconds: float) -> Frame:
@@ -404,7 +395,6 @@ def get_frame_played_at(self, seconds: float) -> Frame:
         data, pts_seconds, duration_seconds = core.get_frame_at_pts(
             self._decoder, seconds
         )
-        self._has_decoded_frame = True
         return Frame(
             data=data,
             pts_seconds=pts_seconds.item(),
@@ -426,7 +416,6 @@ def get_frames_played_at(
         data, pts_seconds, duration_seconds = core.get_frames_by_pts(
             self._decoder, timestamps=seconds
         )
-        self._has_decoded_frame = True
         return FrameBatch(
             data=data,
             pts_seconds=pts_seconds,
@@ -471,7 +460,6 @@ def get_frames_played_in_range(
             start_seconds=start_seconds,
             stop_seconds=stop_seconds,
         )
-        self._has_decoded_frame = True
         return FrameBatch(*frames)
 
 
diff --git a/test/test_decoders.py b/test/test_decoders.py
index a54496d97..95034b259 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1737,19 +1737,6 @@ def test_set_cuda_backend(self):
                 with set_cuda_backend(backend):
                     VideoDecoder(H265_VIDEO.path, device=f"cuda:{bad_device_number}")
 
-    def test_cpu_fallback_before_after_decoding(self):
-        decoder = VideoDecoder(NASA_VIDEO.path)
-
-        # Before accessing any frames, status should be unknown
-        assert not decoder.cpu_fallback.status_known
-        assert str(decoder.cpu_fallback) == "Fallback status: Unknown"
-        assert not bool(decoder.cpu_fallback)
-
-        # After accessing frames, status should be known
-        _ = decoder[0]
-        assert decoder.cpu_fallback.status_known
-        assert str(decoder.cpu_fallback) != "Fallback status: Unknown"
-
     def test_cpu_fallback_no_fallback_on_cpu_device(self):
         """Test that CPU device doesn't trigger fallback (it's not a fallback scenario)."""
         decoder = VideoDecoder(NASA_VIDEO.path, device="cpu")
@@ -1767,6 +1754,8 @@ def test_cpu_fallback_h265_video_ffmpeg_cuda(self):
         # because its dimensions are too small
         decoder = VideoDecoder(H265_VIDEO.path, device="cuda")
 
+        assert not decoder.cpu_fallback.status_known
+
         _ = decoder.get_frame_at(0)
 
         assert decoder.cpu_fallback.status_known
@@ -1779,9 +1768,14 @@ def test_cpu_fallback_h265_video_beta_cuda(self):
         with set_cuda_backend("beta"):
             decoder = VideoDecoder(H265_VIDEO.path, device="cuda")
 
+        # Before accessing any frames, status should be unknown
+        assert decoder.cpu_fallback.status_known
+
         _ = decoder.get_frame_at(0)
 
+        # After accessing frames, status should be known
         assert decoder.cpu_fallback.status_known
+
         assert bool(decoder.cpu_fallback)
         assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback)
 

From e97490e27d6cbd9db2ffaa38a0ec8bbaa902c23c Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Thu, 4 Dec 2025 14:44:45 -0800
Subject: [PATCH 10/14] switch _.code._get_backend_details() to new api

---
 src/torchcodec/decoders/_video_decoder.py | 13 +++++-
 test/test_decoders.py                     | 49 +++++++++--------------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 54dec7bf4..b25904663 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -41,6 +41,7 @@ def __init__(self):
         self.status_known = False
         self._nvcuvid_unavailable = False
         self._video_not_supported = False
+        self._backend = ""
 
     def __bool__(self):
         """Returns True if fallback occurred."""
@@ -60,8 +61,11 @@ def __str__(self):
             reasons.append("Video not supported")
 
         if reasons:
-            return "Fallback status: Falling back due to: " + ", ".join(reasons)
-        return "Fallback status: No fallback required"
+            return (
+                f"[{self._backend}] Fallback status: Falling back due to: "
+                + ", ".join(reasons)
+            )
+        return f"[{self._backend}] Fallback status: No fallback required"
 
 
 class VideoDecoder:
@@ -241,6 +245,11 @@ def cpu_fallback(self) -> CpuFallbackStatus:
             if "status unknown" not in backend_details:
                 self._cpu_fallback.status_known = True
 
+                for backend in ("FFmpeg CUDA", "Beta CUDA", "CPU"):
+                    if backend_details.startswith(backend):
+                        self._cpu_fallback._backend = backend
+                        break
+
                 if "CPU fallback" in backend_details:
                     if "NVCUVID not available" in backend_details:
                         self._cpu_fallback._nvcuvid_unavailable = True
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 95034b259..b56d70290 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1672,22 +1672,27 @@ def test_beta_cuda_interface_cpu_fallback(self):
         # to the CPU path, too.
 
         ref_dec = VideoDecoder(H265_VIDEO.path, device="cuda")
-        ref_frames = ref_dec.get_frame_at(0)
-        assert (
-            _core._get_backend_details(ref_dec._decoder)
-            == "FFmpeg CUDA Device Interface. Using CPU fallback."
-        )
+
+        # Before accessing any frames, status should be unknown
+        assert not ref_dec.cpu_fallback.status_known
+
+        ref_frame = ref_dec.get_frame_at(0)
+
+        assert "FFmpeg CUDA" in str(ref_dec.cpu_fallback)
+        assert ref_dec.cpu_fallback.status_known
+        assert bool(ref_dec.cpu_fallback)
 
         with set_cuda_backend("beta"):
             beta_dec = VideoDecoder(H265_VIDEO.path, device="cuda")
 
-        assert (
-            _core._get_backend_details(beta_dec._decoder)
-            == "Beta CUDA Device Interface. Using CPU fallback."
-        )
+        assert "Beta CUDA" in str(beta_dec.cpu_fallback)
+        # For beta interface, status is known immediately
+        assert beta_dec.cpu_fallback.status_known
+        assert bool(beta_dec.cpu_fallback)
+
         beta_frame = beta_dec.get_frame_at(0)
 
-        assert psnr(ref_frames.data, beta_frame.data) > 25
+        assert psnr(ref_frame.data, beta_frame.data) > 25
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):
@@ -1715,7 +1720,8 @@ def test_set_cuda_backend(self):
         # Check that the default is the ffmpeg backend
         assert _get_cuda_backend() == "ffmpeg"
         dec = VideoDecoder(H265_VIDEO.path, device="cuda")
-        assert _core._get_backend_details(dec._decoder).startswith("FFmpeg CUDA")
+        _ = dec.get_frame_at(0)
+        assert "FFmpeg CUDA" in str(dec.cpu_fallback)
 
         # Check the setting "beta" effectively uses the BETA backend.
         # We also show that the affects decoder creation only. When the decoder
@@ -1724,9 +1730,9 @@ def test_set_cuda_backend(self):
         with set_cuda_backend("beta"):
             dec = VideoDecoder(H265_VIDEO.path, device="cuda")
         assert _get_cuda_backend() == "ffmpeg"
-        assert _core._get_backend_details(dec._decoder).startswith("Beta CUDA")
+        assert "Beta CUDA" in str(dec.cpu_fallback)
         with set_cuda_backend("ffmpeg"):
-            assert _core._get_backend_details(dec._decoder).startswith("Beta CUDA")
+            assert "Beta CUDA" in str(dec.cpu_fallback)
 
         # Hacky way to ensure passing "cuda:1" is supported by both backends. We
         # just check that there's an error when passing cuda:N where N is too
@@ -1762,23 +1768,6 @@ def test_cpu_fallback_h265_video_ffmpeg_cuda(self):
         assert bool(decoder.cpu_fallback)
         assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback)
 
-    @needs_cuda
-    def test_cpu_fallback_h265_video_beta_cuda(self):
-        """Test that H265 video triggers CPU fallback on Beta CUDA interface."""
-        with set_cuda_backend("beta"):
-            decoder = VideoDecoder(H265_VIDEO.path, device="cuda")
-
-        # Before accessing any frames, status should be unknown
-        assert decoder.cpu_fallback.status_known
-
-        _ = decoder.get_frame_at(0)
-
-        # After accessing frames, status should be known
-        assert decoder.cpu_fallback.status_known
-
-        assert bool(decoder.cpu_fallback)
-        assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback)
-
     @needs_cuda
     def test_cpu_fallback_no_fallback_on_supported_video(self):
         """Test that supported videos don't trigger fallback on CUDA."""

From 6a05947d2ac6d03dad4335c21671255b33dfd2a0 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Sun, 7 Dec 2025 20:57:48 -0800
Subject: [PATCH 11/14] address feedback

---
 docs/source/api_ref_decoders.rst          |  1 +
 src/torchcodec/decoders/_video_decoder.py | 47 +++++++++------
 test/test_decoders.py                     | 69 ++++++++---------------
 test/utils.py                             |  7 +++
 4 files changed, 61 insertions(+), 63 deletions(-)

diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst
index 1417d7aea..b3a1f3250 100644
--- a/docs/source/api_ref_decoders.rst
+++ b/docs/source/api_ref_decoders.rst
@@ -33,3 +33,4 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a
 
     VideoStreamMetadata
     AudioStreamMetadata
+    CpuFallbackStatus
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 2f91878ca..38e9dd11f 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -7,7 +7,7 @@
 import io
 import json
 import numbers
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Literal, Optional, Sequence, Tuple, Union
 
@@ -29,20 +29,24 @@ class CpuFallbackStatus:
     """Information about CPU fallback status.
 
     This class tracks whether the decoder fell back to CPU decoding.
+    Users should not instantiate this class directly; instead, access it
+    via the :attr:`VideoDecoder.cpu_fallback` attribute.
 
     Usage:
-        - Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status
-        - Use ``bool(cpu_fallback_status)`` to check if any fallback occurred
 
-    Attributes:
-        status_known (bool): Whether the fallback status has been determined.
+    - Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status
+    - Use ``if cpu_fallback_status:`` to check if any fallback occurred
     """
 
-    def __init__(self):
-        self.status_known = False
-        self._nvcuvid_unavailable = False
-        self._video_not_supported = False
-        self._backend = ""
+    status_known: bool = False
+    """Whether the fallback status has been determined.
+    For the Beta CUDA backend (see :func:`~torchcodec.decoders.set_cuda_backend`),
+    this is always ``True`` immediately after decoder creation.
+    For the FFmpeg CUDA backend, this becomes ``True`` after decoding
+    the first frame."""
+    _nvcuvid_unavailable: bool = field(default=False, init=False)
+    _video_not_supported: bool = field(default=False, init=False)
+    _backend: str = field(default="", init=False)
 
     def __bool__(self):
         """Returns True if fallback occurred."""
@@ -53,7 +57,7 @@ def __bool__(self):
     def __str__(self):
         """Returns a human-readable string representation of the cpu fallback status."""
         if not self.status_known:
-            return "Fallback status: Unknown"
+            return f"[{self._backend}] Fallback status: Unknown"
 
         reasons = []
         if self._nvcuvid_unavailable:
@@ -235,25 +239,32 @@ def __init__(
         )
 
         self._cpu_fallback = CpuFallbackStatus()
+        if device.startswith("cuda"):
+            if device_variant == "beta":
+                self._cpu_fallback._backend = "Beta CUDA"
+            else:
+                self._cpu_fallback._backend = "FFmpeg CUDA"
+        else:
+            self._cpu_fallback._backend = "CPU"
 
     def __len__(self) -> int:
         return self._num_frames
 
     @property
     def cpu_fallback(self) -> CpuFallbackStatus:
-        # We can only determine whether fallback to CPU is happening when this
-        # property is accessed and requires that at least one frame has been decoded.
+        # We only query the CPU fallback info if status is unknown. That happens
+        # either when:
+        # - this @property has never been called before
+        # - no frame has been decoded yet on the FFmpeg interface.
+        # Note that for the beta interface, we're able to know the fallback status
+        # right when the VideoDecoder is instantiated, but the status_known
+        # attribute is initialized to False.
         if not self._cpu_fallback.status_known:
             backend_details = core._get_backend_details(self._decoder)
 
             if "status unknown" not in backend_details:
                 self._cpu_fallback.status_known = True
 
-                for backend in ("FFmpeg CUDA", "Beta CUDA", "CPU"):
-                    if backend_details.startswith(backend):
-                        self._cpu_fallback._backend = backend
-                        break
-
                 if "CPU fallback" in backend_details:
                     if "NVCUVID not available" in backend_details:
                         self._cpu_fallback._nvcuvid_unavailable = True
diff --git a/test/test_decoders.py b/test/test_decoders.py
index b56d70290..dee4325e4 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -27,6 +27,7 @@
     assert_frames_equal,
     AV1_VIDEO,
     BT709_FULL_RANGE,
+    cuda_devices,
     cuda_version_used_for_building_torch,
     get_ffmpeg_major_version,
     get_python_version,
@@ -1680,7 +1681,7 @@ def test_beta_cuda_interface_cpu_fallback(self):
 
         assert "FFmpeg CUDA" in str(ref_dec.cpu_fallback)
         assert ref_dec.cpu_fallback.status_known
-        assert bool(ref_dec.cpu_fallback)
+        assert ref_dec.cpu_fallback
 
         with set_cuda_backend("beta"):
             beta_dec = VideoDecoder(H265_VIDEO.path, device="cuda")
@@ -1688,7 +1689,7 @@ def test_beta_cuda_interface_cpu_fallback(self):
         assert "Beta CUDA" in str(beta_dec.cpu_fallback)
         # For beta interface, status is known immediately
         assert beta_dec.cpu_fallback.status_known
-        assert bool(beta_dec.cpu_fallback)
+        assert beta_dec.cpu_fallback
 
         beta_frame = beta_dec.get_frame_at(0)
 
@@ -1720,7 +1721,6 @@ def test_set_cuda_backend(self):
         # Check that the default is the ffmpeg backend
         assert _get_cuda_backend() == "ffmpeg"
         dec = VideoDecoder(H265_VIDEO.path, device="cuda")
-        _ = dec.get_frame_at(0)
         assert "FFmpeg CUDA" in str(dec.cpu_fallback)
 
         # Check the setting "beta" effectively uses the BETA backend.
@@ -1747,65 +1747,44 @@ def test_cpu_fallback_no_fallback_on_cpu_device(self):
         """Test that CPU device doesn't trigger fallback (it's not a fallback scenario)."""
         decoder = VideoDecoder(NASA_VIDEO.path, device="cpu")
 
+        assert decoder.cpu_fallback.status_known
         _ = decoder[0]
 
-        assert decoder.cpu_fallback.status_known
-        assert not bool(decoder.cpu_fallback)
+        assert not decoder.cpu_fallback
         assert "No fallback required" in str(decoder.cpu_fallback)
 
     @needs_cuda
-    def test_cpu_fallback_h265_video_ffmpeg_cuda(self):
-        """Test that H265 video triggers CPU fallback on FFmpeg CUDA interface."""
-        # H265_VIDEO is known to trigger CPU fallback on FFmpeg CUDA
+    @pytest.mark.parametrize("device", cuda_devices())
+    def test_cpu_fallback_h265_video(self, device):
+        """Test that H265 video triggers CPU fallback on CUDA interfaces."""
+        # H265_VIDEO is known to trigger CPU fallback on CUDA
         # because its dimensions are too small
-        decoder = VideoDecoder(H265_VIDEO.path, device="cuda")
+        decoder, _ = make_video_decoder(H265_VIDEO.path, device=device)
 
-        assert not decoder.cpu_fallback.status_known
+        if "beta" in device:
+            # For beta interface, status is known immediately
+            assert decoder.cpu_fallback.status_known
+        else:
+            # For FFmpeg interface, status is unknown until first frame is decoded
+            assert not decoder.cpu_fallback.status_known
 
-        _ = decoder.get_frame_at(0)
+        decoder.get_frame_at(0)
 
         assert decoder.cpu_fallback.status_known
-        assert bool(decoder.cpu_fallback)
-        assert "Fallback status: Falling back due to:" in str(decoder.cpu_fallback)
+        assert decoder.cpu_fallback
+        assert "Video not supported" in str(decoder.cpu_fallback)
 
     @needs_cuda
-    def test_cpu_fallback_no_fallback_on_supported_video(self):
+    @pytest.mark.parametrize("device", cuda_devices())
+    def test_cpu_fallback_no_fallback_on_supported_video(self, device):
         """Test that supported videos don't trigger fallback on CUDA."""
-        decoder = VideoDecoder(NASA_VIDEO.path, device="cuda")
+        decoder, _ = make_video_decoder(NASA_VIDEO.path, device=device)
 
-        _ = decoder[0]
+        decoder[0]
 
-        assert not bool(decoder.cpu_fallback)
+        assert not decoder.cpu_fallback
         assert "No fallback required" in str(decoder.cpu_fallback)
 
-    def test_cpu_fallback_status_cached(self):
-        """Test that cpu_fallback status is determined once and then cached."""
-        decoder = VideoDecoder(NASA_VIDEO.path)
-
-        _ = decoder[0]
-        first_status = str(decoder.cpu_fallback)
-        assert decoder.cpu_fallback.status_known
-
-        _ = decoder[1]
-        second_status = str(decoder.cpu_fallback)
-        assert decoder.cpu_fallback.status_known
-
-        assert first_status == second_status
-
-    def test_cpu_fallback_multiple_access_methods(self):
-        """Test that cpu_fallback works with different frame access methods."""
-        decoder = VideoDecoder(NASA_VIDEO.path)
-
-        _ = decoder.get_frame_at(0)
-        assert decoder.cpu_fallback.status_known
-        status_after_get_frame = str(decoder.cpu_fallback)
-
-        _ = decoder.get_frames_in_range(1, 3)
-        assert str(decoder.cpu_fallback) == status_after_get_frame
-
-        _ = decoder.get_frame_played_at(0.5)
-        assert str(decoder.cpu_fallback) == status_after_get_frame
-
 
 class TestAudioDecoder:
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
diff --git a/test/utils.py b/test/utils.py
index fb2d84483..9e3ddde00 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -52,6 +52,13 @@ def all_supported_devices():
     )
 
 
+def cuda_devices():
+    return (
+        pytest.param("cuda", marks=pytest.mark.needs_cuda),
+        pytest.param(_CUDA_BETA_DEVICE_STR, marks=pytest.mark.needs_cuda),
+    )
+
+
 def unsplit_device_str(device_str: str) -> str:
     # helper meant to be used as
     # device, device_variant = unsplit_device_str(device)

From 8b75eacf9605f6839804d9df10ddec07684e0a54 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Sun, 7 Dec 2025 21:06:54 -0800
Subject: [PATCH 12/14] fix lint

---
 src/torchcodec/decoders/_video_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 040caa946..5f607a47a 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -8,8 +8,8 @@
 import io
 import json
 import numbers
-from dataclasses import dataclass, field
 from collections.abc import Sequence
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Literal
 

From 14ad6c7e0d7412eb5aec0619b56326d30747ebb3 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Mon, 8 Dec 2025 07:30:33 -0800
Subject: [PATCH 13/14] ffmpeg backend logic

---
 src/torchcodec/decoders/_video_decoder.py | 12 +++++++-----
 test/test_decoders.py                     |  7 ++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 5f607a47a..b8518f766 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -48,13 +48,12 @@ class CpuFallbackStatus:
     the first frame."""
     _nvcuvid_unavailable: bool = field(default=False, init=False)
     _video_not_supported: bool = field(default=False, init=False)
+    _is_fallback: bool = field(default=False, init=False)
     _backend: str = field(default="", init=False)
 
     def __bool__(self):
         """Returns True if fallback occurred."""
-        return self.status_known and (
-            self._nvcuvid_unavailable or self._video_not_supported
-        )
+        return self.status_known and self._is_fallback
 
     def __str__(self):
         """Returns a human-readable string representation of the cpu fallback status."""
@@ -64,8 +63,10 @@ def __str__(self):
         reasons = []
         if self._nvcuvid_unavailable:
             reasons.append("NVcuvid unavailable")
-        if self._video_not_supported:
+        elif self._video_not_supported:
             reasons.append("Video not supported")
+        elif self._is_fallback:
+            reasons.append("Unknown reason - try the Beta interface to know more!")
 
         if reasons:
             return (
@@ -268,9 +269,10 @@ def cpu_fallback(self) -> CpuFallbackStatus:
                 self._cpu_fallback.status_known = True
 
                 if "CPU fallback" in backend_details:
+                    self._cpu_fallback._is_fallback = True
                     if "NVCUVID not available" in backend_details:
                         self._cpu_fallback._nvcuvid_unavailable = True
-                    else:
+                    elif self._cpu_fallback._backend == "Beta CUDA":
                         self._cpu_fallback._video_not_supported = True
 
         return self._cpu_fallback
diff --git a/test/test_decoders.py b/test/test_decoders.py
index ae8170d8a..8fcd78665 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1772,7 +1772,12 @@ def test_cpu_fallback_h265_video(self, device):
 
         assert decoder.cpu_fallback.status_known
         assert decoder.cpu_fallback
-        assert "Video not supported" in str(decoder.cpu_fallback)
+        if "beta" in device:
+            # Beta interface provides the specific reason for fallback
+            assert "Video not supported" in str(decoder.cpu_fallback)
+        else:
+            # FFmpeg interface doesn't know the specific reason
+            assert "Unknown reason" in str(decoder.cpu_fallback)
 
     @needs_cuda
     @pytest.mark.parametrize("device", cuda_devices())

From bddfa7cb11283558945d1f4e748477458df9092e Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Mon, 8 Dec 2025 11:29:51 -0800
Subject: [PATCH 14/14] add cpufallback

---
 examples/decoding/basic_cuda_example.py | 41 ++++++++++++-------------
 examples/decoding/performance_tips.py   | 22 +++++++++++++
 2 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/examples/decoding/basic_cuda_example.py b/examples/decoding/basic_cuda_example.py
index 8f82940c0..13a0afe52 100644
--- a/examples/decoding/basic_cuda_example.py
+++ b/examples/decoding/basic_cuda_example.py
@@ -18,28 +18,6 @@
 running the transform steps. Encoded packets are often much smaller than decoded frames so
 CUDA decoding also uses less PCI-e bandwidth.
 
-When to and when not to use CUDA Decoding
------------------------------------------
-
-CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios:
-
-#. You are decoding a large resolution video
-#. You are decoding a large batch of videos that's saturating the CPU
-#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors
-   after decoding
-#. Your CPU is saturated and you want to free it up for other work
-
-
-Here are situations where CUDA Decoding may not make sense:
-
-#. You want bit-exact results compared to CPU Decoding
-#. You have small resolution videos and the PCI-e transfer latency is large
-#. Your GPU is already busy and CPU is not
-
-It's best to experiment with CUDA Decoding to see if it improves your use-case. With
-TorchCodec you can simply pass in a device parameter to the
-:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding.
-
 Installing TorchCodec with CUDA Enabled
 ---------------------------------------
 
@@ -161,3 +139,22 @@ def plot_cpu_and_cuda_frames(cpu_frames: torch.Tensor, cuda_frames: torch.Tensor
 print(f"{frames_equal=}")
 print(f"{mean_abs_diff=}")
 print(f"{max_abs_diff=}")
+
+
+# %%
+# Checking for CPU Fallback
+# -------------------------------------
+#
+# In some cases, CUDA decoding may fall back to CPU decoding. This can happen
+# when the video codec or format is not supported by the NVDEC hardware decoder.
+# TorchCodec provides the :class:`~torchcodec.decoders.CpuFallbackStatus` class
+# to help you detect when this fallback occurs.
+#
+# You can access the fallback status via the
+# :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute:
+
+with set_cuda_backend("beta"):
+    decoder = VideoDecoder(video_file, device="cuda")
+
+# Check and print the CPU fallback status
+print(decoder.cpu_fallback)
diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index ac247fd64..9bcd64877 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -170,8 +170,30 @@
 #
 
 # %%
+# **Checking for CPU Fallback**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In some cases, CUDA decoding may silently fall back to CPU decoding when the
+# video codec or format is not supported by NVDEC. You can detect this using
+# the :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute:
+#
+# .. code-block:: python
+#
+#     decoder = VideoDecoder("file.mp4", device="cuda")
+#     decoder[0]  # Decode at least one frame first (for FFmpeg backend)
+#
+#     # Print detailed fallback status
+#     print(decoder.cpu_fallback)
+#
 # .. note::
 #
+#     The timing of when you can detect CPU fallback differs between backends:
+#
+#     - **FFmpeg backend**: You can only check fallback status after decoding at
+#       least one frame, because FFmpeg determines codec support lazily during decoding.
+#     - **BETA backend**: You can check fallback status immediately after
+#       decoder creation, as the backend checks codec support upfront.
+#
 #     For installation instructions, detailed examples, and visual comparisons
 #     between CPU and CUDA decoding, see:
 #