lightly-ai · JonasWurst · Jan 19, 2026 · Jan 13, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/src/labelformat/formats/__init__.py b/src/labelformat/formats/__init__.py
@@ -65,7 +65,10 @@
     YOLOv26ObjectDetectionInput,
     YOLOv26ObjectDetectionOutput,
 )
-from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput
+from labelformat.formats.youtubevis import (
+    YouTubeVISInstanceSegmentationTrackInput,
+    YouTubeVISObjectDetectionTrackInput,
+)
 
 __all__ = [
     "COCOInstanceSegmentationInput",
@@ -106,5 +109,6 @@
     "YOLOv26ObjectDetectionInput",
     "YOLOv26ObjectDetectionOutput",
     "MaskPairInstanceSegmentationInput",
+    "YouTubeVISInstanceSegmentationTrackInput",
     "YouTubeVISObjectDetectionTrackInput",
 ]
diff --git a/src/labelformat/formats/youtubevis.py b/src/labelformat/formats/youtubevis.py
@@ -3,10 +3,22 @@
 import json
 from argparse import ArgumentParser
 from pathlib import Path
-from typing import Dict, Iterable, List
+from typing import Dict, Iterable, List, cast
 
+import labelformat.formats.coco_segmentation_helpers as segmentation_helpers
+from labelformat.formats.coco_segmentation_helpers import (
+    COCOInstanceSegmentationMultiPolygon,
+    COCOInstanceSegmentationRLE,
+)
+from labelformat.model.binary_mask_segmentation import BinaryMaskSegmentation
 from labelformat.model.bounding_box import BoundingBox, BoundingBoxFormat
 from labelformat.model.category import Category
+from labelformat.model.instance_segmentation_track import (
+    InstanceSegmentationTrackInput,
+    SingleInstanceSegmentationTrack,
+    VideoInstanceSegmentationTrack,
+)
+from labelformat.model.multipolygon import MultiPolygon
 from labelformat.model.object_detection_track import (
     ObjectDetectionTrackInput,
     SingleObjectDetectionTrack,
@@ -16,7 +28,7 @@
 from labelformat.types import JsonDict
 
 
-class YouTubeVISObjectDetectionTrackInput(ObjectDetectionTrackInput):
+class _YouTubeVISBaseInput:
     @staticmethod
     def add_cli_arguments(parser: ArgumentParser) -> None:
         parser.add_argument(
@@ -48,6 +60,10 @@ def get_videos(self) -> Iterable[Video]:
                 number_of_frames=int(video["length"]),
             )
 
+
+class YouTubeVISObjectDetectionTrackInput(
+    _YouTubeVISBaseInput, ObjectDetectionTrackInput
+):
     def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
         video_id_to_video = {video.id: video for video in self.get_videos()}
         category_id_to_category = {
@@ -66,7 +82,7 @@ def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
                 boxes = _get_object_track_boxes(ann=track)
                 objects.append(
                     SingleObjectDetectionTrack(
-                        category=category_id_to_category[ann["category_id"]],
+                        category=category_id_to_category[track["category_id"]],
                         boxes=boxes,
                     )
                 )
@@ -76,6 +92,37 @@ def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
             )
 
 
+class YouTubeVISInstanceSegmentationTrackInput(
+    _YouTubeVISBaseInput, InstanceSegmentationTrackInput
+):
+    def get_labels(self) -> Iterable[VideoInstanceSegmentationTrack]:
+        video_id_to_video = {video.id: video for video in self.get_videos()}
+        category_id_to_category = {
+            category.id: category for category in self.get_categories()
+        }
+        video_id_to_tracks: Dict[int, List[JsonDict]] = {
+            video_id: [] for video_id in video_id_to_video.keys()
+        }
+        for ann in self._data["annotations"]:
+            video_id_to_tracks[ann["video_id"]].append(ann)
+
+        for video_id, tracks in video_id_to_tracks.items():
+            video = video_id_to_video[video_id]
+            objects = []
+            for track in tracks:
+                segmentations = _get_object_track_segmentations(ann=track)
+                objects.append(
+                    SingleInstanceSegmentationTrack(
+                        category=category_id_to_category[track["category_id"]],
+                        segmentations=segmentations,
+                    )
+                )
+            yield VideoInstanceSegmentationTrack(
+                video=video,
+                objects=objects,
+            )
+
+
 def _get_object_track_boxes(
     ann: JsonDict,
 ) -> list[BoundingBox | None]:
@@ -91,3 +138,29 @@ def _get_object_track_boxes(
                 )
             )
     return boxes
+
+
+def _get_object_track_segmentations(
+    ann: JsonDict,
+) -> list[MultiPolygon | BinaryMaskSegmentation | None]:
+    segmentations: list[MultiPolygon | BinaryMaskSegmentation | None] = []
+    bboxes = ann["bboxes"]
+    for index, segmentation in enumerate(ann["segmentations"]):
+        if segmentation is None or len(segmentation) == 0:
+            segmentations.append(None)
+            continue
+        if isinstance(segmentation, dict):
+            segmentation_rle = cast(COCOInstanceSegmentationRLE, segmentation)
+            segmentations.append(
+                segmentation_helpers.coco_segmentation_to_binary_mask_rle(
+                    segmentation=segmentation_rle, bbox=bboxes[index]
+                )
+            )
+        elif isinstance(segmentation, list):
+            segmentation_mp = cast(COCOInstanceSegmentationMultiPolygon, segmentation)
+            segmentations.append(
+                segmentation_helpers.coco_segmentation_to_multipolygon(
+                    coco_segmentation=segmentation_mp
+                )
+            )
+    return segmentations
diff --git a/tests/unit/formats/test_youtubevis.py b/tests/unit/formats/test_youtubevis.py
@@ -1,9 +1,22 @@
 import json
 from pathlib import Path
 
-from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput
+import numpy as np
+
+from labelformat.formats.youtubevis import (
+    YouTubeVISInstanceSegmentationTrackInput,
+    YouTubeVISObjectDetectionTrackInput,
+)
+from labelformat.model.binary_mask_segmentation import (
+    BinaryMaskSegmentation,
+    RLEDecoderEncoder,
+)
 from labelformat.model.bounding_box import BoundingBox
 from labelformat.model.category import Category
+from labelformat.model.instance_segmentation_track import (
+    SingleInstanceSegmentationTrack,
+    VideoInstanceSegmentationTrack,
+)
 from labelformat.model.object_detection_track import (
     SingleObjectDetectionTrack,
     VideoObjectDetectionTrack,
@@ -63,6 +76,39 @@ def test_get_labels(self, tmp_path: Path) -> None:
         ]
 
 
+class TestYouTubeVISInstanceSegmentationTrackInput:
+    def test_get_labels(self, tmp_path: Path) -> None:
+        input_file = _write_youtube_vis_instance_segmentation_json(
+            tmp_path / "instances.json"
+        )
+        label_input = YouTubeVISInstanceSegmentationTrackInput(input_file=input_file)
+
+        binary_mask = np.array([[0, 1, 1], [0, 1, 1]], dtype=int)
+        bounding_box = BoundingBox(xmin=1.0, ymin=0.0, xmax=3.0, ymax=2.0)
+        expected_segmentation = BinaryMaskSegmentation.from_binary_mask(
+            binary_mask=binary_mask,
+            bounding_box=bounding_box,
+        )
+
+        assert list(label_input.get_labels()) == [
+            VideoInstanceSegmentationTrack(
+                video=Video(
+                    id=5,
+                    filename="video1",
+                    width=3,
+                    height=2,
+                    number_of_frames=2,
+                ),
+                objects=[
+                    SingleInstanceSegmentationTrack(
+                        category=Category(id=1, name="cat"),
+                        segmentations=[expected_segmentation, None],
+                    )
+                ],
+            )
+        ]
+
+
 def _write_youtube_vis_json(input_file: Path) -> Path:
     data = {
         "categories": [
@@ -90,3 +136,38 @@ def _write_youtube_vis_json(input_file: Path) -> Path:
     }
     input_file.write_text(json.dumps(data))
     return input_file
+
+
+def _write_youtube_vis_instance_segmentation_json(input_file: Path) -> Path:
+    binary_mask = np.array([[0, 1, 1], [0, 1, 1]], dtype=int)
+    counts = RLEDecoderEncoder.encode_column_wise_rle(binary_mask)
+    data = {
+        "categories": [
+            {"id": 1, "name": "cat"},
+        ],
+        "videos": [
+            {
+                "id": 5,
+                "file_names": ["video1/00000.jpg", "video1/00001.jpg"],
+                "width": 3,
+                "height": 2,
+                "length": 2,
+            }
+        ],
+        "annotations": [
+            {
+                "video_id": 5,
+                "category_id": 1,
+                "bboxes": [
+                    [1.0, 0.0, 2.0, 2.0],
+                    None,
+                ],
+                "segmentations": [
+                    {"counts": counts, "size": [2, 3]},
+                    None,
+                ],
+            }
+        ],
+    }
+    input_file.write_text(json.dumps(data))
+    return input_file