Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/labelformat/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@
YOLOv26ObjectDetectionInput,
YOLOv26ObjectDetectionOutput,
)
from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput
from labelformat.formats.youtubevis import (
YouTubeVISInstanceSegmentationTrackInput,
YouTubeVISObjectDetectionTrackInput,
)

__all__ = [
"COCOInstanceSegmentationInput",
Expand Down Expand Up @@ -106,5 +109,6 @@
"YOLOv26ObjectDetectionInput",
"YOLOv26ObjectDetectionOutput",
"MaskPairInstanceSegmentationInput",
"YouTubeVISInstanceSegmentationTrackInput",
"YouTubeVISObjectDetectionTrackInput",
]
79 changes: 76 additions & 3 deletions src/labelformat/formats/youtubevis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,22 @@
import json
from argparse import ArgumentParser
from pathlib import Path
from typing import Dict, Iterable, List
from typing import Dict, Iterable, List, cast

import labelformat.formats.coco_segmentation_helpers as segmentation_helpers
from labelformat.formats.coco_segmentation_helpers import (
COCOInstanceSegmentationMultiPolygon,
COCOInstanceSegmentationRLE,
)
from labelformat.model.binary_mask_segmentation import BinaryMaskSegmentation
from labelformat.model.bounding_box import BoundingBox, BoundingBoxFormat
from labelformat.model.category import Category
from labelformat.model.instance_segmentation_track import (
InstanceSegmentationTrackInput,
SingleInstanceSegmentationTrack,
VideoInstanceSegmentationTrack,
)
from labelformat.model.multipolygon import MultiPolygon
from labelformat.model.object_detection_track import (
ObjectDetectionTrackInput,
SingleObjectDetectionTrack,
Expand All @@ -16,7 +28,7 @@
from labelformat.types import JsonDict


class YouTubeVISObjectDetectionTrackInput(ObjectDetectionTrackInput):
class _YouTubeVISBaseInput:
@staticmethod
def add_cli_arguments(parser: ArgumentParser) -> None:
parser.add_argument(
Expand Down Expand Up @@ -48,6 +60,10 @@ def get_videos(self) -> Iterable[Video]:
number_of_frames=int(video["length"]),
)


class YouTubeVISObjectDetectionTrackInput(
_YouTubeVISBaseInput, ObjectDetectionTrackInput
):
def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
video_id_to_video = {video.id: video for video in self.get_videos()}
category_id_to_category = {
Expand All @@ -66,7 +82,7 @@ def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
boxes = _get_object_track_boxes(ann=track)
objects.append(
SingleObjectDetectionTrack(
category=category_id_to_category[ann["category_id"]],
category=category_id_to_category[track["category_id"]],
boxes=boxes,
)
)
Expand All @@ -76,6 +92,37 @@ def get_labels(self) -> Iterable[VideoObjectDetectionTrack]:
)


class YouTubeVISInstanceSegmentationTrackInput(
_YouTubeVISBaseInput, InstanceSegmentationTrackInput
):
def get_labels(self) -> Iterable[VideoInstanceSegmentationTrack]:
video_id_to_video = {video.id: video for video in self.get_videos()}
category_id_to_category = {
category.id: category for category in self.get_categories()
}
video_id_to_tracks: Dict[int, List[JsonDict]] = {
video_id: [] for video_id in video_id_to_video.keys()
}
for ann in self._data["annotations"]:
video_id_to_tracks[ann["video_id"]].append(ann)

for video_id, tracks in video_id_to_tracks.items():
video = video_id_to_video[video_id]
objects = []
for track in tracks:
segmentations = _get_object_track_segmentations(ann=track)
objects.append(
SingleInstanceSegmentationTrack(
category=category_id_to_category[track["category_id"]],
segmentations=segmentations,
)
)
yield VideoInstanceSegmentationTrack(
video=video,
objects=objects,
)


def _get_object_track_boxes(
ann: JsonDict,
) -> list[BoundingBox | None]:
Expand All @@ -91,3 +138,29 @@ def _get_object_track_boxes(
)
)
return boxes


def _get_object_track_segmentations(
ann: JsonDict,
) -> list[MultiPolygon | BinaryMaskSegmentation | None]:
segmentations: list[MultiPolygon | BinaryMaskSegmentation | None] = []
bboxes = ann["bboxes"]
for index, segmentation in enumerate(ann["segmentations"]):
if segmentation is None or len(segmentation) == 0:
segmentations.append(None)
continue
if isinstance(segmentation, dict):
segmentation_rle = cast(COCOInstanceSegmentationRLE, segmentation)
segmentations.append(
segmentation_helpers.coco_segmentation_to_binary_mask_rle(
segmentation=segmentation_rle, bbox=bboxes[index]
)
)
elif isinstance(segmentation, list):
segmentation_mp = cast(COCOInstanceSegmentationMultiPolygon, segmentation)
segmentations.append(
segmentation_helpers.coco_segmentation_to_multipolygon(
coco_segmentation=segmentation_mp
)
)
return segmentations
83 changes: 82 additions & 1 deletion tests/unit/formats/test_youtubevis.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
import json
from pathlib import Path

from labelformat.formats.youtubevis import YouTubeVISObjectDetectionTrackInput
import numpy as np

from labelformat.formats.youtubevis import (
YouTubeVISInstanceSegmentationTrackInput,
YouTubeVISObjectDetectionTrackInput,
)
from labelformat.model.binary_mask_segmentation import (
BinaryMaskSegmentation,
RLEDecoderEncoder,
)
from labelformat.model.bounding_box import BoundingBox
from labelformat.model.category import Category
from labelformat.model.instance_segmentation_track import (
SingleInstanceSegmentationTrack,
VideoInstanceSegmentationTrack,
)
from labelformat.model.object_detection_track import (
SingleObjectDetectionTrack,
VideoObjectDetectionTrack,
Expand Down Expand Up @@ -63,6 +76,39 @@ def test_get_labels(self, tmp_path: Path) -> None:
]


class TestYouTubeVISInstanceSegmentationTrackInput:
def test_get_labels(self, tmp_path: Path) -> None:
input_file = _write_youtube_vis_instance_segmentation_json(
tmp_path / "instances.json"
)
label_input = YouTubeVISInstanceSegmentationTrackInput(input_file=input_file)

binary_mask = np.array([[0, 1, 1], [0, 1, 1]], dtype=int)
bounding_box = BoundingBox(xmin=1.0, ymin=0.0, xmax=3.0, ymax=2.0)
expected_segmentation = BinaryMaskSegmentation.from_binary_mask(
binary_mask=binary_mask,
bounding_box=bounding_box,
)

assert list(label_input.get_labels()) == [
VideoInstanceSegmentationTrack(
video=Video(
id=5,
filename="video1",
width=3,
height=2,
number_of_frames=2,
),
objects=[
SingleInstanceSegmentationTrack(
category=Category(id=1, name="cat"),
segmentations=[expected_segmentation, None],
)
],
)
]


def _write_youtube_vis_json(input_file: Path) -> Path:
data = {
"categories": [
Expand Down Expand Up @@ -90,3 +136,38 @@ def _write_youtube_vis_json(input_file: Path) -> Path:
}
input_file.write_text(json.dumps(data))
return input_file


def _write_youtube_vis_instance_segmentation_json(input_file: Path) -> Path:
binary_mask = np.array([[0, 1, 1], [0, 1, 1]], dtype=int)
counts = RLEDecoderEncoder.encode_column_wise_rle(binary_mask)
data = {
"categories": [
{"id": 1, "name": "cat"},
],
"videos": [
{
"id": 5,
"file_names": ["video1/00000.jpg", "video1/00001.jpg"],
"width": 3,
"height": 2,
"length": 2,
}
],
"annotations": [
{
"video_id": 5,
"category_id": 1,
"bboxes": [
[1.0, 0.0, 2.0, 2.0],
None,
],
"segmentations": [
{"counts": counts, "size": [2, 3]},
None,
],
}
],
}
input_file.write_text(json.dumps(data))
return input_file