diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 3ce603c3ed2..a775198842a 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -21,6 +21,7 @@ import torchvision.transforms.v2 as transforms from common_utils import ( + assert_close, assert_equal, cache, cpu_and_cuda, @@ -41,7 +42,6 @@ ) from torch import nn -from torch.testing import assert_close from torch.utils._pytree import tree_flatten, tree_map from torch.utils.data import DataLoader, default_collate from torchvision import tv_tensors @@ -3505,6 +3505,9 @@ def test_kernel_video(self): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_functional(self, make_input): @@ -3520,16 +3523,36 @@ def test_functional(self, make_input): (F.crop_mask, tv_tensors.Mask), (F.crop_video, tv_tensors.Video), (F.crop_keypoints, tv_tensors.KeyPoints), + pytest.param( + F._geometry._crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type) @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) - def test_functional_image_correctness(self, kwargs): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) + def test_functional_image_correctness(self, kwargs, make_input): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = F.crop(image, **kwargs) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs)) assert_equal(actual, expected) @@ -3548,6 +3571,9 @@ def test_functional_image_correctness(self, kwargs): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_transform(self, param, value, make_input): @@ -3614,7 +3640,16 @@ def test_transform_pad_if_needed(self): padding_mode=["constant", "edge", "reflect", "symmetric"], ) @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_image_correctness(self, param, value, seed): + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) + def test_transform_image_correctness(self, param, value, seed, make_input): kwargs = {param: value} if param != "size": # 1. size is required @@ -3625,13 +3660,17 @@ def test_transform_image_correctness(self, param, value, seed): transform = transforms.RandomCrop(pad_if_needed=True, **kwargs) - image = make_image(self.INPUT_SIZE) + image = make_input(self.INPUT_SIZE) with freeze_rng_state(): torch.manual_seed(seed) actual = transform(image) torch.manual_seed(seed) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image(transform(F.to_pil_image(image))) assert_equal(actual, expected) @@ -4458,6 +4497,9 @@ def test_kernel(self, kernel, make_input): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_functional(self, make_input): @@ -4474,9 +4516,16 @@ def test_functional(self, make_input): (F.resized_crop_mask, tv_tensors.Mask), (F.resized_crop_video, tv_tensors.Video), (F.resized_crop_keypoints, tv_tensors.KeyPoints), + pytest.param( + F._geometry._resized_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._resized_crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.resized_crop, kernel=kernel, input_type=input_type) @param_value_parametrization( @@ -4493,6 +4542,9 @@ def test_functional_signature(self, kernel, input_type): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_transform(self, param, value, make_input): @@ -4504,20 +4556,33 @@ def test_transform(self, param, value, make_input): # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) - def test_functional_image_correctness(self, interpolation): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + def test_functional_image_correctness(self, make_input, interpolation): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8) actual = F.resized_crop( image, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation, antialias=True ) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image( F.resized_crop( F.to_pil_image(image), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation ) ) - torch.testing.assert_close(actual, expected, atol=1, rtol=0) + assert_close(actual, expected, atol=1, rtol=0) def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width, size): new_height, new_width = size @@ -4928,6 +4993,9 @@ def test_kernel_video(self): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_functional(self, make_input): @@ -4943,9 +5011,16 @@ def test_functional(self, make_input): (F.center_crop_mask, tv_tensors.Mask), (F.center_crop_video, tv_tensors.Video), (F.center_crop_keypoints, tv_tensors.KeyPoints), + pytest.param( + F._geometry._center_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._center_crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.center_crop, kernel=kernel, input_type=input_type) @pytest.mark.parametrize( @@ -4958,17 +5033,33 @@ def test_functional_signature(self, kernel, input_type): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_transform(self, make_input): check_transform(transforms.CenterCrop(self.OUTPUT_SIZES[0]), make_input(self.INPUT_SIZE)) @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) - def test_image_correctness(self, output_size, fn): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + def test_image_correctness(self, output_size, make_input, fn): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = fn(image, output_size) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image(F.center_crop(F.to_pil_image(image), output_size=output_size)) assert_equal(actual, expected) @@ -6243,7 +6334,15 @@ def wrapper(*args, **kwargs): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], + [ + make_image_tensor, + make_image_pil, + make_image, + make_video, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], ) @pytest.mark.parametrize("functional", [F.five_crop, F.ten_crop]) def test_functional(self, make_input, functional): @@ -6261,13 +6360,27 @@ def test_functional(self, make_input, functional): (F.five_crop, F._geometry._five_crop_image_pil, PIL.Image.Image), (F.five_crop, F.five_crop_image, tv_tensors.Image), (F.five_crop, F.five_crop_video, tv_tensors.Video), + pytest.param( + F.five_crop, + F._geometry._five_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), (F.ten_crop, F.ten_crop_image, torch.Tensor), (F.ten_crop, F._geometry._ten_crop_image_pil, PIL.Image.Image), (F.ten_crop, F.ten_crop_image, tv_tensors.Image), (F.ten_crop, F.ten_crop_video, tv_tensors.Video), + pytest.param( + F.ten_crop, + F._geometry._ten_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, functional, kernel, input_type): + if kernel is F._geometry._five_crop_image_cvcuda or kernel is F._geometry._ten_crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(functional, kernel=kernel, input_type=input_type) class _TransformWrapper(nn.Module): @@ -6289,7 +6402,15 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], + [ + make_image_tensor, + make_image_pil, + make_image, + make_video, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], ) @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) def test_transform(self, make_input, transform_cls): @@ -6307,19 +6428,41 @@ def test_transform_error(self, make_input, transform_cls): with pytest.raises(TypeError, match="not supported"): transform(make_input(self.INPUT_SIZE)) + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("fn", [F.five_crop, transform_cls_to_functional(transforms.FiveCrop)]) - def test_correctness_image_five_crop(self, fn): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + def test_correctness_image_five_crop(self, make_input, fn): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = fn(image, size=self.OUTPUT_SIZE) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.five_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE) assert isinstance(actual, tuple) assert_equal(actual, [F.to_image(e) for e in expected]) + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("fn_or_class", [F.ten_crop, transforms.TenCrop]) @pytest.mark.parametrize("vertical_flip", [False, True]) - def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip): + def test_correctness_image_ten_crop(self, make_input, fn_or_class, vertical_flip): if fn_or_class is transforms.TenCrop: fn = transform_cls_to_functional(fn_or_class, size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) kwargs = dict() @@ -6327,9 +6470,13 @@ def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip): fn = fn_or_class kwargs = dict(size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = fn(image, **kwargs) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.ten_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) assert isinstance(actual, tuple) diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index 96166e05e9a..e84b7c6d7c3 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -194,6 +194,8 @@ class CenterCrop(Transform): _v1_transform_cls = _transforms.CenterCrop + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__(self, size: Union[int, Sequence[int]]): super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -252,6 +254,8 @@ class RandomResizedCrop(Transform): _v1_transform_cls = _transforms.RandomResizedCrop + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__( self, size: Union[int, Sequence[int]], @@ -360,6 +364,8 @@ class FiveCrop(Transform): _v1_transform_cls = _transforms.FiveCrop + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__(self, size: Union[int, Sequence[int]]) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -404,6 +410,8 @@ class TenCrop(Transform): _v1_transform_cls = _transforms.TenCrop + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -811,6 +819,8 @@ class RandomCrop(Transform): _v1_transform_cls = _transforms.RandomCrop + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def _extract_params_for_v1_transform(self) -> dict[str, Any]: params = super()._extract_params_for_v1_transform() @@ -1121,6 +1131,8 @@ class RandomIoUCrop(Transform): Default, 40. """ + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__( self, min_scale: float = 0.3, diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py index bb6051b4e61..7274abaa861 100644 --- a/torchvision/transforms/v2/_utils.py +++ b/torchvision/transforms/v2/_utils.py @@ -16,7 +16,7 @@ from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size # noqa: F401 from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor -from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT +from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT, _is_cvcuda_tensor def _setup_number_or_seq(arg: int | float | Sequence[int | float], name: str) -> Sequence[float]: @@ -207,6 +207,7 @@ def query_size(flat_inputs: list[Any]) -> tuple[int, int]: tv_tensors.Mask, tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, + _is_cvcuda_tensor, ), ) } diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 0e27218bc89..2e1bee05e76 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -1924,6 +1924,50 @@ def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int return crop_image(video, top, left, height, width) +def _crop_image_cvcuda( + image: "cvcuda.Tensor", + top: int, + left: int, + height: int, + width: int, +) -> "cvcuda.Tensor": + cvcuda = _import_cvcuda() + + image_height, image_width, channels = image.shape[1:] + top_diff = 0 + left_diff = 0 + height_diff = 0 + width_diff = 0 + if top < 0: + top_diff = int(-1 * top) + if left < 0: + left_diff = int(-1 * left) + if top + height > image_height: + height_diff = int(top + height - image_height) + if left + width > image_width: + width_diff = int(left + width - image_width) + if top_diff or left_diff or height_diff or width_diff: + image = cvcuda.copymakeborder( + image, + border_mode=cvcuda.Border.CONSTANT, + border_value=[0.0] * channels, + top=top_diff, + left=left_diff, + bottom=height_diff, + right=width_diff, + ) + top = top + top_diff + left = left + left_diff + return cvcuda.customcrop( + image, + cvcuda.RectI(x=left, y=top, width=width, height=height), + ) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(crop, _import_cvcuda().Tensor)(_crop_image_cvcuda) + + def perspective( inpt: torch.Tensor, startpoints: Optional[list[list[int]]], @@ -2674,6 +2718,47 @@ def center_crop_video(video: torch.Tensor, output_size: list[int]) -> torch.Tens return center_crop_image(video, output_size) +def _center_crop_image_cvcuda( + image: "cvcuda.Tensor", + output_size: list[int], +) -> "cvcuda.Tensor": + cvcuda = _import_cvcuda() + + crop_height, crop_width = _center_crop_parse_output_size(output_size) + # we only allow cvcuda conversion for 4 ndim, and always use nhwc layout + image_height = image.shape[1] + image_width = image.shape[2] + channels = image.shape[3] + if crop_height > image_height or crop_width > image_width: + padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) + image = cvcuda.copymakeborder( + image, + border_mode=cvcuda.Border.CONSTANT, + border_value=[0.0] * channels, + top=padding_ltrb[1], + left=padding_ltrb[0], + bottom=padding_ltrb[3], + right=padding_ltrb[2], + ) + + image_height = image.shape[1] + image_width = image.shape[2] + + if crop_width == image_width and crop_height == image_height: + return image + + # use customcrop to match crop_image behavior + crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width) + return cvcuda.customcrop( + image, + cvcuda.RectI(x=crop_left, y=crop_top, width=crop_width, height=crop_height), + ) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(center_crop, _import_cvcuda().Tensor)(_center_crop_image_cvcuda) + + def resized_crop( inpt: torch.Tensor, top: int, @@ -2860,6 +2945,24 @@ def resized_crop_video( ) +def _resized_crop_image_cvcuda( + image: "cvcuda.Tensor", + top: int, + left: int, + height: int, + width: int, + size: list[int], + interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + antialias: Optional[bool] = True, +) -> "cvcuda.Tensor": + image = _crop_image_cvcuda(image, top, left, height, width) + return _resize_image_cvcuda(image, size, interpolation=interpolation, antialias=antialias) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(resized_crop, _import_cvcuda().Tensor)(_resized_crop_image_cvcuda) + + def five_crop( inpt: torch.Tensor, size: list[int] ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: @@ -2933,6 +3036,29 @@ def five_crop_video( return five_crop_image(video, size) +def _five_crop_image_cvcuda( + image: "cvcuda.Tensor", + size: list[int], +) -> tuple["cvcuda.Tensor", "cvcuda.Tensor", "cvcuda.Tensor", "cvcuda.Tensor", "cvcuda.Tensor"]: + crop_height, crop_width = _parse_five_crop_size(size) + image_height, image_width = image.shape[1], image.shape[2] + + if crop_width > image_width or crop_height > image_height: + raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}") + + tl = _crop_image_cvcuda(image, 0, 0, crop_height, crop_width) + tr = _crop_image_cvcuda(image, 0, image_width - crop_width, crop_height, crop_width) + bl = _crop_image_cvcuda(image, image_height - crop_height, 0, crop_height, crop_width) + br = _crop_image_cvcuda(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width) + center = _center_crop_image_cvcuda(image, [crop_height, crop_width]) + + return tl, tr, bl, br, center + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(five_crop, _import_cvcuda().Tensor)(_five_crop_image_cvcuda) + + def ten_crop( inpt: torch.Tensor, size: list[int], vertical_flip: bool = False ) -> tuple[ @@ -3028,3 +3154,35 @@ def ten_crop_video( torch.Tensor, ]: return ten_crop_image(video, size, vertical_flip=vertical_flip) + + +def _ten_crop_image_cvcuda( + image: "cvcuda.Tensor", + size: list[int], + vertical_flip: bool = False, +) -> tuple[ + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", +]: + non_flipped = _five_crop_image_cvcuda(image, size) + + if vertical_flip: + image = _vertical_flip_image_cvcuda(image) + else: + image = _horizontal_flip_image_cvcuda(image) + + flipped = _five_crop_image_cvcuda(image, size) + + return non_flipped + flipped + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(ten_crop, _import_cvcuda().Tensor)(_ten_crop_image_cvcuda)