From 97090ecb772bff7da84d865786ec8f4a0718c61d Mon Sep 17 00:00:00 2001 From: fcdl94 Date: Tue, 30 Sep 2025 13:59:30 +0000 Subject: [PATCH 1/3] feat(augmentation): add mosaic and copy-paste augmentations for dataset enhancement - Introduced MosaicAugmentation class to combine images in a grid layout. - Added CopyPasteAugmentation class to enhance images by pasting objects from other images. - Updated DictDataset methods to support new augmentation functionalities. - Refactored existing methods for clarity and consistency, including deprecating the store_coco_roboflow_format method in favor of save. - Improved annotation handling and bounding box adjustments in the dataset processing. --- focoos/data/datasets/dict_dataset.py | 22 +- scripts/dataset_augmentations.py | 567 +++++++++++++++++++++++++++ 2 files changed, 585 insertions(+), 4 deletions(-) create mode 100644 scripts/dataset_augmentations.py diff --git a/focoos/data/datasets/dict_dataset.py b/focoos/data/datasets/dict_dataset.py index f8a35abc..ee35f72c 100644 --- a/focoos/data/datasets/dict_dataset.py +++ b/focoos/data/datasets/dict_dataset.py @@ -10,6 +10,7 @@ import numpy as np from PIL import Image from torch.utils.data import Dataset +from typing_extensions import deprecated from focoos.data.datasets.serialize import TorchSerializedDataset from focoos.ports import ( @@ -63,9 +64,16 @@ def __getitem__(self, index) -> dict: def __len__(self): return len(self.dicts) + @deprecated("Use save instead") def store_coco_roboflow_format(self, output_dir: str): """ - Store the dataset in COCO format. + Store the dataset in Roboflow COCO format. + """ + self.save(output_dir) + + def save(self, output_dir: str): + """ + Store the dataset in Roboflow COCO format. """ def compute_area_seg(seg): @@ -124,10 +132,10 @@ def compute_area_box(bbox): obj = { "id": annotation_idx, "image_id": data.image_id, - "category_id": ann["category_id"], + "category_id": ann["category_id"] + 1, "bbox": ann["bbox"], "area": area, # to compute - "iscrowd": ann["iscrowd"], + "iscrowd": ann.get("iscrowd", 0), } if use_seg: obj["segmentation"] = ann["segmentation"] @@ -405,7 +413,13 @@ def from_segmentation( image_file = os.path.join(ds_dir, images[image_id]) label_file = os.path.join(ds_dir, ann["file_name"]) - dataset_dicts.append(DetectronDict(file_name=image_file, sem_seg_file_name=label_file, image_id=image_id)) + dataset_dicts.append( + DetectronDict( + file_name=image_file, + sem_seg_file_name=label_file, + image_id=image_id, + ) + ) logger.info("Loaded {} images with semantic segmentation from {}".format(len(dataset_dicts), ds_dir)) diff --git a/scripts/dataset_augmentations.py b/scripts/dataset_augmentations.py new file mode 100644 index 00000000..db273a97 --- /dev/null +++ b/scripts/dataset_augmentations.py @@ -0,0 +1,567 @@ +# try to laod the downloaded dataset +import argparse +import os +import random +import shutil + +import numpy as np +from PIL import Image +from tqdm import tqdm + +from focoos.data.datasets.dict_dataset import DictDataset +from focoos.ports import DatasetSplitType, DetectronDict, Task +from focoos.structures import BoxMode + +TRAIN_SPLIT_NAME = "train" +VAL_SPLIT_NAME = "valid" + + +def open_image(file_name) -> np.ndarray: + image = np.array(Image.open(file_name)) + if len(image.shape) == 2: + image = np.stack((image,) * 3, axis=-1) + elif len(image.shape) == 3 and image.shape[2] == 1: + image = np.repeat(image, 3, axis=2) + return image + + +class MosaicAugmentation: + """ + Augmentation that combines 2 or 4 images in a simple grid layout. + + This augmentation creates a new image by combining images in a clean grid: + - 2 images: Side by side (horizontal) or stacked (vertical) + - 4 images: 2x2 grid + + Every pixel in the output image belongs to exactly one input image. + """ + + def __init__(self, dataset: DictDataset, output_size=(640, 640)): + """ + Args: + dataset: The dataset to sample images from + output_size: Tuple (width, height) for the output mosaic image size + """ + super().__init__() + self.dataset = dataset + self.output_size = output_size + + def _get_grid_layout(self, num_images): + """Determine the grid layout based on number of images.""" + if num_images == 2: + # Randomly choose horizontal or vertical split + if random.random() < 0.5: + return "horizontal" # Side by side + else: + return "vertical" # Stacked + elif num_images == 4: + return "2x2" # 2x2 grid + else: + raise ValueError(f"Unsupported number of images: {num_images}. Use 2 or 4.") + + def _get_grid_regions(self, layout): + """Calculate the exact regions for each image in the grid.""" + width, height = self.output_size + + if layout == "horizontal": + # Two images side by side + mid_x = width // 2 + return [ + (0, 0, mid_x, height), # Left image + (mid_x, 0, width, height), # Right image + ] + elif layout == "vertical": + # Two images stacked + mid_y = height // 2 + return [ + (0, 0, width, mid_y), # Top image + (0, mid_y, width, height), # Bottom image + ] + elif layout == "2x2": + # Four images in 2x2 grid + mid_x = width // 2 + mid_y = height // 2 + return [ + (0, 0, mid_x, mid_y), # Top-left + (mid_x, 0, width, mid_y), # Top-right + (0, mid_y, mid_x, height), # Bottom-left + (mid_x, mid_y, width, height), # Bottom-right + ] + else: + raise ValueError(f"Unknown layout: {layout}") + + def transform(self, dataset_dict): + """ + Apply mosaic augmentation to create a grid of images. + + Args: + dataset_dict: Dictionary containing image information and annotations + + Returns: + mosaic_dataset_dict: Updated dataset dict with mosaic image and combined annotations + """ + # Determine number of images (2 or 4) + num_images = random.choice([2, 4]) + + # Sample random images from dataset + source_indices = [random.randint(0, len(self.dataset) - 1) for _ in range(num_images - 1)] + source_indices.insert(0, random.randint(0, len(self.dataset) - 1)) # Include original image + + # Load all images + images = [] + all_annotations = [] + + for i, idx in enumerate(source_indices): + if i == 0: + # Use the original image + image = open_image(dataset_dict["file_name"]) + annotations = dataset_dict.get("annotations", []).copy() + else: + # Sample from dataset + sample = self.dataset[idx] + image = open_image(sample["file_name"]) + annotations = sample.get("annotations", []).copy() + + images.append(image) + all_annotations.append(annotations) + + # Determine grid layout + layout = self._get_grid_layout(num_images) + regions = self._get_grid_regions(layout) + + # Create output mosaic image + mosaic_image = np.zeros((self.output_size[1], self.output_size[0], 3), dtype=np.uint8) + mosaic_annotations = [] + + # Process each image and place it in the grid + for i, (image, annotations, region) in enumerate(zip(images, all_annotations, regions)): + x1, y1, x2, y2 = region + region_width = x2 - x1 + region_height = y2 - y1 + + # Resize image to exactly fit the region (no padding, no gaps) + img_h, img_w = image.shape[:2] + + # Resize to fit the exact region dimensions + resized_image = np.array(Image.fromarray(image).resize((region_width, region_height))) + + # Place the image in the exact region + mosaic_image[y1:y2, x1:x2] = resized_image + + # Update annotations for this image + for ann in annotations: + if "bbox" in ann: + # Convert bbox to absolute coordinates + box = BoxMode.convert( + ann["bbox"], + ann.get("bbox_mode", BoxMode.XYWH_ABS), + BoxMode.XYXY_ABS, + ) + assert ann.get("category_id") is not None, "category_id is None" + + # Scale the bbox to fit the new region + x1_orig, y1_orig, x2_orig, y2_orig = box + + # Calculate scale factors + scale_x = region_width / img_w + scale_y = region_height / img_h + + # Scale and translate the bbox + new_x1 = int(x1_orig * scale_x) + x1 + new_y1 = int(y1_orig * scale_y) + y1 + new_x2 = int(x2_orig * scale_x) + x1 + new_y2 = int(y2_orig * scale_y) + y1 + + # Check if bbox is within the output image bounds + if ( + new_x1 >= 0 + and new_y1 >= 0 + and new_x2 <= self.output_size[0] + and new_y2 <= self.output_size[1] + and new_x2 > new_x1 + and new_y2 > new_y1 + ): + # Convert back to XYWH format + new_w_bbox = new_x2 - new_x1 + new_h_bbox = new_y2 - new_y1 + new_area = new_w_bbox * new_h_bbox + + mosaic_annotations.append( + { + "bbox": [new_x1, new_y1, new_w_bbox, new_h_bbox], + "bbox_mode": BoxMode.XYWH_ABS, + "category_id": ann.get("category_id"), + "area": new_area, + "iscrowd": ann.get("iscrowd", 0), + } + ) + + # Create the mosaic dataset dict + mosaic_dataset_dict = dataset_dict.copy() + mosaic_dataset_dict["image"] = mosaic_image + mosaic_dataset_dict["annotations"] = mosaic_annotations + mosaic_dataset_dict["width"] = self.output_size[0] + mosaic_dataset_dict["height"] = self.output_size[1] + + return mosaic_dataset_dict + + +class CopyPasteAugmentation: + """ + Augmentation that enhances an image by pasting objects from other images onto it. + """ + + def __init__(self, dataset: DictDataset, scale_range=(0.3, 0.7), num_objects=3, blend_factor=1.0): + """ + Args: + dataset: The dataset to sample images from + scale_range: Range of scaling factors for the pasted objects + num_objects: Number of objects to paste onto the original image + blend_factor: Factor controlling the blending of pasted objects + """ + super().__init__() + self.dataset = dataset + self.scale_range = scale_range + self.num_objects = num_objects + self.blend_factor = blend_factor + + def transform(self, dataset_dict): + """ + Apply copy paste augmentation to an image. + + Args: + dataset_dict: Dictionary containing image information and annotations + + Returns: + copy_paste_dataset_dict: Updated dataset dict with augmented image and annotations + """ + # Load the original image + image = open_image(dataset_dict["file_name"]) + # Convert grayscale to RGB if needed + h, w = image.shape[:2] + copy_paste_image = image.copy() + + # Create a dataset dict for the mosaic image + copy_paste_dataset_dict = dataset_dict.copy() + original_annotations = copy_paste_dataset_dict.get("annotations", []).copy() + copy_paste_dataset_dict["annotations"] = original_annotations + + # Create a mask to track occupied regions + occupied_mask = np.zeros((h, w), dtype=bool) + + # Calculate maximum allowed area (10% of original image) + max_allowed_area = 0.1 * h * w + + # Sample random images to get objects from + source_indices = [random.randint(0, len(self.dataset) - 1) for _ in range(self.num_objects)] + + for source_idx in source_indices: + # Get a random image from dataset + sample = self.dataset[source_idx] + source_image = open_image(sample["file_name"]) + + # Get bounding boxes if available + if "annotations" in sample and len(sample["annotations"]) > 0: + source_boxes = [] + category_ids = [] + areas = [] + iscrowds = [] + + for ann in sample["annotations"]: + if "bbox" in ann: + box = BoxMode.convert( + ann["bbox"], + ann.get("bbox_mode", BoxMode.XYWH_ABS), + BoxMode.XYXY_ABS, + ) + source_boxes.append(box) + category_ids.append(ann.get("category_id")) + areas.append(ann.get("area", 0)) + iscrowds.append(ann.get("iscrowd", 0)) + source_boxes = np.array(source_boxes) + else: + continue # Skip if no boxes + + if len(source_boxes) == 0: + continue + + # Select a random box + box_idx = random.randint(0, len(source_boxes) - 1) + box = source_boxes[box_idx] + category_id = category_ids[box_idx] + # area = areas[box_idx] + iscrowd = iscrowds[box_idx] + + # Get the object region + x1, y1, x2, y2 = map(int, box) + box_w, box_h = x2 - x1, y2 - y1 + + # Add margin around the box + margin = 0.1 + crop_x1 = max(0, int(x1 - margin * box_w)) + crop_y1 = max(0, int(y1 - margin * box_h)) + crop_x2 = min(source_image.shape[1], int(x2 + margin * box_w)) + crop_y2 = min(source_image.shape[0], int(y2 + margin * box_h)) + + # Crop the object region + crop_w, crop_h = crop_x2 - crop_x1, crop_y2 - crop_y1 + cropped_image = source_image[crop_y1:crop_y2, crop_x1:crop_x2] + + # Scale the cropped image + scale_factor = random.uniform(*self.scale_range) + new_w, new_h = int(crop_w * scale_factor), int(crop_h * scale_factor) + + # Skip if too small + if new_w < 10 or new_h < 10: + continue + + # Skip if area exceeds maximum allowed area (10% of original image) + if new_w * new_h > max_allowed_area: + continue + + scaled_image = np.array(Image.fromarray(cropped_image).resize((new_w, new_h))) + + # Find a place to paste (avoid edges) + max_x = w - new_w + max_y = h - new_h + if max_x <= 0 or max_y <= 0: + continue + + # Try to find a non-overlapping position (max 10 attempts) + found_valid_position = False + for _ in range(10): + paste_x = random.randint(0, max_x) + paste_y = random.randint(0, max_y) + + # Check if the region overlaps with existing objects + region_mask = occupied_mask[paste_y : paste_y + new_h, paste_x : paste_x + new_w] + if region_mask.size > 0 and not np.any(region_mask): + found_valid_position = True + break + + # Skip if we couldn't find a non-overlapping position + if not found_valid_position: + continue + + # Mark this region as occupied + occupied_mask[paste_y : paste_y + new_h, paste_x : paste_x + new_w] = True + + # Create alpha mask for smoother blending + alpha = self.blend_factor # Blend factor + + # Paste the scaled image with alpha blending + roi = copy_paste_image[paste_y : paste_y + new_h, paste_x : paste_x + new_w] + if roi.shape[:2] == scaled_image.shape[:2]: # Ensure shapes match + copy_paste_image[paste_y : paste_y + new_h, paste_x : paste_x + new_w] = ( + alpha * scaled_image + (1 - alpha) * roi + ).astype(np.uint8) + + # Calculate new area + new_area = new_w * new_h + + # Add the new box to the annotations in XYWH_ABS format + copy_paste_dataset_dict["annotations"].append( + { + "bbox": [paste_x, paste_y, new_w, new_h], + "bbox_mode": BoxMode.XYWH_ABS, + "category_id": category_id, + "area": new_area, + "iscrowd": iscrowd, + } + ) + + # Update the image in the dataset dict + copy_paste_dataset_dict["image"] = copy_paste_image + + return copy_paste_dataset_dict + + +# Example usage in a notebook: +def apply_mosaic_augmentation(dataset, num_samples=5, output_size=(640, 640)): + """Apply mosaic augmentation to a few samples and display results""" + mosaic_aug = MosaicAugmentation(dataset, output_size=output_size) + + results = [] + for i in range(num_samples): + sample_idx = random.randint(0, len(dataset) - 1) + sample = dataset[sample_idx] + + # Apply mosaic augmentation + new_dict = mosaic_aug.transform(sample) + + results.append((new_dict)) + + return results + + +def apply_copypaste_augmentation(dataset, num_samples=5, num_objects=5): + """Apply copy-paste augmentation to a few samples and display results""" + copy_paste_aug = CopyPasteAugmentation(dataset, num_objects=num_objects) + + results = [] + for i in range(num_samples): + sample_idx = random.randint(0, len(dataset) - 1) + sample = dataset[sample_idx] + + # Apply copy-paste augmentation + new_dict = copy_paste_aug.transform(sample) + + results.append((new_dict)) + + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Create augmented dataset with increased size") + parser.add_argument("--dataset_dir", type=str, default="./datasets/coco", help="Path to the original dataset") + parser.add_argument( + "--copy_paste_weight", + type=int, + default=5, + help="Weight of copy-paste augmentation (10 is same as no augmentation)", + ) + parser.add_argument( + "--mosaic_weight", + type=int, + default=5, + help="Weight of mosaic augmentation (10 is same as no augmentation)", + ) + parser.add_argument( + "--target_size", + type=int, + default=None, + help="Target number of images in the resulting dataset. If not specified, will use original size", + ) + parser.add_argument( + "--augmentation_ratio", + type=float, + default=1.0, + help="Ratio of augmented images to original images (e.g., 1.0 means equal number of original images)", + ) + parser.add_argument( + "--copy_paste_num_objects", type=int, default=10, help="Number of objects to paste in copy-paste augmentation" + ) + parser.add_argument( + "--mosaic_output_size", type=str, default="640,640", help="Comma-separated width,height for mosaic output" + ) + parser.add_argument("--copy_val_dataset", type=bool, default=False, help="Whether to copy validation dataset") + parser.add_argument( + "--output_suffix", + type=str, + default="_augmented", + help="Suffix for output dataset directory (default: based on augmentation type)", + ) + args = parser.parse_args() + + # Load original datasets + print("Loading original datasets...") + data = DictDataset.from_roboflow_coco( + ds_dir=args.dataset_dir + "/" + TRAIN_SPLIT_NAME, task=Task.DETECTION, split_type=DatasetSplitType.TRAIN + ) + val_data = DictDataset.from_roboflow_coco( + ds_dir=args.dataset_dir + "/" + VAL_SPLIT_NAME, task=Task.DETECTION, split_type=DatasetSplitType.VAL + ) + + original_size = len(data) + print(f"Original training dataset size: {original_size}") + + # Calculate target size + if args.target_size is None: + target_size = int(original_size * (args.augmentation_ratio)) + else: + target_size = args.target_size + + augmented_size = target_size + print(f"Target dataset size: {target_size}") + + # Parse mosaic parameters + mosaic_output_size = tuple(map(int, args.mosaic_output_size.split(","))) + + # Choose augmentation type + copypaste_aug = CopyPasteAugmentation(data, num_objects=args.copy_paste_num_objects) + mosaic_aug = MosaicAugmentation(data, output_size=mosaic_output_size) + + new_dataset_root = args.dataset_dir + args.output_suffix + new_train_root = new_dataset_root + "/" + TRAIN_SPLIT_NAME + + os.makedirs(new_dataset_root, exist_ok=True) + os.makedirs(new_dataset_root + "/" + TRAIN_SPLIT_NAME, exist_ok=True) + + augmentation_weights = args.copy_paste_weight + args.mosaic_weight + 10 + + # Generate augmented images + new_dataset = [] + + if augmented_size > 0: + print(f"Generating {augmented_size} augmented images...") + + # Calculate how many times we need to go through the original dataset + iterations_needed = (augmented_size + original_size - 1) // original_size + + for iteration in range(iterations_needed): + remaining_augmented = augmented_size - len(new_dataset) + if remaining_augmented <= 0: + break + + print(f"Augmentation iteration {iteration + 1}/{iterations_needed}") + + for dic in tqdm(data, desc=f"Generating augmented images (iter {iteration + 1})"): + if len(new_dataset) >= augmented_size: + break + + try: + # Apply augmentation + augmentation = random.random() * augmentation_weights + if augmentation < args.copy_paste_weight: + aug = copypaste_aug + elif augmentation < (args.copy_paste_weight + args.mosaic_weight): + aug = mosaic_aug + else: + aug = None + + if aug is not None: + augmented_dict = aug.transform(dic) + else: + augmented_dict = dic + augmented_dict["image"] = open_image(dic["file_name"]) + + # Generate unique filename + original_file_name = dic["file_name"].split("/")[-1] + name, ext = os.path.splitext(original_file_name) + aug_file_name = f"{name}_aug_{len(new_dataset) + 1}{ext}" + new_file_name = os.path.join(new_train_root, aug_file_name) + + # Save augmented image + Image.fromarray(augmented_dict["image"]).save(new_file_name) + + # Update the dataset dict + augmented_dict["file_name"] = new_file_name + del augmented_dict["image"] # Remove image data to save memory + new_dataset.append(augmented_dict) + + except Exception as e: + print(f"Error generating augmented image ({dic['file_name']}): {e}") + # Skip this augmentation if it fails + continue + + # Create the final dataset + print("Creating final dataset...") + final_dataset = DictDataset( + [DetectronDict(**d) for d in new_dataset], + metadata=data.metadata, + task=Task.DETECTION, + split_type=DatasetSplitType.TRAIN, + ) + + print(f"Final dataset size: {len(final_dataset)}") + + # Save the dataset + final_dataset.save(output_dir=new_dataset_root + "/" + TRAIN_SPLIT_NAME) + + # Copy validation dataset if requested + if args.copy_val_dataset: + print("Copying validation dataset...") + shutil.copytree(args.dataset_dir + "/" + VAL_SPLIT_NAME, new_dataset_root + "/" + VAL_SPLIT_NAME) + + print(f"Augmented dataset saved to: {new_dataset_root}") + print("Dataset augmentation completed successfully!") From e797bf65bf7ca11da3bfba67aae541552153a202 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 10 Dec 2025 09:46:44 +0000 Subject: [PATCH 2/3] Refactor copy_val_dataset argument to use action='store_true' Co-authored-by: ivan.murabito --- scripts/dataset_augmentations.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/dataset_augmentations.py b/scripts/dataset_augmentations.py index db273a97..07cd078a 100644 --- a/scripts/dataset_augmentations.py +++ b/scripts/dataset_augmentations.py @@ -444,7 +444,11 @@ def apply_copypaste_augmentation(dataset, num_samples=5, num_objects=5): parser.add_argument( "--mosaic_output_size", type=str, default="640,640", help="Comma-separated width,height for mosaic output" ) - parser.add_argument("--copy_val_dataset", type=bool, default=False, help="Whether to copy validation dataset") + parser.add_argument( + "--copy_val_dataset", + action="store_true", + help="Copy validation dataset alongside the augmented training split", + ) parser.add_argument( "--output_suffix", type=str, From 17c4d113eb16802beb8be0b55a69a0dddfffa1db Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 10 Dec 2025 09:48:50 +0000 Subject: [PATCH 3/3] Fix: Ensure new bounding box area is calculated Co-authored-by: ivan.murabito --- scripts/dataset_augmentations.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/scripts/dataset_augmentations.py b/scripts/dataset_augmentations.py index 07cd078a..f333c655 100644 --- a/scripts/dataset_augmentations.py +++ b/scripts/dataset_augmentations.py @@ -356,19 +356,19 @@ def transform(self, dataset_dict): alpha * scaled_image + (1 - alpha) * roi ).astype(np.uint8) - # Calculate new area - new_area = new_w * new_h - - # Add the new box to the annotations in XYWH_ABS format - copy_paste_dataset_dict["annotations"].append( - { - "bbox": [paste_x, paste_y, new_w, new_h], - "bbox_mode": BoxMode.XYWH_ABS, - "category_id": category_id, - "area": new_area, - "iscrowd": iscrowd, - } - ) + # Calculate new area + new_area = new_w * new_h + + # Add the new box to the annotations in XYWH_ABS format + copy_paste_dataset_dict["annotations"].append( + { + "bbox": [paste_x, paste_y, new_w, new_h], + "bbox_mode": BoxMode.XYWH_ABS, + "category_id": category_id, + "area": new_area, + "iscrowd": iscrowd, + } + ) # Update the image in the dataset dict copy_paste_dataset_dict["image"] = copy_paste_image