diff --git a/docs/user_guide/17_active_learning.md b/docs/user_guide/17_active_learning.md new file mode 100644 index 000000000..57c82fc73 --- /dev/null +++ b/docs/user_guide/17_active_learning.md @@ -0,0 +1,285 @@ +# Active Learning submodule + +Active learning in the context of image object detection is a technique for efficiently selecting the most valuable images and objects to annotate, reducing the labeling effort required for model training while maximizing performance gains. This submodule provides active learning utilities. It wraps DeepForest’s training and inference APIs with a small, reproducible loop that selects informative unlabeled images using an entropy-based acquisition function. It is designed to be extensible, so you can swap selection strategies, wire it to an annotation tool (Label Studio), and run multi-round training. + +# Key Features + +- Reproducible training rounds via explicit seeding +- YAML- or dict-based configuration with validation (no `Config` dataclass) +- PyTorch Lightning training with checkpointing and early stopping +- Batch prediction over image lists +- Entropy-based acquisition that scores each image by uncertainty +- DeepForest-compatible CSV and image-path I/O +- Label Studio integration `src/deepforest/label_studio.py`. Pre-annotation upload is optional and not included by default + +# Basic flow of the submodule +![alt text](AL_workflow.jpg) + + + +# Quickstart + +```python +# Recommended: configure via YAML and construct the learner with learner_from_yaml() +# Your YAML must include all required keys used by load_config() +from active_learning import learner_from_yaml + +learner = learner_from_yaml("active_learning.yml") + +# Train one round and evaluate +ckpt_path = learner.fit_one_round() +metrics = learner.evaluate() +print("Best checkpoint:", ckpt_path) +print("Eval metrics:", {k: v for k, v in metrics.items() if not hasattr(v, "head")}) + +# Select uncertain images from a file of paths (one path per line) or a list +manifest = learner.select_for_labeling("/data/trees/unlabeled_paths.txt", k=50) +print(manifest.head()) +``` + +If you prefer not to use YAML, you can pass a validated dict directly to `ActiveLearner(cfg_dict)`. The dict must include the same keys that `load_config()` enforces (`workdir`, `images_dir`, `train_csv`, `val_csv`, `classes`, `training/eval/acquisition hyperparams`, etc.). + +`select_for_labeling()` accepts either a Python list of image paths or a text file with one path per line and ranks images by Shannon entropy of predicted class score mass. And the label Studio usage (exporting tasks and converting to DeepForest CSV) is handled by separate helper functions; they’re not part of ActiveLearner itself. + +## How It Works + +- **Training:** `ActiveLearner.fit_one_round()` uses the PyTorch Lightning `Trainer` that’s created in `__init__` (with checkpointing and early stopping) to train the DeepForest model for `cfg["epochs_per_round"]`. + +- **Evaluation:** `ActiveLearner.evaluate()` runs DeepForest’s evaluation on `cfg["val_csv"]` with the IoU threshold `cfg["iou_eval"]`, returning a metrics dict. + +- **Prediction:** `ActiveLearner.predict_images(paths)` calls `model.predict_image` on each path using `cfg["score_threshold_pred"]` and returns a dict `{image_path: DataFrame}` in DeepForest format (`xmin,ymin,xmax,ymax,label,score,image_path`). + +- **Acquisition:** `select_for_labeling()` aggregates per-image predictions by **summing detection scores per class/label**, computes Shannon entropy (images with no predictions receive `log(C)`), sorts by entropy (desc), and returns the **top-K** rows. The **full ranked manifest** is written to `workdir/acquisition/selection_round.csv` with columns: `image_path,entropy,n_preds,mean_score`. + +# Configuration + +## Utility Functions + +- `load_config(yaml_path)` loads a YAML with no defaults. All of these keys must be present: + +- Paths and labels +`workdir`, `images_dir`, `train_csv`, `val_csv`, `classes` (non-empty list) + +- Training hyperparameters +`epochs_per_round`, `batch_size`, `lr`, `weight_decay`, `precision`, `device`, `num_workers`, `seed`, `use_release_weights` + +- Evaluation +`iou_eval` + +- Acquisition +`k_per_round`, `score_threshold_pred` + +Example of YAML (`active_learning.yml`): +```YAML +workdir: /tmp/df_active +images_dir: /data/trees/images +train_csv: /data/trees/train.csv +val_csv: /data/trees/val.csv +classes: ["Tree"] + +epochs_per_round: 5 +batch_size: 4 +lr: 0.0005 +weight_decay: 0.0001 +precision: 32 # 16, 32, "bf16" are typical +device: auto # "auto", "cuda", or "cpu" +num_workers: 4 +seed: 1337 +use_release_weights: true + +iou_eval: 0.4 +k_per_round: 50 +score_threshold_pred: 0.05 +``` +## Class: ActiveLearner + +Construct from a YAML + +```python +from active_learning import learner_from_yaml +learner = learner_from_yaml("active_learning.yml") +``` + +For reproducibility `_seed_everything(seed)` seeds Python, NumPy, Torch, and Lightning (if available) & `_resolve_device(device)` accepts "auto", "cuda...", or "cpu". By default it falls back to CPU if CUDA is unavailable. + +Methods: + +- `fit_one_round() -> Path` +Trains for one round with checkpointing and early stopping. Returns best checkpoint path. + +- `evaluate() -> dict` +Evaluates on val_csv with iou_eval. Returns a dict (metrics + any DataFrame entries DeepForest provides). + +- `predict_images(paths: list[str] | str) -> dict[str, pd.DataFrame]` +Runs inference on image paths. Values are DataFrames with DeepForest columns +["xmin","ymin","xmax","ymax","label","score","image_path"]. + +- `select_for_labeling(unlabeled_paths, k=None) -> pd.DataFrame` +Ranks images by Shannon entropy of predicted class score mass. + + - unlabeled_paths can be a list or a text file with one path per line. + + - Uses k or falls back to cfg["k_per_round"]. + + - Writes workdir/acquisition/selection_round.csv with columns: +image_path, entropy, n_preds, mean_score. + +# Input & Output Formats + +## DeepForest CSV (training/validation) + +Columns required: `image_path,xmin,ymin,xmax,ymax,label` + +* `image_path` can be absolute or relative to `images_dir` +* Coordinates are pixel units in the image coordinate system + +## Prediction DataFrame + +Returned by `model.predict_image` and normalized here to columns: + +* `xmin, ymin, xmax, ymax, label, score, image_path` + +# Logging, Checkpoints, and Reproducibility + +* Checkpoints saved under `workdir/logs/checkpoints/` +* Best model path tracked via `ModelCheckpoint` +* Training is deterministic where possible; seeds are set for Python, NumPy, and Torch +* Early stopping monitors `val_map` with patience 3 by default + +# Error Handling Notes + +* Prediction exceptions are caught per-image; an empty DataFrame is substituted +* Evaluation failure returns an empty dict and logs a warning +* CUDA unavailability falls back to CPU with a warning if CUDA was explicitly requested + +# Example: Multi-Round Active Learning Loop + +```python +al = ActiveLearner(cfg) +for round_id in range(5): + print(f"Round {round_id}") + ckpt = al.fit_one_round() + print(al.evaluate()) + manifest = al.select_for_labeling("/data/unlabeled_paths.txt", k=cfg.k_per_round) + # Send `manifest` to an annotation workflow (e.g., Label Studio) + # Merge newly labeled data into train.csv, deduplicate, and continue +``` + +# Label Studio Integration Plan + +This section outlines how to connect the acquisition outputs to Label Studio for annotation and then flow the results back into DeepForest. + + +## 1. Helpers functions in `src/deepforest/label_studio.py` + +- `get_access_token()` refreshes an access token using `REFRESH_TOKEN`. +- `Health_check(access)` returns the `Authorization` header for subsequent calls. +- `list_projects(access)` fetches Label Studio projects. +- `list_tasks(access, project_id)` retrieves tasks for a project with `fields=all` so annotations and predictions are included. +- `get_task(access, task_id)` pulls a single task and its annotations. +- `export_project_tasks(access, project_id)` downloads all tasks with annotations in JSON format. +- `find_image_field(task_data)` finds the first image-like field in a task’s `data`. +- `absolute_image_url(path_or_url)` converts relative paths to absolute URLs under `BASE_URL`. +- `parse_image_url(url)` extracts a stable filename from an image URL. +- `extract_annotation_pairs(task)` returns `(filename, label)` pairs from choices, textarea, and any `*labels` results. + +## 2. Locate your project + +```python +projects = list_projects(access) +# pick by title or id +project = next(p for p in projects if p["title"] == "Your Project Name") +project_id = project["id"] +``` + +## 3. Fetch tasks for inspection +```python +tasks = list_tasks(access, project_id) +# or fetch a specific task +one = get_task(access, tasks[0]["id"]) +``` + +## 4. Export annotations as JSON +```python +export = export_project_tasks(access, project_id) +# 'export' is a list of task dicts with annotations included +``` + +## 5. Resolve image filenames and labels +```python +pairs = [] +for task in export: + image_field = find_image_field(task.get("data", {}) or {}) + if not image_field: + continue + + img_url = absolute_image_url(image_field) + filename = parse_image_url(img_url) + + # extract_annotation_pairs already returns (filename, label) + # but we prefer the resolved 'filename' for consistency + for _, label in extract_annotation_pairs(task): + pairs.append((filename, label)) +``` + +## 6. Persist outputs for training (classification/tag use-case) + +```python +import csv, os +os.makedirs(TRAIN_DIR, exist_ok=True) +out_csv = os.path.join(TRAIN_DIR, "labels.csv") + +with open(out_csv, "w", newline="") as f: + w = csv.writer(f) + w.writerow(["filename", "label"]) + w.writerows(pairs) + +``` + +## Minimum example to pull a project export: +```python +from label_studio_helpers import get_access_token, list_projects, export_project_tasks + +access = get_access_token() +projects = list_projects(access) +project_id = next(p["id"] for p in projects if p["title"] == "Your Project Name") + +export = export_project_tasks(access, project_id) # list of task dicts +``` +## Converting Label Studio Exports + +The current helper `extract_annotation_pairs` supports classification-like outputs and returns (filename, label) pairs. + +If your Label Studio project collects bounding boxes (DeepForest use-case), you need an extra converter for rectanglelabels, because your current helpers do not compute pixel coordinates. A minimal approach is: + +**1. For each task:** + +Find the image field (`find_image_field`), resolve it (`absolute_image_url`), choose a local filename (`parse_image_url`), and ensure the corresponding image exists under `images_dir`(download if needed). + +**2. For each `annotation.result` item with `type == "rectanglelabels"`:** + +- Convert percent coords to pixels: +```python +xmin = (x/100) * W, ymin = (y/100) * H +xmax = ((x + width)/100) * W, ymax = ((y + height)/100) * H +``` +- Write a DeepForest CSV row: image_path,xmin,ymin,xmax,ymax,label. + + +## Automating the Round-Trip + +* After annotators finish a batch, export JSON, convert to DeepForest CSV, and append to `train_csv`. +* Deduplicate by `(image_path, xmin, ymin, xmax, ymax, label)` if necessary. +* Optionally use Label Studio webhooks to trigger a small script that runs the conversion and kicks off the next `fit_one_round()`. + +# Extending the Acquisition Strategy + +The current entropy score uses class-distribution uncertainty from detection scores. You can plug in other criteria: + +* **Score margin**: difference between top-2 class masses +* **Mean score**: prioritize low-confidence images +* **Diversity**: add image embeddings and do k-center or clustering over features +* **Spatial entropy**: weight by number of boxes or box-area variance +* **Cost-aware**: penalize large, hard-to-annotate images +* **BALD/MC-Dropout**: approximate Bayesian uncertainty via stochastic forward passes + diff --git a/docs/user_guide/AL_workflow.jpg b/docs/user_guide/AL_workflow.jpg new file mode 100644 index 000000000..2d702450c Binary files /dev/null and b/docs/user_guide/AL_workflow.jpg differ diff --git a/src/deepforest/active_learning.py b/src/deepforest/active_learning.py new file mode 100644 index 000000000..a6476c7a9 --- /dev/null +++ b/src/deepforest/active_learning.py @@ -0,0 +1,411 @@ +"""This submodule provides active learning utilities for the +weecology/deepforest library. + +Features: +- Configuration management via YAML files for active learning experiments. +- ActiveLearner class: wraps DeepForest model training, evaluation, prediction, and acquisition routines. +- Entropy-based acquisition function for selecting unlabeled images to label next. +- Utilities for reproducibility, device management, and data handling. +- Training and validation CSVs must follow DeepForest format: image_path, xmin, ymin, xmax, ymax, label. +- Supports iterative active learning workflows: model training, evaluation, prediction, selection, and retraining with new labels. + +Intended for use in tree detection and similar object detection tasks with DeepForest. +library. +""" + +from __future__ import annotations +import logging +import math +import random +from pathlib import Path +from omegaconf import open_dict + +import yaml +import numpy as np +import pandas as pd +import torch + +import pytorch_lightning as pl +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint + +from deepforest import main as df_main + + +def load_config(yaml_path: str = "active_learning.yml") -> dict: + """Load and validate configuration from YAML. + + No defaults are applied. + """ + p = Path(yaml_path) + if not p.exists(): + raise FileNotFoundError(f"Config YAML not found: {yaml_path}") + with p.open("r", encoding="utf-8") as f: + cfg = yaml.safe_load(f) or {} + + # Required keys (must all be present in YAML) + required = [ + # Paths & labels + "workdir", + "images_dir", + "train_csv", + "val_csv", + "classes", + # Training + "epochs_per_round", + "batch_size", + "lr", + "weight_decay", + "precision", + "device", + "num_workers", + "seed", + "use_release_weights", + # Evaluation + "iou_eval", + # Acquisition + "k_per_round", + "score_threshold_pred", + ] + missing = [k for k in required if k not in cfg] + if missing: + raise KeyError(f"Missing required config keys in {yaml_path}: {missing}") + + if not isinstance(cfg["classes"], (list, tuple)) or not cfg["classes"]: + raise ValueError("Config 'classes' must be a non-empty list.") + + return cfg + + +def learner_from_yaml(yaml_path: str = "active_learning.yml") -> "ActiveLearner": + """Create an ActiveLearner by loading configuration from a YAML file.""" + cfg = load_config(yaml_path) + return ActiveLearner(cfg) + + +def _seed_everything(seed: int): + """Seed Python, NumPy, and Torch for reproducibility.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + try: + pl.seed_everything(seed, workers=True) + except Exception: + pass + + +def _resolve_device(device: str): + """Return (accelerator, devices) tuple understood by PyTorch Lightning.""" + dev = str(device).lower() + if dev == "auto": + return ("gpu", 1) if torch.cuda.is_available() else ("cpu", 1) + if dev.startswith("cuda"): + if not torch.cuda.is_available(): + logging.warning("CUDA requested but not available; falling back to CPU.") + return ("cpu", 1) + return ("gpu", 1) + return ("cpu", 1) + + +def _ensure_dir(p): + """Create directory p (and parents) if it does not exist.""" + Path(p).mkdir(parents=True, exist_ok=True) + + +def _read_paths_file(path_or_list): + """Read a file of image paths, or accept a list directly.""" + if isinstance(path_or_list, (list, tuple)): + return [str(Path(p)) for p in path_or_list] + p = Path(path_or_list) + lines = p.read_text(encoding="utf-8").splitlines() + return [ln.strip() for ln in lines if ln.strip()] + + +def _image_entropy_from_predictions(pred_df, classes): + """Compute Shannon entropy from per-class aggregated detection scores. + + Args: + pred_df: DataFrame of predictions (DeepForest format). + classes: List of known class labels. + + Returns: + (entropy, n_preds, mean_score) + - entropy: Shannon entropy of class distribution. + - n_preds: Number of predicted boxes. + - mean_score: Mean detection score. + + Empty predictions receive maximum uncertainty (log(C)). + """ + if pred_df is None or len(pred_df) == 0: + c = max(1, len(classes)) + return (math.log(c), 0, 0.0) + + # Aggregate score mass per class + class_mass = {c: 0.0 for c in classes} + for _, row in pred_df.iterrows(): + label = str(row.get("label", "")) + score = float(row.get("score", 0.0)) + if label in class_mass: + class_mass[label] += max(0.0, min(1.0, score)) + + total = sum(class_mass.values()) + if total <= 0: + c = max(1, len(classes)) + return (math.log(c), len(pred_df), 0.0) + + probs = np.array([class_mass[c] / total for c in classes], dtype=np.float64) + entropy = float(-(probs * np.log(probs + 1e-12)).sum()) + return (entropy, int(len(pred_df)), float(pred_df["score"].mean())) + + +class ActiveLearner: + """High-level wrapper for DeepForest active learning. + + Methods: + fit_one_round() -> Path: Train for one round, return checkpoint path. + evaluate() -> dict: Evaluate model on validation set. + predict_images(paths) -> dict[str, DataFrame]: Run predictions on images. + select_for_labeling(unlabeled_paths, k) -> DataFrame: Rank images by entropy. + """ + + def __init__(self, cfg: dict): + self.cfg = cfg + self.workdir = Path(cfg["workdir"]) + self.images_dir = Path(cfg["images_dir"]) + self.train_csv = Path(cfg["train_csv"]) + self.val_csv = Path(cfg["val_csv"]) + self.classes = list(cfg["classes"]) + + _ensure_dir(self.workdir) + _ensure_dir(self.workdir / "logs") + _ensure_dir(self.workdir / "acquisition") + _seed_everything(int(cfg["seed"])) + + self.model = self._build_model(cfg) + self.trainer, self._ckpt_cb = self._create_trainer(cfg, self.workdir / "logs") + + self._attach_training_data() + + def _build_model(self, cfg): + """Initialize DeepForest model with correct class count.""" + model = df_main.deepforest() + if cfg["use_release_weights"]: + model.use_release() + model.config["num_classes"] = len(cfg["classes"]) + model.config["batch_size"] = int(cfg["batch_size"]) + # If desired and supported by your DeepForest version, you may also + # set optimizer hyperparameters here using cfg["lr"] / cfg["weight_decay"]. + return model + + def _create_trainer(self, cfg, log_dir): + """Create PyTorch Lightning Trainer with checkpointing and early + stopping.""" + accelerator, devices = _resolve_device(cfg["device"]) + ckpt_dir = Path(log_dir) / "checkpoints" + _ensure_dir(ckpt_dir) + + ckpt_cb = ModelCheckpoint( + dirpath=str(ckpt_dir), + filename="epoch{epoch:02d}-val_map", + monitor="val_map", + mode="max", + save_top_k=1, + save_weights_only=True, + auto_insert_metric_name=False, + ) + es_cb = EarlyStopping(monitor="val_map", mode="max", patience=3) + + trainer = pl.Trainer( + max_epochs=int(cfg["epochs_per_round"]), + accelerator=accelerator, + devices=devices, + precision=cfg["precision"], # int 16/32 or string "bf16" + default_root_dir=str(log_dir), + callbacks=[ckpt_cb, es_cb], + deterministic=True, + log_every_n_steps=10, + enable_checkpointing=True, + ) + return trainer, ckpt_cb + + +def _attach_training_data(self): + """Attach train/val CSVs and root dirs to model config.""" + cfg = self.model.config + + with open_dict(cfg): + if "train" not in cfg: + cfg["train"] = {} + # Some versions might use "validation"; prefer "val" if absent + if "val" not in cfg and "validation" not in cfg: + cfg["val"] = {} + + train = cfg["train"] + vkey = "val" if "val" in cfg else "validation" + + train["csv_file"] = str(self.train_csv) + train["root_dir"] = str(self.images_dir) + cfg[vkey]["csv_file"] = str(self.val_csv) + cfg[vkey]["root_dir"] = str(self.images_dir) + + def fit_one_round(self): + """Train for one active learning round and return best checkpoint + path.""" + self.model.create_trainer(trainer=self.trainer) + self.trainer.fit(self.model) + ckpt_path = Path( + self._ckpt_cb.best_model_path) if self._ckpt_cb else (self.workdir / "logs" / + "checkpoints") + logging.info("Training finished. Best checkpoint: %s", ckpt_path) + return ckpt_path + + def evaluate(self): + """Run evaluation on validation CSV and return results dict.""" + try: + results = self.model.evaluate( + csv_file=str(self.val_csv), + root_dir=str(self.images_dir), + iou_threshold=float(self.cfg["iou_eval"]), + predictions=None, + ) + log_summary = {k: v for k, v in results.items() if not hasattr(v, "head")} + logging.info("Evaluation: %s", log_summary) + return dict(results) + except Exception as e: + logging.warning("Evaluation failed: %s", e) + return {} + + def predict_images(self, paths): + """Run predictions for a list of image paths, returning dict[path -> + DataFrame].""" + self.model.eval() + out = {} + for p in paths: + p_str = str(p) + try: + with torch.no_grad(): + df = self.model.predict_image( + image_path=p_str, + return_plot=False, + score_threshold=float(self.cfg["score_threshold_pred"]), + ) + if df is None: + df = pd.DataFrame(columns=[ + "xmin", "ymin", "xmax", "ymax", "label", "score", "image_path" + ]) + except Exception as e: + logging.warning("Prediction error for %s: %s", p_str, e) + df = pd.DataFrame(columns=[ + "xmin", "ymin", "xmax", "ymax", "label", "score", "image_path" + ]) + out[p_str] = df + return out + + def select_for_labeling(self, unlabeled_paths, k=None): + """Rank unlabeled images by entropy and return top-k for labeling. + + Args: + unlabeled_paths: List of image paths or file containing paths. + k: Number of images to return. If None, uses cfg['k_per_round']. + + Returns: + DataFrame with ranked images and entropy scores. + """ + if k is None: + k = int(self.cfg["k_per_round"]) + + paths = _read_paths_file(unlabeled_paths) + if not paths: + raise ValueError("No unlabeled paths provided") + + logging.info("Acquisition over %d images", len(paths)) + preds = self.predict_images(paths) + + rows = [] + for img_path, df in preds.items(): + ent, n_preds, mean_score = _image_entropy_from_predictions(df, self.classes) + rows.append({ + "image_path": img_path, + "entropy": ent, + "n_preds": n_preds, + "mean_score": mean_score, + }) + + manifest = pd.DataFrame(rows).sort_values("entropy", + ascending=False).reset_index(drop=True) + out_path = self.workdir / "acquisition" / "selection_round.csv" + manifest.to_csv(out_path, index=False) + logging.info("Wrote acquisition manifest: %s", out_path) + + return manifest.head(k).copy() + + def append_and_retrain(self, new_labels_csv: str, round_id=None) -> dict: + """Append new DeepForest-format labels to train_csv and retrain. + + new_labels_csv must have columns: + image_path, xmin, ymin, xmax, ymax, label + It may optionally include a 'round' column; if missing, round_id is used. + + Returns: + dict with counts and checkpoint path. + """ + new_path = Path(new_labels_csv) + if not new_path.exists(): + raise FileNotFoundError(f"New labels CSV not found: {new_labels_csv}") + + new_df = pd.read_csv(new_path) + required_cols = {"image_path", "xmin", "ymin", "xmax", "ymax", "label"} + if not required_cols.issubset(set(new_df.columns)): + raise ValueError( + f"{new_labels_csv} must contain columns {sorted(required_cols)}") + + # Ensure labels are in cfg.classes + before = len(new_df) + new_df = new_df[new_df["label"].astype(str).isin(self.classes)].copy() + filtered_out = before - len(new_df) + + # Add round column if needed + if "round" not in new_df.columns: + new_df["round"] = round_id if round_id is not None else 0 + + # Clamp to valid bounds if any stray values slipped in + def _clamp_row(r): + W = None # optional: could verify against actual image size here + r["xmin"] = max(0, int(r["xmin"])) + r["ymin"] = max(0, int(r["ymin"])) + r["xmax"] = max(int(r["xmin"]) + 1, int(r["xmax"])) + r["ymax"] = max(int(r["ymin"]) + 1, int(r["ymax"])) + return r + + new_df = new_df.apply(_clamp_row, axis=1) + + # Load existing training CSV if present + if Path(self.train_csv).exists(): + old_df = pd.read_csv(self.train_csv) + else: + old_df = pd.DataFrame(columns=list(required_cols) + ["round"]) + + # Deduplicate on exact geometry and label + key_cols = ["image_path", "xmin", "ymin", "xmax", "ymax", "label"] + merged = pd.concat([old_df, new_df], ignore_index=True) + deduped = merged.drop_duplicates(subset=key_cols, keep="first") + + added_boxes = len(deduped) - len(old_df) + added_images = deduped.tail( + added_boxes)["image_path"].nunique() if added_boxes > 0 else 0 + + deduped.to_csv(self.train_csv, index=False) + + logging.info( + "Appended labels: %d boxes (%d images). Filtered-out labels not in classes: %d. New train_csv size: %d", + added_boxes, added_images, filtered_out, len(deduped)) + + ckpt = self.fit_one_round() + return { + "added_boxes": int(added_boxes), + "added_images": int(added_images), + "filtered_out": int(filtered_out), + "checkpoint": str(ckpt), + "train_csv_size": int(len(deduped)), + } diff --git a/src/deepforest/active_learning.yml b/src/deepforest/active_learning.yml new file mode 100644 index 000000000..f1e952ea2 --- /dev/null +++ b/src/deepforest/active_learning.yml @@ -0,0 +1,26 @@ +# Active learning config for DeepForest +# Fill in the required paths and class labels for your project. +workdir: ./workdir +images_dir: ./images +train_csv: ./train.csv +val_csv: ./val.csv +classes: # List of class labels + - tree + +# Training +epochs_per_round: 10 +batch_size: 4 +lr: 0.0001 +weight_decay: 0.0001 +precision: 32 # Can be 16/32 or "bf16" depending on your PL install +device: auto # "auto", "cpu", or "cuda:0" +num_workers: 4 +seed: 42 +use_release_weights: false # Warm start from NEON release weights + +# Evaluation +iou_eval: 0.5 + +# Acquisition +k_per_round: 50 +score_threshold_pred: 0.2 diff --git a/src/deepforest/label_studio.py b/src/deepforest/label_studio.py new file mode 100644 index 000000000..40569e1c0 --- /dev/null +++ b/src/deepforest/label_studio.py @@ -0,0 +1,356 @@ +import os +import json +import csv +import pathlib +from pathlib import Path +from typing import Dict, Any, List, Tuple, Optional, Iterable + +import requests +from PIL import Image +from urllib.parse import urlparse, parse_qs + +BASE_URL = "http://localhost:8080" +REFRESH_TOKEN = ("your_refresh_token_here") # Replace with your actual refresh token + +TRAIN_DIR = "train_set" +os.makedirs(TRAIN_DIR, exist_ok=True) + + +def get_access_token() -> str: + """Obtain a new access token using the refresh token. + + Sends a POST request to the API's token refresh endpoint with the provided refresh token. + Raises an HTTPError if the request fails. + + Returns: + str: The newly obtained access token. + + Raises: + requests.HTTPError: If the HTTP request to refresh the token fails. + """ + r = requests.post( + f"{BASE_URL}/api/token/refresh", + json={"refresh": REFRESH_TOKEN}, + timeout=10, + headers={"Content-Type": "application/json"}, + ) + r.raise_for_status() + return r.json()["access"] + + +def Health_check(access: str) -> Dict[str, str]: + """Generate an authorization header for Label Studio API requests. + + Args: + access (str): The access token to be used for authentication. + + Returns: + Dict[str, str]: A dictionary containing the 'Authorization' header with the provided access token. + """ + return {"Authorization": f"Bearer {access}"} + + +def list_projects(access: str) -> List[Dict[str, Any]]: + """Retrieve a list of projects from the Label Studio API. + + Args: + access (str): Access token or authentication string for API requests. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each representing a project. + + Raises: + requests.HTTPError: If the HTTP request to the API fails. + """ + r = requests.get(f"{BASE_URL}/api/projects?page_size=1000000", + headers=Health_check(access), + timeout=15) + r.raise_for_status() + data = r.json() + return data["results"] if isinstance(data, dict) and "results" in data else data + + +def paginate(url: str, headers: Dict[str, str]) -> List[Dict[str, Any]]: + """Fetches and paginates results from a given API endpoint. + + Args: + url (str): The initial URL to fetch data from. + headers (Dict[str, str]): HTTP headers to include in the request. + + Returns: + List[Dict[str, Any]]: A list of items retrieved from all paginated API responses. + + Raises: + requests.HTTPError: If an HTTP error occurs during the request. + + Notes: + - Assumes the API response contains a "results" key for paginated data and a "next" key for the next page URL. + - If the response is a list or a single dictionary, it is appended directly to the results. + """ + items: List[Dict[str, Any]] = [] + next_url = url + while next_url: + r = requests.get(next_url, headers=headers, timeout=30) + r.raise_for_status() + data = r.json() + if isinstance(data, dict) and "results" in data: + items.extend(data["results"]) + next_url = data.get("next") + else: + items.extend(data if isinstance(data, list) else [data]) + next_url = None + return items + + +def list_tasks(access: str, project_id: int) -> List[Dict[str, Any]]: + """Retrieve a list of tasks from a Label Studio project, including + annotations and predictions. + + Args: + access (str): Access token or credentials for authentication. + project_id (int): The ID of the Label Studio project to fetch tasks from. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each representing a task with its associated data. + """ + # fields=all should include annotations & predictions + url = f"{BASE_URL}/api/tasks?project={project_id}&page_size=200&fields=all" + return paginate(url, Health_check(access)) + + +def _filter_finished_only(tasks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Keep tasks that have at least one annotation with a non-empty result.""" + finished = [] + for t in tasks: + anns = t.get("annotations") or [] + if any((a.get("result") for a in anns)): + finished.append(t) + return finished + + +def ls_json_to_deepforest_csv( + tasks: Iterable[Dict[str, Any]], + images_dir: str, + classes: List[str], + out_csv: str, + round_id: Optional[int] = None, + strict_labels: bool = True, +) -> Tuple[int, int]: + """Convert Label Studio rectanglelabels annotations to a DeepForest CSV. + + Returns (n_boxes_written, n_unique_images_written). + """ + Path(out_csv).parent.mkdir(parents=True, exist_ok=True) + size_cache: Dict[str, Tuple[int, int]] = {} + + def _img_size(p: str) -> Optional[Tuple[int, int]]: + if p in size_cache: + return size_cache[p] + try: + with Image.open(p) as im: + size_cache[p] = im.size # (W, H) + return size_cache[p] + except Exception: + return None + + rows: List[List[Any]] = [] + images_used: set = set() + + for task in tasks: + data = task.get("data", {}) or {} + img_field = find_image_field(data) + if not img_field: + continue + + # Resolve to a filename under images_dir + fname = parse_image_url(absolute_image_url(img_field)) + img_path = str(Path(images_dir) / fname) + + size = _img_size(img_path) + if size is None: + # Image is not available locally; skip to keep dataset consistent + continue + W, H = size + + for ann in task.get("annotations") or []: + for res in ann.get("result") or []: + if res.get("type") != "rectanglelabels": + continue + val = res.get("value") or {} + labs = val.get("rectanglelabels") or [] + if not labs: + continue + label = str(labs[0]) + if strict_labels and label not in classes: + continue + + # LS percentages → pixels + try: + x = float(val.get("x", 0.0)) + y = float(val.get("y", 0.0)) + w = float(val.get("width", 0.0)) + h = float(val.get("height", 0.0)) + except Exception: + continue + + xmin = int(round((x / 100.0) * W)) + ymin = int(round((y / 100.0) * H)) + xmax = int(round(((x + w) / 100.0) * W)) + ymax = int(round(((y + h) / 100.0) * H)) + + # Clamp to image bounds + xmin = max(0, min(W, xmin)) + ymin = max(0, min(H, ymin)) + xmax = max(0, min(W, xmax)) + ymax = max(0, min(H, ymax)) + + # Require positive area + if xmax <= xmin or ymax <= ymin: + continue + + if round_id is None: + rows.append([img_path, xmin, ymin, xmax, ymax, label]) + else: + rows.append([img_path, xmin, ymin, xmax, ymax, label, round_id]) + + images_used.add(img_path) + + # Write CSV + with open(out_csv, "w", newline="") as f: + w = csv.writer(f) + header = ["image_path", "xmin", "ymin", "xmax", "ymax", "label"] + if round_id is not None: + header.append("round") + w.writerow(header) + w.writerows(rows) + + return len(rows), len(images_used) + + +def export_project_tasks(access: str, + project_id: int, + finished_only: bool = True) -> List[Dict[str, Any]]: + """Export tasks (with annotations) from a Label Studio project. + + If finished_only is True, requests the server to return only + completed tasks. + """ + url = (f"{BASE_URL}/api/projects/{project_id}/export" + f"?exportType=JSON&download_all_tasks=true" + f"{'&onlyFinished=1' if finished_only else ''}") + r = requests.get(url, headers=Health_check(access), timeout=60) + r.raise_for_status() + data = r.json() + if finished_only and isinstance(data, list): + data = _filter_finished_only(data) + return data + + +def find_image_field(task_data: Dict[str, Any]) -> Optional[str]: + """Searches for an image file path in the provided task data dictionary. + + Iterates through the key-value pairs in `task_data` and returns the value of the first key + whose value is a string containing a common image file extension (e.g., .jpg, .png, .tiff). + + Args: + task_data (Dict[str, Any]): A dictionary containing task data, potentially including image file paths. + + Returns: + Optional[str]: The image file path if found, otherwise None. + """ + for k, v in task_data.items(): + if isinstance(v, str) and any(ext in v.lower( + ) for ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tif", ".tiff"]): + return v + return None + + +def absolute_image_url(rel_or_abs: str) -> str: + """Returns an absolute image URL. + + If the input string is already an absolute URL (starts with "http://" or "https://"), + it is returned unchanged. Otherwise, the input is treated as a relative path and + concatenated with the BASE_URL to form an absolute URL. + + Args: + rel_or_abs (str): A relative or absolute image URL. + + Returns: + str: An absolute image URL. + """ + if rel_or_abs.startswith("http://") or rel_or_abs.startswith("https://"): + return rel_or_abs + return f"{BASE_URL.rstrip('/')}/{rel_or_abs.lstrip('/')}" + + +def parse_image_url(image_url: str) -> str: + """Extracts the image filename from a given image URL. + + The function attempts to parse the filename from the URL by checking for a query parameter 'd'. + If not found, it parses the URL path and query string to extract the filename. + If extraction fails, it returns "unknown_image.jpg". + + Args: + image_url (str): The URL of the image. + + Returns: + str: The extracted image filename, or "unknown_image.jpg" if extraction fails. + """ + try: + if "?d=" in image_url: + filename = image_url.split("?d=")[-1] + else: + parsed = urlparse(image_url) + filename = pathlib.Path(parsed.path).name + if parsed.query: + qs = parse_qs(parsed.query) + if "d" in qs: + filename = qs["d"][0] + return pathlib.Path(filename.lstrip("/")).name or "unknown_image.jpg" + except Exception: + return "unknown_image.jpg" + + +def extract_annotation_pairs(task: Dict[str, Any]) -> List[Tuple[str, str]]: + """Extracts pairs of image filenames and annotation labels from a Label + Studio task. + + This function processes a Label Studio task dictionary to extract annotation pairs, + where each pair consists of the image filename and an associated label or annotation value. + It supports multiple annotation types, including choices, textarea, and any field ending + with "labels" (e.g., rectanglelabels, polygonlabels, etc.). + + Args: + task (Dict[str, Any]): A dictionary representing a Label Studio task, expected to contain + image data and annotation results. + + Returns: + List[Tuple[str, str]]: A list of tuples, each containing the image filename and a label or + annotation value extracted from the task. + """ + pairs: List[Tuple[str, str]] = [] + image_field = find_image_field(task.get("data", {}) or {}) + if not image_field: + return pairs + filename = parse_image_url(image_field) + + anns = task.get("annotations") or [] + for ann in anns: + for res in ann.get("result") or []: + t = res.get("type") + val = (res.get("value") or {}) + # choices + if t == "choices": + for c in val.get("choices", []): + pairs.append((filename, str(c))) + # textarea + if t == "textarea": + texts = val.get("text", []) + if texts: + pairs.append((filename, str(texts[0]))) + # any "*labels" list (rectanglelabels, polygonlabels, brushlabels, keypointlabels, labels, taxonomyLabels, etc.) + for key, v in val.items(): + if key.lower().endswith("labels") and isinstance(v, list): + for lab in v: + pairs.append((filename, str(lab))) + return pairs diff --git a/tests/test_active_learning.py b/tests/test_active_learning.py new file mode 100644 index 000000000..d52e512bb --- /dev/null +++ b/tests/test_active_learning.py @@ -0,0 +1,154 @@ +from pathlib import Path +import math +import os +import pandas as pd +import pytest +import yaml +import numpy as np + +from deepforest import active_learning as al + +# two standard files in src/deepforest/data/ +SRC_CSV = Path("src/deepforest/data/2018_SJER_3_252000_4107000_image_477.csv") +SRC_IMG = Path("src/deepforest/data/2018_SJER_3_252000_4107000_image_477.tif") + +pytestmark = pytest.mark.skipif( + not (SRC_CSV.exists() and SRC_IMG.exists()), + reason="Expected CSV/TIF not found" +) + +def _stage_single_asset(tmp_path: Path): + """Copy CSV and image into a temp dataset and fix image_path to the copied TIF.""" + images_dir = tmp_path / "images" + images_dir.mkdir(parents=True, exist_ok=True) + # copy image + img = images_dir / SRC_IMG.name + img.write_bytes(SRC_IMG.read_bytes()) + + # rewrite CSV to point to the copied image + df = pd.read_csv(SRC_CSV) + assert "image_path" in df.columns, "CSV must include 'image_path'" + assert "label" in df.columns, "CSV must include 'label'" + df = df.copy() + df["image_path"] = str(img) # full path to the staged image + + train_csv = tmp_path / "train.csv" + val_csv = tmp_path / "val.csv" + df.to_csv(train_csv, index=False) + df.to_csv(val_csv, index=False) + return train_csv, val_csv, images_dir, img + +def _make_cfg(tmp_path: Path, train_csv: Path, val_csv: Path, images_dir: Path): + workdir = tmp_path / "work" + workdir.mkdir(parents=True, exist_ok=True) + return { + "workdir": str(workdir), + "images_dir": str(images_dir), + "train_csv": str(train_csv), + "val_csv": str(val_csv), + # CSV has a single class '0' + "classes": ["0"], + "epochs_per_round": 1, + "batch_size": 1, + "lr": 1e-3, + "weight_decay": 0.0, + "precision": 32, + "device": "cpu", + "num_workers": 0, + "seed": 123, + "use_release_weights": False, # keep offline & fast by default + "iou_eval": 0.5, + "k_per_round": 1, + "score_threshold_pred": 0.2, + } + +def test_predict_and_acquisition_with_single_image(tmp_path): + train_csv, val_csv, images_dir, img = _stage_single_asset(tmp_path) + cfg = _make_cfg(tmp_path, train_csv, val_csv, images_dir) + learner = al.ActiveLearner(cfg) + + # Predict on the single image + preds = learner.predict_images([str(img)]) + assert set(preds.keys()) == {str(img)} + df = preds[str(img)] + # Even if model returns nothing, we expect these columns + assert list(df.columns) == ["xmin", "ymin", "xmax", "ymax", "label", "score", "image_path"] + + # Acquisition over the single image + unlabeled = tmp_path / "unlabeled.txt" + unlabeled.write_text(str(img) + "\n", encoding="utf-8") + topk = learner.select_for_labeling(unlabeled, k=1) + + # Manifest exists with expected schema and bounded entropy + manifest = Path(cfg["workdir"]) / "acquisition" / "selection_round.csv" + assert manifest.exists() + mdf = pd.read_csv(manifest) + for col in ["image_path", "entropy", "n_preds", "mean_score"]: + assert col in mdf.columns + + # With one class, entropy ∈ [0, ln(1)=0] so it must be 0 + assert pytest.approx(float(mdf["entropy"].iloc[0])) == 0.0 + assert len(topk) == 1 + assert topk["image_path"].iloc[0] == str(img) + + +def test_load_config_validates_and_loads(tmp_path): + # Happy path: write a minimal valid YAML from your helper cfg + train_csv, val_csv, images_dir, _ = _stage_single_asset(tmp_path) + cfg = _make_cfg(tmp_path, train_csv, val_csv, images_dir) + + yml = tmp_path / "cfg.yml" + yml.write_text(yaml.safe_dump(cfg), encoding="utf-8") + + loaded = al.load_config(str(yml)) + # Ensure all required keys survived and a couple of core values match + for k in [ + "workdir", "images_dir", "train_csv", "val_csv", "classes", + "epochs_per_round", "batch_size", "precision", "device", + "iou_eval", "k_per_round", "score_threshold_pred" + ]: + assert k in loaded + assert loaded["classes"] == ["0"] + assert int(loaded["epochs_per_round"]) == 1 + + # Missing required keys -> KeyError + bad_yml = tmp_path / "bad.yml" + bad_cfg = {k: v for k, v in cfg.items() if k not in {"classes", "workdir"}} + bad_yml.write_text(yaml.safe_dump(bad_cfg), encoding="utf-8") + with pytest.raises(KeyError): + al.load_config(str(bad_yml)) + + # Invalid classes -> ValueError + empty_classes_yml = tmp_path / "empty_classes.yml" + bad_cfg2 = cfg.copy() + bad_cfg2["classes"] = [] + empty_classes_yml.write_text(yaml.safe_dump(bad_cfg2), encoding="utf-8") + with pytest.raises(ValueError): + al.load_config(str(empty_classes_yml)) + + +def test_image_entropy_from_predictions_multiclass_and_empty(): + # Two classes; score mass: class "0" gets 1.0, class "1" gets 0.5 + classes = ["0", "1"] + df = pd.DataFrame( + [ + {"label": "0", "score": 0.2}, + {"label": "0", "score": 0.8}, + {"label": "1", "score": 0.5}, + ] + ) + entropy, n_preds, mean_score = al._image_entropy_from_predictions(df, classes) + + # Expected probabilities: [2/3, 1/3] + p = np.array([2/3, 1/3], dtype=float) + expected_entropy = float(-(p * np.log(p)).sum()) + assert pytest.approx(entropy, rel=1e-6) == expected_entropy + assert n_preds == 3 + assert pytest.approx(mean_score, rel=1e-6) == np.mean([0.2, 0.8, 0.5]) + + # Empty predictions -> maximum uncertainty log(C) + empty_df = pd.DataFrame(columns=["label", "score"]) + entropy2, n_preds2, mean_score2 = al._image_entropy_from_predictions(empty_df, classes) + assert pytest.approx(entropy2, rel=1e-9) == math.log(len(classes)) + assert n_preds2 == 0 + assert mean_score2 == 0.0 diff --git a/tests/test_label_studio.py b/tests/test_label_studio.py new file mode 100644 index 000000000..e69de29bb