WUR-AI · gabrieletijunaityte · Jan 26, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/configs/data/butterfly_aef.yaml b/configs/data/butterfly_aef.yaml
@@ -10,7 +10,7 @@ dataset:
       size: 256
       format: tif
   use_target_data: true # default true
-  use_aux_data: false # default false
+  use_aux_data: none # default false
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
 

diff --git a/configs/data/butterfly_coords.yaml b/configs/data/butterfly_coords.yaml
@@ -7,7 +7,7 @@ dataset:
   modalities:
     coords:
   use_target_data: true
-  use_aux_data: false
+  use_aux_data: "none"
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
 

diff --git a/configs/data/butterfly_coords_text.yaml b/configs/data/butterfly_coords_text.yaml
@@ -7,7 +7,7 @@ dataset:
   modalities:
     coords:
   use_target_data: false
-  use_aux_data: true
+  use_aux_data: "all"
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
 

diff --git a/configs/data/butterfly_full_param_example.yaml b/configs/data/butterfly_full_param_example.yaml
@@ -16,7 +16,7 @@ dataset:
       format: npy
       year: 2019
   use_target_data: true
-  use_aux_data: false
+  use_aux_data: none
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
 

diff --git a/configs/data/butterfly_s2_rgb.yaml b/configs/data/butterfly_s2_rgb.yaml
@@ -12,7 +12,7 @@ dataset:
       channels: rgb
       preprocessing: zscored
   use_target_data: true # default true
-  use_aux_data: false # default false
+  use_aux_data: none # default false
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
 

diff --git a/configs/data/heat_guatemala.yaml b/configs/data/heat_guatemala.yaml
@@ -7,7 +7,7 @@ dataset:
     coords: {}
   use_target_data: true
   use_features: true
-  use_aux_data: false
+  use_aux_data: none
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
 

diff --git a/configs/data/satbird-Kenya_coords.yaml b/configs/data/satbird-Kenya_coords.yaml
@@ -7,7 +7,7 @@ dataset:
   modalities:
     coords:
   use_target_data: true
-  use_aux_data: false
+  use_aux_data: none
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
   study_site: Kenya

diff --git a/configs/data/satbird_USA-summer_coords.yaml b/configs/data/satbird_USA-summer_coords.yaml
@@ -7,7 +7,7 @@ dataset:
   modalities:
     coords:
   use_target_data: true
-  use_aux_data: false
+  use_aux_data: none
   seed: ${seed}
   cache_dir: ${paths.cache_dir}
   study_site: USA-summer

diff --git a/src/data/base_caption_builder.py b/src/data/base_caption_builder.py
@@ -52,33 +52,53 @@ def _fill(template: str, fillers: Dict[str, str]) -> str:
         return template
 
     @abstractmethod
-    def _build_from_template(self, template_idx: int, row: List[Any]) -> str:
+    def _build_from_template(
+        self, template_idx: int, aux: torch.Tensor, top: List[str] | None = None
+    ) -> str:
         """Build caption text from template and row of auxiliary data."""
         pass
 
-    def random(self, aux_values: List[Any]) -> List[str]:
+    def random(self, aux_values) -> List[str]:
         """Return a caption from a randomly sampled template for each data point."""
         formatted_rows = []
-        template_idx = random.choices(
+
+        batch_size = len(aux_values["aux"])
+
+        template_ids = random.choices(
             range(len(self.templates)),
-            k=len(aux_values),
+            k=batch_size,
         )
-        for idx, row in zip(template_idx, aux_values):
-            formatted_rows.append(self._build_from_template(idx, row))
+        for (
+            i,
+            template_idx,
+        ) in enumerate(template_ids):
+            row_aux = aux_values["aux"][i]
+            row_top = aux_values.get("top")[i] if aux_values.get("top") else None
+            formatted_rows.append(
+                self._build_from_template(template_idx, aux=row_aux, top=row_top)
+            )
 
         return formatted_rows
 
-    def all(self, aux_values: List[Any]) -> List[str]:
+    def all(self, aux_values) -> List[str]:
         """Return a list of captions from all available templates."""
         formatted_rows = []
-        for row in aux_values:
+        for i in range(0, len(aux_values["aux"])):
             descriptions = []
+            row_aux = aux_values["aux"][i]
+            row_top = aux_values.get("top")[i] if aux_values.get("top") else None
+
             for template_idx in range(0, len(self)):
-                descriptions.append(self._build_from_template(template_idx, row))
+                descriptions.append(
+                    self._build_from_template(template_idx, aux=row_aux, top=row_top)
+                )
             formatted_rows.append(descriptions)
 
         return formatted_rows
 
+    def build_concepts(self, aux_values) -> List[str]:
+        pass
+
 
 class DummyCaptionBuilder(BaseCaptionBuilder):
     """Dummy caption builder for testing purposes."""
@@ -89,8 +109,10 @@ def __init__(self, templates_fname: str, data_dir: str, seed: int) -> None:
     def sync_with_dataset(self, dataset) -> None:
         pass
 
-    def _build_from_template(self, template_idx: int, row: List[Any]) -> str:
-        first_val = row[0].item() if torch.is_tensor(row) else row[0]
+    def _build_from_template(
+        self, template_idx: int, aux: torch.Tensor, top: List[str] | None = None
+    ) -> str:
+        first_val = aux[0].item()
         return f"Location with value {first_val}"
 
 

diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py
@@ -1,6 +1,7 @@
 import os
+import re
 from abc import ABC, abstractmethod
-from typing import Any, Dict, final
+from typing import Any, Dict, List, final
 
 import numpy as np
 import pandas as pd
@@ -16,7 +17,7 @@ def __init__(
         data_dir: str,
         modalities: dict,
         use_target_data: bool = True,
-        use_aux_data: bool = False,
+        use_aux_data: Dict[str, List[str] | str] | str | None = None,
         dataset_name: str = "BaseDataset",
         seed: int = 12345,
         mode: str = "train",
@@ -78,10 +79,26 @@ def __init__(
         self.num_classes = None
         self.tabular_dim = None
         self.seed = seed
-        self.use_target_data: bool = use_target_data
-        self.use_aux_data: bool = use_aux_data
+        self.use_target_data = use_target_data
         self.use_features = use_features
 
+        if use_aux_data is None or use_aux_data == "all":
+            self.use_aux_data = {
+                "aux": {
+                    "pattern": "^aux_(?!.*top).*",
+                    #     'columns' : []
+                },
+                "top": {
+                    "pattern": "^aux_.*top.*",
+                    #     'columns' : []
+                },
+            }
+
+        elif type(use_aux_data) is dict:
+            self.use_aux_data = use_aux_data
+        else:
+            self.use_aux_data = None
+
         self.mode: str = mode  # 'train', 'val', 'test'
         self.records: dict[str, Any] = self.get_records()
 
@@ -107,16 +124,25 @@ def get_records(self) -> dict[str, Any]:
                 )
                 columns.append(f"{modality}_path")
 
-        # Include targets
+        # Include targets TODO: this could be moved under geo-modalities
         if self.use_target_data:
             self.target_names = [c for c in self.df.columns if "target_" in c]
             columns.extend(self.target_names)
             self.num_classes = len(self.target_names)
 
         # Include aux data
-        if self.use_aux_data:
-            self.aux_names = [c for c in self.df.columns if "aux_" in c]
-            columns.extend(self.aux_names)
+        if self.use_aux_data is not None:
+            for k, val in self.use_aux_data.items():
+                if val.get("pattern"):
+                    pattern = re.compile(val["pattern"])
+                    aux_names = [x for x in self.df.columns if pattern.match(x)]
+                else:
+                    aux_names = val.get(
+                        "columns",
+                        ValueError('use_aux_data should have "pattern" or "columns" defined'),
+                    )
+                self.use_aux_data[k] = aux_names
+                columns.extend(aux_names)
 
         # Include tabular features
         if self.use_features:

diff --git a/src/data/butterfly_caption_builder.py b/src/data/butterfly_caption_builder.py
@@ -1,8 +1,8 @@
-import math
 import os
 from typing import Any, List, override
 
 import pandas as pd
+import torch
 
 from src.data.base_caption_builder import (
     BaseCaptionBuilder,
@@ -27,19 +27,20 @@ def sync_with_dataset(self, dataset: BaseDataset) -> None:
         corine_columns = self.get_corine_column_keys()
         humanfootprint_columns = self.get_humanfootprint_column_keys()
         aux_columns = {**bioclim_columns, **corine_columns, **humanfootprint_columns}
-        self.column_to_metadata_map = {}
+        self.column_to_metadata_map = {k: {} for k in dataset.use_aux_data.keys()}
 
-        for id, key in enumerate(dataset.aux_names):
-            if "aux_corine_frac" in key and "top" in key:  # to avoid assert statement
-                description, units = None, None
-            else:
-                description, units = aux_columns.get(key) or (None, None)
-                assert description is not None, f"Key {key} not found in aux columns"
-            self.column_to_metadata_map[key] = {
-                "id": id,
-                "description": description,
-                "units": units,
-            }
+        for aux_cat, cols in dataset.use_aux_data.items():
+            for i, c in enumerate(cols):
+                if "top" in aux_cat:
+                    description, units = None, None
+                else:
+                    description, units = aux_columns.get(c) or (None, None)
+
+                self.column_to_metadata_map[aux_cat][c] = {
+                    "id": i,
+                    "description": description,
+                    "units": units,
+                }
 
     def get_corine_column_keys(self):
         """Returns metadata for corine columns."""
@@ -99,33 +100,28 @@ def get_humanfootprint_column_keys(self):
     def _build_from_template(
         self,
         template_idx: int,
-        row: List[Any],
+        aux: torch.Tensor,
+        top: List[str] | None = None,
         convert_corine_perc: bool = True,
     ) -> str:
         """Create caption from template and row of auxiliary data."""
         template = self.templates[template_idx]
         tokens = self.tokens_in_template[template_idx]
         replacements = {}
         for token in tokens:
-            if "aux_corine_frac" in token and "top" in token:
-                try:
-                    values_dict_top = self.column_to_metadata_map[token]
-                except KeyError:
-                    raise KeyError(
-                        f"Token {token} not found in column_to_metadata_map {self.column_to_metadata_map}. Check if the token in the template matches the column names in the dataset."
-                    )
-                idx_top = values_dict_top["id"]
-                referral_token = row[
-                    idx_top
-                ]  # e.g., token 'aux_corine_frac_lowlevel_top_1' might refer to 'corine_frac_211' in this row
-                referral_token = (
-                    "aux_" + referral_token if "aux_" not in referral_token else referral_token
+            init_token = token
+            if "top" in token:
+                idx = self.column_to_metadata_map["top"][token]["id"]
+                token = f"aux_{top[idx]}"
+            try:
+                values_dict = self.column_to_metadata_map["aux"][token]
+            except KeyError:
+                raise KeyError(
+                    f"Token {token} not found in column_to_metadata_map {self.column_to_metadata_map}. Check if the token in the template matches the column names in the dataset."
                 )
-                values_dict = self.column_to_metadata_map[referral_token]
-            else:
-                values_dict = self.column_to_metadata_map[token]
+
             idx = values_dict["id"]
-            value = row[idx]
+            value = aux[idx].item()
 
             formatted_desc = values_dict["description"].lower() or ""
             units = values_dict["units"]
@@ -139,7 +135,7 @@ def _build_from_template(
                     formatted_desc = formatted_desc + f' ({round(value)} {units if units else ""})'
             else:
                 formatted_desc = formatted_desc + f' of {round(value)} {units if units else ""}'
-            replacements[token] = formatted_desc
+            replacements[init_token] = formatted_desc
 
         template = self._fill(template, replacements)
         return template

diff --git a/src/data/butterfly_dataset.py b/src/data/butterfly_dataset.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, override
+from typing import Any, Dict, List, override
 
 import numpy as np
 import pooch
@@ -17,7 +17,7 @@ def __init__(
         data_dir: str,
         modalities: dict,
         use_target_data: bool = True,
-        use_aux_data: bool = False,
+        use_aux_data: Dict[str, List[str] | str] | None = None,
         seed: int = 12345,
         cache_dir: str = None,
         mock: bool = False,
@@ -165,7 +165,14 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
             )
 
         if self.use_aux_data:
-            formatted_row["aux"] = [row[i] for i in self.aux_names]
+            formatted_row["aux"] = {}
+            for aux_cat, vals in self.use_aux_data.items():
+                if aux_cat == "aux":
+                    formatted_row["aux"][aux_cat] = torch.tensor(
+                        [row[v] for v in vals], dtype=torch.float32
+                    )
+                else:
+                    formatted_row["aux"][aux_cat] = [row[v] for v in vals]
 
         return formatted_row