From daa8ce1c6e9df2de80a9a751e8a96d0a9869efc5 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 7 Jan 2026 11:28:41 +0000
Subject: [PATCH 01/15] Virtual v1

---
 compression/make_virtual_snapshot.py | 37 +++++++++++++++-------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/compression/make_virtual_snapshot.py b/compression/make_virtual_snapshot.py
index 5f6e37f8..dc17eca1 100644
--- a/compression/make_virtual_snapshot.py
+++ b/compression/make_virtual_snapshot.py
@@ -5,15 +5,24 @@
 import shutil
 
 
-def make_virtual_snapshot(snapshot, membership, output_file, snap_nr):
+def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, snap_nr):
     """
-    Given a FLAMINGO snapshot and group membership files,
-    create a new virtual snapshot with group info.
+    Given a snapshot and auxilary files, create
+    a new virtual snapshot with all datasets combine.
     """
 
-    # Check which datasets exist in the membership files
+    # Copy the input virtual snapshot to the output
+    shutil.copyfile(snapshot, output_file)
+
+    # Open the output file
+    outfile = h5py.File(output_file, "r+")
+
+    # TODO: Loop
+    auxilary = auxilary_snapshots[0]
+
+    # Check which datasets exist in the auxilary files
     # and store their attributes and datatype
-    filename = membership.format(file_nr=0, snap_nr=snap_nr)
+    filename = auxilary.format(file_nr=0, snap_nr=snap_nr)
     dset_attrs = {}
     dset_dtype = {}
     with h5py.File(filename, "r") as infile:
@@ -26,32 +35,26 @@ def make_virtual_snapshot(snapshot, membership, output_file, snap_nr):
                 attrs = dict(infile[f"PartType{ptype}/{dset}"].attrs)
                 dtype = infile[f"PartType{ptype}/{dset}"].dtype
 
-                # Some membership files are missing these attributes
+                # Some auxilary files are missing these attributes
                 if not "Value stored as physical" in attrs:
                     print(f"Setting comoving attrs for PartType{ptype}/{dset}")
                     attrs["Value stored as physical"] = [1]
                     attrs["Property can be converted to comoving"] = [0]
 
-                # Add a flag that these are stored in the membership files
+                # Add a flag that these datasets are stored in the auxilary files
                 attrs["Auxilary file"] = [1]
 
                 # Store the values we need for later
                 dset_attrs[f"PartType{ptype}"][dset] = attrs
                 dset_dtype[f"PartType{ptype}"][dset] = dtype
 
-    # Copy the input virtual snapshot to the output
-    shutil.copyfile(snapshot, output_file)
-
-    # Open the output file
-    outfile = h5py.File(output_file, "r+")
-
-    # Loop over input membership files to get dataset shapes
+    # Loop over input auxilary files to get dataset shapes
     file_nr = 0
     filenames = []
     shapes = []
     counts = []
     while True:
-        filename = membership.format(file_nr=file_nr, snap_nr=snap_nr)
+        filename = auxilary.format(file_nr=file_nr, snap_nr=snap_nr)
         if os.path.exists(filename):
             filenames.append(filename)
             with h5py.File(filename, "r") as infile:
@@ -73,7 +76,7 @@ def make_virtual_snapshot(snapshot, membership, output_file, snap_nr):
             break
         file_nr += 1
     if file_nr == 0:
-        raise IOError(f"Failed to find files matching: {membership}")
+        raise IOError(f"Failed to find files matching: {auxilary}")
 
     # Loop over particle types in the output
     for ptype in range(7):
@@ -174,7 +177,7 @@ def make_virtual_snapshot(snapshot, membership, output_file, snap_nr):
     output_file = args.output_file.format(snap_nr=args.snap_nr)
 
     # Make a new virtual snapshot with group info
-    make_virtual_snapshot(virtual_snapshot, args.membership, output_file, args.snap_nr)
+    make_virtual_snapshot(virtual_snapshot, [args.membership], output_file, args.snap_nr)
 
     # Set file paths for datasets
     abs_snapshot_dir = os.path.abspath(os.path.dirname(virtual_snapshot))

From 688baad5cf99671f109dba637b3de48121ee9b5d Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 7 Jan 2026 15:32:43 +0000
Subject: [PATCH 02/15] Combine update_vds_path with make_virtual_snapshot

---
 compression/make_virtual_snapshot.py | 353 ++++++++++++++++++---------
 compression/update_vds_paths.py      | 118 ---------
 2 files changed, 233 insertions(+), 238 deletions(-)
 delete mode 100644 compression/update_vds_paths.py

diff --git a/compression/make_virtual_snapshot.py b/compression/make_virtual_snapshot.py
index dc17eca1..934b92cc 100644
--- a/compression/make_virtual_snapshot.py
+++ b/compression/make_virtual_snapshot.py
@@ -3,12 +3,74 @@
 import os.path
 import h5py
 import shutil
+import numpy as np
 
 
-def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, snap_nr):
+class SafeDict(dict):
+    def __missing__(self, key):
+        # Return the key back in braces so it remains in the string
+        return "{" + key + "}"
+
+
+def update_vds_paths(dset, modify_function):
+    """
+    Modify the virtual paths of the specified dataset
+
+    Note that querying the source dataspace and selection does not appear
+    to work (invalid pointer error from h5py) so here we assume that we're
+    referencing all of the source dataspace, which is correct for SWIFT
+    snapshots.
+
+    dset:            a h5py.Dataset object
+    modify_function: a function which takes the old path as its argument and
+                     returns the new path
+    """
+
+    # Choose a temporary path for the new virtual dataset
+    path = dset.name
+    tmp_path = dset.name + ".__tmp__"
+
+    # Build the creation property list for the new dataset
+    plist = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
+    for vs in dset.virtual_sources():
+        bounds = vs.vspace.get_select_bounds()
+        if bounds is not None:
+            lower, upper = bounds
+            size = np.asarray(upper, dtype=int) - np.asarray(lower, dtype=int) + 1
+            src_space = h5py.h5s.create_simple(tuple(size))
+            new_name = modify_function(vs.file_name)
+            plist.set_virtual(
+                vs.vspace, new_name.encode(), vs.dset_name.encode(), src_space
+            )
+
+    # Create the new dataset
+    tmp_dset = h5py.h5d.create(
+        dset.file["/"].id,
+        tmp_path.encode(),
+        dset.id.get_type(),
+        dset.id.get_space(),
+        dcpl=plist,
+    )
+    tmp_dset = h5py.Dataset(tmp_dset)
+    for attr_name in dset.attrs:
+        tmp_dset.attrs[attr_name] = dset.attrs[attr_name]
+
+    # Rename the new dataset
+    f = dset.file
+    del f[path]
+    f[path] = f[tmp_path]
+    del f[tmp_path]
+
+
+def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, absolute_paths=False):
     """
     Given a snapshot and auxilary files, create
     a new virtual snapshot with all datasets combine.
+    
+    snapshot: Path to the snapshot file
+    auxilary_snapshots: List of auxiliary file patterns
+    output_file: Path to the output virtual snapshot
+    absolute_paths: If True, use absolute paths; if False, use relative paths
     """
 
     # Copy the input virtual snapshot to the output
@@ -17,114 +79,169 @@ def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, snap_nr):
     # Open the output file
     outfile = h5py.File(output_file, "r+")
 
-    # TODO: Loop
-    auxilary = auxilary_snapshots[0]
+    # Calculate directories for path updates
+    abs_snapshot_dir = os.path.abspath(os.path.dirname(snapshot))
+    abs_auxilary_dirs = [
+        os.path.abspath(os.path.dirname(aux.format(file_nr=0)))
+        for aux in auxilary_snapshots
+    ]
+    abs_output_dir = os.path.abspath(os.path.dirname(output_file))
+    
+    if absolute_paths:
+        snapshot_dir = abs_snapshot_dir
+        auxilary_dirs = abs_auxilary_dirs
+    else:
+        snapshot_dir = os.path.relpath(abs_snapshot_dir, abs_output_dir)
+        auxilary_dirs = [
+            os.path.relpath(aux_dir, abs_output_dir) 
+            for aux_dir in abs_auxilary_dirs
+        ]
+
+    # Create path replacement functions
+    def make_replace_path(target_dir):
+        def replace_path(old_path):
+            basename = os.path.basename(old_path)
+            return os.path.join(target_dir, basename)
+        return replace_path
+
+    replace_snapshot_path = make_replace_path(snapshot_dir)
+    auxilary_path_replacers = [make_replace_path(d) for d in auxilary_dirs]
+
+    all_auxilary_datasets = {}
+    
+    for aux_index, auxilary in enumerate(auxilary_snapshots):
 
-    # Check which datasets exist in the auxilary files
-    # and store their attributes and datatype
-    filename = auxilary.format(file_nr=0, snap_nr=snap_nr)
-    dset_attrs = {}
-    dset_dtype = {}
-    with h5py.File(filename, "r") as infile:
+        # Check which datasets exist in the auxilary files
+        # and store their attributes and datatype
+        filename = auxilary.format(file_nr=0)
+        dset_attrs = {}
+        dset_dtype = {}
+        with h5py.File(filename, "r") as infile:
+            for ptype in range(7):
+                if not f"PartType{ptype}" in infile:
+                    continue
+                dset_attrs[f"PartType{ptype}"] = {}
+                dset_dtype[f"PartType{ptype}"] = {}
+                for dset in infile[f"PartType{ptype}"].keys():
+                    attrs = dict(infile[f"PartType{ptype}/{dset}"].attrs)
+                    dtype = infile[f"PartType{ptype}/{dset}"].dtype
+
+                    # Some auxilary files are missing these attributes
+                    if not "Value stored as physical" in attrs:
+                        print(f"Setting comoving attrs for PartType{ptype}/{dset}")
+                        attrs["Value stored as physical"] = [1]
+                        attrs["Property can be converted to comoving"] = [0]
+
+                    # Add a flag that these datasets are stored in the auxilary files
+                    attrs["Auxilary file"] = [1]
+
+                    # Store the values we need for later
+                    dset_attrs[f"PartType{ptype}"][dset] = attrs
+                    dset_dtype[f"PartType{ptype}"][dset] = dtype
+
+                    # Check we don't have this dataset in any of the other auxilary files
+                    dset_path = f"PartType{ptype}/{dset}"
+                    if dset_path in all_auxilary_datasets:
+                        other_file = all_auxilary_datasets[f"PartType{ptype}/{dset}"]
+                        raise ValueError(f"{dset_path} is in {auxilary} and {other_file}")
+                    all_auxilary_datasets[dset_path] = auxilary
+
+        # Loop over input auxilary files to get dataset shapes
+        file_nr = 0
+        filenames = []
+        shapes = []
+        counts = []
+        while True:
+            filename = auxilary.format(file_nr=file_nr)
+            if os.path.exists(filename):
+                filenames.append(filename)
+                with h5py.File(filename, "r") as infile:
+                    shape = {}
+                    count = {}
+                    for ptype in range(7):
+                        if f"PartType{ptype}" not in dset_attrs:
+                            continue
+                        shape[f"PartType{ptype}"] = {}
+                        # Get the shape for each dataset
+                        for dset in dset_attrs[f"PartType{ptype}"]:
+                            s = infile[f"PartType{ptype}/{dset}"].shape
+                            shape[f"PartType{ptype}"][dset] = s
+                        # Get the number of particles in this chunk file
+                        count[f"PartType{ptype}"] = s[0]
+                    shapes.append(shape)
+                    counts.append(count)
+            else:
+                break
+            file_nr += 1
+        if file_nr == 0:
+            raise IOError(f"Failed to find files matching: {auxilary}")
+
+        # Loop over particle types in the output
         for ptype in range(7):
-            if not f"PartType{ptype}" in infile:
+            if f"PartType{ptype}" not in dset_attrs:
                 continue
-            dset_attrs[f"PartType{ptype}"] = {}
-            dset_dtype[f"PartType{ptype}"] = {}
-            for dset in infile[f"PartType{ptype}"].keys():
-                attrs = dict(infile[f"PartType{ptype}/{dset}"].attrs)
-                dtype = infile[f"PartType{ptype}/{dset}"].dtype
-
-                # Some auxilary files are missing these attributes
-                if not "Value stored as physical" in attrs:
-                    print(f"Setting comoving attrs for PartType{ptype}/{dset}")
-                    attrs["Value stored as physical"] = [1]
-                    attrs["Property can be converted to comoving"] = [0]
-
-                # Add a flag that these datasets are stored in the auxilary files
-                attrs["Auxilary file"] = [1]
-
-                # Store the values we need for later
-                dset_attrs[f"PartType{ptype}"][dset] = attrs
-                dset_dtype[f"PartType{ptype}"][dset] = dtype
-
-    # Loop over input auxilary files to get dataset shapes
-    file_nr = 0
-    filenames = []
-    shapes = []
-    counts = []
-    while True:
-        filename = auxilary.format(file_nr=file_nr, snap_nr=snap_nr)
-        if os.path.exists(filename):
-            filenames.append(filename)
-            with h5py.File(filename, "r") as infile:
-                shape = {}
-                count = {}
-                for ptype in range(7):
-                    if f"PartType{ptype}" not in dset_attrs:
-                        continue
-                    shape[f"PartType{ptype}"] = {}
-                    # Get the shape for each dataset
-                    for dset in dset_attrs[f"PartType{ptype}"]:
-                        s = infile[f"PartType{ptype}/{dset}"].shape
-                        shape[f"PartType{ptype}"][dset] = s
-                    # Get the number of particles in this chunk file
-                    count[f"PartType{ptype}"] = s[0]
-                shapes.append(shape)
-                counts.append(count)
-        else:
-            break
-        file_nr += 1
-    if file_nr == 0:
-        raise IOError(f"Failed to find files matching: {auxilary}")
-
-    # Loop over particle types in the output
-    for ptype in range(7):
-        if f"PartType{ptype}" not in dset_attrs:
-            continue
-
-        # Create virtual layout for new datasets
-        layouts = {}
-        nr_parts = sum([count[f"PartType{ptype}"] for count in counts])
-        for dset in dset_attrs[f"PartType{ptype}"]:
-            full_shape = list(shapes[0][f"PartType{ptype}"][dset])
-            full_shape[0] = nr_parts
-            full_shape = tuple(full_shape)
-            dtype = dset_dtype[f"PartType{ptype}"][dset]
-            layouts[dset] = h5py.VirtualLayout(shape=full_shape, dtype=dtype)
-
-        # Loop over input files
-        offset = 0
-        for filename, count, shape in zip(filenames, counts, shapes):
-            n_part = count[f"PartType{ptype}"]
+
+            # Create virtual layout for new datasets
+            layouts = {}
+            nr_parts = sum([count[f"PartType{ptype}"] for count in counts])
             for dset in dset_attrs[f"PartType{ptype}"]:
-                layouts[dset][offset : offset + n_part] = h5py.VirtualSource(
-                    filename,
-                    f"PartType{ptype}/{dset}",
-                    shape=shape[f"PartType{ptype}"][dset],
-                )
-            offset += n_part
-
-        # Create the virtual datasets, renaming datasets if they
-        # already exist in the snapshot
-        for dset, attrs in dset_attrs[f"PartType{ptype}"].items():
-            if f"PartType{ptype}/{dset}" in outfile:
-                outfile.move(f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap")
-            outfile.create_virtual_dataset(
-                f"PartType{ptype}/{dset}", layouts[dset], fillvalue=-999
-            )
-            for k, v in attrs.items():
-                outfile[f"PartType{ptype}/{dset}"].attrs[k] = v
+                full_shape = list(shapes[0][f"PartType{ptype}"][dset])
+                full_shape[0] = nr_parts
+                full_shape = tuple(full_shape)
+                dtype = dset_dtype[f"PartType{ptype}"][dset]
+                layouts[dset] = h5py.VirtualLayout(shape=full_shape, dtype=dtype)
+
+            # Loop over input files
+            offset = 0
+            for filename, count, shape in zip(filenames, counts, shapes):
+                n_part = count[f"PartType{ptype}"]
+                for dset in dset_attrs[f"PartType{ptype}"]:
+                    layouts[dset][offset : offset + n_part] = h5py.VirtualSource(
+                        filename,
+                        f"PartType{ptype}/{dset}",
+                        shape=shape[f"PartType{ptype}"][dset],
+                    )
+                offset += n_part
 
-            # Copy GroupNr_bound to HaloCatalogueIndex, since that is the name in SOAP
-            if dset == "GroupNr_bound":
+            # Create the virtual datasets, renaming datasets if they
+            # already exist in the snapshot
+            for dset, attrs in dset_attrs[f"PartType{ptype}"].items():
+                if f"PartType{ptype}/{dset}" in outfile:
+                    outfile.move(f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap")
                 outfile.create_virtual_dataset(
-                    f"PartType{ptype}/HaloCatalogueIndex",
-                    layouts["GroupNr_bound"],
-                    fillvalue=-999,
+                    f"PartType{ptype}/{dset}", layouts[dset], fillvalue=-999
                 )
-                for k, v in outfile[f"PartType{ptype}/GroupNr_bound"].attrs.items():
-                    outfile[f"PartType{ptype}/HaloCatalogueIndex"].attrs[k] = v
+                for k, v in attrs.items():
+                    outfile[f"PartType{ptype}/{dset}"].attrs[k] = v
+
+                # Update paths for this newly created auxiliary dataset
+                update_vds_paths(outfile[f"PartType{ptype}/{dset}"], auxilary_path_replacers[aux_index])
+
+                # Copy GroupNr_bound to HaloCatalogueIndex, since 
+                # that is the name in SOAP
+                if dset == "GroupNr_bound":
+                    outfile.create_virtual_dataset(
+                        f"PartType{ptype}/HaloCatalogueIndex",
+                        layouts["GroupNr_bound"],
+                        fillvalue=-999,
+                    )
+                    for k, v in outfile[f"PartType{ptype}/GroupNr_bound"].attrs.items():
+                        outfile[f"PartType{ptype}/HaloCatalogueIndex"].attrs[k] = v
+                    
+                    # Update paths for HaloCatalogueIndex too
+                    update_vds_paths(outfile[f"PartType{ptype}/HaloCatalogueIndex"], auxilary_path_replacers[aux_index])
+
+    # Update paths for all original snapshot datasets
+    for ptype in range(7):
+        ptype_name = f"PartType{ptype}"
+        if ptype_name in outfile:
+            for dset_name in list(outfile[ptype_name].keys()):
+                dset = outfile[f"{ptype_name}/{dset_name}"]
+                if dset.is_virtual:
+                    # Check if this is an auxiliary dataset (skip those, already handled)
+                    if dset.attrs.get("Auxilary file", [0])[0] != 1:
+                        # This is an original snapshot dataset
+                        update_vds_paths(dset, replace_snapshot_path)
 
     # Done
     outfile.close()
@@ -133,7 +250,6 @@ def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, snap_nr):
 if __name__ == "__main__":
 
     import argparse
-    from update_vds_paths import update_virtual_snapshot_paths
 
     # For description of parameters run the following: $ python make_virtual_snapshot.py --help
     parser = argparse.ArgumentParser(
@@ -149,9 +265,10 @@ def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, snap_nr):
         help="Name of the SWIFT virtual snapshot file, e.g. snapshot_{snap_nr:04}.hdf5",
     )
     parser.add_argument(
-        "membership",
+        "auxilary_snapshots",
         type=str,
-        help="Format string for membership files, e.g. membership_{snap_nr:04}.{file_nr}.hdf5",
+        nargs="+",
+        help="One of more format strings for auxilary files, e.g. membership_{snap_nr:04}.{file_nr}.hdf5",
     )
     parser.add_argument(
         "output_file",
@@ -176,20 +293,16 @@ def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, snap_nr):
     virtual_snapshot = args.virtual_snapshot.format(snap_nr=args.snap_nr)
     output_file = args.output_file.format(snap_nr=args.snap_nr)
 
-    # Make a new virtual snapshot with group info
-    make_virtual_snapshot(virtual_snapshot, [args.membership], output_file, args.snap_nr)
+    # We don't want to replace {file_nr} for auxilary snapshots
+    auxilary_snapshots = [
+        filename.format_map(SafeDict({'snap_nr': args.snap_nr}))
+        for filename in args.auxilary_snapshots
+    ]
 
-    # Set file paths for datasets
-    abs_snapshot_dir = os.path.abspath(os.path.dirname(virtual_snapshot))
-    abs_membership_dir = os.path.abspath(
-        os.path.dirname(args.membership.format(snap_nr=args.snap_nr, file_nr=0))
+    # Make a new virtual snapshot with group info
+    make_virtual_snapshot(
+        virtual_snapshot, 
+        auxilary_snapshots, 
+        output_file, 
+        absolute_paths=args.absolute_paths
     )
-    if args.absolute_paths:
-        # Ensure all paths in the virtual file are absolute to avoid VDS prefix issues
-        # (we probably need to pick up datasets from two different directories)
-        update_virtual_snapshot_paths(output_file, abs_snapshot_dir, abs_membership_dir)
-    else:
-        abs_output_dir = os.path.abspath(os.path.dirname(output_file))
-        rel_snapshot_dir = os.path.relpath(abs_snapshot_dir, abs_output_dir)
-        rel_membership_dir = os.path.relpath(abs_membership_dir, abs_output_dir)
-        update_virtual_snapshot_paths(output_file, rel_snapshot_dir, rel_membership_dir)
diff --git a/compression/update_vds_paths.py b/compression/update_vds_paths.py
deleted file mode 100644
index da24d57d..00000000
--- a/compression/update_vds_paths.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/env python
-
-import sys
-import h5py
-import numpy as np
-import os.path
-
-
-def update_vds_paths(dset, modify_function):
-    """
-    Modify the virtual paths of the specified dataset
-
-    Note that querying the source dataspace and selection does not appear
-    to work (invalid pointer error from h5py) so here we assume that we're
-    referencing all of the source dataspace, which is correct for SWIFT
-    snapshots.
-
-    dset:            a h5py.Dataset object
-    modify_function: a function which takes the old path as its argument and
-                     returns the new path
-    """
-
-    # Choose a temporary path for the new virtual dataset
-    path = dset.name
-    tmp_path = dset.name + ".__tmp__"
-
-    # Build the creation property list for the new dataset
-    plist = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
-    for vs in dset.virtual_sources():
-        bounds = vs.vspace.get_select_bounds()
-        if bounds is not None:
-            lower, upper = bounds
-            size = np.asarray(upper, dtype=int) - np.asarray(lower, dtype=int) + 1
-            src_space = h5py.h5s.create_simple(tuple(size))
-            new_name = modify_function(vs.file_name)
-            plist.set_virtual(
-                vs.vspace, new_name.encode(), vs.dset_name.encode(), src_space
-            )
-
-    # Create the new dataset
-    tmp_dset = h5py.h5d.create(
-        dset.file["/"].id,
-        tmp_path.encode(),
-        dset.id.get_type(),
-        dset.id.get_space(),
-        dcpl=plist,
-    )
-    tmp_dset = h5py.Dataset(tmp_dset)
-    for attr_name in dset.attrs:
-        tmp_dset.attrs[attr_name] = dset.attrs[attr_name]
-
-    # Rename the new dataset
-    f = dset.file
-    del f[path]
-    f[path] = f[tmp_path]
-    del f[tmp_path]
-
-
-def update_virtual_snapshot_paths(filename, snapshot_dir=None, membership_dir=None):
-    """
-    Add full paths to virtual datasets in the specified file
-    """
-    f = h5py.File(filename, "r+")
-
-    # Find all datasets in the file
-    all_datasets = []
-
-    def visit_datasets(name, obj):
-        if isinstance(obj, h5py.Dataset):
-            all_datasets.append(obj)
-
-    f.visititems(visit_datasets)
-
-    def replace_snapshot_path(old_path):
-        basename = os.path.basename(old_path)
-        return os.path.join(snapshot_dir, basename)
-
-    def replace_membership_path(old_path):
-        basename = os.path.basename(old_path)
-        return os.path.join(membership_dir, basename)
-
-    # Loop over datasets and update paths if necessary
-    for dset in all_datasets:
-        if dset.is_virtual:
-            name = dset.name.split("/")[-1]
-            # Check if the dataset comes from a membership file
-            if dset.attrs.get("Auxilary file", [0])[0] == 1:
-                if membership_dir is not None:
-                    update_vds_paths(dset, replace_membership_path)
-            # Catch old datasets which didn't have the "Auxilary file" set
-            elif name in (
-                "GroupNr_all",
-                "GroupNr_bound",
-                "Rank_bound",
-                "HaloCatalogueIndex",
-                "SpecificPotentialEnergies",
-            ):
-                if membership_dir is not None:
-                    update_vds_paths(dset, replace_membership_path)
-            # Catch old case of FOF IDs from membership files
-            elif (name == "FOFGroupIDs") and ("PartType1/FOFGroupIDs_old" in f):
-                if membership_dir is not None:
-                    update_vds_paths(dset, replace_membership_path)
-            # Data comes from the snapshot files
-            else:
-                if snapshot_dir is not None:
-                    update_vds_paths(dset, replace_snapshot_path)
-
-    f.close()
-
-
-if __name__ == "__main__":
-
-    filename = sys.argv[1]  # Virtual snapshot file to update
-    snapshot_dir = sys.argv[2]  # Directory with the real snapshot files
-    membership_dir = sys.argv[3]  # Directory with the real membership files
-
-    update_virtual_snapshot_paths(filename, snapshot_dir, membership_dir)

From a3bde138476ae3397db52bdbe9c4a35b0d6ed6aa Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 7 Jan 2026 15:55:37 +0000
Subject: [PATCH 03/15] Virtual snapshot creation using multiple auxilary files

---
 compression/make_virtual_snapshot.py         | 63 +++++++++++++-------
 scripts/COLIBRE/compress_group_membership.sh |  5 +-
 scripts/EAGLE.sh                             |  6 +-
 3 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/compression/make_virtual_snapshot.py b/compression/make_virtual_snapshot.py
index 934b92cc..0679ff7a 100644
--- a/compression/make_virtual_snapshot.py
+++ b/compression/make_virtual_snapshot.py
@@ -62,11 +62,13 @@ def update_vds_paths(dset, modify_function):
     del f[tmp_path]
 
 
-def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, absolute_paths=False):
+def make_virtual_snapshot(
+    snapshot, auxilary_snapshots, output_file, absolute_paths=False
+):
     """
     Given a snapshot and auxilary files, create
     a new virtual snapshot with all datasets combine.
-    
+
     snapshot: Path to the snapshot file
     auxilary_snapshots: List of auxiliary file patterns
     output_file: Path to the output virtual snapshot
@@ -86,15 +88,14 @@ def make_virtual_snapshot(snapshot, auxilary_snapshots, output_file, absolute_pa
         for aux in auxilary_snapshots
     ]
     abs_output_dir = os.path.abspath(os.path.dirname(output_file))
-    
+
     if absolute_paths:
         snapshot_dir = abs_snapshot_dir
         auxilary_dirs = abs_auxilary_dirs
     else:
         snapshot_dir = os.path.relpath(abs_snapshot_dir, abs_output_dir)
         auxilary_dirs = [
-            os.path.relpath(aux_dir, abs_output_dir) 
-            for aux_dir in abs_auxilary_dirs
+            os.path.relpath(aux_dir, abs_output_dir) for aux_dir in abs_auxilary_dirs
         ]
 
     # Create path replacement functions
@@ -102,13 +103,14 @@ def make_replace_path(target_dir):
         def replace_path(old_path):
             basename = os.path.basename(old_path)
             return os.path.join(target_dir, basename)
+
         return replace_path
 
     replace_snapshot_path = make_replace_path(snapshot_dir)
     auxilary_path_replacers = [make_replace_path(d) for d in auxilary_dirs]
 
     all_auxilary_datasets = {}
-    
+
     for aux_index, auxilary in enumerate(auxilary_snapshots):
 
         # Check which datasets exist in the auxilary files
@@ -143,7 +145,9 @@ def replace_path(old_path):
                     dset_path = f"PartType{ptype}/{dset}"
                     if dset_path in all_auxilary_datasets:
                         other_file = all_auxilary_datasets[f"PartType{ptype}/{dset}"]
-                        raise ValueError(f"{dset_path} is in {auxilary} and {other_file}")
+                        raise ValueError(
+                            f"{dset_path} is in {auxilary} and {other_file}"
+                        )
                     all_auxilary_datasets[dset_path] = auxilary
 
         # Loop over input auxilary files to get dataset shapes
@@ -207,7 +211,9 @@ def replace_path(old_path):
             # already exist in the snapshot
             for dset, attrs in dset_attrs[f"PartType{ptype}"].items():
                 if f"PartType{ptype}/{dset}" in outfile:
-                    outfile.move(f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap")
+                    outfile.move(
+                        f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap"
+                    )
                 outfile.create_virtual_dataset(
                     f"PartType{ptype}/{dset}", layouts[dset], fillvalue=-999
                 )
@@ -215,9 +221,12 @@ def replace_path(old_path):
                     outfile[f"PartType{ptype}/{dset}"].attrs[k] = v
 
                 # Update paths for this newly created auxiliary dataset
-                update_vds_paths(outfile[f"PartType{ptype}/{dset}"], auxilary_path_replacers[aux_index])
+                update_vds_paths(
+                    outfile[f"PartType{ptype}/{dset}"],
+                    auxilary_path_replacers[aux_index],
+                )
 
-                # Copy GroupNr_bound to HaloCatalogueIndex, since 
+                # Copy GroupNr_bound to HaloCatalogueIndex, since
                 # that is the name in SOAP
                 if dset == "GroupNr_bound":
                     outfile.create_virtual_dataset(
@@ -227,9 +236,12 @@ def replace_path(old_path):
                     )
                     for k, v in outfile[f"PartType{ptype}/GroupNr_bound"].attrs.items():
                         outfile[f"PartType{ptype}/HaloCatalogueIndex"].attrs[k] = v
-                    
+
                     # Update paths for HaloCatalogueIndex too
-                    update_vds_paths(outfile[f"PartType{ptype}/HaloCatalogueIndex"], auxilary_path_replacers[aux_index])
+                    update_vds_paths(
+                        outfile[f"PartType{ptype}/HaloCatalogueIndex"],
+                        auxilary_path_replacers[aux_index],
+                    )
 
     # Update paths for all original snapshot datasets
     for ptype in range(7):
@@ -260,25 +272,28 @@ def replace_path(old_path):
         )
     )
     parser.add_argument(
-        "virtual_snapshot",
+        "--virtual-snapshot",
         type=str,
+        required=True,
         help="Name of the SWIFT virtual snapshot file, e.g. snapshot_{snap_nr:04}.hdf5",
     )
     parser.add_argument(
-        "auxilary_snapshots",
+        "--auxilary-snapshots",
         type=str,
         nargs="+",
+        required=True,
         help="One of more format strings for auxilary files, e.g. membership_{snap_nr:04}.{file_nr}.hdf5",
     )
     parser.add_argument(
-        "output_file",
+        "--output-file",
         type=str,
+        required=True,
         help="Name of the virtual snapshot to create, e.g. membership_{snap_nr:04}.hdf5",
     )
     parser.add_argument(
-        "snap_nr",
+        "--snap-nr",
         type=int,
-        nargs="?",
+        required=False,
         default=-1,
         help="Snapshot number (default: -1). Not required if snap_nr is present in filenames passed.",
     )
@@ -289,20 +304,24 @@ def replace_path(old_path):
     )
     args = parser.parse_args()
 
+    print(f"Creating virtual snapshot")
+    for k, v in vars(args).items():
+        print(f"  {k}: {v}")
+
     # Substitute snap number
     virtual_snapshot = args.virtual_snapshot.format(snap_nr=args.snap_nr)
     output_file = args.output_file.format(snap_nr=args.snap_nr)
 
     # We don't want to replace {file_nr} for auxilary snapshots
     auxilary_snapshots = [
-        filename.format_map(SafeDict({'snap_nr': args.snap_nr}))
+        filename.format_map(SafeDict({"snap_nr": args.snap_nr}))
         for filename in args.auxilary_snapshots
     ]
 
     # Make a new virtual snapshot with group info
     make_virtual_snapshot(
-        virtual_snapshot, 
-        auxilary_snapshots, 
-        output_file, 
-        absolute_paths=args.absolute_paths
+        virtual_snapshot,
+        auxilary_snapshots,
+        output_file,
+        absolute_paths=args.absolute_paths,
     )
diff --git a/scripts/COLIBRE/compress_group_membership.sh b/scripts/COLIBRE/compress_group_membership.sh
index ee34965f..69587a22 100644
--- a/scripts/COLIBRE/compress_group_membership.sh
+++ b/scripts/COLIBRE/compress_group_membership.sh
@@ -90,7 +90,10 @@ echo "Creating virtual snapshot"
 snapshot="${output_dir}/${sim}/snapshots/colibre_${snapnum}/colibre_${snapnum}.hdf5"
 membership="${output_filename}.{file_nr}.hdf5"
 virtual="${outbase}/colibre_with_SOAP_membership_${snapnum}.hdf5"
-python compression/make_virtual_snapshot.py $snapshot $membership $virtual
+python compression/make_virtual_snapshot.py \
+  --virtual-snapshot $snapshot \
+  --auxilary-snapshots $membership \
+  --output-file $virtual
 
 echo "Setting virtual file to be read-only"
 chmod a=r "${virtual}"
diff --git a/scripts/EAGLE.sh b/scripts/EAGLE.sh
index 8f87ef72..d8ffd65b 100755
--- a/scripts/EAGLE.sh
+++ b/scripts/EAGLE.sh
@@ -78,9 +78,9 @@ python "${soap_dir}/create_virtual_snapshot.py" "snap_${snap_nr}.0.hdf5"
 cd -
 
 python compression/make_virtual_snapshot.py \
-    "${output_dir}/swift_snapshots/swift_${snap_nr}/snap_${snap_nr}.hdf5" \
-    "${output_dir}/SOAP_uncompressed/membership_${snap_nr}/membership_${snap_nr}.{file_nr}.hdf5" \
-    "${output_dir}/SOAP_uncompressed/snap_${snap_nr}.hdf5" \
+    --virtual-snapshot "${output_dir}/swift_snapshots/swift_${snap_nr}/snap_${snap_nr}.hdf5" \
+    --auxilary-snapshots "${output_dir}/SOAP_uncompressed/membership_${snap_nr}/membership_${snap_nr}.{file_nr}.hdf5" \
+    --output-file "${output_dir}/SOAP_uncompressed/snap_${snap_nr}.hdf5"
 
 ######### Run SOAP
 

From 73dc32ec4051679c0204641ed5b2115b96d3a9c1 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 7 Jan 2026 16:16:10 +0000
Subject: [PATCH 04/15] BirthHaloCatalogueIndex v1

---
 misc/compute_BirthHaloCatalogueIndex.py | 247 ++++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 misc/compute_BirthHaloCatalogueIndex.py

diff --git a/misc/compute_BirthHaloCatalogueIndex.py b/misc/compute_BirthHaloCatalogueIndex.py
new file mode 100644
index 00000000..3f458f48
--- /dev/null
+++ b/misc/compute_BirthHaloCatalogueIndex.py
@@ -0,0 +1,247 @@
+#!/bin/env python
+
+"""
+# TODO: Update
+match_group_membership.py
+
+This script matches halos between different simulations run from the
+same initial conditions.
+
+Usage:
+
+  mpirun -- python -u misc/match_group_membership \
+            --snap-basename1 SNAP_BASENAME1 \
+            --snap-basename2 SNAP_BASENAME2 \
+            --membership-basename1 MEMBERSHIP_BASENAME1 \
+            --membership-basename2 MEMBERSHIP_BASENAME2 \
+            --catalogue-filename1 CATALOGUE_FILENAME1 \
+            --catalogue-filename2 CATALOGUE_FILENAME2 \
+            --output-filename OUTPUT_FILENAME
+
+Run "python misc/match_group_membership.py -h" for a discription
+of the optional arguments.
+
+"""
+
+import argparse
+import datetime
+import os
+
+from mpi4py import MPI
+
+comm = MPI.COMM_WORLD
+comm_rank = comm.Get_rank()
+comm_size = comm.Get_size()
+
+import h5py
+import numpy as np
+
+import virgo.mpi.parallel_sort as psort
+import virgo.mpi.parallel_hdf5 as phdf5
+from virgo.mpi.gather_array import gather_array
+
+
+def load_particle_data(snap_basename, membership_basename, load_gas, comm):
+    """
+    Load the particle IDs and halo membership for the particle types
+    we will use to match. Removes unbound particles.
+    """
+
+    particle_data = {}
+
+    # Load particle IDs
+    snap_filename = snap_basename + ".{file_nr}.hdf5"
+    file = phdf5.MultiFile(
+        snap_filename, file_nr_attr=("Header", "NumFilesPerSnapshot"), comm=comm
+    )
+    particle_data["PartType4/ParticleIDs"] = file.read("PartType4/ParticleIDs")
+    if load_gas:
+        particle_data["PartType0/ParticleIDs"] = file.read("PartType0/ParticleIDs")
+
+    # Membership files don't have a header, so create a list of filenames
+    n_file = len(file.filenames)
+    membership_filenames = [f"{membership_basename}.{i}.hdf5" for i in range(n_file)]
+    # Load membership information
+    file = phdf5.MultiFile(
+        membership_filenames, file_nr_attr=("Header", "NumFilesPerSnapshot"), comm=comm
+    )
+    particle_data["PartType4/GroupNr_bound"] = file.read("PartType4/GroupNr_bound")
+    if load_gas:
+        particle_data["PartType0/GroupNr_bound"] = file.read("PartType0/GroupNr_bound")
+
+    # Check the two files are partitioned the same way
+    assert (
+        particle_data["PartType4/GroupNr_bound"].shape
+        == particle_data["PartType4/ParticleIDs"].shape
+    )
+    if load_gas:
+        assert (
+            particle_data["PartType0/GroupNr_bound"].shape
+            == particle_data["PartType0/ParticleIds"].shape
+        )
+
+    return particle_data
+
+
+unit_attrs = {
+    "Conversion factor to CGS (not including cosmological corrections)": [1.0],
+    "Conversion factor to physical CGS (including cosmological corrections)": [1.0],
+    "U_I exponent": [0.0],
+    "U_L exponent": [0.0],
+    "U_M exponent": [0.0],
+    "U_t exponent": [0.0],
+    "U_T exponent": [0.0],
+    "a-scale exponent": [0.0],
+    "h-scale exponent": [0.0],
+    "Property can be converted to comoving": [0],
+    "Value stored as physical": [1],
+}
+
+
+def mpi_print(string, comm_rank):
+    if comm_rank == 0:
+        print(string)
+
+
+if __name__ == "__main__":
+
+    start_time = datetime.datetime.now()
+
+    parser = argparse.ArgumentParser(
+        description=("Script to calculate BirthHaloCatalogueIndex of star particles"),
+    )
+    parser.add_argument(
+        "--snap-basename",
+        type=str,
+        required=True,
+        help=(
+            "The basename of the snapshot files (the snapshot "
+            "name without the .{file_nr}.hdf5 suffix. Use "
+            "{snap_nr:04d} instead of the snapshot number)"
+        ),
+    )
+    parser.add_argument(
+        "--membership-basename",
+        type=str,
+        required=True,
+        help="The basename of the membership files",
+    )
+    parser.add_argument(
+        "--output-basename",
+        type=str,
+        required=True,
+        help="The basename of the output files",
+    )
+    parser.add_argument(
+        "--final-snap-nr",
+        type=int,
+        required=True,
+        help=(
+            # TODO:
+            "Snapshot at which to load the particles"
+        ),
+    )
+    # TODO: Skip check gas particles
+    parser.add_argument(
+        "--calculate-PreBirthHaloCatalogueIndex",
+        action="store_true",
+        help=(
+            "Whether to calculate and output the subhalo halo catalogue "
+            "index of the gas particle that formed each star"
+        ),
+    )
+
+    args = parser.parse_args()
+
+    # Log the arguments
+    for k, v in vars(args).items():
+        mpi_print(f"  {k}: {v}", comm_rank)
+
+    final_snap_basename = args.snap_basename.format(snap_nr=args.final_snap_nr)
+    final_membership_basename = args.membership_basename.format(
+        snap_nr=args.final_snap_nr
+    )
+    mpi_print("Loading stars from final snapshot", comm_rank)
+    particle_data = load_particle_data(
+        final_snap_basename,
+        final_membership_basename,
+        False,
+        comm,
+    )
+    star_particle_ids = particle_data["PartType4/ParticleIDs"]
+    star_birth_ids = particle_data["PartType4/GroupNr_bound"]
+    star_birth_ids[:] = -99
+
+    # TODO:
+    # For the loop we need gas particles from the previous snapshot
+    # if args.calculate_PreBirthHaloCatalogueIndex:
+    # star_prebirth_ids = np.copy(star_birth_ids)
+
+    for snap_nr in range(0, args.final_snap_nr + 1):
+        mpi_print(f"Loading data from snapshot {snap_nr}", comm_rank)
+        snap_basename = args.snap_basename.format(snap_nr=snap_nr)
+        membership_basename = args.membership_basename.format(snap_nr=snap_nr)
+        particle_data = load_particle_data(
+            snap_basename,
+            membership_basename,
+            args.calculate_PreBirthHaloCatalogueIndex,
+            comm,
+        )
+
+        # It would be quicker to make use of the BirthScaleFactors
+        # instead of checking all stars
+        idx = psort.parallel_match(
+            star_particle_ids[star_birth_ids == -99],
+            particle_data["PartType4/ParticleIDs"],
+            comm=comm,
+        )
+
+        new_birth_ids = psort.fetch_elements(
+            particle_data["PartType4/GroupNr_bound"],
+            idx[idx != -1],
+            comm=comm,
+        )
+
+        # TODO:
+        # if args.calculate_PreBirthHaloCatalogueIndex:
+
+        has_new_birth_id = star_birth_ids == -99
+        has_new_birth_id[has_new_birth_id] = idx != -1
+        star_birth_ids[has_new_birth_id] = new_birth_ids
+
+    # Check we found a value for every star
+    assert np.sum(star_birth_ids == -99) == 0
+
+    mpi_print("Writing output", comm_rank)
+    snap_file = phdf5.MultiFile(
+        final_snap_basename + ".{file_nr}.hdf5",
+        file_nr_attr=("Header", "NumFilesPerSnapshot"),
+        comm=comm,
+    )
+    elements_per_file = snap_file.get_elements_per_file(
+        "ParticleIDs", group="PartType4"
+    )
+    output = {"BirthHaloCatalogueIndex": star_birth_ids}
+    attrs = {"BirthHaloCatalogueIndex": {"Description": "TODO"}}
+    attrs["BirthHaloCatalogueIndex"].update(unit_attrs)
+    output_filename = (
+        args.output_basename.format(snap_nr=args.final_snap_nr) + ".{file_nr}.hdf5"
+    )
+    if comm_rank == 0:
+        output_dir = os.path.dirname(output_filename)
+        os.makedirs(output_dir, exist_ok=True)
+    comm.barrier()
+
+    snap_file.write(
+        output,
+        elements_per_file,
+        filenames=output_filename,
+        mode="w",
+        group="PartType4",
+        attrs=attrs,
+    )
+
+    comm.barrier()
+    mpi_print(f"Runtime: {datetime.datetime.now() - start_time}", comm_rank)
+    mpi_print("Done!", comm_rank)
+

From 03679bbe3984641a3601da47afdac30f05087eaa Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 14 Jan 2026 09:55:12 +0000
Subject: [PATCH 05/15] Identify gas progenitor

---
 misc/compute_BirthHaloCatalogueIndex.py | 106 ++++++++++++++++--------
 1 file changed, 70 insertions(+), 36 deletions(-)

diff --git a/misc/compute_BirthHaloCatalogueIndex.py b/misc/compute_BirthHaloCatalogueIndex.py
index 3f458f48..5fbed1d0 100644
--- a/misc/compute_BirthHaloCatalogueIndex.py
+++ b/misc/compute_BirthHaloCatalogueIndex.py
@@ -1,25 +1,21 @@
 #!/bin/env python
 
 """
-# TODO: Update
-match_group_membership.py
+compute_BirthHaloCatalogueIndex.py
 
-This script matches halos between different simulations run from the
-same initial conditions.
+This script produces an auxiliary snapshot which contains the subhalo
+id each star was part of when it first formed.
 
 Usage:
 
-  mpirun -- python -u misc/match_group_membership \
-            --snap-basename1 SNAP_BASENAME1 \
-            --snap-basename2 SNAP_BASENAME2 \
-            --membership-basename1 MEMBERSHIP_BASENAME1 \
-            --membership-basename2 MEMBERSHIP_BASENAME2 \
-            --catalogue-filename1 CATALOGUE_FILENAME1 \
-            --catalogue-filename2 CATALOGUE_FILENAME2 \
-            --output-filename OUTPUT_FILENAME
+  mpirun -- python -u misc/compute_BirthHaloCatalogueIndex.py \
+            --snap-basename SNAP_BASENAME \
+            --membership-basename MEMBERSHIP_BASENAME \
+            --output-basename OUTPUT_FILENAME \
+            --final-snap-nr FINAL_SNAP_NR
 
-Run "python misc/match_group_membership.py -h" for a discription
-of the optional arguments.
+Run "python misc/compute_BirthHaloCatalogueIndex.py -h" for a full description
+of the arguments, and a list of optional arguments.
 
 """
 
@@ -77,12 +73,12 @@ def load_particle_data(snap_basename, membership_basename, load_gas, comm):
     if load_gas:
         assert (
             particle_data["PartType0/GroupNr_bound"].shape
-            == particle_data["PartType0/ParticleIds"].shape
+            == particle_data["PartType0/ParticleIDs"].shape
         )
 
     return particle_data
 
-
+# Units for the dimensionless fields we will be saving
 unit_attrs = {
     "Conversion factor to CGS (not including cosmological corrections)": [1.0],
     "Conversion factor to physical CGS (including cosmological corrections)": [1.0],
@@ -137,11 +133,9 @@ def mpi_print(string, comm_rank):
         type=int,
         required=True,
         help=(
-            # TODO:
             "Snapshot at which to load the particles"
         ),
     )
-    # TODO: Skip check gas particles
     parser.add_argument(
         "--calculate-PreBirthHaloCatalogueIndex",
         action="store_true",
@@ -171,14 +165,20 @@ def mpi_print(string, comm_rank):
     star_particle_ids = particle_data["PartType4/ParticleIDs"]
     star_birth_ids = particle_data["PartType4/GroupNr_bound"]
     star_birth_ids[:] = -99
+    star_first_snapshot = np.copy(star_birth_ids)
 
-    # TODO:
-    # For the loop we need gas particles from the previous snapshot
-    # if args.calculate_PreBirthHaloCatalogueIndex:
-    # star_prebirth_ids = np.copy(star_birth_ids)
+    if args.calculate_PreBirthHaloCatalogueIndex:
+        particle_data['PartType0/ParticleIDs'] = np.ones(0)
+        particle_data['PartType0/GroupNr_bound'] = np.ones(0)
+        star_prebirth_ids = np.copy(star_birth_ids)
 
     for snap_nr in range(0, args.final_snap_nr + 1):
+
         mpi_print(f"Loading data from snapshot {snap_nr}", comm_rank)
+        if args.calculate_PreBirthHaloCatalogueIndex:
+            # We need to keep the gas IDs from snapshot N-1
+            gas_particle_ids = particle_data['PartType0/ParticleIDs']
+            gas_group_nr = particle_data['PartType0/GroupNr_bound']
         snap_basename = args.snap_basename.format(snap_nr=snap_nr)
         membership_basename = args.membership_basename.format(snap_nr=snap_nr)
         particle_data = load_particle_data(
@@ -188,6 +188,7 @@ def mpi_print(string, comm_rank):
             comm,
         )
 
+        mpi_print(f"Matching stars", comm_rank)
         # It would be quicker to make use of the BirthScaleFactors
         # instead of checking all stars
         idx = psort.parallel_match(
@@ -202,28 +203,50 @@ def mpi_print(string, comm_rank):
             comm=comm,
         )
 
-        # TODO:
-        # if args.calculate_PreBirthHaloCatalogueIndex:
-
         has_new_birth_id = star_birth_ids == -99
         has_new_birth_id[has_new_birth_id] = idx != -1
         star_birth_ids[has_new_birth_id] = new_birth_ids
+        star_first_snapshot[has_new_birth_id] = snap_nr
+
+        if args.calculate_PreBirthHaloCatalogueIndex:
+            mpi_print(f"Matching gas", comm_rank)
+            # Identify the gas progenitor of the newly formed stars
+            gas_idx = psort.parallel_match(
+                star_particle_ids[has_new_birth_id],
+                gas_particle_ids,
+                comm=comm,
+            )
+            # The gas progenitor may not exist for all stars due
+            # to particle splitting. Note this information is
+            # recoverable if required by using the SplitTrees.
+            new_prebirth_ids = -99 * np.ones_like(new_birth_ids)
+            new_prebirth_ids[gas_idx != -1] = psort.fetch_elements(
+                gas_group_nr,
+                gas_idx[gas_idx != -1],
+                comm=comm,
+            )
+            star_prebirth_ids[has_new_birth_id] = new_prebirth_ids
 
     # Check we found a value for every star
     assert np.sum(star_birth_ids == -99) == 0
 
-    mpi_print("Writing output", comm_rank)
-    snap_file = phdf5.MultiFile(
-        final_snap_basename + ".{file_nr}.hdf5",
-        file_nr_attr=("Header", "NumFilesPerSnapshot"),
-        comm=comm,
-    )
-    elements_per_file = snap_file.get_elements_per_file(
-        "ParticleIDs", group="PartType4"
-    )
-    output = {"BirthHaloCatalogueIndex": star_birth_ids}
-    attrs = {"BirthHaloCatalogueIndex": {"Description": "TODO"}}
+    # Set up what we want to output
+    output = {
+        "BirthHaloCatalogueIndex": star_birth_ids,
+        "FirstSnapshot": star_first_snapshot,
+    }
+    attrs = {
+        "BirthHaloCatalogueIndex": {"Description": "The HaloCatalogueIndex of this particle at the first snapshot it appeared."},
+        "FirstSnapshot": {"Description": "Index of the first simulation snapshot in which the star particle is present."},
+    }
     attrs["BirthHaloCatalogueIndex"].update(unit_attrs)
+    attrs["FirstSnapshot"].update(unit_attrs)
+    if args.calculate_PreBirthHaloCatalogueIndex:
+        output["PreBirthHaloCatalogueIndex"] = star_prebirth_ids
+        attrs["PreBirthHaloCatalogueIndex"] = {"Description": "The HaloCatalogueIndex of gas prognitor at the snapshot before the star formed. -99 if no gas progenitor is found."}
+        attrs["PreBirthHaloCatalogueIndex"].update(unit_attrs)
+
+    # Check the output directory exists
     output_filename = (
         args.output_basename.format(snap_nr=args.final_snap_nr) + ".{file_nr}.hdf5"
     )
@@ -232,6 +255,16 @@ def mpi_print(string, comm_rank):
         os.makedirs(output_dir, exist_ok=True)
     comm.barrier()
 
+    # Write the output
+    mpi_print("Writing output", comm_rank)
+    snap_file = phdf5.MultiFile(
+        final_snap_basename + ".{file_nr}.hdf5",
+        file_nr_attr=("Header", "NumFilesPerSnapshot"),
+        comm=comm,
+    )
+    elements_per_file = snap_file.get_elements_per_file(
+        "ParticleIDs", group="PartType4"
+    )
     snap_file.write(
         output,
         elements_per_file,
@@ -241,6 +274,7 @@ def mpi_print(string, comm_rank):
         attrs=attrs,
     )
 
+    # Finished
     comm.barrier()
     mpi_print(f"Runtime: {datetime.datetime.now() - start_time}", comm_rank)
     mpi_print("Done!", comm_rank)

From 43e709393b292e0d14a5c7f2dfa5384d8614c226 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 14 Jan 2026 09:59:23 +0000
Subject: [PATCH 06/15] Add ExSitu fraction property to SOAP

---
 .../particle_selection/aperture_properties.py |  19 +++
 SOAP/property_table.py                        |  18 ++-
 parameter_files/COLIBRE_HYBRID.yml            |   1 +
 parameter_files/COLIBRE_THERMAL.yml           |   1 +
 parameter_files/ExSitu.yml                    | 111 ++++++++++++++++++
 5 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 parameter_files/ExSitu.yml

diff --git a/SOAP/particle_selection/aperture_properties.py b/SOAP/particle_selection/aperture_properties.py
index 5d0cbe4b..3a424a42 100644
--- a/SOAP/particle_selection/aperture_properties.py
+++ b/SOAP/particle_selection/aperture_properties.py
@@ -760,6 +760,24 @@ def TotalSNIaRate(self) -> unyt.unyt_quantity:
             self.star_mask_ap
         ].sum()
 
+    @lazy_property
+    def ExSituFraction(self) -> unyt.unyt_quantity:
+        """
+        Mass fraction of bound stars that formed in a different subhalo.
+        """
+        if self.Nstar == 0:
+            return None
+
+        group_nr = self.get_dataset("PartType4/GroupNr_bound")[self.star_mask_all][
+            self.star_mask_ap
+        ]
+        birth_group_nr = self.get_dataset("PartType4/BirthHaloCatalogueIndex")[self.star_mask_all][
+            self.star_mask_ap
+        ]
+        ex_situ = group_nr != birth_group_nr
+
+        return self.star_mass_fraction[ex_situ].sum()
+
     @lazy_property
     def bh_mask_all(self) -> NDArray[bool]:
         """
@@ -3788,6 +3806,7 @@ class ApertureProperties(HaloProperty):
         "stellar_age_mw": False,
         "stellar_age_lw": False,
         "TotalSNIaRate": False,
+        "ExSituFraction": False,
         "HydrogenMass": False,
         "HeliumMass": False,
         "MolecularHydrogenMass": False,
diff --git a/SOAP/property_table.py b/SOAP/property_table.py
index 583f291f..1867517a 100644
--- a/SOAP/property_table.py
+++ b/SOAP/property_table.py
@@ -1905,6 +1905,22 @@ class PropertyTable:
             output_physical=True,
             a_scale_exponent=0,
         ),
+        "ExSituFraction": Property(
+            name="ExSituFraction",
+            shape=1,
+            dtype=np.float32,
+            unit="dimensionless",
+            description="Mass fraction of bound stars that formed in a different subhalo",
+            lossy_compression_filter="FMantissa9",
+            dmo_property=True,
+            particle_properties=[
+                "PartType4/Masses",
+                "PartType4/BirthHaloCatalogueIndex",
+                "PartType4/GroupNr_bound",
+            ],
+            output_physical=True,
+            a_scale_exponent=0,
+        ),
         "Mgas": Property(
             name="GasMass",
             shape=1,
@@ -5134,7 +5150,7 @@ def generate_tex_files(self, output_dir: str):
         # standalone table file footer
         tailstr = "\\end{document}"
 
-        # generate the auxilary documentation files
+        # generate the auxiliary documentation files
         with open(f"{output_dir}/timestamp.tex", "w") as ofile:
             ofile.write(get_version_string())
         with open(f"{output_dir}/table.tex", "w") as ofile:
diff --git a/parameter_files/COLIBRE_HYBRID.yml b/parameter_files/COLIBRE_HYBRID.yml
index 0f9b3f6e..343be68d 100644
--- a/parameter_files/COLIBRE_HYBRID.yml
+++ b/parameter_files/COLIBRE_HYBRID.yml
@@ -244,6 +244,7 @@ ApertureProperties:
     StellarCylindricalVelocityDispersionLuminosityWeighted: false
     StellarCylindricalVelocityDispersionVerticalLuminosityWeighted: false
     StellarCylindricalVelocityDispersionDiscPlaneLuminosityWeighted: false
+    ExSituFraction: false
     TotalMass: true
     TotalSNIaRate: true
     GasMassInColdDenseDiffuseMetals:
diff --git a/parameter_files/COLIBRE_THERMAL.yml b/parameter_files/COLIBRE_THERMAL.yml
index 403267a5..9d93c947 100644
--- a/parameter_files/COLIBRE_THERMAL.yml
+++ b/parameter_files/COLIBRE_THERMAL.yml
@@ -244,6 +244,7 @@ ApertureProperties:
     StellarCylindricalVelocityDispersionDiscPlaneLuminosityWeighted: false
     StellarRotationalVelocity: false
     StellarRotationalVelocityLuminosityWeighted: false
+    ExSituFraction: false
     TotalMass: true
     TotalSNIaRate: true
     GasMassInColdDenseDiffuseMetals:
diff --git a/parameter_files/ExSitu.yml b/parameter_files/ExSitu.yml
new file mode 100644
index 00000000..47adf469
--- /dev/null
+++ b/parameter_files/ExSitu.yml
@@ -0,0 +1,111 @@
+# Values in this section are substituted into the other sections
+Parameters:
+  sim_dir: /cosma8/data/dp004/jlvc76/COLIBRE/ScienceRuns
+  output_dir: /snap8/scratch/dp004/dc-mcgi1/soap_BirthTrackId
+  scratch_dir: /snap8/scratch/dp004/dc-mcgi1/soap_BirthTrackId
+
+# Location of the Swift snapshots:
+Snapshots:
+  filename: "{sim_dir}/{sim_name}/snapshots/colibre_{snap_nr:04d}/colibre_{snap_nr:04d}.{file_nr}.hdf5"
+
+# Which halo finder we're using, and base name for halo finder output files
+HaloFinder:
+  type: HBTplus
+  filename: "{sim_dir}/{sim_name}/HBT-HERONS/sorted_catalogues/OrderedSubSnap_{snap_nr:03d}.hdf5"
+  # fof_filename: "{sim_dir}/{sim_name}/fof/fof_output_{snap_nr:04d}.hdf5"
+  # fof_radius_filename: "{sim_dir}/{sim_name}/fof/fof_output_{snap_nr:04d}.hdf5"
+  read_potential_energies: true
+  #type: VR
+  #filename: "{sim_dir}/halo_{snap_nr:04d}"
+  #type: Subfind
+  #filename: "{sim_dir}/snapdir_{snap_nr:03d}/snapshot_{snap_nr:03d}"
+
+GroupMembership:
+  # Where to write the group membership files
+  filename: "{sim_dir}/{sim_name}/SOAP-HBT/membership_{snap_nr:04d}/membership_{snap_nr:04d}.{file_nr}.hdf5"
+
+ExtraInput:
+  situ: "{sim_dir}/{sim_name}/SOAP_BirthTrackId/test_{snap_nr:04d}/test_{snap_nr:04d}.{file_nr}.hdf5"
+
+HaloProperties:
+  # Where to write the halo properties file
+  filename: "{output_dir}/{sim_name}/SOAP_uncompressed/halo_properties_{snap_nr:04d}.hdf5"
+  # Where to write temporary chunk output
+  chunk_dir: "{scratch_dir}/{sim_name}/SOAP-tmp/"
+
+ApertureProperties:
+  properties:
+    StellarMass: true
+    ExSituFraction: true
+  variations:
+    exclusive_50_kpc:
+      inclusive: false
+      radius_in_kpc: 50.0
+ProjectedApertureProperties:
+  properties:
+    {}
+  variations:
+    {}
+SOProperties:
+  properties:
+    {}
+  variations:
+    {}
+SubhaloProperties:
+  properties:
+    EncloseRadius: true
+    TotalMass: true
+    NumberOfBlackHoleParticles: true
+    NumberOfDarkMatterParticles: true
+    NumberOfGasParticles: true
+    NumberOfStarParticles: true
+aliases:
+  PartType0/LastSNIIKineticFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
+  PartType0/LastSNIIThermalFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
+  snipshot:
+    PartType0/SpeciesFractions: PartType0/ReducedSpeciesFractions
+    PartType0/ElementMassFractions: PartType0/ReducedElementMassFractions
+    PartType0/LastSNIIKineticFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
+    PartType0/LastSNIIThermalFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
+filters:
+  general:
+    limit: 100
+    properties:
+      - BoundSubhalo/NumberOfGasParticles
+      - BoundSubhalo/NumberOfDarkMatterParticles
+      - BoundSubhalo/NumberOfStarParticles
+      - BoundSubhalo/NumberOfBlackHoleParticles
+    combine_properties: sum
+  baryon:
+    limit: 0
+    properties:
+      - BoundSubhalo/NumberOfGasParticles
+      - BoundSubhalo/NumberOfStarParticles
+    combine_properties: sum
+  dm:
+    limit: 0
+    properties:
+      - BoundSubhalo/NumberOfDarkMatterParticles
+  gas:
+    limit: 0
+    properties:
+      - BoundSubhalo/NumberOfGasParticles
+  star:
+    limit: 0
+    properties:
+      - BoundSubhalo/NumberOfStarParticles
+defined_constants:
+  O_H_sun: 4.9e-4
+  Fe_H_sun: 3.16e-5
+  N_O_sun: 0.138
+  C_O_sun: 0.549
+  Mg_H_sun: 3.98e-5
+calculations:
+  calculate_missing_properties: false
+  strict_halo_copy: false
+  recently_heated_gas_filter:
+    delta_time_myr: 15
+    use_AGN_delta_T: false
+  cold_dense_gas_filter:
+    maximum_temperature_K: 3.16e4
+    minimum_hydrogen_number_density_cm3: 0.1

From 257b0be53c971900a0d6000c942f4bede2bcf7cd Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 14 Jan 2026 10:22:22 +0000
Subject: [PATCH 07/15] Add sbatch script

---
 scripts/COLIBRE/compute_birth_index.sh | 31 ++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100755 scripts/COLIBRE/compute_birth_index.sh

diff --git a/scripts/COLIBRE/compute_birth_index.sh b/scripts/COLIBRE/compute_birth_index.sh
new file mode 100755
index 00000000..bf777fef
--- /dev/null
+++ b/scripts/COLIBRE/compute_birth_index.sh
@@ -0,0 +1,31 @@
+#!/bin/bash -l
+
+#SBATCH --cpus-per-task=1
+#SBATCH -o ./logs/birth_track_id_%j.out
+#SBATCH -p cosma8
+#SBATCH -A dp004
+#SBATCH -J BirthHaloCatalogueIndex
+#SBATCH --nodes=1
+#SBATCH -t 12:00:00
+# N0752: 1 node, 2 hours
+# N1504: 1 node, 12 hours
+# N3008: 4 nodes, 24 hours
+
+set -e
+
+base_dir="/cosma8/data/dp004/jlvc76/COLIBRE/ScienceRuns"
+sim="L0400N3008/Thermal"
+output_dir="/cosma8/data/dp004/dc-mcgi1/COLIBRE/BirthHaloCatalogueIndex"
+snapnum="0127"
+
+snap_basename="${base_dir}/${sim}/snapshots/colibre_{snap_nr:04d}/colibre_{snap_nr:04d}"
+membership_basename="${base_dir}/${sim}/SOAP-HBT/membership_{snap_nr:04d}/membership_{snap_nr:04d}"
+output_basename="${output_dir}/${sim}/SOAP_BirthTrackId/birth_${snapnum}/birth_${snapnum}"
+
+mpirun -- python misc/compute_BirthHaloCatalogueIndex.py \
+  --snap-basename ${snap_basename} \
+  --membership-basename ${membership_basename} \
+  --output-basename ${output_basename} \
+  --final-snap-nr ${snapnum} \
+  --calculate-PreBirthHaloCatalogueIndex
+

From 2fa51f47c36a7e07628ada2d428c92c4522c5c2d Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 14 Jan 2026 13:15:44 +0000
Subject: [PATCH 08/15] Remove extra parameter file

---
 parameter_files/ExSitu.yml | 111 -------------------------------------
 1 file changed, 111 deletions(-)
 delete mode 100644 parameter_files/ExSitu.yml

diff --git a/parameter_files/ExSitu.yml b/parameter_files/ExSitu.yml
deleted file mode 100644
index 47adf469..00000000
--- a/parameter_files/ExSitu.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# Values in this section are substituted into the other sections
-Parameters:
-  sim_dir: /cosma8/data/dp004/jlvc76/COLIBRE/ScienceRuns
-  output_dir: /snap8/scratch/dp004/dc-mcgi1/soap_BirthTrackId
-  scratch_dir: /snap8/scratch/dp004/dc-mcgi1/soap_BirthTrackId
-
-# Location of the Swift snapshots:
-Snapshots:
-  filename: "{sim_dir}/{sim_name}/snapshots/colibre_{snap_nr:04d}/colibre_{snap_nr:04d}.{file_nr}.hdf5"
-
-# Which halo finder we're using, and base name for halo finder output files
-HaloFinder:
-  type: HBTplus
-  filename: "{sim_dir}/{sim_name}/HBT-HERONS/sorted_catalogues/OrderedSubSnap_{snap_nr:03d}.hdf5"
-  # fof_filename: "{sim_dir}/{sim_name}/fof/fof_output_{snap_nr:04d}.hdf5"
-  # fof_radius_filename: "{sim_dir}/{sim_name}/fof/fof_output_{snap_nr:04d}.hdf5"
-  read_potential_energies: true
-  #type: VR
-  #filename: "{sim_dir}/halo_{snap_nr:04d}"
-  #type: Subfind
-  #filename: "{sim_dir}/snapdir_{snap_nr:03d}/snapshot_{snap_nr:03d}"
-
-GroupMembership:
-  # Where to write the group membership files
-  filename: "{sim_dir}/{sim_name}/SOAP-HBT/membership_{snap_nr:04d}/membership_{snap_nr:04d}.{file_nr}.hdf5"
-
-ExtraInput:
-  situ: "{sim_dir}/{sim_name}/SOAP_BirthTrackId/test_{snap_nr:04d}/test_{snap_nr:04d}.{file_nr}.hdf5"
-
-HaloProperties:
-  # Where to write the halo properties file
-  filename: "{output_dir}/{sim_name}/SOAP_uncompressed/halo_properties_{snap_nr:04d}.hdf5"
-  # Where to write temporary chunk output
-  chunk_dir: "{scratch_dir}/{sim_name}/SOAP-tmp/"
-
-ApertureProperties:
-  properties:
-    StellarMass: true
-    ExSituFraction: true
-  variations:
-    exclusive_50_kpc:
-      inclusive: false
-      radius_in_kpc: 50.0
-ProjectedApertureProperties:
-  properties:
-    {}
-  variations:
-    {}
-SOProperties:
-  properties:
-    {}
-  variations:
-    {}
-SubhaloProperties:
-  properties:
-    EncloseRadius: true
-    TotalMass: true
-    NumberOfBlackHoleParticles: true
-    NumberOfDarkMatterParticles: true
-    NumberOfGasParticles: true
-    NumberOfStarParticles: true
-aliases:
-  PartType0/LastSNIIKineticFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
-  PartType0/LastSNIIThermalFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
-  snipshot:
-    PartType0/SpeciesFractions: PartType0/ReducedSpeciesFractions
-    PartType0/ElementMassFractions: PartType0/ReducedElementMassFractions
-    PartType0/LastSNIIKineticFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
-    PartType0/LastSNIIThermalFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
-filters:
-  general:
-    limit: 100
-    properties:
-      - BoundSubhalo/NumberOfGasParticles
-      - BoundSubhalo/NumberOfDarkMatterParticles
-      - BoundSubhalo/NumberOfStarParticles
-      - BoundSubhalo/NumberOfBlackHoleParticles
-    combine_properties: sum
-  baryon:
-    limit: 0
-    properties:
-      - BoundSubhalo/NumberOfGasParticles
-      - BoundSubhalo/NumberOfStarParticles
-    combine_properties: sum
-  dm:
-    limit: 0
-    properties:
-      - BoundSubhalo/NumberOfDarkMatterParticles
-  gas:
-    limit: 0
-    properties:
-      - BoundSubhalo/NumberOfGasParticles
-  star:
-    limit: 0
-    properties:
-      - BoundSubhalo/NumberOfStarParticles
-defined_constants:
-  O_H_sun: 4.9e-4
-  Fe_H_sun: 3.16e-5
-  N_O_sun: 0.138
-  C_O_sun: 0.549
-  Mg_H_sun: 3.98e-5
-calculations:
-  calculate_missing_properties: false
-  strict_halo_copy: false
-  recently_heated_gas_filter:
-    delta_time_myr: 15
-    use_AGN_delta_T: false
-  cold_dense_gas_filter:
-    maximum_temperature_K: 3.16e4
-    minimum_hydrogen_number_density_cm3: 0.1

From 6a1d3984f771f3118d2385f309aa7f8dfa0586f1 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 14 Jan 2026 13:30:25 +0000
Subject: [PATCH 09/15] Format

---
 .../particle_selection/aperture_properties.py |  6 ++---
 misc/compute_BirthHaloCatalogueIndex.py       | 26 +++++++++++--------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/SOAP/particle_selection/aperture_properties.py b/SOAP/particle_selection/aperture_properties.py
index 3a424a42..a14c9be1 100644
--- a/SOAP/particle_selection/aperture_properties.py
+++ b/SOAP/particle_selection/aperture_properties.py
@@ -771,9 +771,9 @@ def ExSituFraction(self) -> unyt.unyt_quantity:
         group_nr = self.get_dataset("PartType4/GroupNr_bound")[self.star_mask_all][
             self.star_mask_ap
         ]
-        birth_group_nr = self.get_dataset("PartType4/BirthHaloCatalogueIndex")[self.star_mask_all][
-            self.star_mask_ap
-        ]
+        birth_group_nr = self.get_dataset("PartType4/BirthHaloCatalogueIndex")[
+            self.star_mask_all
+        ][self.star_mask_ap]
         ex_situ = group_nr != birth_group_nr
 
         return self.star_mass_fraction[ex_situ].sum()
diff --git a/misc/compute_BirthHaloCatalogueIndex.py b/misc/compute_BirthHaloCatalogueIndex.py
index 5fbed1d0..8926db16 100644
--- a/misc/compute_BirthHaloCatalogueIndex.py
+++ b/misc/compute_BirthHaloCatalogueIndex.py
@@ -78,6 +78,7 @@ def load_particle_data(snap_basename, membership_basename, load_gas, comm):
 
     return particle_data
 
+
 # Units for the dimensionless fields we will be saving
 unit_attrs = {
     "Conversion factor to CGS (not including cosmological corrections)": [1.0],
@@ -132,9 +133,7 @@ def mpi_print(string, comm_rank):
         "--final-snap-nr",
         type=int,
         required=True,
-        help=(
-            "Snapshot at which to load the particles"
-        ),
+        help=("Snapshot at which to load the particles"),
     )
     parser.add_argument(
         "--calculate-PreBirthHaloCatalogueIndex",
@@ -168,8 +167,8 @@ def mpi_print(string, comm_rank):
     star_first_snapshot = np.copy(star_birth_ids)
 
     if args.calculate_PreBirthHaloCatalogueIndex:
-        particle_data['PartType0/ParticleIDs'] = np.ones(0)
-        particle_data['PartType0/GroupNr_bound'] = np.ones(0)
+        particle_data["PartType0/ParticleIDs"] = np.ones(0)
+        particle_data["PartType0/GroupNr_bound"] = np.ones(0)
         star_prebirth_ids = np.copy(star_birth_ids)
 
     for snap_nr in range(0, args.final_snap_nr + 1):
@@ -177,8 +176,8 @@ def mpi_print(string, comm_rank):
         mpi_print(f"Loading data from snapshot {snap_nr}", comm_rank)
         if args.calculate_PreBirthHaloCatalogueIndex:
             # We need to keep the gas IDs from snapshot N-1
-            gas_particle_ids = particle_data['PartType0/ParticleIDs']
-            gas_group_nr = particle_data['PartType0/GroupNr_bound']
+            gas_particle_ids = particle_data["PartType0/ParticleIDs"]
+            gas_group_nr = particle_data["PartType0/GroupNr_bound"]
         snap_basename = args.snap_basename.format(snap_nr=snap_nr)
         membership_basename = args.membership_basename.format(snap_nr=snap_nr)
         particle_data = load_particle_data(
@@ -236,14 +235,20 @@ def mpi_print(string, comm_rank):
         "FirstSnapshot": star_first_snapshot,
     }
     attrs = {
-        "BirthHaloCatalogueIndex": {"Description": "The HaloCatalogueIndex of this particle at the first snapshot it appeared."},
-        "FirstSnapshot": {"Description": "Index of the first simulation snapshot in which the star particle is present."},
+        "BirthHaloCatalogueIndex": {
+            "Description": "The HaloCatalogueIndex of this particle at the first snapshot it appeared."
+        },
+        "FirstSnapshot": {
+            "Description": "Index of the first simulation snapshot in which the star particle is present."
+        },
     }
     attrs["BirthHaloCatalogueIndex"].update(unit_attrs)
     attrs["FirstSnapshot"].update(unit_attrs)
     if args.calculate_PreBirthHaloCatalogueIndex:
         output["PreBirthHaloCatalogueIndex"] = star_prebirth_ids
-        attrs["PreBirthHaloCatalogueIndex"] = {"Description": "The HaloCatalogueIndex of gas prognitor at the snapshot before the star formed. -99 if no gas progenitor is found."}
+        attrs["PreBirthHaloCatalogueIndex"] = {
+            "Description": "The HaloCatalogueIndex of gas prognitor at the snapshot before the star formed. -99 if no gas progenitor is found."
+        }
         attrs["PreBirthHaloCatalogueIndex"].update(unit_attrs)
 
     # Check the output directory exists
@@ -278,4 +283,3 @@ def mpi_print(string, comm_rank):
     comm.barrier()
     mpi_print(f"Runtime: {datetime.datetime.now() - start_time}", comm_rank)
     mpi_print("Done!", comm_rank)
-

From 15d7eaf1e68d73c6ebe488c190f2aaa411ecd794 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 4 Feb 2026 10:53:29 +0000
Subject: [PATCH 10/15] Learn to spell

---
 compression/make_virtual_snapshot.py         | 68 ++++++++++----------
 scripts/COLIBRE/compress_group_membership.sh |  2 +-
 scripts/EAGLE.sh                             |  2 +-
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/compression/make_virtual_snapshot.py b/compression/make_virtual_snapshot.py
index 0679ff7a..d7b10481 100644
--- a/compression/make_virtual_snapshot.py
+++ b/compression/make_virtual_snapshot.py
@@ -63,14 +63,14 @@ def update_vds_paths(dset, modify_function):
 
 
 def make_virtual_snapshot(
-    snapshot, auxilary_snapshots, output_file, absolute_paths=False
+    snapshot, auxiliary_snapshots, output_file, absolute_paths=False
 ):
     """
-    Given a snapshot and auxilary files, create
+    Given a snapshot and auxiliary files, create
     a new virtual snapshot with all datasets combine.
 
     snapshot: Path to the snapshot file
-    auxilary_snapshots: List of auxiliary file patterns
+    auxiliary_snapshots: List of auxiliary file patterns
     output_file: Path to the output virtual snapshot
     absolute_paths: If True, use absolute paths; if False, use relative paths
     """
@@ -83,19 +83,19 @@ def make_virtual_snapshot(
 
     # Calculate directories for path updates
     abs_snapshot_dir = os.path.abspath(os.path.dirname(snapshot))
-    abs_auxilary_dirs = [
+    abs_auxiliary_dirs = [
         os.path.abspath(os.path.dirname(aux.format(file_nr=0)))
-        for aux in auxilary_snapshots
+        for aux in auxiliary_snapshots
     ]
     abs_output_dir = os.path.abspath(os.path.dirname(output_file))
 
     if absolute_paths:
         snapshot_dir = abs_snapshot_dir
-        auxilary_dirs = abs_auxilary_dirs
+        auxiliary_dirs = abs_auxiliary_dirs
     else:
         snapshot_dir = os.path.relpath(abs_snapshot_dir, abs_output_dir)
-        auxilary_dirs = [
-            os.path.relpath(aux_dir, abs_output_dir) for aux_dir in abs_auxilary_dirs
+        auxiliary_dirs = [
+            os.path.relpath(aux_dir, abs_output_dir) for aux_dir in abs_auxiliary_dirs
         ]
 
     # Create path replacement functions
@@ -107,15 +107,15 @@ def replace_path(old_path):
         return replace_path
 
     replace_snapshot_path = make_replace_path(snapshot_dir)
-    auxilary_path_replacers = [make_replace_path(d) for d in auxilary_dirs]
+    auxiliary_path_replacers = [make_replace_path(d) for d in auxiliary_dirs]
 
-    all_auxilary_datasets = {}
+    all_auxiliary_datasets = {}
 
-    for aux_index, auxilary in enumerate(auxilary_snapshots):
+    for aux_index, auxiliary in enumerate(auxiliary_snapshots):
 
-        # Check which datasets exist in the auxilary files
+        # Check which datasets exist in the auxiliary files
         # and store their attributes and datatype
-        filename = auxilary.format(file_nr=0)
+        filename = auxiliary.format(file_nr=0)
         dset_attrs = {}
         dset_dtype = {}
         with h5py.File(filename, "r") as infile:
@@ -128,35 +128,35 @@ def replace_path(old_path):
                     attrs = dict(infile[f"PartType{ptype}/{dset}"].attrs)
                     dtype = infile[f"PartType{ptype}/{dset}"].dtype
 
-                    # Some auxilary files are missing these attributes
+                    # Some auxiliary files are missing these attributes
                     if not "Value stored as physical" in attrs:
                         print(f"Setting comoving attrs for PartType{ptype}/{dset}")
                         attrs["Value stored as physical"] = [1]
                         attrs["Property can be converted to comoving"] = [0]
 
-                    # Add a flag that these datasets are stored in the auxilary files
-                    attrs["Auxilary file"] = [1]
+                    # Add a flag that these datasets are stored in the auxiliary files
+                    attrs["auxiliary file"] = [1]
 
                     # Store the values we need for later
                     dset_attrs[f"PartType{ptype}"][dset] = attrs
                     dset_dtype[f"PartType{ptype}"][dset] = dtype
 
-                    # Check we don't have this dataset in any of the other auxilary files
+                    # Check we don't have this dataset in any of the other auxiliary files
                     dset_path = f"PartType{ptype}/{dset}"
-                    if dset_path in all_auxilary_datasets:
-                        other_file = all_auxilary_datasets[f"PartType{ptype}/{dset}"]
+                    if dset_path in all_auxiliary_datasets:
+                        other_file = all_auxiliary_datasets[f"PartType{ptype}/{dset}"]
                         raise ValueError(
-                            f"{dset_path} is in {auxilary} and {other_file}"
+                            f"{dset_path} is in {auxiliary} and {other_file}"
                         )
-                    all_auxilary_datasets[dset_path] = auxilary
+                    all_auxiliary_datasets[dset_path] = auxiliary
 
-        # Loop over input auxilary files to get dataset shapes
+        # Loop over input auxiliary files to get dataset shapes
         file_nr = 0
         filenames = []
         shapes = []
         counts = []
         while True:
-            filename = auxilary.format(file_nr=file_nr)
+            filename = auxiliary.format(file_nr=file_nr)
             if os.path.exists(filename):
                 filenames.append(filename)
                 with h5py.File(filename, "r") as infile:
@@ -178,7 +178,7 @@ def replace_path(old_path):
                 break
             file_nr += 1
         if file_nr == 0:
-            raise IOError(f"Failed to find files matching: {auxilary}")
+            raise IOError(f"Failed to find files matching: {auxiliary}")
 
         # Loop over particle types in the output
         for ptype in range(7):
@@ -223,7 +223,7 @@ def replace_path(old_path):
                 # Update paths for this newly created auxiliary dataset
                 update_vds_paths(
                     outfile[f"PartType{ptype}/{dset}"],
-                    auxilary_path_replacers[aux_index],
+                    auxiliary_path_replacers[aux_index],
                 )
 
                 # Copy GroupNr_bound to HaloCatalogueIndex, since
@@ -240,7 +240,7 @@ def replace_path(old_path):
                     # Update paths for HaloCatalogueIndex too
                     update_vds_paths(
                         outfile[f"PartType{ptype}/HaloCatalogueIndex"],
-                        auxilary_path_replacers[aux_index],
+                        auxiliary_path_replacers[aux_index],
                     )
 
     # Update paths for all original snapshot datasets
@@ -251,7 +251,7 @@ def replace_path(old_path):
                 dset = outfile[f"{ptype_name}/{dset_name}"]
                 if dset.is_virtual:
                     # Check if this is an auxiliary dataset (skip those, already handled)
-                    if dset.attrs.get("Auxilary file", [0])[0] != 1:
+                    if dset.attrs.get("auxiliary file", [0])[0] != 1:
                         # This is an original snapshot dataset
                         update_vds_paths(dset, replace_snapshot_path)
 
@@ -266,7 +266,7 @@ def replace_path(old_path):
     # For description of parameters run the following: $ python make_virtual_snapshot.py --help
     parser = argparse.ArgumentParser(
         description=(
-            "Link SWIFT snapshots with SWIFT auxilary snapshots (snapshot-like"
+            "Link SWIFT snapshots with SWIFT auxiliary snapshots (snapshot-like"
             "files with the same number of particles in the same order as the"
             "snapshot, but with less metadata), such as the SOAP memberships"
         )
@@ -278,11 +278,11 @@ def replace_path(old_path):
         help="Name of the SWIFT virtual snapshot file, e.g. snapshot_{snap_nr:04}.hdf5",
     )
     parser.add_argument(
-        "--auxilary-snapshots",
+        "--auxiliary-snapshots",
         type=str,
         nargs="+",
         required=True,
-        help="One of more format strings for auxilary files, e.g. membership_{snap_nr:04}.{file_nr}.hdf5",
+        help="One of more format strings for auxiliary files, e.g. membership_{snap_nr:04}.{file_nr}.hdf5",
     )
     parser.add_argument(
         "--output-file",
@@ -312,16 +312,16 @@ def replace_path(old_path):
     virtual_snapshot = args.virtual_snapshot.format(snap_nr=args.snap_nr)
     output_file = args.output_file.format(snap_nr=args.snap_nr)
 
-    # We don't want to replace {file_nr} for auxilary snapshots
-    auxilary_snapshots = [
+    # We don't want to replace {file_nr} for auxiliary snapshots
+    auxiliary_snapshots = [
         filename.format_map(SafeDict({"snap_nr": args.snap_nr}))
-        for filename in args.auxilary_snapshots
+        for filename in args.auxiliary_snapshots
     ]
 
     # Make a new virtual snapshot with group info
     make_virtual_snapshot(
         virtual_snapshot,
-        auxilary_snapshots,
+        auxiliary_snapshots,
         output_file,
         absolute_paths=args.absolute_paths,
     )
diff --git a/scripts/COLIBRE/compress_group_membership.sh b/scripts/COLIBRE/compress_group_membership.sh
index 69587a22..0e57df43 100644
--- a/scripts/COLIBRE/compress_group_membership.sh
+++ b/scripts/COLIBRE/compress_group_membership.sh
@@ -92,7 +92,7 @@ membership="${output_filename}.{file_nr}.hdf5"
 virtual="${outbase}/colibre_with_SOAP_membership_${snapnum}.hdf5"
 python compression/make_virtual_snapshot.py \
   --virtual-snapshot $snapshot \
-  --auxilary-snapshots $membership \
+  --auxiliary-snapshots $membership \
   --output-file $virtual
 
 echo "Setting virtual file to be read-only"
diff --git a/scripts/EAGLE.sh b/scripts/EAGLE.sh
index d8ffd65b..569bb28b 100755
--- a/scripts/EAGLE.sh
+++ b/scripts/EAGLE.sh
@@ -79,7 +79,7 @@ cd -
 
 python compression/make_virtual_snapshot.py \
     --virtual-snapshot "${output_dir}/swift_snapshots/swift_${snap_nr}/snap_${snap_nr}.hdf5" \
-    --auxilary-snapshots "${output_dir}/SOAP_uncompressed/membership_${snap_nr}/membership_${snap_nr}.{file_nr}.hdf5" \
+    --auxiliary-snapshots "${output_dir}/SOAP_uncompressed/membership_${snap_nr}/membership_${snap_nr}.{file_nr}.hdf5" \
     --output-file "${output_dir}/SOAP_uncompressed/snap_${snap_nr}.hdf5"
 
 ######### Run SOAP

From 78704fc4a58b444280cc5b165f7351a6e98ad32b Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Wed, 4 Feb 2026 12:50:52 +0000
Subject: [PATCH 11/15] Add to BoundSubhalo

---
 SOAP/particle_selection/subhalo_properties.py | 17 ++++++++++++
 parameter_files/COLIBRE_HYBRID.yml            |  1 +
 parameter_files/COLIBRE_THERMAL.yml           |  1 +
 scripts/COLIBRE/compute_birth_index.sh        | 26 +++++++++++++++----
 tests/COLIBRE/run_L0025N0188_Thermal.sh       |  2 +-
 5 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/SOAP/particle_selection/subhalo_properties.py b/SOAP/particle_selection/subhalo_properties.py
index bd2c94cd..66b1fa85 100644
--- a/SOAP/particle_selection/subhalo_properties.py
+++ b/SOAP/particle_selection/subhalo_properties.py
@@ -481,6 +481,22 @@ def stellar_age_lw(self) -> unyt.unyt_array:
         Lrtot = Lr.sum()
         return ((Lr / Lrtot) * self.stellar_ages).sum()
 
+    @lazy_property
+    def ExSituFraction(self) -> unyt.unyt_quantity:
+        """
+        Mass fraction of bound stars that formed in a different subhalo.
+        """
+        if self.Nstar == 0:
+            return None
+
+        group_nr = self.get_dataset("PartType4/GroupNr_bound")[self.star_mask_all]
+        birth_group_nr = self.get_dataset("PartType4/BirthHaloCatalogueIndex")[
+            self.star_mask_all
+        ]
+        ex_situ = group_nr != birth_group_nr
+
+        return self.star_mass_fraction[ex_situ].sum()
+
     @lazy_property
     def bh_mask_all(self) -> NDArray[bool]:
         """
@@ -2455,6 +2471,7 @@ class SubhaloProperties(HaloProperty):
             "Lstar_luminosity_weighted",
             "stellar_age_mw",
             "stellar_age_lw",
+            "ExSituFraction",
             "Mgas_SF",
             "gasmetalfrac_SF",
             "MedianStellarBirthDensity",
diff --git a/parameter_files/COLIBRE_HYBRID.yml b/parameter_files/COLIBRE_HYBRID.yml
index 343be68d..4f823849 100644
--- a/parameter_files/COLIBRE_HYBRID.yml
+++ b/parameter_files/COLIBRE_HYBRID.yml
@@ -743,6 +743,7 @@ SubhaloProperties:
     StellarCylindricalVelocityDispersionLuminosityWeighted: false
     StellarCylindricalVelocityDispersionVerticalLuminosityWeighted: false
     StellarCylindricalVelocityDispersionDiscPlaneLuminosityWeighted: false
+    ExSituFraction: false
 aliases:
   PartType0/LastSNIIKineticFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
   PartType0/LastSNIIThermalFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
diff --git a/parameter_files/COLIBRE_THERMAL.yml b/parameter_files/COLIBRE_THERMAL.yml
index 9d93c947..7e2eb418 100644
--- a/parameter_files/COLIBRE_THERMAL.yml
+++ b/parameter_files/COLIBRE_THERMAL.yml
@@ -743,6 +743,7 @@ SubhaloProperties:
     StellarCylindricalVelocityDispersionLuminosityWeighted: false
     StellarCylindricalVelocityDispersionVerticalLuminosityWeighted: false
     StellarCylindricalVelocityDispersionDiscPlaneLuminosityWeighted: false
+    ExSituFraction: false
 aliases:
   PartType0/LastSNIIKineticFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
   PartType0/LastSNIIThermalFeedbackDensities: PartType0/DensitiesAtLastSupernovaEvent
diff --git a/scripts/COLIBRE/compute_birth_index.sh b/scripts/COLIBRE/compute_birth_index.sh
index bf777fef..8ac4c0c2 100755
--- a/scripts/COLIBRE/compute_birth_index.sh
+++ b/scripts/COLIBRE/compute_birth_index.sh
@@ -5,22 +5,23 @@
 #SBATCH -p cosma8
 #SBATCH -A dp004
 #SBATCH -J BirthHaloCatalogueIndex
-#SBATCH --nodes=1
-#SBATCH -t 12:00:00
+#SBATCH --nodes=4
+#SBATCH -t 24:00:00
 # N0752: 1 node, 2 hours
 # N1504: 1 node, 12 hours
 # N3008: 4 nodes, 24 hours
 
 set -e
 
-base_dir="/cosma8/data/dp004/jlvc76/COLIBRE/ScienceRuns"
-sim="L0400N3008/Thermal"
+# TODO: Set these values
+base_dir="/cosma8/data/dp004/dc-mcgi1/COLIBRE/BirthHaloCatalogueIndex"
 output_dir="/cosma8/data/dp004/dc-mcgi1/COLIBRE/BirthHaloCatalogueIndex"
+sim="L0400N3008/Thermal"
 snapnum="0127"
 
 snap_basename="${base_dir}/${sim}/snapshots/colibre_{snap_nr:04d}/colibre_{snap_nr:04d}"
 membership_basename="${base_dir}/${sim}/SOAP-HBT/membership_{snap_nr:04d}/membership_{snap_nr:04d}"
-output_basename="${output_dir}/${sim}/SOAP_BirthTrackId/birth_${snapnum}/birth_${snapnum}"
+output_basename="${output_dir}/${sim}/SOAP-ExSitu/birth_${snapnum}/birth_${snapnum}"
 
 mpirun -- python misc/compute_BirthHaloCatalogueIndex.py \
   --snap-basename ${snap_basename} \
@@ -29,3 +30,18 @@ mpirun -- python misc/compute_BirthHaloCatalogueIndex.py \
   --final-snap-nr ${snapnum} \
   --calculate-PreBirthHaloCatalogueIndex
 
+chmod a=r "${output_basename}"*
+
+snapshot="${snap_basename}.hdf5"
+membership="${membership_basename}.{file_nr}.hdf5"
+output="${output_basename}.{file_nr}.hdf5"
+virtual="${output_dir}/${sim}/SOAP-ExSitu/birth_${snapnum}.hdf5"
+python compression/make_virtual_snapshot.py \
+  --virtual-snapshot "$snapshot" \
+  --auxiliary-snapshots "$membership" "$output" \
+  --output-file "$virtual" \
+  --snap-nr "$snapnum" \
+
+chmod a=r "${virtual}"
+
+echo "Job complete!"
diff --git a/tests/COLIBRE/run_L0025N0188_Thermal.sh b/tests/COLIBRE/run_L0025N0188_Thermal.sh
index 27a43e1e..58969b78 100755
--- a/tests/COLIBRE/run_L0025N0188_Thermal.sh
+++ b/tests/COLIBRE/run_L0025N0188_Thermal.sh
@@ -30,7 +30,7 @@ python tests/COLIBRE/create_parameters_file.py
 rm -r output/SOAP-tmp
 
 # Run SOAP on eight cores processing the selected halos. Use 'python3 -m pdb' to start in the debugger.
-mpirun -np 8 python3 -u -m mpi4py SOAP/compute_halo_properties.py \
+mpirun -np 8 python SOAP/compute_halo_properties.py \
        ./tests/COLIBRE/test_parameters.yml \
        --halo-indices ${halo_indices} \
        --sim-name=${sim} --snap-nr=${snapnum} --chunks=1

From c9271bf15b7b47396a82931d4a15d738dc4762ea Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Fri, 20 Feb 2026 15:17:05 +0000
Subject: [PATCH 12/15] Handle named columns

---
 compression/make_virtual_snapshot.py | 41 ++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/compression/make_virtual_snapshot.py b/compression/make_virtual_snapshot.py
index d7b10481..6f4799e2 100644
--- a/compression/make_virtual_snapshot.py
+++ b/compression/make_virtual_snapshot.py
@@ -63,7 +63,7 @@ def update_vds_paths(dset, modify_function):
 
 
 def make_virtual_snapshot(
-    snapshot, auxiliary_snapshots, output_file, absolute_paths=False
+    snapshot, auxiliary_snapshots, output_file, absolute_paths=False, discard_duplicate_datasets=False,
 ):
     """
     Given a snapshot and auxiliary files, create
@@ -150,6 +150,23 @@ def replace_path(old_path):
                         )
                     all_auxiliary_datasets[dset_path] = auxiliary
 
+            # Copy over the named column values, handling the case where we have
+            # dataset names that already exist in the original snapshot
+            for dset in infile.get('SubgridScheme/NamedColumns', []):
+                outfile_named_cols = outfile['SubgridScheme/NamedColumns']
+                if dset in outfile_named_cols:
+                    if discard_duplicate_datasets:
+                        del outfile_named_cols[dset]
+                    else:
+                        outfile.move(
+                            f"SubgridScheme/NamedColumns/{dset}",
+                            f"SubgridScheme/NamedColumns/{dset}_snap",
+                        )
+                outfile_named_cols.create_dataset(
+                    dset,
+                    data=infile[f'SubgridScheme/NamedColumns/{dset}'],
+                )
+
         # Loop over input auxiliary files to get dataset shapes
         file_nr = 0
         filenames = []
@@ -211,9 +228,12 @@ def replace_path(old_path):
             # already exist in the snapshot
             for dset, attrs in dset_attrs[f"PartType{ptype}"].items():
                 if f"PartType{ptype}/{dset}" in outfile:
-                    outfile.move(
-                        f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap"
-                    )
+                    if discard_duplicate_datasets:
+                        del outfile[f"PartType{ptype}/{dset}"]
+                    else:
+                        outfile.move(
+                            f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap"
+                        )
                 outfile.create_virtual_dataset(
                     f"PartType{ptype}/{dset}", layouts[dset], fillvalue=-999
                 )
@@ -268,7 +288,7 @@ def replace_path(old_path):
         description=(
             "Link SWIFT snapshots with SWIFT auxiliary snapshots (snapshot-like"
             "files with the same number of particles in the same order as the"
-            "snapshot, but with less metadata), such as the SOAP memberships"
+            "snapshot, but with less metadata), such as the SOAP memberships."
         )
     )
     parser.add_argument(
@@ -302,6 +322,16 @@ def replace_path(old_path):
         action="store_true",
         help="Use absolute paths in the virtual dataset",
     )
+    parser.add_argument(
+        "--discard-duplicate-datasets",
+        action="store_true",
+        help=(
+            "This flag determines the behaviour when a dataset exists in both the original snapshot"
+            "and the auxilary file. By default the virtual file will rename the original snapshot"
+            "dataset as {dataset_name}_snap. If this flag is passed then the dataset from the original"
+            "snapshot will not be linked to"
+        ),
+    )
     args = parser.parse_args()
 
     print(f"Creating virtual snapshot")
@@ -324,4 +354,5 @@ def replace_path(old_path):
         auxiliary_snapshots,
         output_file,
         absolute_paths=args.absolute_paths,
+        discard_duplicate_datasets=args.discard_duplicate_datasets,
     )

From 640f30d3ac5c56cf898b282627ea4293f5fa2878 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Fri, 20 Feb 2026 15:17:44 +0000
Subject: [PATCH 13/15] Format

---
 compression/make_virtual_snapshot.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/compression/make_virtual_snapshot.py b/compression/make_virtual_snapshot.py
index 6f4799e2..2311840e 100644
--- a/compression/make_virtual_snapshot.py
+++ b/compression/make_virtual_snapshot.py
@@ -63,7 +63,11 @@ def update_vds_paths(dset, modify_function):
 
 
 def make_virtual_snapshot(
-    snapshot, auxiliary_snapshots, output_file, absolute_paths=False, discard_duplicate_datasets=False,
+    snapshot,
+    auxiliary_snapshots,
+    output_file,
+    absolute_paths=False,
+    discard_duplicate_datasets=False,
 ):
     """
     Given a snapshot and auxiliary files, create
@@ -152,8 +156,8 @@ def replace_path(old_path):
 
             # Copy over the named column values, handling the case where we have
             # dataset names that already exist in the original snapshot
-            for dset in infile.get('SubgridScheme/NamedColumns', []):
-                outfile_named_cols = outfile['SubgridScheme/NamedColumns']
+            for dset in infile.get("SubgridScheme/NamedColumns", []):
+                outfile_named_cols = outfile["SubgridScheme/NamedColumns"]
                 if dset in outfile_named_cols:
                     if discard_duplicate_datasets:
                         del outfile_named_cols[dset]
@@ -164,7 +168,7 @@ def replace_path(old_path):
                         )
                 outfile_named_cols.create_dataset(
                     dset,
-                    data=infile[f'SubgridScheme/NamedColumns/{dset}'],
+                    data=infile[f"SubgridScheme/NamedColumns/{dset}"],
                 )
 
         # Loop over input auxiliary files to get dataset shapes

From 29b09684e1461526f97a0947f5ffc9b3aeeb1786 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Mon, 16 Mar 2026 14:48:54 +0000
Subject: [PATCH 14/15] Move compression dir

---
 README.md                                                     | 2 +-
 {compression => SOAP/compression}/README.md                   | 0
 {compression => SOAP/compression}/compress_soap_catalogue.py  | 0
 .../compression}/create_empty_SOAP_catalogue.py               | 0
 {compression => SOAP/compression}/extract_filters.py          | 0
 {compression => SOAP/compression}/filters.yml                 | 0
 {compression => SOAP/compression}/make_virtual_snapshot.py    | 0
 {compression => SOAP/compression}/wrong_compression.yml       | 0
 format.sh                                                     | 2 +-
 scripts/COLIBRE/compress_group_membership.sh                  | 2 +-
 scripts/COLIBRE/compress_halo_properties.sh                   | 4 ++--
 scripts/COLIBRE/compute_birth_index.sh                        | 2 +-
 scripts/EAGLE.sh                                              | 2 +-
 .../L1000N1800/compress_halo_properties_L1000N1800.sh         | 2 +-
 14 files changed, 8 insertions(+), 8 deletions(-)
 rename {compression => SOAP/compression}/README.md (100%)
 rename {compression => SOAP/compression}/compress_soap_catalogue.py (100%)
 rename {compression => SOAP/compression}/create_empty_SOAP_catalogue.py (100%)
 rename {compression => SOAP/compression}/extract_filters.py (100%)
 rename {compression => SOAP/compression}/filters.yml (100%)
 rename {compression => SOAP/compression}/make_virtual_snapshot.py (100%)
 rename {compression => SOAP/compression}/wrong_compression.yml (100%)

diff --git a/README.md b/README.md
index 469fee13..7b72e845 100644
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ The first is lossless compression via GZIP, the second is lossy compression.
 For the group membership files we only apply lossless compression. However,
 each property in the final SOAP catalogue has a lossy compression filter
 associated with it, which are set in `SOAP/property_table.py`. The script 
-`compression/compress_soap_catalogue.py` will apply both lossy and
+`SOAP/compression/compress_soap_catalogue.py` will apply both lossy and
 lossless compression to SOAP catalogues.
 
 ### Documentation
diff --git a/compression/README.md b/SOAP/compression/README.md
similarity index 100%
rename from compression/README.md
rename to SOAP/compression/README.md
diff --git a/compression/compress_soap_catalogue.py b/SOAP/compression/compress_soap_catalogue.py
similarity index 100%
rename from compression/compress_soap_catalogue.py
rename to SOAP/compression/compress_soap_catalogue.py
diff --git a/compression/create_empty_SOAP_catalogue.py b/SOAP/compression/create_empty_SOAP_catalogue.py
similarity index 100%
rename from compression/create_empty_SOAP_catalogue.py
rename to SOAP/compression/create_empty_SOAP_catalogue.py
diff --git a/compression/extract_filters.py b/SOAP/compression/extract_filters.py
similarity index 100%
rename from compression/extract_filters.py
rename to SOAP/compression/extract_filters.py
diff --git a/compression/filters.yml b/SOAP/compression/filters.yml
similarity index 100%
rename from compression/filters.yml
rename to SOAP/compression/filters.yml
diff --git a/compression/make_virtual_snapshot.py b/SOAP/compression/make_virtual_snapshot.py
similarity index 100%
rename from compression/make_virtual_snapshot.py
rename to SOAP/compression/make_virtual_snapshot.py
diff --git a/compression/wrong_compression.yml b/SOAP/compression/wrong_compression.yml
similarity index 100%
rename from compression/wrong_compression.yml
rename to SOAP/compression/wrong_compression.yml
diff --git a/format.sh b/format.sh
index f532a46d..1c142b87 100755
--- a/format.sh
+++ b/format.sh
@@ -20,7 +20,7 @@ fi
 black="./black_formatting_env/bin/python3 -m black"
 
 # Make sure we don't try and format any virtual environments
-files=$(echo {compression/*.py,misc/*.py,SOAP/*.py,SOAP/*/*.py,tests/*.py})
+files=$(echo {misc/*.py,SOAP/*.py,SOAP/*/*.py,tests/*.py})
 
 # Run formatting
 if [[ "$1" == "--check" ]]; then
diff --git a/scripts/COLIBRE/compress_group_membership.sh b/scripts/COLIBRE/compress_group_membership.sh
index 0e57df43..f04ec4c8 100644
--- a/scripts/COLIBRE/compress_group_membership.sh
+++ b/scripts/COLIBRE/compress_group_membership.sh
@@ -90,7 +90,7 @@ echo "Creating virtual snapshot"
 snapshot="${output_dir}/${sim}/snapshots/colibre_${snapnum}/colibre_${snapnum}.hdf5"
 membership="${output_filename}.{file_nr}.hdf5"
 virtual="${outbase}/colibre_with_SOAP_membership_${snapnum}.hdf5"
-python compression/make_virtual_snapshot.py \
+python SOAP/compression/make_virtual_snapshot.py \
   --virtual-snapshot $snapshot \
   --auxiliary-snapshots $membership \
   --output-file $virtual
diff --git a/scripts/COLIBRE/compress_halo_properties.sh b/scripts/COLIBRE/compress_halo_properties.sh
index b75a7632..5acbca36 100755
--- a/scripts/COLIBRE/compress_halo_properties.sh
+++ b/scripts/COLIBRE/compress_halo_properties.sh
@@ -29,7 +29,7 @@ output_dir="/cosma8/data/dp004/dc-mcgi1/COLIBRE/Runs"
 scratch_dir="/snap8/scratch/dp004/dc-mcgi1/COLIBRE/Runs"
 
 # compression script
-script="./compression/compress_soap_catalogue.py"
+script="./SOAP/compression/compress_soap_catalogue.py"
 
 # Which snapshot to do
 snapnum=`printf '%04d' ${SLURM_ARRAY_TASK_ID}`
@@ -41,7 +41,7 @@ sim="${SLURM_JOB_NAME}"
 input_filename="${input_dir}/${sim}/SOAP_uncompressed/halo_properties_${snapnum}.hdf5"
 
 # Location and name of the output SOAP catalogue
-outbase="${output_dir}/${sim}/SOAP"
+outbase="${output_dir}/${sim}/SOAP-ExSitu"
 mkdir -p $outbase
 output_filename="${outbase}/halo_properties_${snapnum}.hdf5"
 
diff --git a/scripts/COLIBRE/compute_birth_index.sh b/scripts/COLIBRE/compute_birth_index.sh
index 8ac4c0c2..35181cbb 100755
--- a/scripts/COLIBRE/compute_birth_index.sh
+++ b/scripts/COLIBRE/compute_birth_index.sh
@@ -36,7 +36,7 @@ snapshot="${snap_basename}.hdf5"
 membership="${membership_basename}.{file_nr}.hdf5"
 output="${output_basename}.{file_nr}.hdf5"
 virtual="${output_dir}/${sim}/SOAP-ExSitu/birth_${snapnum}.hdf5"
-python compression/make_virtual_snapshot.py \
+python SOAP/compression/make_virtual_snapshot.py \
   --virtual-snapshot "$snapshot" \
   --auxiliary-snapshots "$membership" "$output" \
   --output-file "$virtual" \
diff --git a/scripts/EAGLE.sh b/scripts/EAGLE.sh
index 569bb28b..9c83e6f4 100755
--- a/scripts/EAGLE.sh
+++ b/scripts/EAGLE.sh
@@ -77,7 +77,7 @@ cd "${output_dir}/swift_snapshots/swift_${snap_nr}"
 python "${soap_dir}/create_virtual_snapshot.py" "snap_${snap_nr}.0.hdf5"
 cd -
 
-python compression/make_virtual_snapshot.py \
+python SOAP/compression/make_virtual_snapshot.py \
     --virtual-snapshot "${output_dir}/swift_snapshots/swift_${snap_nr}/snap_${snap_nr}.hdf5" \
     --auxiliary-snapshots "${output_dir}/SOAP_uncompressed/membership_${snap_nr}/membership_${snap_nr}.{file_nr}.hdf5" \
     --output-file "${output_dir}/SOAP_uncompressed/snap_${snap_nr}.hdf5"
diff --git a/scripts/FLAMINGO/L1000N1800/compress_halo_properties_L1000N1800.sh b/scripts/FLAMINGO/L1000N1800/compress_halo_properties_L1000N1800.sh
index 60d202d0..ea032de7 100644
--- a/scripts/FLAMINGO/L1000N1800/compress_halo_properties_L1000N1800.sh
+++ b/scripts/FLAMINGO/L1000N1800/compress_halo_properties_L1000N1800.sh
@@ -29,7 +29,7 @@ output_dir="/cosma8/data/dp004/dc-mcgi1/FLAMINGO/Runs"
 scratch_dir="/snap8/scratch/dp004/dc-mcgi1/FLAMINGO/Runs"
 
 # compression script
-script="./compression/compress_soap_catalogue.py"
+script="./SOAP/compression/compress_soap_catalogue.py"
 
 # Which snapshot to do
 snapnum=`printf '%04d' ${SLURM_ARRAY_TASK_ID}`

From d501b1954b30d97f21b8e2a2c44dbc0e90ddc506 Mon Sep 17 00:00:00 2001
From: robjmcgibbon <robjmcgibbon@gmail.com>
Date: Mon, 16 Mar 2026 14:54:40 +0000
Subject: [PATCH 15/15] Format

---
 SOAP/catalogue_readers/read_hbtplus.py             | 2 +-
 SOAP/compute_halo_properties.py                    | 1 -
 SOAP/core/combine_chunks.py                        | 2 +-
 SOAP/core/halo_tasks.py                            | 1 -
 SOAP/particle_filter/recently_heated_gas_filter.py | 1 -
 misc/check_subhalo_ranking.py                      | 1 -
 tests/test_SO_properties.py                        | 2 +-
 7 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/SOAP/catalogue_readers/read_hbtplus.py b/SOAP/catalogue_readers/read_hbtplus.py
index d91c5849..4dff8d93 100644
--- a/SOAP/catalogue_readers/read_hbtplus.py
+++ b/SOAP/catalogue_readers/read_hbtplus.py
@@ -264,7 +264,7 @@ def read_hbtplus_catalogue(
         MassInMsunh = None
         VelInKmS = None
         sorted_file = None
-    (LengthInMpch, MassInMsunh, VelInKmS) = comm.bcast(
+    LengthInMpch, MassInMsunh, VelInKmS = comm.bcast(
         (LengthInMpch, MassInMsunh, VelInKmS)
     )
     sorted_file = comm.bcast(sorted_file)
diff --git a/SOAP/compute_halo_properties.py b/SOAP/compute_halo_properties.py
index d9a9be12..46e72258 100644
--- a/SOAP/compute_halo_properties.py
+++ b/SOAP/compute_halo_properties.py
@@ -40,7 +40,6 @@
 from SOAP.particle_filter.cold_dense_gas_filter import ColdDenseGasFilter
 from SOAP.particle_filter.recently_heated_gas_filter import RecentlyHeatedGasFilter
 
-
 # Set numpy to raise divide by zero, overflow and invalid operation errors as exceptions
 np.seterr(divide="raise", over="raise", invalid="raise")
 
diff --git a/SOAP/core/combine_chunks.py b/SOAP/core/combine_chunks.py
index 1f969570..99c12ab8 100644
--- a/SOAP/core/combine_chunks.py
+++ b/SOAP/core/combine_chunks.py
@@ -420,7 +420,7 @@ def combine_chunks(
             fof_reg = None
             fof_com_unit = None
             fof_mass_unit = None
-        (fof_reg, fof_com_unit, fof_mass_unit) = comm_world.bcast(
+        fof_reg, fof_com_unit, fof_mass_unit = comm_world.bcast(
             (fof_reg, fof_com_unit, fof_mass_unit)
         )
 
diff --git a/SOAP/core/halo_tasks.py b/SOAP/core/halo_tasks.py
index 53f91598..0261771c 100644
--- a/SOAP/core/halo_tasks.py
+++ b/SOAP/core/halo_tasks.py
@@ -10,7 +10,6 @@
 from SOAP.particle_selection.halo_properties import SearchRadiusTooSmallError
 from SOAP.property_table import PropertyTable
 
-
 # Factor by which to increase search radius when looking for density threshold
 SEARCH_RADIUS_FACTOR = 1.2
 
diff --git a/SOAP/particle_filter/recently_heated_gas_filter.py b/SOAP/particle_filter/recently_heated_gas_filter.py
index 8cdb7721..63d4467d 100644
--- a/SOAP/particle_filter/recently_heated_gas_filter.py
+++ b/SOAP/particle_filter/recently_heated_gas_filter.py
@@ -16,7 +16,6 @@
 requires knowledge of the cosmology.
 """
 
-
 from astropy.cosmology import w0waCDM, z_at_value
 import astropy.constants as const
 import astropy.units as astropy_units
diff --git a/misc/check_subhalo_ranking.py b/misc/check_subhalo_ranking.py
index e10d5ad9..9b94af6f 100644
--- a/misc/check_subhalo_ranking.py
+++ b/misc/check_subhalo_ranking.py
@@ -3,7 +3,6 @@
 import numpy as np
 import h5py
 
-
 # Read VR IDs and positions
 filename = "/cosma8/data/dp004/flamingo/Runs/L1000N0900/HYDRO_FIDUCIAL_DATA/HYDRO_FIDUCIAL/VR/catalogue_0077/vr_catalogue_0077.properties.0"
 with h5py.File(filename, "r") as infile:
diff --git a/tests/test_SO_properties.py b/tests/test_SO_properties.py
index 1ba8c03c..bd4459a4 100644
--- a/tests/test_SO_properties.py
+++ b/tests/test_SO_properties.py
@@ -419,7 +419,7 @@ def calculate_SO_properties_nfw_halo(seed, num_part, c):
         "crit",
     )
 
-    (input_halo, data, rmax, Mtot, Npart, particle_numbers) = dummy_halos.gen_nfw_halo(
+    input_halo, data, rmax, Mtot, Npart, particle_numbers = dummy_halos.gen_nfw_halo(
         100, c, num_part
     )