Skip to content

Commit 86c3bfd

Browse files
authored
Calculate birth halo catalogue index (#204)
* Virtual v1 * Combine update_vds_path with make_virtual_snapshot * Virtual snapshot creation using multiple auxilary files * BirthHaloCatalogueIndex v1 * Identify gas progenitor * Add ExSitu fraction property to SOAP * Add sbatch script * Remove extra parameter file * Format * Learn to spell * Add to BoundSubhalo * Handle named columns * Format * Move compression dir * Format
1 parent e5aa7fa commit 86c3bfd

30 files changed

Lines changed: 768 additions & 329 deletions

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ The first is lossless compression via GZIP, the second is lossy compression.
116116
For the group membership files we only apply lossless compression. However,
117117
each property in the final SOAP catalogue has a lossy compression filter
118118
associated with it, which are set in `SOAP/property_table.py`. The script
119-
`compression/compress_soap_catalogue.py` will apply both lossy and
119+
`SOAP/compression/compress_soap_catalogue.py` will apply both lossy and
120120
lossless compression to SOAP catalogues.
121121

122122
### Documentation

SOAP/catalogue_readers/read_hbtplus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ def read_hbtplus_catalogue(
264264
MassInMsunh = None
265265
VelInKmS = None
266266
sorted_file = None
267-
(LengthInMpch, MassInMsunh, VelInKmS) = comm.bcast(
267+
LengthInMpch, MassInMsunh, VelInKmS = comm.bcast(
268268
(LengthInMpch, MassInMsunh, VelInKmS)
269269
)
270270
sorted_file = comm.bcast(sorted_file)
File renamed without changes.
File renamed without changes.
File renamed without changes.
Lines changed: 362 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,362 @@
1+
#!/bin/env python
2+
3+
import os.path
4+
import h5py
5+
import shutil
6+
import numpy as np
7+
8+
9+
class SafeDict(dict):
10+
def __missing__(self, key):
11+
# Return the key back in braces so it remains in the string
12+
return "{" + key + "}"
13+
14+
15+
def update_vds_paths(dset, modify_function):
16+
"""
17+
Modify the virtual paths of the specified dataset
18+
19+
Note that querying the source dataspace and selection does not appear
20+
to work (invalid pointer error from h5py) so here we assume that we're
21+
referencing all of the source dataspace, which is correct for SWIFT
22+
snapshots.
23+
24+
dset: a h5py.Dataset object
25+
modify_function: a function which takes the old path as its argument and
26+
returns the new path
27+
"""
28+
29+
# Choose a temporary path for the new virtual dataset
30+
path = dset.name
31+
tmp_path = dset.name + ".__tmp__"
32+
33+
# Build the creation property list for the new dataset
34+
plist = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
35+
for vs in dset.virtual_sources():
36+
bounds = vs.vspace.get_select_bounds()
37+
if bounds is not None:
38+
lower, upper = bounds
39+
size = np.asarray(upper, dtype=int) - np.asarray(lower, dtype=int) + 1
40+
src_space = h5py.h5s.create_simple(tuple(size))
41+
new_name = modify_function(vs.file_name)
42+
plist.set_virtual(
43+
vs.vspace, new_name.encode(), vs.dset_name.encode(), src_space
44+
)
45+
46+
# Create the new dataset
47+
tmp_dset = h5py.h5d.create(
48+
dset.file["/"].id,
49+
tmp_path.encode(),
50+
dset.id.get_type(),
51+
dset.id.get_space(),
52+
dcpl=plist,
53+
)
54+
tmp_dset = h5py.Dataset(tmp_dset)
55+
for attr_name in dset.attrs:
56+
tmp_dset.attrs[attr_name] = dset.attrs[attr_name]
57+
58+
# Rename the new dataset
59+
f = dset.file
60+
del f[path]
61+
f[path] = f[tmp_path]
62+
del f[tmp_path]
63+
64+
65+
def make_virtual_snapshot(
66+
snapshot,
67+
auxiliary_snapshots,
68+
output_file,
69+
absolute_paths=False,
70+
discard_duplicate_datasets=False,
71+
):
72+
"""
73+
Given a snapshot and auxiliary files, create
74+
a new virtual snapshot with all datasets combine.
75+
76+
snapshot: Path to the snapshot file
77+
auxiliary_snapshots: List of auxiliary file patterns
78+
output_file: Path to the output virtual snapshot
79+
absolute_paths: If True, use absolute paths; if False, use relative paths
80+
"""
81+
82+
# Copy the input virtual snapshot to the output
83+
shutil.copyfile(snapshot, output_file)
84+
85+
# Open the output file
86+
outfile = h5py.File(output_file, "r+")
87+
88+
# Calculate directories for path updates
89+
abs_snapshot_dir = os.path.abspath(os.path.dirname(snapshot))
90+
abs_auxiliary_dirs = [
91+
os.path.abspath(os.path.dirname(aux.format(file_nr=0)))
92+
for aux in auxiliary_snapshots
93+
]
94+
abs_output_dir = os.path.abspath(os.path.dirname(output_file))
95+
96+
if absolute_paths:
97+
snapshot_dir = abs_snapshot_dir
98+
auxiliary_dirs = abs_auxiliary_dirs
99+
else:
100+
snapshot_dir = os.path.relpath(abs_snapshot_dir, abs_output_dir)
101+
auxiliary_dirs = [
102+
os.path.relpath(aux_dir, abs_output_dir) for aux_dir in abs_auxiliary_dirs
103+
]
104+
105+
# Create path replacement functions
106+
def make_replace_path(target_dir):
107+
def replace_path(old_path):
108+
basename = os.path.basename(old_path)
109+
return os.path.join(target_dir, basename)
110+
111+
return replace_path
112+
113+
replace_snapshot_path = make_replace_path(snapshot_dir)
114+
auxiliary_path_replacers = [make_replace_path(d) for d in auxiliary_dirs]
115+
116+
all_auxiliary_datasets = {}
117+
118+
for aux_index, auxiliary in enumerate(auxiliary_snapshots):
119+
120+
# Check which datasets exist in the auxiliary files
121+
# and store their attributes and datatype
122+
filename = auxiliary.format(file_nr=0)
123+
dset_attrs = {}
124+
dset_dtype = {}
125+
with h5py.File(filename, "r") as infile:
126+
for ptype in range(7):
127+
if not f"PartType{ptype}" in infile:
128+
continue
129+
dset_attrs[f"PartType{ptype}"] = {}
130+
dset_dtype[f"PartType{ptype}"] = {}
131+
for dset in infile[f"PartType{ptype}"].keys():
132+
attrs = dict(infile[f"PartType{ptype}/{dset}"].attrs)
133+
dtype = infile[f"PartType{ptype}/{dset}"].dtype
134+
135+
# Some auxiliary files are missing these attributes
136+
if not "Value stored as physical" in attrs:
137+
print(f"Setting comoving attrs for PartType{ptype}/{dset}")
138+
attrs["Value stored as physical"] = [1]
139+
attrs["Property can be converted to comoving"] = [0]
140+
141+
# Add a flag that these datasets are stored in the auxiliary files
142+
attrs["auxiliary file"] = [1]
143+
144+
# Store the values we need for later
145+
dset_attrs[f"PartType{ptype}"][dset] = attrs
146+
dset_dtype[f"PartType{ptype}"][dset] = dtype
147+
148+
# Check we don't have this dataset in any of the other auxiliary files
149+
dset_path = f"PartType{ptype}/{dset}"
150+
if dset_path in all_auxiliary_datasets:
151+
other_file = all_auxiliary_datasets[f"PartType{ptype}/{dset}"]
152+
raise ValueError(
153+
f"{dset_path} is in {auxiliary} and {other_file}"
154+
)
155+
all_auxiliary_datasets[dset_path] = auxiliary
156+
157+
# Copy over the named column values, handling the case where we have
158+
# dataset names that already exist in the original snapshot
159+
for dset in infile.get("SubgridScheme/NamedColumns", []):
160+
outfile_named_cols = outfile["SubgridScheme/NamedColumns"]
161+
if dset in outfile_named_cols:
162+
if discard_duplicate_datasets:
163+
del outfile_named_cols[dset]
164+
else:
165+
outfile.move(
166+
f"SubgridScheme/NamedColumns/{dset}",
167+
f"SubgridScheme/NamedColumns/{dset}_snap",
168+
)
169+
outfile_named_cols.create_dataset(
170+
dset,
171+
data=infile[f"SubgridScheme/NamedColumns/{dset}"],
172+
)
173+
174+
# Loop over input auxiliary files to get dataset shapes
175+
file_nr = 0
176+
filenames = []
177+
shapes = []
178+
counts = []
179+
while True:
180+
filename = auxiliary.format(file_nr=file_nr)
181+
if os.path.exists(filename):
182+
filenames.append(filename)
183+
with h5py.File(filename, "r") as infile:
184+
shape = {}
185+
count = {}
186+
for ptype in range(7):
187+
if f"PartType{ptype}" not in dset_attrs:
188+
continue
189+
shape[f"PartType{ptype}"] = {}
190+
# Get the shape for each dataset
191+
for dset in dset_attrs[f"PartType{ptype}"]:
192+
s = infile[f"PartType{ptype}/{dset}"].shape
193+
shape[f"PartType{ptype}"][dset] = s
194+
# Get the number of particles in this chunk file
195+
count[f"PartType{ptype}"] = s[0]
196+
shapes.append(shape)
197+
counts.append(count)
198+
else:
199+
break
200+
file_nr += 1
201+
if file_nr == 0:
202+
raise IOError(f"Failed to find files matching: {auxiliary}")
203+
204+
# Loop over particle types in the output
205+
for ptype in range(7):
206+
if f"PartType{ptype}" not in dset_attrs:
207+
continue
208+
209+
# Create virtual layout for new datasets
210+
layouts = {}
211+
nr_parts = sum([count[f"PartType{ptype}"] for count in counts])
212+
for dset in dset_attrs[f"PartType{ptype}"]:
213+
full_shape = list(shapes[0][f"PartType{ptype}"][dset])
214+
full_shape[0] = nr_parts
215+
full_shape = tuple(full_shape)
216+
dtype = dset_dtype[f"PartType{ptype}"][dset]
217+
layouts[dset] = h5py.VirtualLayout(shape=full_shape, dtype=dtype)
218+
219+
# Loop over input files
220+
offset = 0
221+
for filename, count, shape in zip(filenames, counts, shapes):
222+
n_part = count[f"PartType{ptype}"]
223+
for dset in dset_attrs[f"PartType{ptype}"]:
224+
layouts[dset][offset : offset + n_part] = h5py.VirtualSource(
225+
filename,
226+
f"PartType{ptype}/{dset}",
227+
shape=shape[f"PartType{ptype}"][dset],
228+
)
229+
offset += n_part
230+
231+
# Create the virtual datasets, renaming datasets if they
232+
# already exist in the snapshot
233+
for dset, attrs in dset_attrs[f"PartType{ptype}"].items():
234+
if f"PartType{ptype}/{dset}" in outfile:
235+
if discard_duplicate_datasets:
236+
del outfile[f"PartType{ptype}/{dset}"]
237+
else:
238+
outfile.move(
239+
f"PartType{ptype}/{dset}", f"PartType{ptype}/{dset}_snap"
240+
)
241+
outfile.create_virtual_dataset(
242+
f"PartType{ptype}/{dset}", layouts[dset], fillvalue=-999
243+
)
244+
for k, v in attrs.items():
245+
outfile[f"PartType{ptype}/{dset}"].attrs[k] = v
246+
247+
# Update paths for this newly created auxiliary dataset
248+
update_vds_paths(
249+
outfile[f"PartType{ptype}/{dset}"],
250+
auxiliary_path_replacers[aux_index],
251+
)
252+
253+
# Copy GroupNr_bound to HaloCatalogueIndex, since
254+
# that is the name in SOAP
255+
if dset == "GroupNr_bound":
256+
outfile.create_virtual_dataset(
257+
f"PartType{ptype}/HaloCatalogueIndex",
258+
layouts["GroupNr_bound"],
259+
fillvalue=-999,
260+
)
261+
for k, v in outfile[f"PartType{ptype}/GroupNr_bound"].attrs.items():
262+
outfile[f"PartType{ptype}/HaloCatalogueIndex"].attrs[k] = v
263+
264+
# Update paths for HaloCatalogueIndex too
265+
update_vds_paths(
266+
outfile[f"PartType{ptype}/HaloCatalogueIndex"],
267+
auxiliary_path_replacers[aux_index],
268+
)
269+
270+
# Update paths for all original snapshot datasets
271+
for ptype in range(7):
272+
ptype_name = f"PartType{ptype}"
273+
if ptype_name in outfile:
274+
for dset_name in list(outfile[ptype_name].keys()):
275+
dset = outfile[f"{ptype_name}/{dset_name}"]
276+
if dset.is_virtual:
277+
# Check if this is an auxiliary dataset (skip those, already handled)
278+
if dset.attrs.get("auxiliary file", [0])[0] != 1:
279+
# This is an original snapshot dataset
280+
update_vds_paths(dset, replace_snapshot_path)
281+
282+
# Done
283+
outfile.close()
284+
285+
286+
if __name__ == "__main__":
287+
288+
import argparse
289+
290+
# For description of parameters run the following: $ python make_virtual_snapshot.py --help
291+
parser = argparse.ArgumentParser(
292+
description=(
293+
"Link SWIFT snapshots with SWIFT auxiliary snapshots (snapshot-like"
294+
"files with the same number of particles in the same order as the"
295+
"snapshot, but with less metadata), such as the SOAP memberships."
296+
)
297+
)
298+
parser.add_argument(
299+
"--virtual-snapshot",
300+
type=str,
301+
required=True,
302+
help="Name of the SWIFT virtual snapshot file, e.g. snapshot_{snap_nr:04}.hdf5",
303+
)
304+
parser.add_argument(
305+
"--auxiliary-snapshots",
306+
type=str,
307+
nargs="+",
308+
required=True,
309+
help="One of more format strings for auxiliary files, e.g. membership_{snap_nr:04}.{file_nr}.hdf5",
310+
)
311+
parser.add_argument(
312+
"--output-file",
313+
type=str,
314+
required=True,
315+
help="Name of the virtual snapshot to create, e.g. membership_{snap_nr:04}.hdf5",
316+
)
317+
parser.add_argument(
318+
"--snap-nr",
319+
type=int,
320+
required=False,
321+
default=-1,
322+
help="Snapshot number (default: -1). Not required if snap_nr is present in filenames passed.",
323+
)
324+
parser.add_argument(
325+
"--absolute-paths",
326+
action="store_true",
327+
help="Use absolute paths in the virtual dataset",
328+
)
329+
parser.add_argument(
330+
"--discard-duplicate-datasets",
331+
action="store_true",
332+
help=(
333+
"This flag determines the behaviour when a dataset exists in both the original snapshot"
334+
"and the auxilary file. By default the virtual file will rename the original snapshot"
335+
"dataset as {dataset_name}_snap. If this flag is passed then the dataset from the original"
336+
"snapshot will not be linked to"
337+
),
338+
)
339+
args = parser.parse_args()
340+
341+
print(f"Creating virtual snapshot")
342+
for k, v in vars(args).items():
343+
print(f" {k}: {v}")
344+
345+
# Substitute snap number
346+
virtual_snapshot = args.virtual_snapshot.format(snap_nr=args.snap_nr)
347+
output_file = args.output_file.format(snap_nr=args.snap_nr)
348+
349+
# We don't want to replace {file_nr} for auxiliary snapshots
350+
auxiliary_snapshots = [
351+
filename.format_map(SafeDict({"snap_nr": args.snap_nr}))
352+
for filename in args.auxiliary_snapshots
353+
]
354+
355+
# Make a new virtual snapshot with group info
356+
make_virtual_snapshot(
357+
virtual_snapshot,
358+
auxiliary_snapshots,
359+
output_file,
360+
absolute_paths=args.absolute_paths,
361+
discard_duplicate_datasets=args.discard_duplicate_datasets,
362+
)

SOAP/compute_halo_properties.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
from SOAP.particle_filter.cold_dense_gas_filter import ColdDenseGasFilter
4141
from SOAP.particle_filter.recently_heated_gas_filter import RecentlyHeatedGasFilter
4242

43-
4443
# Set numpy to raise divide by zero, overflow and invalid operation errors as exceptions
4544
np.seterr(divide="raise", over="raise", invalid="raise")
4645

0 commit comments

Comments
 (0)