Updated the spectra generation script to include validation data.

HTJense · HTJense · commit 8e1ecd199c61 · 2025-07-09T14:08:59.000+01:00
diff --git a/cosmopower/examples/2_create_spectra.py b/cosmopower/examples/2_create_spectra.py
@@ -14,9 +14,17 @@
     tqdm = lambda x: x  # noqa: E731
 
 """
-In the previous file, we generated an LHC and saved it to the file
+In the previous tutorial, we generated an LHC and saved it to the file
 `example/spectra/parameters.hdf5`. Now we'll generate the spectra associated
 with this dataset.
+
+The result of this file will be several training spectra. You can compare the
+result of this tutorial with invoking the command
+
+    python -m cosmopower generate example.yaml
+
+which generates both the LHC from tutorial 1 and the spectra from this
+tutorial.
 """
 parser = YAMLParser("example.yaml")
 
diff --git a/cosmopower/examples/3_train_emulator.py b/cosmopower/examples/3_train_emulator.py
@@ -11,6 +11,14 @@
 """
 In the previous file, we created a training dataset for a linear P(k,z)
 emulator. Here, we will train the emulator over this dataset.
+
+The result of this file will be an emulator trained over the spectra generated
+before. You can compare the result from this file with the results from
+invoking the command
+
+    python -m cosmopower train example.yaml
+
+which loads the data, initializes the emulators, and trains them.
 """
 parser = YAMLParser("example.yaml")
 
@@ -42,7 +50,7 @@
                         trainable=True,
                         **settings.get("n_traits", {}))
 
-with tf.device("/device:CPU:0"):
+with tf.device(None):
     network.train(training_data=datasets,
                   filename_saved_model=output_file,
                   validation=validation,
diff --git a/cosmopower/examples/example.yaml b/cosmopower/examples/example.yaml
@@ -14,8 +14,8 @@ emulated_code:
 
 samples:
   # How many training and validation samples do we want to generate.
-  Ntraining: 1000
-  Nvalidation: 100
+  Ntraining: 400
+  Nvalidation: 25
   
   # The parameters of the LHC over which the samples are generated.
   parameters:
diff --git a/cosmopower/spectra.py b/cosmopower/spectra.py
@@ -7,7 +7,8 @@
 import numpy as np
 import h5py
 from importlib import import_module
-from typing import Optional
+from typing import Optional, Tuple
+from types import ModuleType
 
 
 def setup_path(parser: YAMLParser, args: object) -> bool:
@@ -93,7 +94,8 @@ def get_boltzmann_spectra(parser: YAMLParser, state: dict, args: dict = {},
 
 
 def cycle_spectrum_file(parser: YAMLParser, quantity: str,
-                        fp: Optional[Dataset], n: int = 0) -> Dataset:
+                        fp: Optional[Dataset], n: int = 0,
+                        validation: bool = False) -> Dataset:
     """
     Cycle the given spectrum file, i.e. open the next file we expect will
     contain spectrum data.
@@ -103,18 +105,77 @@ def cycle_spectrum_file(parser: YAMLParser, quantity: str,
       can contain spectra for quantity.
     If the resulting file does not exist, it will automatically create one.
     """
+    suffix = "_validation" if validation else ""
     if fp is None:
         dataset = Dataset(parser, quantity,
-                          quantity.replace("/", "_") + f".{n}.hdf5")
+                          quantity.replace("/", "_") + f"{suffix}.{n}.hdf5")
     else:
         i = int(fp.filename.split(".")[1]) + 1
         dataset = Dataset(parser, quantity,
-                          quantity.replace("/", "_") + f".{i}.hdf5")
+                          quantity.replace("/", "_") + f"{suffix}.{i}.hdf5")
         fp.close()
     dataset.open()
     return dataset
 
 
+def split_samples(MPI: Optional[ModuleType], parser: YAMLParser, samples: dict,
+                  nsamples: int) -> Tuple:
+    """
+    Given a set of samples, check how to split them evenly between the MPI
+    processes, and get the range over which this process needs to operate.
+    """
+    if MPI:
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        n_tot = comm.Get_size()
+        Barrier = comm.Barrier
+    else:
+        MPI = None
+        comm = None
+        rank = 0
+        n_tot = 1
+        Barrier = lambda: -1  # noqa E731
+    
+    first = 0
+    last = nsamples-1
+    first_file = 0
+    last_file = (last // parser.max_filesize)
+
+    # If mpi-ing, share the data here.
+    if comm is not None:
+        samples = comm.bcast(samples, root=0)
+        # Amount of spectra/files handled by each runner
+        cut = (last - first) // n_tot
+        fcut = int(np.ceil(float(cut) / parser.max_filesize))
+
+        first, last = first + (rank) * cut, first + (rank + 1) * cut - 1
+        first_file, last_file = (first_file + (rank) * fcut,
+                                 first_file + (rank + 1) * fcut - 1)
+
+    return samples, first, last, first_file, last_file
+
+
+def check_open_files(parser: YAMLParser, files: dict, n: int,
+                     first_file: int, validation: bool = False
+                    ) -> Tuple[dict, list]:
+    quantities_to_be_computed = []
+    
+    for q in parser.quantities:
+        if files[q] is None:
+            # Open first file to read from.
+            files[q] = cycle_spectrum_file(parser, q, files[q],
+                                           n=first_file, validation=validation)
+        while files[q].empty_size == 0 and files[q].indices.max() < n:
+            # Current file is full, so we have to cycle to the next one.
+            files[q] = cycle_spectrum_file(parser, q, files[q],
+                                           n=first_file, validation=validation)
+
+        if n not in files[q].indices:
+            quantities_to_be_computed.append(q)
+    
+    return files, quantities_to_be_computed
+
+
 def generate_spectra(args: list = None) -> None:
     """
     Hook for the "generate spectra" command.
@@ -184,29 +245,22 @@ def generate_spectra(args: list = None) -> None:
 
     if rank == 0:
         # TODO: Generate the validation samples.
-        samples, validation_samples = \
-            parser.get_parameter_samples(force_new = args.force)
+        if parser.nvalidation:
+            samples, validation_samples = \
+                parser.get_parameter_samples(force_new=args.force_overwrite,
+                                             return_validation=True)
+        else:
+            samples = \
+                parser.get_parameter_samples(force_new=args.force_overwrite)
+            validation_samples = {}
     else:
-        samples = None
+        samples, validation_samples = None, None
 
-    first = 0
-    last = parser.nsamples
-    first_file = 0
-    last_file = (last // parser.max_filesize)
+    samples, first, last, first_file, last_file = \
+        split_samples(MPI, parser, samples, parser.nsamples)
 
-    # If mpi-ing, share the data here.
-    if comm is not None:
-        samples = comm.bcast(samples, root=0)
-        # Amount of spectra/files handled by each runner
-        cut = (last - first) // n_tot
-        fcut = int(np.ceil(float(cut) / parser.max_filesize))
-
-        first, last = first + (rank) * cut, first + (rank + 1) * cut - 1
-        first_file, last_file = (first_file + (rank) * fcut,
-                                 first_file + (rank + 1) * fcut - 1)
-
-    print(f"[{rank}]: Iterating over samples {first}--{last} in files \
-            {first_file}--{last_file}.")
+    print(f"[{rank}]: Iterating over samples {first}--{last} in files " \
+          f"{first_file}--{last_file}.")
 
     state = init_boltzmann_code(parser)
     extra_args = parser.boltzmann_extra_args
@@ -221,20 +275,9 @@ def generate_spectra(args: list = None) -> None:
                              + f"{accepted/n:.1%} success rate")
 
         boltzmann_params = {k: samples[k][n] for k in parser.boltzmann_inputs}
-        quantities_to_be_computed = []
-
-        for q in parser.quantities:
-            if files[q] is None:
-                # Open first file to read from.
-                files[q] = cycle_spectrum_file(parser, q, files[q],
-                                               n=first_file)
-            while files[q].empty_size == 0 and files[q].indices.max() < n:
-                # Current file is full, so we have to cycle to the next one.
-                files[q] = cycle_spectrum_file(parser, q, files[q],
-                                               n=first_file)
 
-            if n not in files[q].indices:
-                quantities_to_be_computed.append(q)
+        files, quantities_to_be_computed = \
+            check_open_files(parser, files, n, first_file,validation=False)
 
         if len(quantities_to_be_computed) == 0:
             accepted += 1
@@ -254,10 +297,79 @@ def generate_spectra(args: list = None) -> None:
                 else:
                     spec = state.get(q, None)
 
+                if spec is None:
+                    continue
+
+                if parser.is_log(q):
+                    spec = np.log10(spec)
+
+                if np.any(np.isnan(spec)):
+                    continue
+
                 network_params = np.array([
                     samples[k][n] for k in parser.network_input_parameters(q)
                 ])
 
+                if files[q] is None:
+                    files[q] = cycle_spectrum_file(parser, q, files[q],
+                                                   n=first_file,
+                                                   validation=False)
+
+                files[q].write_data(n, network_params, spec)
+
+    for q in files:
+        if files[q] is not None and files[q].is_open:
+            files[q].close()
+            files[q] = None
+
+    if validation_samples == {}:
+        if rank == 0:
+            print(f"Finished generating {accepted} spectra.")
+            print(f"You can now run\n\tcosmopower train {args.yamlfile}\n" \
+                   "to train the networks on this dataset.")
+            return
+    
+    # Do the exact same thing all over again, but for validation samples.
+    validation_samples, first, last, first_file, last_file = \
+        split_samples(MPI, parser, validation_samples, parser.nvalidation)
+
+    print(f"[{rank}]: Iterating over validation samples {first}--{last} in " \
+          f"files {first_file}--{last_file}.")
+
+    accepted = 0
+    tbar = tqdm.tqdm(np.arange(first, last + 1))
+
+    Barrier()
+
+    for n in tbar:
+        tbar.set_description(("" if MPI is None else f"[{rank}] ")
+                             + f"{accepted/n:.1%} success rate")
+
+        boltzmann_params = {
+            k: validation_samples[k][n] for k in parser.boltzmann_inputs
+        }
+
+        files, quantities_to_be_computed = \
+            check_open_files(parser, files, n, first_file, validation=True)
+
+        if len(quantities_to_be_computed) == 0:
+            accepted += 1
+            continue
+
+        if get_boltzmann_spectra(parser, state, boltzmann_params,
+                                 quantities_to_be_computed, extra_args):
+            accepted += 1
+
+            for k in state["derived"]:
+                validation_samples[k][n] = state["derived"][k]
+
+            for q in quantities_to_be_computed:
+                if q == "derived":
+                    spec = np.asarray([state["derived"].get(p)
+                                       for p in parser.computed_parameters])
+                else:
+                    spec = state.get(q, None)
+
                 if spec is None:
                     continue
 
@@ -267,17 +379,18 @@ def generate_spectra(args: list = None) -> None:
                 if np.any(np.isnan(spec)):
                     continue
 
+                network_params = np.array([
+                    validation_samples[k][n]
+                    for k in parser.network_input_parameters(q)
+                ])
+
                 if files[q] is None:
                     files[q] = cycle_spectrum_file(parser, q, files[q],
-                                                   n=first_file)
+                                                   n=first_file,
+                                                   validation=True)
 
                 files[q].write_data(n, network_params, spec)
 
     for q in files:
         if files[q] is not None and files[q].is_open:
             files[q].close()
-
-    if rank == 0:
-        print(f"Finished generating {accepted} spectra.")
-        print(f"You can now run\n\tcosmopower train {args.yamlfile}\n\
-                to train the networks on this dataset.")
diff --git a/cosmopower/train.py b/cosmopower/train.py
@@ -57,6 +57,10 @@ def train_network_NN(parser: YAMLParser, quantity: str, device: str = "",
                                            "_validation.*.hdf5"))
         validation = [Dataset(parser, quantity, os.path.basename(filename))
                       for filename in filenames]
+        
+        if len(validation) == 0:
+            print(f"No validation data found? Defaulting to 10% split.")
+            validation = 0.1
 
     with tf.device(device):
         print("\tTraining NN.")
@@ -108,6 +112,10 @@ def train_network_PCAplusNN(parser: YAMLParser, quantity: str,
                                            "_validation.*.hdf5"))
         validation = [Dataset(parser, quantity, os.path.basename(filename))
                       for filename in filenames]
+        
+        if len(validation) == 0:
+            print(f"No validation data found? Defaulting to 10% split.")
+            validation = 0.1
 
     cp_pca = cosmopower_PCA(parameters=parameters, modes=modes, n_pcas=n_pcas,
                             n_batches=n_batches, verbose=True)