From 38df93b7cf112a683efa52d1f2762def6967976c Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 14 Nov 2025 13:36:29 -0500 Subject: [PATCH 01/34] Start of example of sophisticated aggregating compute ensemble --- .../024-aggregated-compute-ensemble/README.md | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples-proposed/024-aggregated-compute-ensemble/README.md diff --git a/examples-proposed/024-aggregated-compute-ensemble/README.md b/examples-proposed/024-aggregated-compute-ensemble/README.md new file mode 100644 index 00000000..97c7e95b --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/README.md @@ -0,0 +1,34 @@ +# An example ensemble simulation for aggregated computing + +This example demonstrates how to set up an ensemble simulation in IPS that +performs aggregated computing across multiple ensemble instances. Each ensemble +instance runs a component for ${COMPUTE} that reports values local to the +instance, but that is then aggregated at the top-level after the instances +have finished. + +Note that there will be Dask related errors and warnings at the end that can be +ignored. These are due to Dask not having a clean shutdown. + +## Contents + +* `__init__.py` -- empty python init file +* `driver.py` -- top-level driver +* `instance_component.py` -- component worker code +* `instance_driver.py` -- component driver code + +* `ensemble.conf` -- top-level configuration file +* `platform.conf` -- platform configuration file +* `template.conf` -- ensemble instance configuration file + + +## Instructions + +To run the code, run: + +```bash +PORTAL_API_KEY=changeme ips.py --platform platform.conf --simulation ensemble.conf +``` + +Depending on the web portal instance you want to connect to, you will need to +change `PORTAL_API_KEY` in the run command and `PORTAL_URL` in the +`ensemble.conf` file. From 7c1d62f876f62222b7abf9406a496987743ecb44 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 14 Nov 2025 14:20:42 -0500 Subject: [PATCH 02/34] Start of example of sophisticated aggregating compute ensemble. Many of the files were copied from another example and will undergo significant changes for a reasonable pretend compute problem that uses ensemble of instances. --- .../024-aggregated-compute-ensemble/README.md | 1 - .../__init__.py | 0 .../024-aggregated-compute-ensemble/driver.py | 47 ++++++++++++ .../ensemble.conf | 32 ++++++++ .../instance_component.py | 74 +++++++++++++++++++ .../instance_driver.py | 17 +++++ .../platform.conf | 9 +++ .../template.conf | 40 ++++++++++ 8 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 examples-proposed/024-aggregated-compute-ensemble/__init__.py create mode 100644 examples-proposed/024-aggregated-compute-ensemble/driver.py create mode 100644 examples-proposed/024-aggregated-compute-ensemble/ensemble.conf create mode 100644 examples-proposed/024-aggregated-compute-ensemble/instance_component.py create mode 100644 examples-proposed/024-aggregated-compute-ensemble/instance_driver.py create mode 100644 examples-proposed/024-aggregated-compute-ensemble/platform.conf create mode 100644 examples-proposed/024-aggregated-compute-ensemble/template.conf diff --git a/examples-proposed/024-aggregated-compute-ensemble/README.md b/examples-proposed/024-aggregated-compute-ensemble/README.md index 97c7e95b..cc68ad63 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/README.md +++ b/examples-proposed/024-aggregated-compute-ensemble/README.md @@ -11,7 +11,6 @@ ignored. These are due to Dask not having a clean shutdown. ## Contents -* `__init__.py` -- empty python init file * `driver.py` -- top-level driver * `instance_component.py` -- component worker code * `instance_driver.py` -- component driver code diff --git a/examples-proposed/024-aggregated-compute-ensemble/__init__.py b/examples-proposed/024-aggregated-compute-ensemble/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples-proposed/024-aggregated-compute-ensemble/driver.py b/examples-proposed/024-aggregated-compute-ensemble/driver.py new file mode 100644 index 00000000..070fefa5 --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/driver.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Simple ensemble driver that just dispatches an IPS ensemble. +""" + +from pathlib import Path + +from ipsframework import Component + + +class EnsembleDriver(Component): + """Kicks off a simple ensemble""" + + def init(self, timestamp=0.0): + NOTEBOOK_TEMPLATE = 'notebook.ipynb' + self.services.stage_input_files([NOTEBOOK_TEMPLATE]) + try: + self.services.initialize_jupyter_notebook(NOTEBOOK_TEMPLATE) + except Exception: + print('did not add notebook to portal') + + def step(self, timestamp=0.0): + # FIXME get a hold of INPUT FILES for + variables = params_from_csv('../../../variables.csv') + + # This is the IPS configuration file for the instances that looks like + # a regular configuration file except there are slots for the 'base_x', 'base_y', + # and 'word' for variable substitution. 'TEMPLATE' is specified in the + # config file section for this driver. + template = Path(self.config['TEMPLATE']) + self.services.info(f'Using template config file {template}') + + if not template.exists(): + raise RuntimeError(f'{template} config template file does not exist') + + # Now spin up and run the instances. This function will return a list + # with each list element corresponding to an instance. You can use + # this information to find the specific instance run directory for a + # given set of variables. E.g., the instance corresponding to + # {'base_x' : 2, 'base_y' : 5.82, 'word' : 'baz'} is probably found in the + # `my_simple_ensemble1` subdirectory. + # + # The "name" parameter must be unique for each ensemble within a run, and will be used as an identifier on the Portal. + mapping = self.services.run_ensemble(template, variables, run_dir=Path('.').absolute(), name='my_simple_ensemble', num_nodes=1, cores_per_instance=1) + # Print each mapping of instance name to what variable values were used. + for instance in mapping: + self.services.info(f'{instance!s}') diff --git a/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf b/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf new file mode 100644 index 00000000..4b01f9a1 --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf @@ -0,0 +1,32 @@ +SIM_NAME = simpleensemble +SIM_ROOT = $PWD/ENSEMBLES +LOG_FILE = log +LOG_LEVEL = INFO +SIMULATION_MODE = NORMAL + +INPUT_DIR = $PWD/input_dir/ + +PORTAL_URL=https://lb.ipsportal.development.svc.spin.nersc.org +# do not commit actual PORTAL_API_KEY value to version control, best to set as an environment variable +#PORTAL_API_KEY=changeme + +[PORTS] + NAMES = DRIVER + [[DRIVER]] + IMPLEMENTATION = ensemble_driver + +[ensemble_driver] + CLASS = DRIVER + SUB_CLASS = + NAME = EnsembleDriver + NPROC = 1 + BIN_PATH = + INPUT_FILES = + OUTPUT_FILES = + SCRIPT = $PWD/driver.py + MODULE = + # Specifies the template configuration file used for instances + TEMPLATE = $PWD/template.conf + # Specifies the parameter values for each instance + PARAMETER_FILE = $PWD/values.csv + diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py new file mode 100644 index 00000000..14197c41 --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +Component to be stepped in instance +""" + +import csv +import itertools +import json +import sys +from time import time +from typing import Any + +from ipsframework import Component +from ipsframework.resourceHelper import get_platform_info + + +def generate_fake_data(timestamp: float, base_x: float, base_y: float, word: str) -> dict[str, Any]: + x_data = [] + y_data = [] + + for idx, perm in enumerate(itertools.permutations(word)): + x = timestamp + (idx + 1) * base_x + y = timestamp + (idx + 1) * base_y + for ch_idx, character in enumerate(perm): + shrink_factor = 1 if ch_idx % 2 == 0 else -1 + x = abs(x + ((ord(character) + ch_idx + 1) * shrink_factor)) + y = abs(y + ((ord(character) * (ch_idx + 1)) * shrink_factor)) + x_data.append(x) + y_data.append(y) + + return { + 'base_x': base_x, + 'base_y': base_y, + 'word': word, + 'x_data': x_data, + 'y_data': y_data, + } + + +class InstanceComponent(Component): + def step(self, timestamp: float = 0.0, **keywords): + start = time() + + # ENSEMBLE_INSTANCE is a special IPS variable that contains the + # string uniquely identifying this instance. Each instance will have + # the `run_ensemble()` `name` argument prepended to a unique number + # for each instance. E.g., ENSEMBLE_INSTANCE might be "MY_INSTANCE_23". + instance_id = self.services.get_config_param('ENSEMBLE_INSTANCE') + self.services.info(f'{instance_id}: Start of step of instance component.') + + # Echo the parameters we're expecting, A, B, and C + self.services.info(f'{instance_id}: instance component parameters: base_x={self.base_x}, base_y={self.base_y}, word={self.word}') + + # generate some fake data and save it + data_fname = f'generated_{timestamp}.json' + data = generate_fake_data(timestamp, float(self.base_x), float(self.base_y), self.word) + with open(data_fname, 'w') as fd: + json.dump(data, fd) + + # Save some per-component stats + stats_fname = f'stats_{timestamp}.csv' + run_env = get_platform_info() + + with open(stats_fname, 'w') as f: + writer = csv.writer(f) + writer.writerow(['instance', 'executable', 'hostname', 'pid', 'core', 'start', 'end']) + writer.writerow([instance_id, sys.argv[0], run_env['hostname'], run_env['pid'], run_env['core_id'], start, time()]) + + try: + self.services.add_analysis_data_files([data_fname, stats_fname], timestamp) + except Exception: + print('did not add data files to portal, check logs') + + self.services.info(f'{instance_id}: End of step of instance component.') diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_driver.py b/examples-proposed/024-aggregated-compute-ensemble/instance_driver.py new file mode 100644 index 00000000..33f1954b --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_driver.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +""" +Driver component for instances +""" + +from ipsframework import Component + + +class InstanceDriver(Component): + """ + Instance driver component that steps the main component + """ + + def step(self, timestamp: float = 0.0, **keywords): + instance_component = self.services.get_port('WORKER') + + self.services.call(instance_component, 'step', 0.0) diff --git a/examples-proposed/024-aggregated-compute-ensemble/platform.conf b/examples-proposed/024-aggregated-compute-ensemble/platform.conf new file mode 100644 index 00000000..7bd5585d --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/platform.conf @@ -0,0 +1,9 @@ +# Platform config to run this example on localhost +MPIRUN = eval +NODE_DETECTION = manual +PROCS_PER_NODE = 8 +CORES_PER_NODE = 8 +SOCKETS_PER_NODE = 1 +NODE_ALLOCATION_MODE = shared +HOST = localhost +SCRATCH = \ No newline at end of file diff --git a/examples-proposed/024-aggregated-compute-ensemble/template.conf b/examples-proposed/024-aggregated-compute-ensemble/template.conf new file mode 100644 index 00000000..7703984c --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/template.conf @@ -0,0 +1,40 @@ +SIM_NAME = simpleensembleinstance +SIM_ROOT = $PWD +LOG_FILE = log +LOG_LEVEL = INFO +SIMULATION_MODE = NORMAL + +[PORTS] + NAMES = DRIVER WORKER + [[DRIVER]] + IMPLEMENTATION = instance_driver + + [[WORKER]] + IMPLEMENTATION = instance_component + +[instance_driver] + CLASS = DRIVER + SUB_CLASS = + NAME = InstanceDriver + NPROC = 1 + BIN_PATH = + INPUT_FILES = + OUTPUT_FILES = + SCRIPT = $PWD/instance_driver.py + MODULE = + +[instance_component] + CLASS = WORKER + SUB_CLASS = + NAME = InstanceComponent + NPROC = 1 + BIN_PATH = + INPUT_FILES = + OUTPUT_FILES = + SCRIPT = $PWD/instance_component.py + MODULE = + # These are the variables that will have values substituted for each instance. + # The ? are important to indicate that these are to be filled in. + base_x = ? + base_y = ? + word = ? From 479abd8d0cdd6814ea0ea9d68e5c2e5cc866fac9 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 14 Nov 2025 14:24:22 -0500 Subject: [PATCH 03/34] Added basic slurm script for new example --- .../perlmutter.slurm | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm diff --git a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm new file mode 100644 index 00000000..c889a15c --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Set up the IPS ensemble example run. Note that you must have a setup +# python environment on Perlmutter in which IPS is already installed for +# this example to work. In this case, the conda environment is named "ips", but you're +# free to create your own environment with a different name; but, be sure to +# change the "conda activate" line below accordingly if you do so. +# +#SBATCH --account=atom # REPLACE WITH YOUR PROJECT ID +#SBATCH --constraint=cpu +#SBATCH --nodes=1 +#SBATCH --time=5 +#SBATCH -p debug + +module load PrgEnv-gnu openmpi python + +# Again, this assumes that there exists a conda environment named "ips". +conda activate ips + +# This sets the absolute path to the mpi_stats.py script, which is used by IPS to +export MPI_STATS_EXEC=$(realpath ./mpi_stats.py) + +echo "Using MPI_STATS_EXEC: ${MPI_STATS_EXEC}" + +# The 2>&1 binds stderr to stdout so that both are captured in the tee log file. The +# `tee` command allows you to see the output on the terminal as well as save it. +ips.py --simulation=ensemble.conf --platform=perlmutter.conf \ + 2>&1 | tee ${SLURM_JOBID}_ips.log + From 6a9b350534273204915edac243303b5eed0c141c Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 14 Nov 2025 14:26:12 -0500 Subject: [PATCH 04/34] Start of CSV input; this will go significant changes as the example is implemented. --- examples-proposed/024-aggregated-compute-ensemble/values.csv | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 examples-proposed/024-aggregated-compute-ensemble/values.csv diff --git a/examples-proposed/024-aggregated-compute-ensemble/values.csv b/examples-proposed/024-aggregated-compute-ensemble/values.csv new file mode 100644 index 00000000..e337af07 --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/values.csv @@ -0,0 +1,4 @@ +instance_component:A, instance_component:B, instance_component:C +3, 2.34, bar +2, 5.82, baz +4, 0.1, quux From 8fad23643ccf54e74db8528f94c9ada98579cefd Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Thu, 4 Dec 2025 13:22:25 -0500 Subject: [PATCH 05/34] Checkpoint commit --- .../024-aggregated-compute-ensemble/driver.py | 42 ++++++++++++------- .../instance_component.py | 13 ++++-- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/driver.py b/examples-proposed/024-aggregated-compute-ensemble/driver.py index 070fefa5..5bc453e8 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/driver.py +++ b/examples-proposed/024-aggregated-compute-ensemble/driver.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """ -Simple ensemble driver that just dispatches an IPS ensemble. +Simple ensemble driver that just dispatches an IPS ensemble for an example +compute application. """ from pathlib import Path @@ -12,36 +13,47 @@ class EnsembleDriver(Component): """Kicks off a simple ensemble""" def init(self, timestamp=0.0): - NOTEBOOK_TEMPLATE = 'notebook.ipynb' - self.services.stage_input_files([NOTEBOOK_TEMPLATE]) - try: - self.services.initialize_jupyter_notebook(NOTEBOOK_TEMPLATE) - except Exception: - print('did not add notebook to portal') + # TODO temporarily commenting this out until the actual + # example is ready to consider adding a notebook to the portal. + + # NOTEBOOK_TEMPLATE = 'notebook.ipynb' + # self.services.stage_input_files([NOTEBOOK_TEMPLATE]) + # try: + # self.services.initialize_jupyter_notebook(NOTEBOOK_TEMPLATE) + # except Exception: + # print('did not add notebook to portal') def step(self, timestamp=0.0): - # FIXME get a hold of INPUT FILES for - variables = params_from_csv('../../../variables.csv') + # This CSV file contains the parameters used for the + # different instances. + variables = params_from_csv(self.config['PARAMETER_FILE']) # This is the IPS configuration file for the instances that looks like - # a regular configuration file except there are slots for the 'base_x', 'base_y', + # a regular configuration file except there are slots for the + # 'base_x', 'base_y', # and 'word' for variable substitution. 'TEMPLATE' is specified in the # config file section for this driver. template = Path(self.config['TEMPLATE']) self.services.info(f'Using template config file {template}') if not template.exists(): - raise RuntimeError(f'{template} config template file does not exist') + raise RuntimeError( + f'{template} config template file does not exist') # Now spin up and run the instances. This function will return a list # with each list element corresponding to an instance. You can use # this information to find the specific instance run directory for a # given set of variables. E.g., the instance corresponding to - # {'base_x' : 2, 'base_y' : 5.82, 'word' : 'baz'} is probably found in the - # `my_simple_ensemble1` subdirectory. + # {'base_x' : 2, 'base_y' : 5.82, 'word' : 'baz'} is probably found + # in the `INSTANCE_` subdirectory. # - # The "name" parameter must be unique for each ensemble within a run, and will be used as an identifier on the Portal. - mapping = self.services.run_ensemble(template, variables, run_dir=Path('.').absolute(), name='my_simple_ensemble', num_nodes=1, cores_per_instance=1) + # The "name" parameter must be unique for each ensemble within a run, + # and will be used as an identifier on the Portal. + mapping = self.services.run_ensemble(template, variables, + run_dir=Path('.').absolute(), + name='INSTANCE_', + num_nodes=1, cores_per_instance=1) + # Print each mapping of instance name to what variable values were used. for instance in mapping: self.services.info(f'{instance!s}') diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 14197c41..8ab5c3e4 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -15,6 +15,8 @@ def generate_fake_data(timestamp: float, base_x: float, base_y: float, word: str) -> dict[str, Any]: + # From code Lance originally wrote for a different example and will + # be replaced. x_data = [] y_data = [] @@ -66,9 +68,12 @@ def step(self, timestamp: float = 0.0, **keywords): writer.writerow(['instance', 'executable', 'hostname', 'pid', 'core', 'start', 'end']) writer.writerow([instance_id, sys.argv[0], run_env['hostname'], run_env['pid'], run_env['core_id'], start, time()]) - try: - self.services.add_analysis_data_files([data_fname, stats_fname], timestamp) - except Exception: - print('did not add data files to portal, check logs') + # TODO temporarily commenting this out until the actual + # example is ready to consider adding data files to the portal. This + # originally came from code Lance wrote in a previous example. + # try: + # self.services.add_analysis_data_files([data_fname, stats_fname], timestamp) + # except Exception: + # print('did not add data files to portal, check logs') self.services.info(f'{instance_id}: End of step of instance component.') From ca0676ed14dc365f54453c46a52e7d15c9d555ae Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Thu, 4 Dec 2025 15:26:07 -0500 Subject: [PATCH 06/34] get_platform_info() now returns CPU affinity if that is supported on the current platform --- ipsframework/resourceHelper.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ipsframework/resourceHelper.py b/ipsframework/resourceHelper.py index f9867065..3850cc59 100644 --- a/ipsframework/resourceHelper.py +++ b/ipsframework/resourceHelper.py @@ -416,9 +416,12 @@ def get_platform_info(): properly for a given system. :returns: A dictionary containing hostname, cpu count, cpu core id for - current running process, and available GPU devices if set + current running process, and available GPU devices if set; if the + platform is supported it will also return CPU affinity """ - result = {'hostname': platform.node(), 'cpu_count': psutil.cpu_count(), 'pid': os.getpid()} + result = {'hostname': platform.node(), + 'cpu_count': psutil.cpu_count(), + 'pid': os.getpid()} if 'CUDA_VISIBLE_DEVICES' in os.environ: result['cuda_visible_devices'] = os.environ['CUDA_VISIBLE_DEVICES'] @@ -429,6 +432,10 @@ def get_platform_info(): p = psutil.Process() with p.oneshot(): result['core_id'] = p.cpu_num() + if hasattr(p, 'cpu_affinity'): + # Not all platforms support `cpu_affinity`, which is why + # we check first. + result['affinity'] = p.cpu_affinity() except Exception: # cpu_num() only available on linux (and BSD systems), so this will # throw an exception on other platforms From 6cbf49156206eb634d29514001000156ec7d1c73 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Thu, 4 Dec 2025 15:28:25 -0500 Subject: [PATCH 07/34] Let's be more explicit about pusutil ops that may be unsupported on some platforms --- ipsframework/resourceHelper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ipsframework/resourceHelper.py b/ipsframework/resourceHelper.py index 3850cc59..eed408fb 100644 --- a/ipsframework/resourceHelper.py +++ b/ipsframework/resourceHelper.py @@ -436,9 +436,12 @@ def get_platform_info(): # Not all platforms support `cpu_affinity`, which is why # we check first. result['affinity'] = p.cpu_affinity() + else: + result['affinity'] = 'Unsupported Op' except Exception: # cpu_num() only available on linux (and BSD systems), so this will # throw an exception on other platforms - pass + result['core_id'] = 'Unsupported Op' + result['affinity'] = 'Unsupported Op' return result From a2e66d661f81b316ecae2948d24fd4caeeeeacf3 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Thu, 4 Dec 2025 15:58:11 -0500 Subject: [PATCH 08/34] Added variables to template.conf --- .../024-aggregated-compute-ensemble/template.conf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/template.conf b/examples-proposed/024-aggregated-compute-ensemble/template.conf index 7703984c..4e3b9909 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/template.conf +++ b/examples-proposed/024-aggregated-compute-ensemble/template.conf @@ -34,7 +34,8 @@ SIMULATION_MODE = NORMAL SCRIPT = $PWD/instance_component.py MODULE = # These are the variables that will have values substituted for each instance. - # The ? are important to indicate that these are to be filled in. - base_x = ? - base_y = ? - word = ? + alpha = + L = + T_final = + Nx = + Nt = From 0299888549a83abd0d1aa164f27a1885c82f791a Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Thu, 4 Dec 2025 15:58:36 -0500 Subject: [PATCH 09/34] Added randomly generated input to CSV --- .../024-aggregated-compute-ensemble/values.csv | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/values.csv b/examples-proposed/024-aggregated-compute-ensemble/values.csv index e337af07..c1d838d8 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/values.csv +++ b/examples-proposed/024-aggregated-compute-ensemble/values.csv @@ -1,4 +1,12 @@ -instance_component:A, instance_component:B, instance_component:C -3, 2.34, bar -2, 5.82, baz -4, 0.1, quux +instance_component:alpha, instance_component:L, instance_component:T_final, Nx, Nt +0.1, 1.0, 1.0, 50, 100 +0.14, 1.70, 5.38, 95, 85 +0.98, 2.56, 5.74, 57, 60 +0.66, 4.84, 2.64, 95, 91 +0.68, 4.72, 2.76, 78, 71 +0.96, 5.91, 3.44, 70, 68 +0.11, 3.89, 6.76, 73, 64 +0.15, 4.06, 1.69, 35, 50 +0.80, 9.50, 9.16, 2, 91 +0.07, 6.77, 5.88, 14, 30 +0.61, 9.36, 6.56, 91, 71 From 785bef241989526aadca70878c34485d59b2c808 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Thu, 4 Dec 2025 15:59:06 -0500 Subject: [PATCH 10/34] Now write out instance parameter values to CSV --- .../instance_component.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 8ab5c3e4..6b5516eb 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -14,7 +14,7 @@ from ipsframework.resourceHelper import get_platform_info -def generate_fake_data(timestamp: float, base_x: float, base_y: float, word: str) -> dict[str, Any]: +def generate_synthetic_data(timestamp: float, base_x: float, base_y: float, word: str) -> dict[str, Any]: # From code Lance originally wrote for a different example and will # be replaced. x_data = [] @@ -55,7 +55,7 @@ def step(self, timestamp: float = 0.0, **keywords): # generate some fake data and save it data_fname = f'generated_{timestamp}.json' - data = generate_fake_data(timestamp, float(self.base_x), float(self.base_y), self.word) + data = generate_synthetic_data(timestamp, float(self.base_x), float(self.base_y), self.word) with open(data_fname, 'w') as fd: json.dump(data, fd) @@ -64,9 +64,19 @@ def step(self, timestamp: float = 0.0, **keywords): run_env = get_platform_info() with open(stats_fname, 'w') as f: + # Write run-time stats to a CSV as well as the runtime parameters + # specific to this instance. writer = csv.writer(f) - writer.writerow(['instance', 'executable', 'hostname', 'pid', 'core', 'start', 'end']) - writer.writerow([instance_id, sys.argv[0], run_env['hostname'], run_env['pid'], run_env['core_id'], start, time()]) + writer.writerow( + ['instance', 'executable', 'hostname', 'pid', 'core', + 'affinity', + 'alpha', 'L', 'T_final', 'Nx', 'Nt', + 'start', 'end']) + writer.writerow([instance_id, sys.argv[0], run_env['hostname'], + run_env['pid'], run_env['core_id'], + run_env['affinity'], + self.alpha, self.L, self.T_final, self.Nx, self.Nt, + start, time()]) # TODO temporarily commenting this out until the actual # example is ready to consider adding data files to the portal. This From 8d24bcf16e60b4cfde8401ddc7ae2ddb354bcbb3 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 12:03:16 -0500 Subject: [PATCH 11/34] Correcting platform file --- .../024-aggregated-compute-ensemble/perlmutter.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm index c889a15c..b2e3127a 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm +++ b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm @@ -24,6 +24,6 @@ echo "Using MPI_STATS_EXEC: ${MPI_STATS_EXEC}" # The 2>&1 binds stderr to stdout so that both are captured in the tee log file. The # `tee` command allows you to see the output on the terminal as well as save it. -ips.py --simulation=ensemble.conf --platform=perlmutter.conf \ +ips.py --simulation=ensemble.conf --platform=platform.conf \ 2>&1 | tee ${SLURM_JOBID}_ips.log From e0ccf96f4d9c95a02cdf13ce4e222d71eab021bb Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 12:07:15 -0500 Subject: [PATCH 12/34] Entirely new synthetic data function --- .../024-aggregated-compute-ensemble/driver.py | 1 + .../instance_component.py | 81 +++++++++++++------ .../values.csv | 22 ++--- 3 files changed, 70 insertions(+), 34 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/driver.py b/examples-proposed/024-aggregated-compute-ensemble/driver.py index 5bc453e8..7081d6e0 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/driver.py +++ b/examples-proposed/024-aggregated-compute-ensemble/driver.py @@ -13,6 +13,7 @@ class EnsembleDriver(Component): """Kicks off a simple ensemble""" def init(self, timestamp=0.0): + pass # TODO temporarily commenting this out until the actual # example is ready to consider adding a notebook to the portal. diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 6b5516eb..4bd9c254 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -10,33 +10,65 @@ from time import time from typing import Any +import numpy as np +import matplotlib.pyplot as plt + from ipsframework import Component from ipsframework.resourceHelper import get_platform_info -def generate_synthetic_data(timestamp: float, base_x: float, base_y: float, word: str) -> dict[str, Any]: - # From code Lance originally wrote for a different example and will - # be replaced. - x_data = [] - y_data = [] +def generate_synthetic_data(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any]: + """ Generate synthetic data to emulate an actual simulation or complex + calculation. + + As a side-effect it will save a plot to the current working directory with + the name `solution.png`. + + :param alpha: thermal diffusivity + :param L: domain length + :param T_final: final time + :param Nx: number of spatial grid points + :param Nt: number of time steps + :returns: x, y, where x is the steps and u the corresponding values + """ + # Discretization + dx = L / (Nx - 1) + dt = T_final / Nt + r = alpha * dt / (dx ** 2) + # + # # Check stability condition for explicit method + if r > 0.5: + print("Warning: Stability condition r <= 0.5 is not met. " + "Results may be inaccurate.") + + # Initialize solution array + u = np.zeros(Nx) + + # Initial condition (e.g., a sine wave) + x = np.linspace(0, L, Nx) + u = np.sin(np.pi * x) + + # Boundary conditions (Dirichlet, e.g., u(0,t) = 0, u(L,t) = 0) These are + # already handled by the initial setup of u=0 at boundaries if the + # initial condition is 0 there. If non-zero, they would be set within the + # time loop. + + # Time evolution + for n in range(Nt): + u_new = np.copy(u) # Create a copy for updating + for i in range(1, Nx - 1): + u_new[i] = u[i] + r * (u[i + 1] - 2 * u[i] + u[i - 1]) + u = u_new - for idx, perm in enumerate(itertools.permutations(word)): - x = timestamp + (idx + 1) * base_x - y = timestamp + (idx + 1) * base_y - for ch_idx, character in enumerate(perm): - shrink_factor = 1 if ch_idx % 2 == 0 else -1 - x = abs(x + ((ord(character) + ch_idx + 1) * shrink_factor)) - y = abs(y + ((ord(character) * (ch_idx + 1)) * shrink_factor)) - x_data.append(x) - y_data.append(y) + # Plotting the result + plt.plot(x, u) + plt.xlabel("Position (x)") + plt.ylabel("Temperature (u)") + plt.title("Solution of 1D Heat Equation") + plt.grid(True) + plt.savefig("solution.png") - return { - 'base_x': base_x, - 'base_y': base_y, - 'word': word, - 'x_data': x_data, - 'y_data': y_data, - } + return {'x': x, 'u': u} class InstanceComponent(Component): @@ -51,11 +83,14 @@ def step(self, timestamp: float = 0.0, **keywords): self.services.info(f'{instance_id}: Start of step of instance component.') # Echo the parameters we're expecting, A, B, and C - self.services.info(f'{instance_id}: instance component parameters: base_x={self.base_x}, base_y={self.base_y}, word={self.word}') + self.services.info(f'{instance_id}: instance component parameters: ' + f'alpha={self.alpha}, L={self.L}, ' + f'T_final={self.T_final}, Nx={self.Nx}, ' + f'Nt={self.Nt}') # generate some fake data and save it data_fname = f'generated_{timestamp}.json' - data = generate_synthetic_data(timestamp, float(self.base_x), float(self.base_y), self.word) + data = generate_synthetic_data(self.alpha, self.L, self.T_final, self.Nx, self.Nt) with open(data_fname, 'w') as fd: json.dump(data, fd) diff --git a/examples-proposed/024-aggregated-compute-ensemble/values.csv b/examples-proposed/024-aggregated-compute-ensemble/values.csv index c1d838d8..134534c8 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/values.csv +++ b/examples-proposed/024-aggregated-compute-ensemble/values.csv @@ -1,12 +1,12 @@ instance_component:alpha, instance_component:L, instance_component:T_final, Nx, Nt -0.1, 1.0, 1.0, 50, 100 -0.14, 1.70, 5.38, 95, 85 -0.98, 2.56, 5.74, 57, 60 -0.66, 4.84, 2.64, 95, 91 -0.68, 4.72, 2.76, 78, 71 -0.96, 5.91, 3.44, 70, 68 -0.11, 3.89, 6.76, 73, 64 -0.15, 4.06, 1.69, 35, 50 -0.80, 9.50, 9.16, 2, 91 -0.07, 6.77, 5.88, 14, 30 -0.61, 9.36, 6.56, 91, 71 +0.1, 1.0, 1.0, 50, 1000 +0.14, 1.70, 5.38, 95,850 +0.32, 2.56, 5.74, 57,600 +0.23, 4.84, 2.64, 95,910 +0.28, 4.72, 2.76, 78,710 +0.16, 5.91, 3.44, 70,689 +0.11, 3.89, 6.76, 73,640 +0.15, 4.06, 1.69, 35,500 +0.19, 9.50, 9.16, 2,910 +0.07, 6.77, 5.88, 14,300 +0.26, 9.36, 6.56, 91,710 From b632ba4dda6cb3808c13a621f6f19bd7f0526682 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 12:10:24 -0500 Subject: [PATCH 13/34] Import params_from_csv utility in EnsembleDriver --- examples-proposed/024-aggregated-compute-ensemble/driver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples-proposed/024-aggregated-compute-ensemble/driver.py b/examples-proposed/024-aggregated-compute-ensemble/driver.py index 7081d6e0..44df756d 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/driver.py +++ b/examples-proposed/024-aggregated-compute-ensemble/driver.py @@ -7,6 +7,7 @@ from pathlib import Path from ipsframework import Component +from ipsframework.ipsutil import params_from_csv class EnsembleDriver(Component): From 8b2b9e88b929ab1787a8e79f4fe39c9982b9b653 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 12:24:59 -0500 Subject: [PATCH 14/34] Repaired header --- examples-proposed/024-aggregated-compute-ensemble/values.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/values.csv b/examples-proposed/024-aggregated-compute-ensemble/values.csv index 134534c8..e70a30a9 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/values.csv +++ b/examples-proposed/024-aggregated-compute-ensemble/values.csv @@ -1,4 +1,4 @@ -instance_component:alpha, instance_component:L, instance_component:T_final, Nx, Nt +instance_component:alpha, instance_component:L, instance_component:T_final,instance_component:Nx,instance_column:Nt 0.1, 1.0, 1.0, 50, 1000 0.14, 1.70, 5.38, 95,850 0.32, 2.56, 5.74, 57,600 From 8de6630c1e3bbf1facbcd70184d9a846f4e56260 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 09:27:29 -0800 Subject: [PATCH 15/34] Fixed column header --- examples-proposed/022-tasks-and-ensembles/ensemble.conf | 2 +- .../022-tasks-and-ensembles/instance_component.py | 7 ++++++- examples-proposed/022-tasks-and-ensembles/template.conf | 2 +- .../024-aggregated-compute-ensemble/values.csv | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples-proposed/022-tasks-and-ensembles/ensemble.conf b/examples-proposed/022-tasks-and-ensembles/ensemble.conf index 19ca57b5..78755433 100644 --- a/examples-proposed/022-tasks-and-ensembles/ensemble.conf +++ b/examples-proposed/022-tasks-and-ensembles/ensemble.conf @@ -13,7 +13,7 @@ SIMULATION_MODE = NORMAL CLASS = DRIVER SUB_CLASS = NAME = EnsembleDriver - NPROC = 1 + NPROC = 20 BIN_PATH = INPUT_FILES = OUTPUT_FILES = diff --git a/examples-proposed/022-tasks-and-ensembles/instance_component.py b/examples-proposed/022-tasks-and-ensembles/instance_component.py index e1a5cc65..1eb8c32d 100644 --- a/examples-proposed/022-tasks-and-ensembles/instance_component.py +++ b/examples-proposed/022-tasks-and-ensembles/instance_component.py @@ -11,6 +11,11 @@ class InstanceComponent(Component): def step(self, timestamp: float = 0.0, **keywords): + if 'HWLOC_XMLFILE' in os.environ: + self.services.warning(f'HWLOC_XMLfile still set!') + else: + self.services.info('HWLOC_XMLFILE is not set') + # ENSEMBLE_INSTANCE is a special IPS variable that contains the # string uniquely identifying this instance. Each instance will have # the `run_ensemble()` `name` argument prepended to a unique number @@ -37,7 +42,7 @@ def step(self, timestamp: float = 0.0, **keywords): '-o', 'stats.csv'] cmd = str(mpi_executable) + ' ' + ' '.join(args) try: - run_id = self.services.launch_task(nproc=1, + run_id = self.services.launch_task(nproc=5, working_dir=working_dir, binary=cmd) except Exception as e: diff --git a/examples-proposed/022-tasks-and-ensembles/template.conf b/examples-proposed/022-tasks-and-ensembles/template.conf index 146b16e5..fabf351b 100644 --- a/examples-proposed/022-tasks-and-ensembles/template.conf +++ b/examples-proposed/022-tasks-and-ensembles/template.conf @@ -27,7 +27,7 @@ SIMULATION_MODE = NORMAL CLASS = WORKER SUB_CLASS = NAME = InstanceComponent - NPROC = 1 + NPROC = 2 BIN_PATH = INPUT_FILES = OUTPUT_FILES = diff --git a/examples-proposed/024-aggregated-compute-ensemble/values.csv b/examples-proposed/024-aggregated-compute-ensemble/values.csv index e70a30a9..82ced66b 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/values.csv +++ b/examples-proposed/024-aggregated-compute-ensemble/values.csv @@ -1,4 +1,4 @@ -instance_component:alpha, instance_component:L, instance_component:T_final,instance_component:Nx,instance_column:Nt +instance_component:alpha, instance_component:L, instance_component:T_final,instance_component:Nx,instance_component:Nt 0.1, 1.0, 1.0, 50, 1000 0.14, 1.70, 5.38, 95,850 0.32, 2.56, 5.74, 57,600 From 093366eb1733e676c092dc9002aeb239b928d29f Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 12:41:13 -0500 Subject: [PATCH 16/34] Ensuring types are correct for test func --- .../024-aggregated-compute-ensemble/instance_component.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 4bd9c254..8128179b 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -90,7 +90,11 @@ def step(self, timestamp: float = 0.0, **keywords): # generate some fake data and save it data_fname = f'generated_{timestamp}.json' - data = generate_synthetic_data(self.alpha, self.L, self.T_final, self.Nx, self.Nt) + data = generate_synthetic_data(float(self.alpha), + float(self.L), + float(self.T_final), + int(self.Nx), + int(self.Nt)) with open(data_fname, 'w') as fd: json.dump(data, fd) From d7e2de71e53f0cd1d5ddbd8814e58fbfc6e35cd4 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 12:46:02 -0500 Subject: [PATCH 17/34] Convert numpy arrays to lists in return statement --- .../024-aggregated-compute-ensemble/instance_component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 8128179b..d03e1720 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -68,7 +68,7 @@ def generate_synthetic_data(alpha: float, L:float, T_final:float, Nx:int, Nt:int plt.grid(True) plt.savefig("solution.png") - return {'x': x, 'u': u} + return {'x': x.tolist(), 'u': u.tolist()} class InstanceComponent(Component): From cde2913e2471629a1c8a212eec7972ce140d170b Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 13:57:38 -0500 Subject: [PATCH 18/34] Enabling portal support --- .../024-aggregated-compute-ensemble/ensemble.conf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf b/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf index 4b01f9a1..b01655f0 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf +++ b/examples-proposed/024-aggregated-compute-ensemble/ensemble.conf @@ -6,7 +6,8 @@ SIMULATION_MODE = NORMAL INPUT_DIR = $PWD/input_dir/ -PORTAL_URL=https://lb.ipsportal.development.svc.spin.nersc.org +USE_PORTAL = True +PORTAL_URL = https://lb.ipsportal.development.svc.spin.nersc.org # do not commit actual PORTAL_API_KEY value to version control, best to set as an environment variable #PORTAL_API_KEY=changeme From c1d2aa86ce8b00929070da2228b0ebf64b0fc037 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 14:33:17 -0500 Subject: [PATCH 19/34] Cleaning up dead comments and code. --- .../024-aggregated-compute-ensemble/driver.py | 13 +++++-------- .../instance_component.py | 2 +- .../perlmutter.slurm | 5 ----- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/driver.py b/examples-proposed/024-aggregated-compute-ensemble/driver.py index 44df756d..dd0011bc 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/driver.py +++ b/examples-proposed/024-aggregated-compute-ensemble/driver.py @@ -30,11 +30,10 @@ def step(self, timestamp=0.0): # different instances. variables = params_from_csv(self.config['PARAMETER_FILE']) - # This is the IPS configuration file for the instances that looks like - # a regular configuration file except there are slots for the - # 'base_x', 'base_y', - # and 'word' for variable substitution. 'TEMPLATE' is specified in the - # config file section for this driver. + # This is the IPS configuration file for the instances that looks + # like a regular configuration file except there are slots for the + # variables (e.g., 'alpha', 'T_final', etc.). 'TEMPLATE' is + # specified in the config file section for this driver. template = Path(self.config['TEMPLATE']) self.services.info(f'Using template config file {template}') @@ -45,9 +44,7 @@ def step(self, timestamp=0.0): # Now spin up and run the instances. This function will return a list # with each list element corresponding to an instance. You can use # this information to find the specific instance run directory for a - # given set of variables. E.g., the instance corresponding to - # {'base_x' : 2, 'base_y' : 5.82, 'word' : 'baz'} is probably found - # in the `INSTANCE_` subdirectory. + # given set of variables. # # The "name" parameter must be unique for each ensemble within a run, # and will be used as an identifier on the Portal. diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index d03e1720..6e85dc09 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -35,7 +35,7 @@ def generate_synthetic_data(alpha: float, L:float, T_final:float, Nx:int, Nt:int dx = L / (Nx - 1) dt = T_final / Nt r = alpha * dt / (dx ** 2) - # + # # Check stability condition for explicit method if r > 0.5: print("Warning: Stability condition r <= 0.5 is not met. " diff --git a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm index b2e3127a..558e3d2a 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm +++ b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm @@ -17,11 +17,6 @@ module load PrgEnv-gnu openmpi python # Again, this assumes that there exists a conda environment named "ips". conda activate ips -# This sets the absolute path to the mpi_stats.py script, which is used by IPS to -export MPI_STATS_EXEC=$(realpath ./mpi_stats.py) - -echo "Using MPI_STATS_EXEC: ${MPI_STATS_EXEC}" - # The 2>&1 binds stderr to stdout so that both are captured in the tee log file. The # `tee` command allows you to see the output on the terminal as well as save it. ips.py --simulation=ensemble.conf --platform=platform.conf \ From bcac255d8f7064bf7ab63652503b69b050ea5a52 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Fri, 5 Dec 2025 14:38:44 -0500 Subject: [PATCH 20/34] Added matplotlib as an *optional* dependency since now at least one example generates figures. --- pyproject.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e0cbd6f9..32ae57af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,11 +45,13 @@ dependencies = [ "dask", #"dask==2022.10.0", #"dask==2023.12.1", # TODO need a version compatible with 3.8 - "distributed", + "distributed" ] [project.optional-dependencies] -docs = ["sphinx", "sphinx_rtd_theme"] +# sphinx for making docs locally; matplotlib because some of the +# examples generate figures. +docs = ["sphinx", "sphinx_rtd_theme", "matplotlib"] [project.scripts] "ips.py" = "ipsframework.ips:main" From bdcae0bd9e13cc31943cb6125b6762074a687b3e Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 13:57:10 -0500 Subject: [PATCH 21/34] Move synthetic data generation to standalone executable --- .../gen_data.py | 81 ++++++++++++++++++ .../instance_component.py | 84 ++++++------------- 2 files changed, 106 insertions(+), 59 deletions(-) create mode 100644 examples-proposed/024-aggregated-compute-ensemble/gen_data.py diff --git a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py new file mode 100644 index 00000000..7c6d1895 --- /dev/null +++ b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" + Used to generate synthetic data as an example. +""" +import argparse +from typing import Any +import json +import numpy as np +import matplotlib.pyplot as plt + + +def main(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any]: + """ Generate synthetic data to emulate an actual simulation or complex + calculation. + + As a side-effect it will save a plot to the current working directory with + the name `solution.png`. + + :param alpha: thermal diffusivity + :param L: domain length + :param T_final: final time + :param Nx: number of spatial grid points + :param Nt: number of time steps + :returns: x, y, where x is the steps and u the corresponding values + """ + # Discretization + dx = L / (Nx - 1) + dt = T_final / Nt + r = alpha * dt / (dx ** 2) + + # # Check stability condition for explicit method + if r > 0.5: + print("Warning: Stability condition r <= 0.5 is not met. " + "Results may be inaccurate.") + + # Initialize solution array + u = np.zeros(Nx) + + # Initial condition (e.g., a sine wave) + x = np.linspace(0, L, Nx) + u = np.sin(np.pi * x) + + # Boundary conditions (Dirichlet, e.g., u(0,t) = 0, u(L,t) = 0) These are + # already handled by the initial setup of u=0 at boundaries if the + # initial condition is 0 there. If non-zero, they would be set within the + # time loop. + + # Time evolution + for n in range(Nt): + u_new = np.copy(u) # Create a copy for updating + for i in range(1, Nx - 1): + u_new[i] = u[i] + r * (u[i + 1] - 2 * u[i] + u[i - 1]) + u = u_new + + # Plotting the result + plt.plot(x, u) + plt.xlabel("Position (x)") + plt.ylabel("Temperature (u)") + plt.title("Solution of 1D Heat Equation") + plt.grid(True) + plt.savefig("solution.png") + + return {'x': x.tolist(), 'u': u.tolist()} + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate synthetic data to ' + 'emulate an actual simulation or complex') + parser.add_argument('--alpha', type=float, default=1.0,) + parser.add_argument('--L', type=float, default=1.0,) + parser.add_argument('--T_final', type=float, default=1.0,) + parser.add_argument('--Nx', type=int, default=100,) + parser.add_argument('--Nt', type=int, default=100,) + + args = parser.parse_args() + + data = main(args.alpha, args.L, args.T_final, args.Nx, args.Nt) + + with open('solution.json', 'w') as f: + json.dump(data, f) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 6e85dc09..c5723527 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -4,71 +4,28 @@ """ import csv -import itertools -import json import sys from time import time from typing import Any - -import numpy as np -import matplotlib.pyplot as plt +from pathlib import Path from ipsframework import Component from ipsframework.resourceHelper import get_platform_info -def generate_synthetic_data(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any]: - """ Generate synthetic data to emulate an actual simulation or complex - calculation. - - As a side-effect it will save a plot to the current working directory with - the name `solution.png`. +def create_cmd(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> list[Any]: + """ create the command to run the external data generator :param alpha: thermal diffusivity :param L: domain length :param T_final: final time :param Nx: number of spatial grid points :param Nt: number of time steps - :returns: x, y, where x is the steps and u the corresponding values + :returns: list of command line arguments to be executed in step() """ - # Discretization - dx = L / (Nx - 1) - dt = T_final / Nt - r = alpha * dt / (dx ** 2) - - # # Check stability condition for explicit method - if r > 0.5: - print("Warning: Stability condition r <= 0.5 is not met. " - "Results may be inaccurate.") - - # Initialize solution array - u = np.zeros(Nx) - - # Initial condition (e.g., a sine wave) - x = np.linspace(0, L, Nx) - u = np.sin(np.pi * x) - - # Boundary conditions (Dirichlet, e.g., u(0,t) = 0, u(L,t) = 0) These are - # already handled by the initial setup of u=0 at boundaries if the - # initial condition is 0 there. If non-zero, they would be set within the - # time loop. - - # Time evolution - for n in range(Nt): - u_new = np.copy(u) # Create a copy for updating - for i in range(1, Nx - 1): - u_new[i] = u[i] + r * (u[i + 1] - 2 * u[i] + u[i - 1]) - u = u_new - - # Plotting the result - plt.plot(x, u) - plt.xlabel("Position (x)") - plt.ylabel("Temperature (u)") - plt.title("Solution of 1D Heat Equation") - plt.grid(True) - plt.savefig("solution.png") - - return {'x': x.tolist(), 'u': u.tolist()} + cmd = ['gen_data.py', '--alpha', alpha, '--L', L, '--T_final', T_final, + '--Nx', Nx, '--Nt', Nt] + return cmd class InstanceComponent(Component): @@ -88,15 +45,24 @@ def step(self, timestamp: float = 0.0, **keywords): f'T_final={self.T_final}, Nx={self.Nx}, ' f'Nt={self.Nt}') - # generate some fake data and save it - data_fname = f'generated_{timestamp}.json' - data = generate_synthetic_data(float(self.alpha), - float(self.L), - float(self.T_final), - int(self.Nx), - int(self.Nt)) - with open(data_fname, 'w') as fd: - json.dump(data, fd) + cmd = create_cmd() + + working_dir = str(Path('.').absolute()) + self.services.info(f'{instance_id}: Launching executable ' + f'in {working_dir}') + run_id = None + try: + run_id = self.services.launch_task(nproc=2, + working_dir=working_dir, + binary=cmd) + except Exception as e: + self.services.critical(f'{instance_id}: Unable to launch ' + f'executable in {working_dir}') + + self.services.wait_task(run_id) # block until done + + self.services.info(f'{instance_id}: Completed MPI executable.') + # Save some per-component stats stats_fname = f'stats_{timestamp}.csv' From e23eeee09c2f0b901a6d12e34ac9bdf446530e68 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 14:12:58 -0500 Subject: [PATCH 22/34] It helps to pass in the actual parameters --- .../024-aggregated-compute-ensemble/instance_component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index c5723527..bae58713 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -45,7 +45,7 @@ def step(self, timestamp: float = 0.0, **keywords): f'T_final={self.T_final}, Nx={self.Nx}, ' f'Nt={self.Nt}') - cmd = create_cmd() + cmd = create_cmd(self.alpha, self.L, self.T_final, self.Nx, self.Nt) working_dir = str(Path('.').absolute()) self.services.info(f'{instance_id}: Launching executable ' From 8e9b73bb1449433db0e9211a166bd6427aa7a619 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 14:13:15 -0500 Subject: [PATCH 23/34] Expanding README.md signposts to include new examples dirs --- examples-proposed/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples-proposed/README.md b/examples-proposed/README.md index 810cd23e..52f4afff 100644 --- a/examples-proposed/README.md +++ b/examples-proposed/README.md @@ -19,4 +19,9 @@ Note that each example will explicitly need to be installed into its own virtual - `009-task-pool-sync` - showcases a simple example utilizing `dask` to simulate parallelism - `010-adios-example` - meant to be a hello-world example for utilizing ADIOS files with the IPS Portal API. - `020-simple-ensemble` - simple example of how to run an ensemble of simulations -- `020-simple-ensemble-with-portal` - same as the `020-simple-ensemble` example, but with IPS Portal integration. \ No newline at end of file +- `021-ensembles-from-CSV` - +- `022-tasks-and-ensembles` - +- `023-simple-ensemble-with-portal` - +- `024-aggregated-compute-ensemble` - +- `023-simple-ensemble-with-portal` - same as the `020-simple-ensemble` + example, but with IPS Portal integration. \ No newline at end of file From b3db297f957125f4c536ea671fa210ca86ccc0b5 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 14:17:23 -0500 Subject: [PATCH 24/34] Helps to specify the python interpreter if there is no executable set on the standalone --- .../024-aggregated-compute-ensemble/instance_component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index bae58713..4e45a80a 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -23,7 +23,7 @@ def create_cmd(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> list[Any :param Nt: number of time steps :returns: list of command line arguments to be executed in step() """ - cmd = ['gen_data.py', '--alpha', alpha, '--L', L, '--T_final', T_final, + cmd = ['python3', 'gen_data.py', '--alpha', alpha, '--L', L, '--T_final', T_final, '--Nx', Nx, '--Nt', Nt] return cmd From 92e8ca9efa0039a001f3d72d202cdb074115285d Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 14:20:11 -0500 Subject: [PATCH 25/34] launching tasks expect single command string instead of suprocess-style lists --- .../024-aggregated-compute-ensemble/instance_component.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 4e45a80a..c2fc7ff1 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -52,6 +52,7 @@ def step(self, timestamp: float = 0.0, **keywords): f'in {working_dir}') run_id = None try: + cmd = ' '.join(cmd) # need one big ole string for executing tasks run_id = self.services.launch_task(nproc=2, working_dir=working_dir, binary=cmd) From 8532988695bdcf8b64925a12dffe64fb5aa720ea Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 14:33:18 -0500 Subject: [PATCH 26/34] Reducing resource allocation --- .../024-aggregated-compute-ensemble/instance_component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index c2fc7ff1..19f8a149 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -53,7 +53,7 @@ def step(self, timestamp: float = 0.0, **keywords): run_id = None try: cmd = ' '.join(cmd) # need one big ole string for executing tasks - run_id = self.services.launch_task(nproc=2, + run_id = self.services.launch_task(nproc=1, working_dir=working_dir, binary=cmd) except Exception as e: From 15fbdafc784e181d1892471be475d2551e46dcff Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 14:50:56 -0500 Subject: [PATCH 27/34] Fixed signpost descriptions in README.md --- examples-proposed/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples-proposed/README.md b/examples-proposed/README.md index 52f4afff..85e4bd67 100644 --- a/examples-proposed/README.md +++ b/examples-proposed/README.md @@ -19,9 +19,9 @@ Note that each example will explicitly need to be installed into its own virtual - `009-task-pool-sync` - showcases a simple example utilizing `dask` to simulate parallelism - `010-adios-example` - meant to be a hello-world example for utilizing ADIOS files with the IPS Portal API. - `020-simple-ensemble` - simple example of how to run an ensemble of simulations -- `021-ensembles-from-CSV` - -- `022-tasks-and-ensembles` - -- `023-simple-ensemble-with-portal` - -- `024-aggregated-compute-ensemble` - +- `021-ensembles-from-CSV` - ensembles with parameters read from CSV file +- `022-tasks-and-ensembles` - ensembles that submit tasks - `023-simple-ensemble-with-portal` - same as the `020-simple-ensemble` - example, but with IPS Portal integration. \ No newline at end of file + example, but with IPS Portal integration. +- `024-aggregated-compute-ensemble` - ensembles that submit tasks and + perform analytics per instance and for aggregated instances via the portal From 7fc122814fec03f5bb25acca97598acfd18f9795 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Mon, 8 Dec 2025 15:03:55 -0500 Subject: [PATCH 28/34] Add instance parameter to data generation and update output files --- .../gen_data.py | 45 ++++++++++++++---- .../instance_component.py | 47 +++++++------------ 2 files changed, 53 insertions(+), 39 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py index 7c6d1895..e9a29946 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py +++ b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py @@ -5,24 +5,28 @@ import argparse from typing import Any import json +import csv import numpy as np import matplotlib.pyplot as plt +from ipsframework.resourceHelper import get_platform_info -def main(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any]: +def main(instance: str, + alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any]: """ Generate synthetic data to emulate an actual simulation or complex calculation. As a side-effect it will save a plot to the current working directory with the name `solution.png`. + :param instance: instance name :param alpha: thermal diffusivity :param L: domain length :param T_final: final time :param Nx: number of spatial grid points :param Nt: number of time steps :returns: x, y, where x is the steps and u the corresponding values - """ + """ # Discretization dx = L / (Nx - 1) dt = T_final / Nt @@ -33,9 +37,6 @@ def main(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any] print("Warning: Stability condition r <= 0.5 is not met. " "Results may be inaccurate.") - # Initialize solution array - u = np.zeros(Nx) - # Initial condition (e.g., a sine wave) x = np.linspace(0, L, Nx) u = np.sin(np.pi * x) @@ -60,13 +61,36 @@ def main(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any] plt.grid(True) plt.savefig("solution.png") + # Save some per-component stats + stats_fname = f'{instance}_stats_{timestamp}.csv' + run_env = get_platform_info() + + with open(stats_fname, 'w') as f: + # Write run-time stats to a CSV as well as the runtime parameters + # specific to this instance. + writer = csv.writer(f) + writer.writerow( + ['instance', 'hostname', 'pid', 'core', + 'affinity', + 'alpha', 'L', 'T_final', 'Nx', 'Nt', + 'start', 'end']) + + writer.writerow([instance_id, run_env['hostname'], + run_env['pid'], run_env['core_id'], + run_env['affinity'], + alpha, L, T_final, Nx, Nt, + start, time()]) + return {'x': x.tolist(), 'u': u.tolist()} if __name__ == '__main__': parser = argparse.ArgumentParser(description='Generate synthetic data to ' - 'emulate an actual simulation or complex') + 'emulate an actual simulation ' + 'or complex') + parser.add_argument('--instance', type=str, + help='instance name') parser.add_argument('--alpha', type=float, default=1.0,) parser.add_argument('--L', type=float, default=1.0,) parser.add_argument('--T_final', type=float, default=1.0,) @@ -75,7 +99,12 @@ def main(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> dict[str, Any] args = parser.parse_args() - data = main(args.alpha, args.L, args.T_final, args.Nx, args.Nt) + data = main(args.instance, + args.alpha, args.L, args.T_final, args.Nx, args.Nt) + + file_name = f'{args.instance}_solution.json' + + print(f'Writing to {file_name}') - with open('solution.json', 'w') as f: + with open(file_name, 'w') as f: json.dump(data, f) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index 19f8a149..e0f16191 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -1,21 +1,23 @@ #!/usr/bin/env python3 """ -Component to be stepped in instance -""" +Component to be stepped in instance. -import csv -import sys +This should generate a PNG image, a JSON file, and a CSV file. The first +two are from synthetic data generated from `gen_data.py`. The latter is +also generated from provenance data captured in `gen_data.py`, too. +""" +from pathlib import Path from time import time from typing import Any -from pathlib import Path from ipsframework import Component -from ipsframework.resourceHelper import get_platform_info -def create_cmd(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> list[Any]: +def create_cmd(instance: str, alpha: float, L:float, T_final:float, + Nx:int, Nt:int) -> list[Any]: """ create the command to run the external data generator + :parma instance: instance name :param alpha: thermal diffusivity :param L: domain length :param T_final: final time @@ -23,7 +25,8 @@ def create_cmd(alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> list[Any :param Nt: number of time steps :returns: list of command line arguments to be executed in step() """ - cmd = ['python3', 'gen_data.py', '--alpha', alpha, '--L', L, '--T_final', T_final, + cmd = ['python3', 'gen_data.py', '--instance', instance, + '--alpha', alpha, '--L', L, '--T_final', T_final, '--Nx', Nx, '--Nt', Nt] return cmd @@ -45,7 +48,8 @@ def step(self, timestamp: float = 0.0, **keywords): f'T_final={self.T_final}, Nx={self.Nx}, ' f'Nt={self.Nt}') - cmd = create_cmd(self.alpha, self.L, self.T_final, self.Nx, self.Nt) + cmd = create_cmd(instance_id, + self.alpha, self.L, self.T_final, self.Nx, self.Nt) working_dir = str(Path('.').absolute()) self.services.info(f'{instance_id}: Launching executable ' @@ -60,29 +64,10 @@ def step(self, timestamp: float = 0.0, **keywords): self.services.critical(f'{instance_id}: Unable to launch ' f'executable in {working_dir}') - self.services.wait_task(run_id) # block until done - - self.services.info(f'{instance_id}: Completed MPI executable.') - - - # Save some per-component stats - stats_fname = f'stats_{timestamp}.csv' - run_env = get_platform_info() + return_value = self.services.wait_task(run_id) # block until done - with open(stats_fname, 'w') as f: - # Write run-time stats to a CSV as well as the runtime parameters - # specific to this instance. - writer = csv.writer(f) - writer.writerow( - ['instance', 'executable', 'hostname', 'pid', 'core', - 'affinity', - 'alpha', 'L', 'T_final', 'Nx', 'Nt', - 'start', 'end']) - writer.writerow([instance_id, sys.argv[0], run_env['hostname'], - run_env['pid'], run_env['core_id'], - run_env['affinity'], - self.alpha, self.L, self.T_final, self.Nx, self.Nt, - start, time()]) + self.services.info(f'{instance_id}: Completed MPI executable with ' + f'return value: {return_value}.') # TODO temporarily commenting this out until the actual # example is ready to consider adding data files to the portal. This From 5a14afaa078d7137a1b95a3165a8716df9e8185b Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Tue, 9 Dec 2025 12:14:53 -0500 Subject: [PATCH 29/34] Add error handling to data generation script --- .../gen_data.py | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py index e9a29946..e6bbac98 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py +++ b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py @@ -6,6 +6,7 @@ from typing import Any import json import csv +from traceback import print_exc import numpy as np import matplotlib.pyplot as plt @@ -86,25 +87,32 @@ def main(instance: str, if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Generate synthetic data to ' - 'emulate an actual simulation ' - 'or complex') - parser.add_argument('--instance', type=str, - help='instance name') - parser.add_argument('--alpha', type=float, default=1.0,) - parser.add_argument('--L', type=float, default=1.0,) - parser.add_argument('--T_final', type=float, default=1.0,) - parser.add_argument('--Nx', type=int, default=100,) - parser.add_argument('--Nt', type=int, default=100,) + try: + parser = argparse.ArgumentParser(description='Generate synthetic data to ' + 'emulate an actual simulation ' + 'or complex') + parser.add_argument('--instance', type=str, + help='instance name') + parser.add_argument('--alpha', type=float, default=1.0,) + parser.add_argument('--L', type=float, default=1.0,) + parser.add_argument('--T_final', type=float, default=1.0,) + parser.add_argument('--Nx', type=int, default=100,) + parser.add_argument('--Nt', type=int, default=100,) - args = parser.parse_args() + args = parser.parse_args() - data = main(args.instance, - args.alpha, args.L, args.T_final, args.Nx, args.Nt) + data = main(args.instance, + args.alpha, args.L, args.T_final, args.Nx, args.Nt) - file_name = f'{args.instance}_solution.json' - print(f'Writing to {file_name}') + file_name = f'{args.instance}_solution.json' - with open(file_name, 'w') as f: - json.dump(data, f) + print(f'Writing to {file_name}') + + with open(file_name, 'w') as f: + json.dump(data, f) + + except Exception as e: + print(f'Encountered error: {e}') + print(f'Encountered error: {e}', file=gen_data_error.txt) + print_exc() From 39f7d0c3d44a028329e97fd320b5b4c13fbce224 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Tue, 9 Dec 2025 12:15:54 -0500 Subject: [PATCH 30/34] Repaired indentation --- .../024-aggregated-compute-ensemble/gen_data.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py index e6bbac98..005d9c40 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py +++ b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py @@ -101,14 +101,11 @@ def main(instance: str, args = parser.parse_args() - data = main(args.instance, - args.alpha, args.L, args.T_final, args.Nx, args.Nt) - + data = main(args.instance, + args.alpha, args.L, args.T_final, args.Nx, args.Nt) file_name = f'{args.instance}_solution.json' - print(f'Writing to {file_name}') - with open(file_name, 'w') as f: json.dump(data, f) From c28e664f2e23b40c5ff38750ba5efecd16335403 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Tue, 9 Dec 2025 12:17:54 -0500 Subject: [PATCH 31/34] Fixed a number of errors. --- .../024-aggregated-compute-ensemble/gen_data.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py index 005d9c40..a60a1efe 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py +++ b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py @@ -6,6 +6,7 @@ from typing import Any import json import csv +from time import time from traceback import print_exc import numpy as np import matplotlib.pyplot as plt @@ -28,6 +29,8 @@ def main(instance: str, :param Nt: number of time steps :returns: x, y, where x is the steps and u the corresponding values """ + start = time() + # Discretization dx = L / (Nx - 1) dt = T_final / Nt @@ -63,7 +66,7 @@ def main(instance: str, plt.savefig("solution.png") # Save some per-component stats - stats_fname = f'{instance}_stats_{timestamp}.csv' + stats_fname = f'{instance}_stats.csv' run_env = get_platform_info() with open(stats_fname, 'w') as f: @@ -76,7 +79,7 @@ def main(instance: str, 'alpha', 'L', 'T_final', 'Nx', 'Nt', 'start', 'end']) - writer.writerow([instance_id, run_env['hostname'], + writer.writerow([instance, run_env['hostname'], run_env['pid'], run_env['core_id'], run_env['affinity'], alpha, L, T_final, Nx, Nt, @@ -111,5 +114,5 @@ def main(instance: str, except Exception as e: print(f'Encountered error: {e}') - print(f'Encountered error: {e}', file=gen_data_error.txt) + print(f'Encountered error: {e}', file='gen_data_error.txt') print_exc() From 311ac854bf7b530fb8ca1ca89fc9836401bb3e15 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Tue, 9 Dec 2025 12:50:56 -0500 Subject: [PATCH 32/34] Specifying path to data generator script. --- .../instance_component.py | 10 ++++++---- .../024-aggregated-compute-ensemble/template.conf | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py index e0f16191..3e1940b7 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/instance_component.py +++ b/examples-proposed/024-aggregated-compute-ensemble/instance_component.py @@ -13,11 +13,12 @@ from ipsframework import Component -def create_cmd(instance: str, alpha: float, L:float, T_final:float, +def create_cmd(instance: str, path: Path, alpha: float, L:float, T_final:float, Nx:int, Nt:int) -> list[Any]: """ create the command to run the external data generator - :parma instance: instance name + :param instance: instance name + :param path: path to data generator script directory :param alpha: thermal diffusivity :param L: domain length :param T_final: final time @@ -25,7 +26,8 @@ def create_cmd(instance: str, alpha: float, L:float, T_final:float, :param Nt: number of time steps :returns: list of command line arguments to be executed in step() """ - cmd = ['python3', 'gen_data.py', '--instance', instance, + executable = f'{path!s}/gen_data.py' + cmd = ['python3', executable, '--instance', instance, '--alpha', alpha, '--L', L, '--T_final', T_final, '--Nx', Nx, '--Nt', Nt] return cmd @@ -48,7 +50,7 @@ def step(self, timestamp: float = 0.0, **keywords): f'T_final={self.T_final}, Nx={self.Nx}, ' f'Nt={self.Nt}') - cmd = create_cmd(instance_id, + cmd = create_cmd(instance_id, Path(self.BIN_PATH), self.alpha, self.L, self.T_final, self.Nx, self.Nt) working_dir = str(Path('.').absolute()) diff --git a/examples-proposed/024-aggregated-compute-ensemble/template.conf b/examples-proposed/024-aggregated-compute-ensemble/template.conf index 4e3b9909..1e699128 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/template.conf +++ b/examples-proposed/024-aggregated-compute-ensemble/template.conf @@ -28,7 +28,7 @@ SIMULATION_MODE = NORMAL SUB_CLASS = NAME = InstanceComponent NPROC = 1 - BIN_PATH = + BIN_PATH = $PWD INPUT_FILES = OUTPUT_FILES = SCRIPT = $PWD/instance_component.py From dbf10977160a9b6ea751a5ba0a83685e4b845475 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Tue, 9 Dec 2025 13:03:14 -0500 Subject: [PATCH 33/34] Name the output figure consistently with other output file names --- examples-proposed/024-aggregated-compute-ensemble/gen_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py index a60a1efe..f8bd70b3 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/gen_data.py +++ b/examples-proposed/024-aggregated-compute-ensemble/gen_data.py @@ -63,7 +63,7 @@ def main(instance: str, plt.ylabel("Temperature (u)") plt.title("Solution of 1D Heat Equation") plt.grid(True) - plt.savefig("solution.png") + plt.savefig(f"{instance}_solution.png") # Save some per-component stats stats_fname = f'{instance}_stats.csv' From d72d230d466e4ea94a236bc84576288f42e56a64 Mon Sep 17 00:00:00 2001 From: Mark Coletti Date: Wed, 10 Dec 2025 14:26:20 -0500 Subject: [PATCH 34/34] Just a minor comment tweak --- .../024-aggregated-compute-ensemble/perlmutter.slurm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm index 558e3d2a..b99cf266 100644 --- a/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm +++ b/examples-proposed/024-aggregated-compute-ensemble/perlmutter.slurm @@ -6,7 +6,7 @@ # free to create your own environment with a different name; but, be sure to # change the "conda activate" line below accordingly if you do so. # -#SBATCH --account=atom # REPLACE WITH YOUR PROJECT ID +#SBATCH --account=atom # REPLACE "atom" WITH YOUR PROJECT ID #SBATCH --constraint=cpu #SBATCH --nodes=1 #SBATCH --time=5 @@ -17,6 +17,9 @@ module load PrgEnv-gnu openmpi python # Again, this assumes that there exists a conda environment named "ips". conda activate ips +# Set the API key and portal URL +source /global/common/software/atom/ips-portal/credentials/ips-portal-development + # The 2>&1 binds stderr to stdout so that both are captured in the tee log file. The # `tee` command allows you to see the output on the terminal as well as save it. ips.py --simulation=ensemble.conf --platform=platform.conf \