diff --git a/.gitignore b/.gitignore index 0df53f80e..50d1f64c6 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ __pycache__ /* # except +!ci !cmake !docs !docs_src diff --git a/.python_package/.gitignore b/.python_package/.gitignore index c0c05eb2b..287461ae6 100644 --- a/.python_package/.gitignore +++ b/.python_package/.gitignore @@ -4,3 +4,4 @@ dist/ setup.cfg *.egg-info/ src/gridtools_cpp/data +build/ diff --git a/ci/base.Dockerfile b/ci/base.Dockerfile new file mode 100644 index 000000000..6682fe564 --- /dev/null +++ b/ci/base.Dockerfile @@ -0,0 +1,63 @@ +ARG UBUNTU_VERSION=24.04 +ARG CUDA_VERSION +FROM docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update -qq && \ + apt-get install -qq -y --no-install-recommends \ + gfortran \ + g++ \ + gcc \ + strace \ + build-essential \ + tar \ + wget \ + curl \ + cmake \ + ca-certificates \ + zlib1g-dev \ + libssl-dev \ + libbz2-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libffi-dev \ + liblzma-dev \ + libreadline-dev \ + python3-dev \ + python3-pip \ + git \ + rustc \ + htop && \ + rm -rf /var/lib/apt/lists/* + +ARG MPICH_VERSION=3.3.2 +ARG MPICH_PATH=/usr/local +RUN wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \ + tar -xzf mpich-${MPICH_VERSION}.tar.gz && \ + cd mpich-${MPICH_VERSION} && \ + ./configure \ + --disable-fortran \ + --prefix=$MPICH_PATH && \ + make install -j32 && \ + rm -rf /root/mpich-${MPICH_VERSION}.tar.gz /root/mpich-${MPICH_VERSION} +RUN echo "${MPICH_PATH}/lib" >> /etc/ld.so.conf.d/cscs.conf && ldconfig + +ENV CXX=${MPICH_PATH}/bin/mpicxx +ENV CC=${MPICH_PATH}/bin/mpicc + +RUN wget --quiet https://archives.boost.io/release/1.85.0/source/boost_1_85_0.tar.gz && \ + echo be0d91732d5b0cc6fbb275c7939974457e79b54d6f07ce2e3dfdd68bef883b0b boost_1_85_0.tar.gz > boost_hash.txt && \ + sha256sum -c boost_hash.txt && \ + tar xzf boost_1_85_0.tar.gz && \ + mv boost_1_85_0/boost /usr/local/include/ && \ + rm boost_1_85_0.tar.gz boost_hash.txt +ENV BOOST_ROOT /usr/local/ + +ENV CUDA_HOME /usr/local/cuda +ENV CUDA_ARCH=${CUDA_ARCH} diff --git a/ci/build.Dockerfile b/ci/build.Dockerfile new file mode 100644 index 000000000..e49c74f03 --- /dev/null +++ b/ci/build.Dockerfile @@ -0,0 +1,23 @@ +ARG BASE_IMAGE +FROM $BASE_IMAGE + +COPY . /gridtools + +ARG BUILD_TYPE + +ENV GTRUN_BUILD_COMMAND='make -j 32' +ENV GTCMAKE_Boost_NO_BOOST_CMAKE=ON +ENV GTCMAKE_Boost_NO_SYSTEM_PATHS=ON +ENV GTCMAKE_GT_TESTS_REQUIRE_FORTRAN_COMPILER=ON +ENV GTCMAKE_GT_TESTS_REQUIRE_C_COMPILER=ON +ENV GTCMAKE_GT_TESTS_REQUIRE_OpenMP=ON +ENV GTCMAKE_GT_TESTS_REQUIRE_GPU=ON +ENV GTCMAKE_GT_TESTS_MPI_WITH_MPI_EXECUTABLE=OFF +ENV GTCMAKE_GT_TESTS_REQUIRE_Python=ON +ENV GT_ENABLE_STENCIL_DUMP=ON +ENV GTCMAKE_CMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON + +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:${PATH}" + +RUN uv run /gridtools/pyutils/driver.py -v build -b ${BUILD_TYPE} -o build -i install -t install diff --git a/ci/cscs.yml b/ci/cscs.yml new file mode 100644 index 000000000..8e01c9efa --- /dev/null +++ b/ci/cscs.yml @@ -0,0 +1,81 @@ +include: + - remote: "https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml" + +stages: + - baseimage + - build + - test + +.build_baseimage: + stage: baseimage + # we create a tag that depends on the SHA value of ci/base.Dockerfile, this way + # a new base image is only built when the SHA of this file changes + # If there are more dependency files that should change the tag-name of the base container + # image, they can be added too. + # Since the base image name is runtime dependent, we need to carry the value of it to + # the following jobs via a dotenv file. + before_script: + # include build arguments in hash since we use a parameterized Docker file + - DOCKER_TAG=`echo "$(cat $DOCKERFILE) $DOCKER_BUILD_ARGS" | sha256sum | head -c 16` + - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/$ARCH/base/gridtools-ci:$DOCKER_TAG + - echo "BASE_IMAGE=$PERSIST_IMAGE_NAME" >> build.env + artifacts: + reports: + dotenv: build.env + variables: + DOCKERFILE: ci/base.Dockerfile + # change to 'always' if you want to rebuild, even if target tag exists already (if-not-exists is the default, i.e. we could also skip the variable) + CSCS_REBUILD_POLICY: if-not-exists + DOCKER_BUILD_ARGS: '["CUDA_VERSION=$CUDA_VERSION", "UBUNTU_VERSION=$UBUNTU_VERSION"]' +build_baseimage_aarch64: + extends: [.container-builder-cscs-gh200, .build_baseimage] + variables: + CUDA_VERSION: 12.6.2 + CUDA_ARCH: sm_90 + UBUNTU_VERSION: 24.04 + SLURM_TIMELIMIT: 10 + + +.build_image: + stage: build + variables: + # make sure we use a unique name here, otherwise we could create a race condition, when multiple pipelines + # are running. + PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/gridtools/gridtools-ci:$CI_COMMIT_SHA + DOCKERFILE: ci/build.Dockerfile + DOCKER_BUILD_ARGS: '["BASE_IMAGE=${BASE_IMAGE}", "BUILD_TYPE=release"]' +build_image_aarch64: + extends: [.container-builder-cscs-gh200, .build_image] + variables: + SLURM_TIMELIMIT: 10 + +.test_helper: + stage: test + image: $CSCS_REGISTRY_PATH/public/$ARCH/gridtools/gridtools-ci:$CI_COMMIT_SHA + variables: + GTRUN_WITH_SLURM: False # since we are already in a SLURM job + SLURM_JOB_NUM_NODES: 1 + SLURM_TIMELIMIT: 10 + CSCS_CUDA_MPS: 0 + +test_aarch64: + extends: [.container-runner-daint-gh200, .test_helper] + script: + - cd /build && ctest -LE mpi --output-on-failure + variables: + SLURM_NTASKS: 1 + +test_aarch64_mpi: + extends: [.container-runner-daint-gh200, .test_helper] + script: + - export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH + - export LD_PRELOAD=/usr/lib64/libmpi_gtl_cuda.so + - cd /build && ctest -L mpi --output-on-failure + variables: + NVIDIA_VISIBLE_DEVICES: all + SLURM_NTASKS: 4 + SLURM_GPUS_PER_TASK: 1 + MPICH_GPU_SUPPORT_ENABLED: 1 + USE_MPI: "YES" + SLURM_MPI_TYPE: cray_shasta + CSCS_ADDITIONAL_MOUNTS: '["/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib/libmpi.so:/usr/local/lib/libmpi.so.12.1.8", "/opt/cray/pe/lib64/libpmi.so.0:/usr/lib64/libpmi.so.0", "/opt/cray/pe/lib64/libpmi2.so.0:/usr/lib64/libpmi2.so.0", "/opt/cray/pals/1.4/lib/libpals.so.0:/usr/lib64/libpals.so.0", "/usr/lib64/libgfortran.so.5:/usr/lib64/libgfortran.so.5", "/opt/cray/pe/mpich/8.1.28/gtl/lib/libmpi_gtl_cuda.so:/usr/lib64/libmpi_gtl_cuda.so"]' diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3ec4eddc8..0012538d9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -36,17 +36,6 @@ if(GT_INSTALL_EXAMPLES) install_example(DIRECTORY boundaries SOURCES boundaries boundaries_provided) - configure_file(c_bindings/CMakeLists.txt.in - ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/c_bindings/CMakeLists.txt @ONLY) - install(FILES ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/c_bindings/CMakeLists.txt - DESTINATION ${GT_INSTALL_EXAMPLES_PATH}/c_bindings) - install( - DIRECTORY c_bindings - DESTINATION ${GT_INSTALL_EXAMPLES_PATH} - PATTERN "CMakeLists.txt.in" EXCLUDE - ) - list(APPEND enabled_examples c_bindings) - configure_file(cmake_skeletons/CMakeLists.txt.driver.in ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeLists.txt @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeLists.txt DESTINATION ${GT_INSTALL_EXAMPLES_PATH}) diff --git a/pyutils/driver.py b/pyutils/driver.py index 803c5cd12..3d5fd14a8 100755 --- a/pyutils/driver.py +++ b/pyutils/driver.py @@ -1,5 +1,13 @@ #!/usr/bin/env python3 +# /// script +# dependencies = [ +# "matplotlib", +# "numpy", +# "python-dateutil", +# ] +# /// + import json import os diff --git a/pyutils/pyutils/env.py b/pyutils/pyutils/env.py index 464175df5..6db44644b 100644 --- a/pyutils/pyutils/env.py +++ b/pyutils/pyutils/env.py @@ -10,20 +10,36 @@ env = os.environ.copy() +def env_flag_to_bool(name: str, default: bool) -> bool: + """Recognize true or false signaling string values.""" + flag_value = None + if name in env: + flag_value = env[name].lower() + if flag_value is None: + return default + elif flag_value in ("0", "false", "off"): + return False + elif flag_value in ("1", "true", "on"): + return True + else: + raise ValueError( + "Invalid environment flag value: use '0 | false | off' or '1 | true | on'." + ) + + def load(envfile): if not os.path.exists(envfile): raise FileNotFoundError(f'Could find environment file "{envfile}"') - env['GTCMAKE_PYUTILS_ENVFILE'] = os.path.abspath(envfile) + env["GTCMAKE_PYUTILS_ENVFILE"] = os.path.abspath(envfile) envdir, envfile = os.path.split(envfile) output = runtools.run( - ['bash', '-c', f'set -e && source {envfile} && env -0'], - cwd=envdir).strip('\0') - env.update(line.split('=', 1) for line in output.split('\0')) + ["bash", "-c", f"set -e && source {envfile} && env -0"], cwd=envdir + ).strip("\0") + env.update(line.split("=", 1) for line in output.split("\0")) - log.info(f'Loaded environment from {os.path.join(envdir, envfile)}') - log.debug(f'New environment', - '\n'.join(f'{k}={v}' for k, v in sorted(env.items()))) + log.info(f"Loaded environment from {os.path.join(envdir, envfile)}") + log.debug(f"New environment", "\n".join(f"{k}={v}" for k, v in sorted(env.items()))) try: @@ -36,39 +52,43 @@ def load(envfile): def _items_with_tag(tag): - return {k[len(tag):]: v for k, v in env.items() if k.startswith(tag)} + return {k[len(tag) :]: v for k, v in env.items() if k.startswith(tag)} def cmake_args(): args = [] - for k, v in _items_with_tag('GTCMAKE_').items(): - if v.strip().upper() in ('ON', 'OFF'): - k += ':BOOL' + for k, v in _items_with_tag("GTCMAKE_").items(): + if v.strip().upper() in ("ON", "OFF"): + k += ":BOOL" else: - k += ':STRING' - args.append(f'-D{k}={v}') + k += ":STRING" + args.append(f"-D{k}={v}") return args def set_cmake_arg(arg, value): if isinstance(value, bool): - value = 'ON' if value else 'OFF' - env['GTCMAKE_' + arg] = value + value = "ON" if value else "OFF" + env["GTCMAKE_" + arg] = value def sbatch_options(mpi): - options = _items_with_tag('GTRUN_SBATCH_') + options = _items_with_tag("GTRUN_SBATCH_") if mpi: - options.update(_items_with_tag('GTRUNMPI_SBATCH_')) + options.update(_items_with_tag("GTRUNMPI_SBATCH_")) return [ - '--' + k.lower().replace('_', '-') + ('=' + v if v else '') + "--" + k.lower().replace("_", "-") + ("=" + v if v else "") for k, v in options.items() ] def build_command(): - return env.get('GTRUN_BUILD_COMMAND', 'make').split() + return env.get("GTRUN_BUILD_COMMAND", "make").split() + + +def run_with_slurm() -> bool: + return env_flag_to_bool("GTRUN_WITH_SLURM", True) def hostname(): @@ -90,9 +110,10 @@ def clustername(): 'kesch' """ try: - output = runtools.run(['scontrol', 'show', 'config']) - m = re.compile(r'.*ClusterName\s*=\s*(\S*).*', - re.MULTILINE | re.DOTALL).match(output) + output = runtools.run(["scontrol", "show", "config"]) + m = re.compile(r".*ClusterName\s*=\s*(\S*).*", re.MULTILINE | re.DOTALL).match( + output + ) if m: return m.group(1) except FileNotFoundError: diff --git a/pyutils/pyutils/runtools.py b/pyutils/pyutils/runtools.py index 65c0ca5df..763fab8f6 100644 --- a/pyutils/pyutils/runtools.py +++ b/pyutils/pyutils/runtools.py @@ -15,26 +15,25 @@ async def _run_async(command, log_output, **kwargs): stderr=asyncio.subprocess.PIPE, env=env.env, limit=2**24, - **kwargs) + **kwargs, + ) async def read_output(stream): buffer = io.StringIO() async for line in stream: line = line.decode() buffer.write(line) - log_output(command[0], line.strip('\n')) + log_output(command[0], line.strip("\n")) buffer.seek(0) return buffer.read() returncode, stdout, stderr = await asyncio.gather( - process.wait(), read_output(process.stdout), - read_output(process.stderr)) + process.wait(), read_output(process.stdout), read_output(process.stderr) + ) if returncode != 0: - commstr = ' '.join(f'"{c}"' for c in command) - log.error( - f'{commstr} finished with exit code {returncode} and message', - stderr) + commstr = " ".join(f'"{c}"' for c in command) + log.error(f"{commstr} finished with exit code {returncode} and message", stderr) raise RuntimeError(f'{commstr} failed with message "{stderr}"') return stdout @@ -42,41 +41,41 @@ async def read_output(stream): def run(command, log_output=None, **kwargs): if not command: - raise ValueError('No command provided') + raise ValueError("No command provided") if log_output is None: log_output = log.debug - log.info('Invoking', ' '.join(f'"{c}"' for c in command)) + log.info("Invoking", " ".join(f'"{c}"' for c in command)) start = time.time() loop = asyncio.get_event_loop() output = loop.run_until_complete(_run_async(command, log_output, **kwargs)) end = time.time() - log.info(f'{command[0]} finished in {end - start:.2f}s') + log.info(f"{command[0]} finished in {end - start:.2f}s") return output @functools.lru_cache() def _slurm_available(): try: - run(['srun', '--version']) - log.info('Using SLURM') + run(["srun", "--version"]) + log.info("Using SLURM") return True except FileNotFoundError: - log.info('SLURM not found: invoking commands directly') + log.info("SLURM not found: invoking commands directly") return False def srun(command, use_mpi_config=False, **kwargs): - if _slurm_available(): - command = ['srun'] + env.sbatch_options(use_mpi_config) + command + if env.run_with_slurm() and _slurm_available(): + command = ["srun"] + env.sbatch_options(use_mpi_config) + command return run(command, **kwargs) def salloc(command, use_mpi_config=False, **kwargs): - if _slurm_available(): - command = ['salloc'] + env.sbatch_options(use_mpi_config) + command + if env.run_with_slurm() and _slurm_available(): + command = ["salloc"] + env.sbatch_options(use_mpi_config) + command return run(command, **kwargs) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 143943db0..a64311eb3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -81,6 +81,7 @@ target_link_libraries(GridToolsTest INTERFACE Threads::Threads gtest) add_subdirectory(src) +option(GT_TESTS_MPI_WITH_MPI_EXECUTABLE "Use MPI executable for MPI tests" ON) function(gridtools_add_mpi_test arch tgt) set(options) set(one_value_args) @@ -91,11 +92,18 @@ function(gridtools_add_mpi_test arch tgt) LIBRARIES ${ARGS_LIBRARIES} mpi_gtest_main_${arch} gcl_${arch}) set(nproc 4) set(labels ${ARGS_LABELS} mpi gcl ${arch}) - # Note: We use MPITEST_ instead of MPIEXEC_ because our own MPI_TEST_-variables are slurm-aware - add_test( - NAME ${tgt} - COMMAND ${MPITEST_EXECUTABLE} ${MPITEST_NUMPROC_FLAG} ${nproc} ${MPITEST_PREFLAGS} $ ${MPITEST_POSTFLAGS} - ) + if(GT_TESTS_MPI_WITH_MPI_EXECUTABLE) + # Note: We use MPITEST_ instead of MPIEXEC_ because our own MPI_TEST_-variables are slurm-aware + add_test( + NAME ${tgt} + COMMAND ${MPITEST_EXECUTABLE} ${MPITEST_NUMPROC_FLAG} ${nproc} ${MPITEST_PREFLAGS} $ ${MPITEST_POSTFLAGS} + ) + else() + add_test( + NAME ${tgt} + COMMAND $ + ) + endif() set_tests_properties(${tgt} PROPERTIES LABELS "${labels}") set_tests_properties(${tgt} PROPERTIES PROCESSORS ${nproc}) endfunction() diff --git a/tests/unit_tests/fn/test_fn_sid_neighbor_table.cu b/tests/unit_tests/fn/test_fn_sid_neighbor_table.cu index cac45338e..1c6a7784b 100644 --- a/tests/unit_tests/fn/test_fn_sid_neighbor_table.cu +++ b/tests/unit_tests/fn/test_fn_sid_neighbor_table.cu @@ -32,6 +32,7 @@ namespace gridtools::fn { return neighbor_table::neighbors(table, index); } +#if defined(__NVCC__) && defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ > 8)) TEST(sid_neighbor_table, correctness_cuda) { constexpr std::size_t num_elements = 3; constexpr std::size_t num_neighbors = 2; @@ -63,5 +64,6 @@ namespace gridtools::fn { EXPECT_EQ(n20, 20); EXPECT_EQ(n21, 21); } +#endif } // namespace } // namespace gridtools::fn