From 5a66450fd165fbc473441a0a2a76767ee1b1780e Mon Sep 17 00:00:00 2001 From: "C. Carouge" Date: Fri, 17 Dec 2021 09:48:26 +1100 Subject: [PATCH 1/2] first draft --- posts/2021-12-15-read-AWS.ipynb | 2712 +++++++++++++++++++++++++++++++ 1 file changed, 2712 insertions(+) create mode 100644 posts/2021-12-15-read-AWS.ipynb diff --git a/posts/2021-12-15-read-AWS.ipynb b/posts/2021-12-15-read-AWS.ipynb new file mode 100644 index 0000000..b9ae4ff --- /dev/null +++ b/posts/2021-12-15-read-AWS.ipynb @@ -0,0 +1,2712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b38e8e0e-4112-4d6e-b1f2-132e6632794b", + "metadata": {}, + "source": [ + "# Read CAFE dataset directly from AWS datastore\n", + "\n", + "The CAFE dataset produced by CSIRO is stored as on Open Dataset on AWS. It is possible to directly read the data from the AWS store from Python. Here is an example on how to do so.\n", + "\n", + "The data is organised by realm: atmos_hybrid, atmos_isobaric, ice, land, ocean, ocean_bgc, ocean_force, ocean_scalar\n", + "\n", + "The list of variables in each realm is given in the [CAFE documentation](https://data.csiro.au/dap/ws/v2/collections/49803/support/4029)\n", + "\n", + "To name of the AWS filesystem is S3. Some Python packages have been developed to access this filesystem. Here we are going to use s3fs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c4c3e78d-cb34-44ad-86f6-873ef34ae489", + "metadata": {}, + "outputs": [], + "source": [ + "import s3fs\n", + "import xarray as xr\n", + "import climtas.nci\n", + "import kerchunk.hdf\n", + "import kerchunk.combine\n", + "import dask\n", + "import ujson\n", + "from glob import glob" + ] + }, + { + "cell_type": "markdown", + "id": "85a6b11f-4ae7-430e-8b48-116419326755", + "metadata": {}, + "source": [ + "## Start with an example" + ] + }, + { + "cell_type": "markdown", + "id": "9cb162fb-3d6c-4cfb-9af5-c4a218dc7acd", + "metadata": {}, + "source": [ + "First, you will need to open an anonymous connection to the filesystem. The `anon` option stands for anonymous." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5b5b1de6-ee03-4d1f-bdde-64433ae11888", + "metadata": {}, + "outputs": [], + "source": [ + "fs = s3fs.S3FileSystem(anon=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1bad80ad-4dbb-4573-8df5-03aea28fa11c", + "metadata": {}, + "source": [ + "Let's find the path to one of the CAFE files using [the AWS Explorer](https://cafe60-reanalysis-dataset-aws-open-data.s3.amazonaws.com/index.html) and see how to open it. You can see the path in the top banner of the Explorer.\n", + "\n", + "Xarray can open the files for you but you shouldn't give the file path as usual but a file object as is returned by the `open()` operation." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f78c6487-e7ee-4077-b27b-d9e7dcec5259", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:      (ensemble: 96, lat: 90, level: 21, lon: 144, time: 31, nv: 2)\n",
+       "Coordinates:\n",
+       "  * ensemble     (ensemble) int32 1 2 3 4 5 6 7 8 9 ... 89 90 91 92 93 94 95 96\n",
+       "  * lat          (lat) float64 -89.49 -87.98 -85.96 -83.93 ... 85.96 87.98 89.49\n",
+       "  * level        (level) float32 1.0 5.0 10.0 20.0 ... 700.0 850.0 925.0 1e+03\n",
+       "  * lon          (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8\n",
+       "  * time         (time) object 1960-01-01 12:00:00 ... 1960-01-31 12:00:00\n",
+       "Dimensions without coordinates: nv\n",
+       "Data variables:\n",
+       "    temp         (time, ensemble, level, lat, lon) float32 ...\n",
+       "    time_bounds  (time, nv) timedelta64[ns] 58440 days 58441 days ... 58471 days\n",
+       "Attributes: (12/27)\n",
+       "    title:                      AccessOcean-AM2\n",
+       "    grid_type:                  regular\n",
+       "    grid_tile:                  N/A\n",
+       "    metadata_description:       \\n\\tEach of the metadata keys added via the C...\n",
+       "    institution:                CSIRO CAFE\n",
+       "    further_info_url:           https://research.csiro.au/dfp/\n",
+       "    ...                         ...\n",
+       "    run_variant_name:           data assimilation\n",
+       "    history_of_appended_files:  Tue May 28 12:31:55 2019: Appended file /OSM/...\n",
+       "    NCO:                        netCDF Operators version 4.7.8 (Homepage = ht...\n",
+       "    comment:                    pressure level interpolator, version 3.0, pre...\n",
+       "    ens_member_number:          1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,...\n",
+       "    filename:                   temp.atmos_isobaric.daily.CAFE60.19600101-196...
" + ], + "text/plain": [ + "\n", + "Dimensions: (ensemble: 96, lat: 90, level: 21, lon: 144, time: 31, nv: 2)\n", + "Coordinates:\n", + " * ensemble (ensemble) int32 1 2 3 4 5 6 7 8 9 ... 89 90 91 92 93 94 95 96\n", + " * lat (lat) float64 -89.49 -87.98 -85.96 -83.93 ... 85.96 87.98 89.49\n", + " * level (level) float32 1.0 5.0 10.0 20.0 ... 700.0 850.0 925.0 1e+03\n", + " * lon (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8\n", + " * time (time) object 1960-01-01 12:00:00 ... 1960-01-31 12:00:00\n", + "Dimensions without coordinates: nv\n", + "Data variables:\n", + " temp (time, ensemble, level, lat, lon) float32 ...\n", + " time_bounds (time, nv) timedelta64[ns] ...\n", + "Attributes: (12/27)\n", + " title: AccessOcean-AM2\n", + " grid_type: regular\n", + " grid_tile: N/A\n", + " metadata_description: \\n\\tEach of the metadata keys added via the C...\n", + " institution: CSIRO CAFE\n", + " further_info_url: https://research.csiro.au/dfp/\n", + " ... ...\n", + " run_variant_name: data assimilation\n", + " history_of_appended_files: Tue May 28 12:31:55 2019: Appended file /OSM/...\n", + " NCO: netCDF Operators version 4.7.8 (Homepage = ht...\n", + " comment: pressure level interpolator, version 3.0, pre...\n", + " ens_member_number: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,...\n", + " filename: temp.atmos_isobaric.daily.CAFE60.19600101-196..." + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_path = \"cafe60-reanalysis-dataset-aws-open-data/atmos_isobaric/temp.atmos_isobaric.daily.CAFE60.19600101-19600131.nc\"\n", + "file_obj = fs.open(file_path) # We use fs.open() as the file is on S3\n", + "ds = xr.open_dataset(file_obj)\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "3e5c39c9-32f8-498d-9fa7-acd221b22ab5", + "metadata": {}, + "source": [ + "You can then use the data as usual but the operation will be longer than usual:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e67e432b-3df6-4699-b733-40c0bf2bf302", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'temp' (ensemble: 96, level: 21, lat: 90, lon: 144)>\n",
+       "array([[[[260.67108, 260.67444, 260.67786, ..., 260.6621 , 260.66476,\n",
+       "          260.6677 ],\n",
+       "         [260.72556, 260.72397, 260.72308, ..., 260.73322, 260.73138,\n",
+       "          260.7293 ],\n",
+       "         [260.50287, 260.50003, 260.50076, ..., 260.52402, 260.51575,\n",
+       "          260.50876],\n",
+       "         ...,\n",
+       "         [215.56755, 215.54845, 215.53064, ..., 215.63278, 215.60954,\n",
+       "          215.58792],\n",
+       "         [215.83043, 215.8165 , 215.80365, ..., 215.87881, 215.8616 ,\n",
+       "          215.84552],\n",
+       "         [216.449  , 216.4476 , 216.44633, ..., 216.4532 , 216.45187,\n",
+       "          216.45032]],\n",
+       "\n",
+       "        [[240.55174, 240.55838, 240.56522, ..., 240.53372, 240.53911,\n",
+       "          240.54498],\n",
+       "         [240.6599 , 240.65944, 240.6593 , ..., 240.66197, 240.66132,\n",
+       "          240.66058],\n",
+       "         [240.42894, 240.4416 , 240.45589, ..., 240.397  , 240.4067 ,\n",
+       "          240.41745],\n",
+       "...\n",
+       "         [253.26424, 253.28755, 253.32199, ..., 253.2713 , 253.2629 ,\n",
+       "          253.25793],\n",
+       "         [252.70592, 252.64792, 252.59059, ..., 252.87955, 252.81825,\n",
+       "          252.76454],\n",
+       "         [252.51163, 252.51167, 252.51173, ..., 252.51149, 252.51146,\n",
+       "          252.51158]],\n",
+       "\n",
+       "        [[260.2997 , 260.32236, 260.3459 , ..., 260.23724, 260.2565 ,\n",
+       "          260.27704],\n",
+       "         [261.9009 , 261.89325, 261.8795 , ..., 261.9365 , 261.91452,\n",
+       "          261.90445],\n",
+       "         [262.28613, 262.12006, 261.96677, ..., 262.74078, 262.5768 ,\n",
+       "          262.43826],\n",
+       "         ...,\n",
+       "         [253.76541, 253.74757, 253.73332, ..., 253.90256, 253.84274,\n",
+       "          253.79501],\n",
+       "         [253.86533, 253.8579 , 253.85526, ..., 253.91324, 253.89287,\n",
+       "          253.8761 ],\n",
+       "         [253.7368 , 253.73196, 253.72743, ..., 253.75186, 253.74701,\n",
+       "          253.74153]]]], dtype=float32)\n",
+       "Coordinates:\n",
+       "  * ensemble  (ensemble) int32 1 2 3 4 5 6 7 8 9 ... 88 89 90 91 92 93 94 95 96\n",
+       "  * lat       (lat) float64 -89.49 -87.98 -85.96 -83.93 ... 85.96 87.98 89.49\n",
+       "  * level     (level) float32 1.0 5.0 10.0 20.0 30.0 ... 700.0 850.0 925.0 1e+03\n",
+       "  * lon       (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8
" + ], + "text/plain": [ + "\n", + "array([[[[260.67108, 260.67444, 260.67786, ..., 260.6621 , 260.66476,\n", + " 260.6677 ],\n", + " [260.72556, 260.72397, 260.72308, ..., 260.73322, 260.73138,\n", + " 260.7293 ],\n", + " [260.50287, 260.50003, 260.50076, ..., 260.52402, 260.51575,\n", + " 260.50876],\n", + " ...,\n", + " [215.56755, 215.54845, 215.53064, ..., 215.63278, 215.60954,\n", + " 215.58792],\n", + " [215.83043, 215.8165 , 215.80365, ..., 215.87881, 215.8616 ,\n", + " 215.84552],\n", + " [216.449 , 216.4476 , 216.44633, ..., 216.4532 , 216.45187,\n", + " 216.45032]],\n", + "\n", + " [[240.55174, 240.55838, 240.56522, ..., 240.53372, 240.53911,\n", + " 240.54498],\n", + " [240.6599 , 240.65944, 240.6593 , ..., 240.66197, 240.66132,\n", + " 240.66058],\n", + " [240.42894, 240.4416 , 240.45589, ..., 240.397 , 240.4067 ,\n", + " 240.41745],\n", + "...\n", + " [253.26424, 253.28755, 253.32199, ..., 253.2713 , 253.2629 ,\n", + " 253.25793],\n", + " [252.70592, 252.64792, 252.59059, ..., 252.87955, 252.81825,\n", + " 252.76454],\n", + " [252.51163, 252.51167, 252.51173, ..., 252.51149, 252.51146,\n", + " 252.51158]],\n", + "\n", + " [[260.2997 , 260.32236, 260.3459 , ..., 260.23724, 260.2565 ,\n", + " 260.27704],\n", + " [261.9009 , 261.89325, 261.8795 , ..., 261.9365 , 261.91452,\n", + " 261.90445],\n", + " [262.28613, 262.12006, 261.96677, ..., 262.74078, 262.5768 ,\n", + " 262.43826],\n", + " ...,\n", + " [253.76541, 253.74757, 253.73332, ..., 253.90256, 253.84274,\n", + " 253.79501],\n", + " [253.86533, 253.8579 , 253.85526, ..., 253.91324, 253.89287,\n", + " 253.8761 ],\n", + " [253.7368 , 253.73196, 253.72743, ..., 253.75186, 253.74701,\n", + " 253.74153]]]], dtype=float32)\n", + "Coordinates:\n", + " * ensemble (ensemble) int32 1 2 3 4 5 6 7 8 9 ... 88 89 90 91 92 93 94 95 96\n", + " * lat (lat) float64 -89.49 -87.98 -85.96 -83.93 ... 85.96 87.98 89.49\n", + " * level (level) float32 1.0 5.0 10.0 20.0 30.0 ... 700.0 850.0 925.0 1e+03\n", + " * lon (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[\"temp\"].mean(dim=\"time\")" + ] + }, + { + "cell_type": "markdown", + "id": "2701278f-f8b6-4e0b-92b9-17e217dabd8d", + "metadata": {}, + "source": [ + "## Generalisation\n", + "From the path to the CAFE file used previously, we see that all the files following this naming pattern:\n", + "\n", + "`\"cafe60-reanalysis-dataset-aws-open-data//...CAFE60.-.nc\"`\n", + "\n", + "With this information, we can write a function that will return the paths to all the files for a given variable, realm and temporal resolution." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dda041d5-bb77-4f5a-8d20-d0643014cc79", + "metadata": {}, + "outputs": [], + "source": [ + "def find_cafe_files(realm, varname, time_res):\n", + " \"\"\" Return a list of all the files for a given variable and a temporal resolution in a realm\n", + " realm: str, one of the realms for the CAFE variables\n", + " varname: str, name of one of the CAFE variables\n", + " time_res: 'daily'|'month', temporal resolution for the variable\"\"\"\n", + " \n", + " root = \"cafe60-reanalysis-dataset-aws-open-data\"\n", + " path = f\"{root}/{realm}\"\n", + " \n", + " my_list = fs.glob(f\"{path}/{varname}.{realm}*\")\n", + " \n", + " # Stop and return an error message if no files found.\n", + " assert len(my_list)!=0, \"There is no such variable in that realm or for that temporal resolution. Please check the CAFE documentation.\"\n", + " return my_list " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ee25f63d-9872-4797-9ac4-b1ea21bc1371", + "metadata": {}, + "outputs": [], + "source": [ + "temp_files = find_cafe_files(\"atmos_isobaric\",\"temp\",\"daily\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c3a22a7-7e4a-4eb8-bf1c-e3b3144452a3", + "metadata": {}, + "source": [ + "Let's try to read in 100 and 250 files and see how long it is. This will allow us to assess if this simple approach can work in this case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6218be31-5bc9-4d34-8530-c8261bf884c7", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# Open all the files and then read them with open_mfdataset\n", + "temp_files_ob = [ fs.open(tt) for tt in temp_files ]\n", + "ds = xr.open_mfdataset(temp_files_ob[0:100],\n", + " combine='nested',\n", + " concat_dim='time', \n", + " join='override',\n", + " coords='minimal',\n", + " compat='override',\n", + " chunks={\"time\":1}, \n", + "# parallel=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea5f5b53-79ec-48ee-94f9-0b6709cc7b50", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# Open all the files and then read them with open_mfdataset\n", + "temp_files_ob = [ fs.open(tt) for tt in temp_files ]\n", + "ds = xr.open_mfdataset(temp_files_ob[0:250],\n", + " combine='nested',\n", + " concat_dim='time', \n", + " join='override',\n", + " coords='minimal',\n", + " compat='override',\n", + " chunks={\"time\":1}, \n", + "# parallel=True\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "9d167342-8a60-4066-ad5c-a7118d43180a", + "metadata": {}, + "source": [ + "We see the time to read 250 files is much more than 2.5 times the time it takes to read 100 files. This tells us we can't use this simple way to read in the data to read in the whole timeseries of 726 files." + ] + }, + { + "cell_type": "markdown", + "id": "148a8de1-949d-40d9-b3a9-aaa37b9e6169", + "metadata": { + "tags": [] + }, + "source": [ + "## Let's try to optimise this\n", + "Cloud is much better at reading a format called zarr instead of netcdf. Luckily, it's possible to fake a Zarr file from a netcdf file. Let's see if it speeds things up. We can use what was done here: https://gist.github.com/rsignell-usgs/ef435a53ac530a2843ce7e1d59f96e22\n", + "\n", + "For this, we need to save some data about the chunking in the netcdf files in JSON files." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9fc776c8-f204-4ffa-abe3-167ed6946d6c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Client

\n", + "

Client-ab88d262-5ec0-11ec-9402-fa163e6c9e3b

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", + " Dashboard: /node/ood-vn9/39373/proxy/8787/status\n", + "
\n", + "\n", + " \n", + "
\n", + "

Cluster Info

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

LocalCluster

\n", + "

bc03a1e1

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/8787/status\n", + " \n", + " Workers: 8\n", + "
\n", + " Total threads: 8\n", + " \n", + " Total memory: 22.46 GiB\n", + "
Status: runningUsing processes: True
\n", + "\n", + "
\n", + " \n", + "

Scheduler Info

\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

Scheduler

\n", + "

Scheduler-9696b5a6-66c2-4a2c-ae77-3ff98b874a05

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Comm: tcp://127.0.0.1:39885\n", + " \n", + " Workers: 8\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/8787/status\n", + " \n", + " Total threads: 8\n", + "
\n", + " Started: Just now\n", + " \n", + " Total memory: 22.46 GiB\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "

Workers

\n", + "
\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 0

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:38873\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/36539/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:43071\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-94kq267v\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 1

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:42849\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/36301/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:32915\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-hyunmlo_\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 2

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:37031\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/36069/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:33071\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-7mepsonn\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 3

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:36105\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/44779/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:40469\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-axla5c2a\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 4

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:44531\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/35897/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:46811\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-eofiugs7\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 5

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:37401\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/38933/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:40261\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-n_94cyka\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 6

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:40447\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/42171/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:42405\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-kq_bvquj\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 7

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:35993\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn9/39373/proxy/45281/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:40817\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-9lsegioa\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = climtas.nci.Client()\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "77aa0183-4e5d-4e5e-a7b7-87aa47d38c21", + "metadata": {}, + "outputs": [], + "source": [ + "so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')\n", + "\n", + "def gen_json(u, json_dir):\n", + " with fs.open(u, **so) as infile:\n", + " h5chunks = kerchunk.hdf.SingleHdf5ToZarr(infile, u, inline_threshold=300)\n", + " p = u.split('/')\n", + " fname = p[4]\n", + " outf = f'{json_dir}/{fname}.json'\n", + " with open(outf, 'wb') as f:\n", + " f.write(ujson.dumps(h5chunks.translate()).encode());" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3e3bdf75-2589-4103-ae09-40a1225dcf32", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's define the URLs and a path to save the JSON files\n", + "urls = [\"s3://\" + f for f in temp_files]\n", + "json_path = f\"/g/data/w35/ccc561/CAFE60/json/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f9523e2f-a07e-411e-ad38-76ac5e57f390", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 37s, sys: 14.4 s, total: 1min 52s\n", + "Wall time: 7min 56s\n" + ] + }, + { + "data": { + "text/plain": [ + "(None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None,\n", + " None)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "dask.compute(*[dask.delayed(gen_json)(u, json_path) for u in urls], retries=10);" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2af7000e-b263-4177-88ec-b9108b02a0b7", + "metadata": {}, + "outputs": [], + "source": [ + "json_list = sorted(glob(f\"{json_path}/temp*.json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d7c93337-6577-4632-ad49-b1f7cf4a04b8", + "metadata": {}, + "outputs": [], + "source": [ + "mzz = kerchunk.combine.MultiZarrToZarr(json_list[0:50], \n", + " remote_protocol='s3',\n", + " remote_options={'anon' : 'True'}, #JSON files \n", + " xarray_open_kwargs={\n", + " 'decode_cf' : False,\n", + " 'mask_and_scale' : False,\n", + " 'decode_times' : False,\n", + " 'use_cftime' : False,\n", + " 'drop_variables': ['time_bounds'],\n", + " 'decode_coords' : False\n", + " },\n", + " xarray_concat_args={\n", + " \"data_vars\": \"minimal\",\n", + " \"coords\": \"minimal\",\n", + " \"compat\": \"override\",\n", + " \"join\": \"override\",\n", + " \"combine_attrs\": \"override\",\n", + " \"dim\": \"time\"\n", + " }\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cae02cb2-f5fd-490e-bb81-8045a9ba8601", + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/local/w35/ccc561/tmp/ipykernel_3691522/3306629662.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmzz\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranslate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{json_path}/temp.atmos_isobaric.daily.CAFE60.json\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/kerchunk/combine.py\u001b[0m in \u001b[0;36mtranslate\u001b[0;34m(self, outpath, template_count)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mds0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_determine_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mds0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_consolidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemplate_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemplate_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/kerchunk/combine.py\u001b[0m in \u001b[0;36m_build_output\u001b[0;34m(self, ds, ds0, fss)\u001b[0m\n\u001b[1;32m 172\u001b[0m consolidated=False) # fills in metadata&coords\n\u001b[1;32m 173\u001b[0m \u001b[0mz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzarr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen_group\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 174\u001b[0;31m \u001b[0maccum_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat_dims\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextra_dims\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# only ever one dim for now\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[0macc_len\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_coord\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mz\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccum_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "mzz.translate(f\"{json_path}/temp.atmos_isobaric.daily.CAFE60.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ef83aa5-9e99-4dce-b015-cbbc34ee9fcb", + "metadata": {}, + "outputs": [], + "source": [ + "temp_files[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e97994e-0076-40e6-bd51-d3bc444fe392", + "metadata": {}, + "outputs": [], + "source": [ + "len(json_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c889c116-7bf4-4aca-9fee-1f2bfd880227", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:analysis3-21.10]", + "language": "python", + "name": "conda-env-analysis3-21.10-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 11081f2e3364ee0ac46d401691eeca29eb7caa89 Mon Sep 17 00:00:00 2001 From: "C. Carouge" Date: Fri, 17 Dec 2021 12:53:22 +1100 Subject: [PATCH 2/2] With reading the data + speed test --- posts/2021-12-15-read-AWS.ipynb | 3417 ++++++++++++++++++++++--------- 1 file changed, 2443 insertions(+), 974 deletions(-) diff --git a/posts/2021-12-15-read-AWS.ipynb b/posts/2021-12-15-read-AWS.ipynb index b9ae4ff..411bf8f 100644 --- a/posts/2021-12-15-read-AWS.ipynb +++ b/posts/2021-12-15-read-AWS.ipynb @@ -7,13 +7,13 @@ "source": [ "# Read CAFE dataset directly from AWS datastore\n", "\n", - "The CAFE dataset produced by CSIRO is stored as on Open Dataset on AWS. It is possible to directly read the data from the AWS store from Python. Here is an example on how to do so.\n", + "The CAFE dataset produced by CSIRO is stored on Open Dataset on AWS. It is possible to directly read the data from the AWS store from Python. Here is an example on how to do so.\n", "\n", "The data is organised by realm: atmos_hybrid, atmos_isobaric, ice, land, ocean, ocean_bgc, ocean_force, ocean_scalar\n", "\n", "The list of variables in each realm is given in the [CAFE documentation](https://data.csiro.au/dap/ws/v2/collections/49803/support/4029)\n", "\n", - "To name of the AWS filesystem is S3. Some Python packages have been developed to access this filesystem. Here we are going to use s3fs." + "The name of the AWS filesystem is S3. Some Python packages have been developed to access this filesystem. Here we are going to use s3fs." ] }, { @@ -30,7 +30,9 @@ "import kerchunk.combine\n", "import dask\n", "import ujson\n", - "from glob import glob" + "import fsspec\n", + "from glob import glob\n", + "from tqdm import tqdm" ] }, { @@ -456,12 +458,12 @@ " NCO: netCDF Operators version 4.7.8 (Homepage = ht...\n", " comment: pressure level interpolator, version 3.0, pre...\n", " ens_member_number: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,...\n", - " filename: temp.atmos_isobaric.daily.CAFE60.19600101-196..." ], "text/plain": [ "\n", @@ -1189,7 +1191,7 @@ "metadata": {}, "source": [ "## Generalisation\n", - "From the path to the CAFE file used previously, we see that all the files following this naming pattern:\n", + "From the path to the CAFE file used previously, we see that all the files are following this naming pattern:\n", "\n", "`\"cafe60-reanalysis-dataset-aws-open-data//...CAFE60.-.nc\"`\n", "\n", @@ -1239,10 +1241,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "6218be31-5bc9-4d34-8530-c8261bf884c7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 56.7 s, sys: 4.74 s, total: 1min 1s\n", + "Wall time: 2min 22s\n" + ] + } + ], "source": [ "%%time\n", "# Open all the files and then read them with open_mfdataset\n", @@ -1254,7 +1265,6 @@ " coords='minimal',\n", " compat='override',\n", " chunks={\"time\":1}, \n", - "# parallel=True\n", " )" ] }, @@ -1275,7 +1285,6 @@ " coords='minimal',\n", " compat='override',\n", " chunks={\"time\":1}, \n", - "# parallel=True\n", " )\n" ] }, @@ -1284,7 +1293,7 @@ "id": "9d167342-8a60-4066-ad5c-a7118d43180a", "metadata": {}, "source": [ - "We see the time to read 250 files is much more than 2.5 times the time it takes to read 100 files. This tells us we can't use this simple way to read in the data to read in the whole timeseries of 726 files." + "The time to read 250 files is actually more than 2.5 times the time it takes to read 100 files. This tells us we can't use this simple way to read in the data in the whole timeseries of 726 files." ] }, { @@ -1295,14 +1304,14 @@ }, "source": [ "## Let's try to optimise this\n", - "Cloud is much better at reading a format called zarr instead of netcdf. Luckily, it's possible to fake a Zarr file from a netcdf file. Let's see if it speeds things up. We can use what was done here: https://gist.github.com/rsignell-usgs/ef435a53ac530a2843ce7e1d59f96e22\n", + "Cloud is much better at reading a format called zarr instead of netcdf. Luckily, it's possible to fake a zarr file from a netcdf file. Let's see if it speeds things up. We can use what was done here: https://gist.github.com/rsignell-usgs/ef435a53ac530a2843ce7e1d59f96e22\n", "\n", - "For this, we need to save some data about the chunking in the netcdf files in JSON files." + "For this, we need to save some data about the chunking in the netcdf files in JSON files. The `kerchunk` package was created especially for this purpose. Those JSON files only need to be created once and can be shared between users. And it's much faster to use Dask for parallelisation at this point." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "9fc776c8-f204-4ffa-abe3-167ed6946d6c", "metadata": { "tags": [] @@ -1315,7 +1324,7 @@ "
\n", "
\n", "

Client

\n", - "

Client-ab88d262-5ec0-11ec-9402-fa163e6c9e3b

\n", + "

Client-9ebc4eb5-5ed8-11ec-8c04-fa163e98a918

\n", " \n", "\n", " \n", @@ -1328,7 +1337,7 @@ " \n", " \n", " \n", " \n", " \n", @@ -1344,22 +1353,22 @@ " \n", "
\n", "

LocalCluster

\n", - "

bc03a1e1

\n", + "

e91c4f2d

\n", "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/8787/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/8787/status\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1381,22 +1390,22 @@ "
\n", "
\n", "

Scheduler

\n", - "

Scheduler-9696b5a6-66c2-4a2c-ae77-3ff98b874a05

\n", + "

Scheduler-e70ef03d-e078-4023-8c7b-9f89df575b1f

\n", "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/8787/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/8787/status\n", " \n", - " Workers: 8\n", + " Workers: 16\n", "
\n", - " Total threads: 8\n", + " Total threads: 16\n", " \n", - " Total memory: 22.46 GiB\n", + " Total memory: 44.92 GiB\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1404,7 +1413,7 @@ " Started: Just now\n", " \n", " \n", " \n", "
\n", - " Comm: tcp://127.0.0.1:39885\n", + " Comm: tcp://127.0.0.1:34737\n", " \n", - " Workers: 8\n", + " Workers: 16\n", "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/8787/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/8787/status\n", " \n", - " Total threads: 8\n", + " Total threads: 16\n", "
\n", - " Total memory: 22.46 GiB\n", + " Total memory: 44.92 GiB\n", "
\n", @@ -1427,7 +1436,7 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1472,7 +1481,7 @@ "
\n", - " Comm: tcp://127.0.0.1:38873\n", + " Comm: tcp://127.0.0.1:46287\n", " \n", " Total threads: 1\n", @@ -1435,7 +1444,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/36539/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/33447/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1443,13 +1452,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:43071\n", + " Nanny: tcp://127.0.0.1:46723\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-94kq267v\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-zzp4i_6r\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1517,7 +1526,7 @@ "
\n", - " Comm: tcp://127.0.0.1:42849\n", + " Comm: tcp://127.0.0.1:43787\n", " \n", " Total threads: 1\n", @@ -1480,7 +1489,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/36301/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/39611/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1488,13 +1497,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:32915\n", + " Nanny: tcp://127.0.0.1:39415\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-hyunmlo_\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-h5k85thr\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1562,7 +1571,7 @@ "
\n", - " Comm: tcp://127.0.0.1:37031\n", + " Comm: tcp://127.0.0.1:46095\n", " \n", " Total threads: 1\n", @@ -1525,7 +1534,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/36069/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/33539/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1533,13 +1542,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:33071\n", + " Nanny: tcp://127.0.0.1:36095\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-7mepsonn\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-h58vexyw\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1607,7 +1616,7 @@ "
\n", - " Comm: tcp://127.0.0.1:36105\n", + " Comm: tcp://127.0.0.1:38261\n", " \n", " Total threads: 1\n", @@ -1570,7 +1579,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/44779/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/36037/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1578,13 +1587,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:40469\n", + " Nanny: tcp://127.0.0.1:38413\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-axla5c2a\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-3bsojucu\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1652,7 +1661,7 @@ "
\n", - " Comm: tcp://127.0.0.1:44531\n", + " Comm: tcp://127.0.0.1:35603\n", " \n", " Total threads: 1\n", @@ -1615,7 +1624,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/35897/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/38869/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1623,13 +1632,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:46811\n", + " Nanny: tcp://127.0.0.1:42293\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-eofiugs7\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-pzfx2tpy\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1697,7 +1706,7 @@ "
\n", - " Comm: tcp://127.0.0.1:37401\n", + " Comm: tcp://127.0.0.1:43693\n", " \n", " Total threads: 1\n", @@ -1660,7 +1669,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/38933/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/36399/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1668,13 +1677,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:40261\n", + " Nanny: tcp://127.0.0.1:46081\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-n_94cyka\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-bnt85vbc\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1742,7 +1751,7 @@ "
\n", - " Comm: tcp://127.0.0.1:40447\n", + " Comm: tcp://127.0.0.1:40979\n", " \n", " Total threads: 1\n", @@ -1705,7 +1714,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/42171/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/46217/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1713,13 +1722,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:42405\n", + " Nanny: tcp://127.0.0.1:43345\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-kq_bvquj\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-uy_ty452\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -1777,915 +1786,2375 @@ " \n", " \n", " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 8

\n", + "
\n", + "
\n", - " Comm: tcp://127.0.0.1:35993\n", + " Comm: tcp://127.0.0.1:40139\n", " \n", " Total threads: 1\n", @@ -1750,7 +1759,7 @@ "
\n", - " Dashboard: /node/ood-vn9/39373/proxy/45281/status\n", + " Dashboard: /node/ood-vn17/6784/proxy/43841/status\n", " \n", " Memory: 2.81 GiB\n", @@ -1758,13 +1767,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:40817\n", + " Nanny: tcp://127.0.0.1:33231\n", "
\n", - " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-9lsegioa\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-0zi0n_ae\n", "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - "\n", + " \n", "\n", - " \n", - " \n", - "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:46631\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/42163/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:37799\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-_n8hfc07\n", + "
\n", " \n", + "
\n", + " \n", " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 9

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "\n", - " \n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "client = climtas.nci.Client()\n", - "client" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "77aa0183-4e5d-4e5e-a7b7-87aa47d38c21", - "metadata": {}, - "outputs": [], - "source": [ - "so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')\n", - "\n", - "def gen_json(u, json_dir):\n", - " with fs.open(u, **so) as infile:\n", - " h5chunks = kerchunk.hdf.SingleHdf5ToZarr(infile, u, inline_threshold=300)\n", - " p = u.split('/')\n", - " fname = p[4]\n", - " outf = f'{json_dir}/{fname}.json'\n", - " with open(outf, 'wb') as f:\n", - " f.write(ujson.dumps(h5chunks.translate()).encode());" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3e3bdf75-2589-4103-ae09-40a1225dcf32", - "metadata": {}, - "outputs": [], - "source": [ - "# Let's define the URLs and a path to save the JSON files\n", - "urls = [\"s3://\" + f for f in temp_files]\n", - "json_path = f\"/g/data/w35/ccc561/CAFE60/json/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f9523e2f-a07e-411e-ad38-76ac5e57f390", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1min 37s, sys: 14.4 s, total: 1min 52s\n", - "Wall time: 7min 56s\n" - ] - }, - { - "data": { - "text/plain": [ - "(None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None,\n", - " None)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "dask.compute(*[dask.delayed(gen_json)(u, json_path) for u in urls], retries=10);" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2af7000e-b263-4177-88ec-b9108b02a0b7", - "metadata": {}, - "outputs": [], - "source": [ - "json_list = sorted(glob(f\"{json_path}/temp*.json\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "d7c93337-6577-4632-ad49-b1f7cf4a04b8", - "metadata": {}, - "outputs": [], - "source": [ - "mzz = kerchunk.combine.MultiZarrToZarr(json_list[0:50], \n", - " remote_protocol='s3',\n", - " remote_options={'anon' : 'True'}, #JSON files \n", - " xarray_open_kwargs={\n", - " 'decode_cf' : False,\n", - " 'mask_and_scale' : False,\n", - " 'decode_times' : False,\n", - " 'use_cftime' : False,\n", - " 'drop_variables': ['time_bounds'],\n", - " 'decode_coords' : False\n", - " },\n", - " xarray_concat_args={\n", - " \"data_vars\": \"minimal\",\n", - " \"coords\": \"minimal\",\n", - " \"compat\": \"override\",\n", - " \"join\": \"override\",\n", - " \"combine_attrs\": \"override\",\n", - " \"dim\": \"time\"\n", - " }\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "cae02cb2-f5fd-490e-bb81-8045a9ba8601", - "metadata": {}, - "outputs": [ - { - "ename": "IndexError", - "evalue": "list index out of range", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/local/w35/ccc561/tmp/ipykernel_3691522/3306629662.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmzz\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranslate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{json_path}/temp.atmos_isobaric.daily.CAFE60.json\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/kerchunk/combine.py\u001b[0m in \u001b[0;36mtranslate\u001b[0;34m(self, outpath, template_count)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mds0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_determine_dims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mds0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_consolidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemplate_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemplate_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/kerchunk/combine.py\u001b[0m in \u001b[0;36m_build_output\u001b[0;34m(self, ds, ds0, fss)\u001b[0m\n\u001b[1;32m 172\u001b[0m consolidated=False) # fills in metadata&coords\n\u001b[1;32m 173\u001b[0m \u001b[0mz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzarr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen_group\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 174\u001b[0;31m \u001b[0maccum_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat_dims\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextra_dims\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;31m# only ever one dim for now\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[0macc_len\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_coord\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mz\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccum_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mIndexError\u001b[0m: list index out of range" - ] - } - ], - "source": [ - "mzz.translate(f\"{json_path}/temp.atmos_isobaric.daily.CAFE60.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ef83aa5-9e99-4dce-b015-cbbc34ee9fcb", - "metadata": {}, - "outputs": [], - "source": [ - "temp_files[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e97994e-0076-40e6-bd51-d3bc444fe392", - "metadata": {}, - "outputs": [], - "source": [ - "len(json_list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c889c116-7bf4-4aca-9fee-1f2bfd880227", - "metadata": {}, - "outputs": [], - "source": [] + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:33829\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/37321/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:45867\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-ssluhfdg\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 10

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:46037\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/39309/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:39601\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-5nz7kp7g\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 11

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:34915\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/33617/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:45801\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-7m7i56_k\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 12

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:43039\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/41555/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:40035\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-vo_kug8q\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 13

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:34791\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/41359/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:38311\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-bm6rpcos\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 14

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:44509\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/44255/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:41601\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-9memqrre\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 15

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:38733\n", + " \n", + " Total threads: 1\n", + "
\n", + " Dashboard: /node/ood-vn17/6784/proxy/41813/status\n", + " \n", + " Memory: 2.81 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:35471\n", + "
\n", + " Local directory: /local/w35/ccc561/tmp/dask-worker-space/worker-3l6anh6_\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = climtas.nci.Client()\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77aa0183-4e5d-4e5e-a7b7-87aa47d38c21", + "metadata": {}, + "outputs": [], + "source": [ + "so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')\n", + "\n", + "def gen_json(u, json_dir):\n", + " with fs.open(u, **so) as infile:\n", + " h5chunks = kerchunk.hdf.SingleHdf5ToZarr(infile, u, inline_threshold=300)\n", + " p = u.split('/')\n", + " fname = p[4]\n", + " outf = f'{json_dir}/{fname}.json'\n", + " with open(outf, 'wb') as f:\n", + " f.write(ujson.dumps(h5chunks.translate()).encode());" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e3bdf75-2589-4103-ae09-40a1225dcf32", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's define the URLs and a path to save the JSON files\n", + "urls = [\"s3://\" + f for f in temp_files]\n", + "json_path = f\"/g/data/w35/ccc561/CAFE60/json/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9523e2f-a07e-411e-ad38-76ac5e57f390", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%time\n", + "dask.compute(*[dask.delayed(gen_json)(u, json_path) for u in urls], retries=10);" + ] + }, + { + "cell_type": "markdown", + "id": "f50676f5-a358-4ced-8b2a-faf383a47623", + "metadata": {}, + "source": [ + "Now that we have encoded the data we need in JSON files, we can use those to read in the entire dataset. We first need to create a mapping between the JSON files and the data on S3. Then we use `xarray.open_mfdataset()` on this mapping. Note, we then need to specify the `zarr` engine for reading the data." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2af7000e-b263-4177-88ec-b9108b02a0b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get a list of the json files we need\n", + "json_path = f\"/g/data/w35/ccc561/CAFE60/json/\"\n", + "json_list = sorted(glob(f\"{json_path}/temp*.json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7e21d146-7f34-44d8-bdab-eb9c32dcaac3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 726/726 [00:09<00:00, 78.28it/s] \n" + ] + } + ], + "source": [ + "# Get the mapping\n", + "m_list = []\n", + "for js in tqdm(json_list):\n", + " with open(js) as f:\n", + " m_list.append(fsspec.get_mapper(\"reference://\",\n", + " fo=ujson.load(f),\n", + " remote_protocol=\"s3\",\n", + " remote_options={\"anon\":True}))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7dd10915-572b-4f28-b248-89a700653baa", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 20.7 s, sys: 3.45 s, total: 24.1 s\n", + "Wall time: 26.5 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Open the data with the zarr engine from the mapping objects m_list\n", + "ds1 = xr.open_mfdataset(m_list,\n", + " combine=\"nested\",\n", + " concat_dim=\"time\",\n", + " engine=\"zarr\",\n", + " coords=\"minimal\",\n", + " data_vars=\"minimal\",\n", + " compat=\"override\",\n", + " parallel=True\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "eb5eac79-92cc-471b-b1fa-6f9eb8d942a3", + "metadata": {}, + "source": [ + "We get a fair amount of warnings but all the files for the timeseries are open in less than a minute." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c889c116-7bf4-4aca-9fee-1f2bfd880227", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:      (latb: 133, lonb: 145, nv: 2, ensemble: 96, lat: 90, level: 21, lon: 144, time: 22097)\n",
+       "Coordinates:\n",
+       "  * latb         (latb) float64 -90.0 -88.99 -86.97 -84.94 ... 86.97 88.99 90.0\n",
+       "  * lonb         (lonb) float64 0.0 2.5 5.0 7.5 10.0 ... 352.5 355.0 357.5 360.0\n",
+       "  * nv           (nv) float64 1.0 2.0\n",
+       "  * ensemble     (ensemble) float64 1.0 2.0 3.0 4.0 5.0 ... 93.0 94.0 95.0 96.0\n",
+       "  * lat          (lat) float64 -89.49 -87.98 -85.96 -83.93 ... 85.96 87.98 89.49\n",
+       "  * level        (level) float32 1.0 5.0 10.0 20.0 ... 700.0 850.0 925.0 1e+03\n",
+       "  * lon          (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8\n",
+       "  * time         (time) object 1960-01-01 12:00:00 ... 2020-11-30 12:00:00\n",
+       "Data variables:\n",
+       "    temp         (time, ensemble, level, lat, lon) float32 dask.array<chunksize=(1, 96, 7, 31, 49), meta=np.ndarray>\n",
+       "    time_bounds  (time, nv) timedelta64[ns] dask.array<chunksize=(1, 2), meta=np.ndarray>\n",
+       "Attributes: (12/27)\n",
+       "    NCO:                        netCDF Operators version 4.7.8 (Homepage = ht...\n",
+       "    calendar:                    julian\n",
+       "    cm-enkf_source:             commit 0141830a243704acbf6d8c8c843b161a39ec39fa\n",
+       "    comment:                    pressure level interpolator, version 3.0, pre...\n",
+       "    contact_name:               Decadal Activity 1 - Data Assimilation\n",
+       "    control_name:               c5\n",
+       "    ...                         ...\n",
+       "    model_source:               commit 2abb29f4384e68777721fa657850587f28efd85a\n",
+       "    nominal_resoltuion:         Atmosphere delta lat = 2.02degrees ; Atmosphe...\n",
+       "    perturbation_name:          not applicable\n",
+       "    references:                 OKane, T.J., Sandery, P.A., Monselesan, D.P.,...\n",
+       "    run_variant_name:           data assimilation\n",
+       "    title:                      AccessOcean-AM2
" + ], + "text/plain": [ + "\n", + "Dimensions: (latb: 133, lonb: 145, nv: 2, ensemble: 96, lat: 90, level: 21, lon: 144, time: 22097)\n", + "Coordinates:\n", + " * latb (latb) float64 -90.0 -88.99 -86.97 -84.94 ... 86.97 88.99 90.0\n", + " * lonb (lonb) float64 0.0 2.5 5.0 7.5 10.0 ... 352.5 355.0 357.5 360.0\n", + " * nv (nv) float64 1.0 2.0\n", + " * ensemble (ensemble) float64 1.0 2.0 3.0 4.0 5.0 ... 93.0 94.0 95.0 96.0\n", + " * lat (lat) float64 -89.49 -87.98 -85.96 -83.93 ... 85.96 87.98 89.49\n", + " * level (level) float32 1.0 5.0 10.0 20.0 ... 700.0 850.0 925.0 1e+03\n", + " * lon (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8\n", + " * time (time) object 1960-01-01 12:00:00 ... 2020-11-30 12:00:00\n", + "Data variables:\n", + " temp (time, ensemble, level, lat, lon) float32 dask.array\n", + " time_bounds (time, nv) timedelta64[ns] dask.array\n", + "Attributes: (12/27)\n", + " NCO: netCDF Operators version 4.7.8 (Homepage = ht...\n", + " calendar: julian\n", + " cm-enkf_source: commit 0141830a243704acbf6d8c8c843b161a39ec39fa\n", + " comment: pressure level interpolator, version 3.0, pre...\n", + " contact_name: Decadal Activity 1 - Data Assimilation\n", + " control_name: c5\n", + " ... ...\n", + " model_source: commit 2abb29f4384e68777721fa657850587f28efd85a\n", + " nominal_resoltuion: Atmosphere delta lat = 2.02degrees ; Atmosphe...\n", + " perturbation_name: not applicable\n", + " references: OKane, T.J., Sandery, P.A., Monselesan, D.P.,...\n", + " run_variant_name: data assimilation\n", + " title: AccessOcean-AM2" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds1" + ] + }, + { + "cell_type": "markdown", + "id": "9e940022-359a-4690-86d4-7f4ab922c126", + "metadata": {}, + "source": [ + "Because Python will need to go and read the data on the cloud for each computation, it is slower than with local data. Below is an example of the time it would take to create an ensemble mean of half the timeseries, at one level roughly over Australia." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3f108330-2dba-4ac9-91c6-0607a7feb9dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'temp' (time: 10000, lat: 20, lon: 12)>\n",
+       "dask.array<mean_agg-aggregate, shape=(10000, 20, 12), dtype=float32, chunksize=(1, 12, 11), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float64 -43.48 -41.46 -39.44 -37.42 ... -9.101 -7.079 -5.056\n",
+       "    level    float32 1.0\n",
+       "  * lon      (lon) float64 121.2 123.8 126.2 128.8 ... 141.2 143.8 146.2 148.8\n",
+       "  * time     (time) object 1960-01-01 12:00:00 ... 1987-05-18 12:00:00
" + ], + "text/plain": [ + "\n", + "dask.array\n", + "Coordinates:\n", + " * lat (lat) float64 -43.48 -41.46 -39.44 -37.42 ... -9.101 -7.079 -5.056\n", + " level float32 1.0\n", + " * lon (lon) float64 121.2 123.8 126.2 128.8 ... 141.2 143.8 146.2 148.8\n", + " * time (time) object 1960-01-01 12:00:00 ... 1987-05-18 12:00:00" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp1 = ds1[\"temp\"].isel(level=0,time=slice(0,10000)).sel(lon=slice(120,150), lat=slice(-45,-5))\n", + "temp1_mean = temp1.mean(dim=\"ensemble\")\n", + "temp1_mean" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fca5b2cc-230b-4dc1-bec9-16cec18ae637", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8min 53s, sys: 1min 19s, total: 10min 13s\n", + "Wall time: 12min 38s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'temp' (time: 10000, lat: 20, lon: 12)>\n",
+       "array([[[235.98643, 236.0106 , 236.02881, ..., 236.09534, 236.07268,\n",
+       "         236.04564],\n",
+       "        [237.92964, 237.92836, 237.92194, ..., 237.76111, 237.70045,\n",
+       "         237.63062],\n",
+       "        [239.72624, 239.70036, 239.67493, ..., 239.3267 , 239.25848,\n",
+       "         239.17676],\n",
+       "        ...,\n",
+       "        [258.3558 , 258.3583 , 258.33548, ..., 257.89304, 257.83365,\n",
+       "         257.88174],\n",
+       "        [258.9962 , 259.01123, 259.00974, ..., 258.38364, 258.42105,\n",
+       "         258.45718],\n",
+       "        [259.57407, 259.62177, 259.62582, ..., 258.81683, 258.80463,\n",
+       "         258.77777]],\n",
+       "\n",
+       "       [[240.83653, 240.89354, 240.95435, ..., 241.2726 , 241.29076,\n",
+       "         241.30333],\n",
+       "        [242.51823, 242.5636 , 242.60957, ..., 242.74242, 242.72713,\n",
+       "         242.70874],\n",
+       "        [244.10077, 244.12251, 244.14238, ..., 244.10237, 244.08421,\n",
+       "         244.04791],\n",
+       "...\n",
+       "        [259.68243, 259.63416, 259.61905, ..., 259.52643, 259.5286 ,\n",
+       "         259.53375],\n",
+       "        [260.20752, 260.18234, 260.2126 , ..., 260.10892, 260.13065,\n",
+       "         260.2048 ],\n",
+       "        [260.702  , 260.718  , 260.7333 , ..., 260.57236, 260.62637,\n",
+       "         260.73703]],\n",
+       "\n",
+       "       [[234.87727, 234.89783, 234.90929, ..., 234.8871 , 234.8867 ,\n",
+       "         234.85811],\n",
+       "        [236.40094, 236.39925, 236.39099, ..., 236.32133, 236.31029,\n",
+       "         236.2804 ],\n",
+       "        [237.8614 , 237.83423, 237.8133 , ..., 237.77417, 237.80426,\n",
+       "         237.74947],\n",
+       "        ...,\n",
+       "        [259.54483, 259.52542, 259.5184 , ..., 259.49985, 259.4794 ,\n",
+       "         259.4852 ],\n",
+       "        [260.0524 , 260.05325, 260.06937, ..., 260.08627, 260.08524,\n",
+       "         260.17264],\n",
+       "        [260.4982 , 260.53256, 260.55963, ..., 260.5532 , 260.60464,\n",
+       "         260.73898]]], dtype=float32)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float64 -43.48 -41.46 -39.44 -37.42 ... -9.101 -7.079 -5.056\n",
+       "    level    float32 1.0\n",
+       "  * lon      (lon) float64 121.2 123.8 126.2 128.8 ... 141.2 143.8 146.2 148.8\n",
+       "  * time     (time) object 1960-01-01 12:00:00 ... 1987-05-18 12:00:00
" + ], + "text/plain": [ + "\n", + "array([[[235.98643, 236.0106 , 236.02881, ..., 236.09534, 236.07268,\n", + " 236.04564],\n", + " [237.92964, 237.92836, 237.92194, ..., 237.76111, 237.70045,\n", + " 237.63062],\n", + " [239.72624, 239.70036, 239.67493, ..., 239.3267 , 239.25848,\n", + " 239.17676],\n", + " ...,\n", + " [258.3558 , 258.3583 , 258.33548, ..., 257.89304, 257.83365,\n", + " 257.88174],\n", + " [258.9962 , 259.01123, 259.00974, ..., 258.38364, 258.42105,\n", + " 258.45718],\n", + " [259.57407, 259.62177, 259.62582, ..., 258.81683, 258.80463,\n", + " 258.77777]],\n", + "\n", + " [[240.83653, 240.89354, 240.95435, ..., 241.2726 , 241.29076,\n", + " 241.30333],\n", + " [242.51823, 242.5636 , 242.60957, ..., 242.74242, 242.72713,\n", + " 242.70874],\n", + " [244.10077, 244.12251, 244.14238, ..., 244.10237, 244.08421,\n", + " 244.04791],\n", + "...\n", + " [259.68243, 259.63416, 259.61905, ..., 259.52643, 259.5286 ,\n", + " 259.53375],\n", + " [260.20752, 260.18234, 260.2126 , ..., 260.10892, 260.13065,\n", + " 260.2048 ],\n", + " [260.702 , 260.718 , 260.7333 , ..., 260.57236, 260.62637,\n", + " 260.73703]],\n", + "\n", + " [[234.87727, 234.89783, 234.90929, ..., 234.8871 , 234.8867 ,\n", + " 234.85811],\n", + " [236.40094, 236.39925, 236.39099, ..., 236.32133, 236.31029,\n", + " 236.2804 ],\n", + " [237.8614 , 237.83423, 237.8133 , ..., 237.77417, 237.80426,\n", + " 237.74947],\n", + " ...,\n", + " [259.54483, 259.52542, 259.5184 , ..., 259.49985, 259.4794 ,\n", + " 259.4852 ],\n", + " [260.0524 , 260.05325, 260.06937, ..., 260.08627, 260.08524,\n", + " 260.17264],\n", + " [260.4982 , 260.53256, 260.55963, ..., 260.5532 , 260.60464,\n", + " 260.73898]]], dtype=float32)\n", + "Coordinates:\n", + " * lat (lat) float64 -43.48 -41.46 -39.44 -37.42 ... -9.101 -7.079 -5.056\n", + " level float32 1.0\n", + " * lon (lon) float64 121.2 123.8 126.2 128.8 ... 141.2 143.8 146.2 148.8\n", + " * time (time) object 1960-01-01 12:00:00 ... 1987-05-18 12:00:00" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "temp1_mean.load()" + ] + }, + { + "cell_type": "markdown", + "id": "440c4081-312a-42aa-a050-185ffebd0247", + "metadata": {}, + "source": [ + "## Conclusion\n", + "We can store the JSON files for the required variables centrally on ua8. Then users can access the data in this way:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e99cb7e-72eb-4984-8924-af2d1770cd28", + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob\n", + "import xarray as xr\n", + "import ujson\n", + "\n", + "# Get a list of the json files we need\n", + "json_path = f\"/g/data/w35/ccc561/CAFE60/json/\" # Update the path to the central location\n", + "json_list = sorted(glob(f\"{json_path}/temp*.json\")) # Update the path to the variable you need.\n", + "\n", + "# Get the mapping\n", + "m_list = []\n", + "for js in json_list:\n", + " with open(js) as f:\n", + " m_list.append(fsspec.get_mapper(\"reference://\",\n", + " fo=ujson.load(f),\n", + " remote_protocol=\"s3\",\n", + " remote_options={\"anon\":True}))\n", + "ds1 = xr.open_mfdataset(m_list,\n", + " combine=\"nested\",\n", + " concat_dim=\"time\",\n", + " engine=\"zarr\",\n", + " coords=\"minimal\",\n", + " data_vars=\"minimal\",\n", + " compat=\"override\",\n", + " parallel=True\n", + " )" + ] } ], "metadata": {