From be22083143edb797ee8e4e6fbf2698627b425dca Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 4 Feb 2025 22:48:23 +0800 Subject: [PATCH 001/129] added objid functions --- pyproject.toml | 7 +- src/h5json/__init__.py | 8 + src/h5json/hdf5db.py | 21 +- src/h5json/objid.py | 485 ++++++++++++++++++++++++++++++++++++++++ test/unit/objid_test.py | 199 +++++++++++++++++ 5 files changed, 707 insertions(+), 13 deletions(-) create mode 100644 src/h5json/objid.py create mode 100755 test/unit/objid_test.py diff --git a/pyproject.toml b/pyproject.toml index bcba8205..5ddb024f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,17 +19,18 @@ authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }] keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"] requires-python = ">=3.8" dependencies = [ - "h5py >=3.10", + "h5py >= 3.10", "numpy >= 2.0; python_version>='3.9'", "jsonschema >=4.4.0", "tomli; python_version<'3.11'", "numpy >=1.20,<2.0.0; python_version=='3.8'", ] + dynamic = ["version"] [project.urls] -Homepage = "https://hdf5-json.readthedocs.io" -Documentation = "https://hdf5-json.readthedocs.io" +Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" +Documentation = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" Source = "https://github.com/HDFGroup/hdf5-json" "Bug Reports" = "https://github.com/HDFGroup/hdf5-json/issues" Social = "https://twitter.com/hdf5" diff --git a/src/h5json/__init__.py b/src/h5json/__init__.py index 704d2411..d4a7f781 100644 --- a/src/h5json/__init__.py +++ b/src/h5json/__init__.py @@ -21,6 +21,14 @@ from .hdf5dtype import getTypeResponse from .hdf5dtype import getItemSize from .hdf5dtype import createDataType +from .objid import createObjId +from .objid import getCollectionForId +from .objid import isObjId +from .objid import isS3ObjKey +from .objid import getS3Key +from .objid import getObjId +from .objid import isSchema2Id +from .objid import isRootObjId from .hdf5db import Hdf5db from . import _version diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 27f20946..676dbef5 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -19,6 +19,7 @@ import json import logging from .hdf5dtype import getTypeItem, createDataType, getItemSize +from .objid import createObjId from .apiversion import _apiver @@ -561,7 +562,7 @@ def initFile(self): self.log.info("initializing file") if not self.root_uuid: - self.root_uuid = str(uuid.uuid1()) + self.root_uuid = createObjId() self.dbGrp.attrs["rootUUID"] = self.root_uuid self.dbGrp.create_group("{groups}") self.dbGrp.create_group("{datasets}") @@ -593,21 +594,21 @@ def visit(self, path, obj): msg = "Unknown object type: " + __name__ + " found during scan of HDF5 file" self.log.error(msg) raise IOError(errno.EIO, msg) - uuid1 = uuid.uuid1() # create uuid - id = str(uuid1) + obj_id = createObjId() # create uuid + addrGrp = self.dbGrp["{addr}"] if not self.readonly: # storing db in the file itself, so we can link to the object directly - col[id] = obj.ref # save attribute ref to object + col[obj_id] = obj.ref # save attribute ref to object else: # store path to object - col[id] = obj.name + col[obj_id] = obj.name addr = h5py.h5o.get_info(obj.id).addr # store reverse map as an attribute - addrGrp.attrs[str(addr)] = id + addrGrp.attrs[str(addr)] = obj_id # - # Get Datset creation properties + # Get Dataset creation properties # def getDatasetCreationProps(self, dset_uuid): prop_list = {} @@ -1087,7 +1088,7 @@ def createCommittedType(self, datatype, obj_uuid=None): raise IOError(errno.EPERM, msg) datatypes = self.dbGrp["{datatypes}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() dt = self.createTypeFromItem(datatype) datatypes[obj_uuid] = dt @@ -2715,7 +2716,7 @@ def createDataset( raise IOError(errno.EPERM, msg) datasets = self.dbGrp["{datasets}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() dt = None item = {} fillvalue = None @@ -3490,7 +3491,7 @@ def createGroup(self, obj_uuid=None): raise IOError(errno.EPERM, msg) groups = self.dbGrp["{groups}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() newGroup = groups.create_group(obj_uuid) # store reverse map as an attribute addr = h5py.h5o.get_info(newGroup.id).addr diff --git a/src/h5json/objid.py b/src/h5json/objid.py new file mode 100644 index 00000000..7a98a5b7 --- /dev/null +++ b/src/h5json/objid.py @@ -0,0 +1,485 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HDF (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# objID: +# id (uuid) related functions +# + + +import hashlib +import uuid + +S3_URI = "s3://" +FILE_URI = "file://" +AZURE_URI = "blob.core.windows.net/" # preceded with "https://" +UUID_LEN = 36 # length for uuid strings + + + +def _getStorageProtocol(uri): + """ returns 's3://', 'file://', or 'https://...net/' prefix if present. + If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer + (references Azure blob storage), return: https://myaccount.blob.core.windows.net/ + otherwise None """ + + if not uri: + protocol = None + elif uri.startswith(S3_URI): + protocol = S3_URI + elif uri.startswith(FILE_URI): + protocol = FILE_URI + elif uri.startswith("https://") and uri.find(AZURE_URI) > 0: + n = uri.find(AZURE_URI) + len(AZURE_URI) + protocol = uri[:n] + elif uri.find("://") >= 0: + raise ValueError(f"storage uri: {uri} not supported") + else: + protocol = None + return protocol + + +def _getBaseName(uri): + """ Return the part of the URI after the storage protocol (if any) """ + + protocol = _getStorageProtocol(uri) + if not protocol: + return uri + else: + return uri[len(protocol):] + +def _getPrefixForCollection(collection): + """ Return prefix character for given collection type """ + collection = collection.lower() + + if collection in ("group", "groups"): + return 'g' + elif collection in ("dataset", "datasets"): + return 'd' + elif collection in ("datatype", "datatypes"): + return 't' + elif collection in ("chunk", "chunks"): + return 'c' + else: + raise ValueError(f"unexpected collection type: {collection}") + + +def getIdHash(id): + """Return md5 prefix based on id value""" + m = hashlib.new("md5") + m.update(id.encode("utf8")) + hexdigest = m.hexdigest() + return hexdigest[:5] + + +def isSchema2Id(id): + """return true if this is a v2 id""" + # v1 ids are in the standard UUID format: 8-4-4-4-12 + # v2 ids are in the non-standard: 8-8-4-6-6 + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id formation for uuid: {id}") + if len(parts[2]) == 8: + return True + else: + return False + + +def getIdHexChars(id): + """get the hex chars of the given id""" + if id[0] == "c": + # don't include chunk index + index = id.index("_") + parts = id[0:index].split("-") + else: + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id format for uuid: {id}") + return "".join(parts[1:]) + + +def hexRot(ch): + """rotate hex character by 8""" + return format((int(ch, base=16) + 8) % 16, "x") + + +def isRootObjId(id): + """returns true if this is a root id (only for v2 schema)""" + if not isSchema2Id(id): + raise ValueError("isRootObjId can only be used with v2 ids") + validateUuid(id) # will throw ValueError exception if not a objid + if id[0] != "g": + return False # not a group + token = getIdHexChars(id) + # root ids will have last 16 chars rotated version of the first 16 + is_root = True + for i in range(16): + if token[i] != hexRot(token[i + 16]): + is_root = False + break + return is_root + + +def getRootObjId(id): + """returns root id for this objid if this is a root id + (only for v2 schema) + """ + if isRootObjId(id): + return id # this is the root id + token = list(getIdHexChars(id)) + # root ids will have last 16 chars rotated version of the first 16 + for i in range(16): + token[i + 16] = hexRot(token[i]) + token = "".join(token) + root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20] + root_id += "-" + token[20:26] + "-" + token[26:32] + + return root_id + + +def createObjId(obj_type=None, root_id=None): + """ create a new objid + + if obj_type is None, return just a bare uuid. + Otherwise a hsds v2 schema obj_id will be created. + In this case obj_type should be one of "groups", + "datasets", "datatypes", "chunks". If rootid is + None, a root group obj_id will be created. Otherwise the + obj_id will be a an id that has root_id as it's root. """ + + + prefix = None + if obj_type is None: + # just return a regular uuid + objid = str(uuid.uuid4()) + else: + + prefix = _getPrefixForCollection(obj_type) + # schema v2 + salt = uuid.uuid4().hex + # take a hash to randomize the uuid + token = list(hashlib.sha256(salt.encode()).hexdigest()) + + if root_id: + # replace first 16 chars of token with first 16 chars of root id + root_hex = getIdHexChars(root_id) + token[0:16] = root_hex[0:16] + else: + if obj_type != "groups": + raise ValueError("expected 'groups' obj_type for root group id") + # use only 16 chars, but make it look a 32 char id + for i in range(16): + token[16 + i] = hexRot(token[i]) + # format as a string + token = "".join(token) + objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-" + objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32] + + return objid + + +def getS3Key(id): + """Return s3 key for given id. + + For schema v1: + A md5 prefix is added to the front of the returned key to better + distribute S3 objects. + For schema v2: + The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and + "db/id[0:16]/{prefix}/id[16-32]" for other ids + Chunk ids have the chunk index added after the slash: + "db/id[0:16]/d/id[16:32]/x_y_z + + For domain id's: + Return a key with the .domain suffix and no preceding slash. + For non-default buckets, use the format: /s3_key + If the id has a storage specifier ("s3://", "file://", etc.) + include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5" + """ + + base_id = _getBaseName(id) # strip any s3://, etc. + if base_id.find("/") > 0: + # a domain id + domain_suffix = ".domain.json" + index = base_id.find("/") + 1 + key = base_id[index:] + if not key.endswith(domain_suffix): + if key[-1] != "/": + key += "/" + key += domain_suffix + else: + if isSchema2Id(id): + # schema v2 id + hexid = getIdHexChars(id) + prefix = id[0] # one of g, d, t, c + if prefix not in ("g", "d", "t", "c"): + raise ValueError(f"Unexpected id: {id}") + + if isRootObjId(id): + key = f"db/{hexid[0:8]}-{hexid[8:16]}" + else: + partition = "" + if prefix == "c": + # use 'g' so that chunks will show up under their dataset + s3col = "d" + n = id.find("-") + if n > 1: + # extract the partition index if present + partition = "p" + id[1:n] + else: + s3col = prefix + key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}" + key += f"-{hexid[20:26]}-{hexid[26:32]}" + if prefix == "c": + if partition: + key += "/" + key += partition + # add the chunk coordinate + index = id.index("_") # will raise ValueError if not found + n = index + 1 + coord = id[n:] + key += "/" + key += coord + elif prefix == "g": + # add key suffix for group + key += "/.group.json" + elif prefix == "d": + # add key suffix for dataset + key += "/.dataset.json" + else: + # add key suffix for datatype + key += "/.datatype.json" + else: + # v1 id + # schema v1 id + idhash = getIdHash(id) + key = f"{idhash}-{id}" + + return key + + +def getObjId(s3key): + """Return object id given valid s3key""" + if all( + ( + len(s3key) >= 44 and s3key[0:5].isalnum(), + len(s3key) >= 44 and s3key[5] == "-", + len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"), + ) + ): + # v1 obj keys + objid = s3key[6:] + elif s3key.endswith("/.domain.json"): + objid = "/" + s3key[: -(len("/.domain.json"))] + elif s3key.startswith("db/"): + # schema v2 object key + parts = s3key.split("/") + chunk_coord = "" # used only for chunk ids + partition = "" # likewise + token = [] + for ch in parts[1]: + if ch != "-": + token.append(ch) + + if len(parts) == 3: + # root id + # last part should be ".group.json" + if parts[2] != ".group.json": + raise ValueError(f"unexpected S3Key: {s3key}") + # add 16 more chars using rotated version of first 16 + for i in range(16): + token.append(hexRot(token[i])) + prefix = "g" + elif len(parts) == 5: + # group, dataset, or datatype or chunk + for ch in parts[3]: + if ch != "-": + token.append(ch) + + if parts[2] == "g" and parts[4] == ".group.json": + prefix = "g" # group json + elif parts[2] == "t" and parts[4] == ".datatype.json": + prefix = "t" # datatype json + elif parts[2] == "d": + if parts[4] == ".dataset.json": + prefix = "d" # dataset json + else: + # chunk object + prefix = "c" + chunk_coord = "_" + parts[4] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + elif len(parts) == 6: + # chunk key with partitioning + for ch in parts[3]: + if ch != "-": + token.append(ch) + if parts[2][0] != "d": + raise ValueError(f"unexpected S3Key: {s3key}") + prefix = "c" + partition = parts[4] + if partition[0] != "p": + raise ValueError(f"unexpected S3Key: {s3key}") + partition = partition[1:] # strip off the p + chunk_coord = "_" + parts[5] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + + token = "".join(token) + objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16] + objid += "-" + token[16:20] + "-" + token[20:26] + "-" + objid += token[26:32] + chunk_coord + else: + msg = f"unexpected S3Key: {s3key}" + raise ValueError(msg) + return objid + + +def isS3ObjKey(s3key): + """ return True if this is a storage key """ + valid = False + try: + objid = getObjId(s3key) + if objid: + valid = True + except KeyError: + pass # ignore + except ValueError: + pass # ignore + return valid + + +def getCollectionForId(obj_id): + """return groups/datasets/datatypes based on id""" + if not isinstance(obj_id, str): + raise ValueError("invalid object id") + collection = None + if obj_id.startswith("g-"): + collection = "groups" + elif obj_id.startswith("d-"): + collection = "datasets" + elif obj_id.startswith("t-"): + collection = "datatypes" + else: + raise ValueError("not a collection id") + return collection + + +def validateUuid(id, obj_class=None): + """ verify the UUID is well-formed + schema can be: + None: expecting ordinary UUID + "v1": expecting HSDS v1 format + "v2": expecting HSDS v2 format + if set obj_class can be one of "groups", "datasets", "datatypes" + """ + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) < UUID_LEN: + raise ValueError("id is too short to be an object identifier") + if len(id) == UUID_LEN: + if obj_class: + # expected a prefix + raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") + else: + # does this have a v1 schema hash tag? + # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + if id[:5].isalnum() and id[5] == '-': + id = id[6:] # trim off the hash tag + # validate prefix + if id[0] not in ("g", "d", "t", "c"): + raise ValueError("Unexpected prefix") + if id[0] != "c" and id[1] != "-": + # chunk ids may have a partition index following the c + raise ValueError("Unexpected prefix") + if obj_class is not None: + obj_class = obj_class.lower() + if id[0] != _getPrefixForCollection(obj_class): + raise ValueError(f"unexpected object id {id} for collection: {obj_class}") + if id[0] == "c": + # trim the type char and any partition id + n = id.find("-") + if n == -1: + raise ValueError("Invalid chunk id") + + # trim the chunk index for chunk ids + m = id.find("_") + if m == -1: + raise ValueError("Invalid chunk id") + n += 1 + id = "c-" + id[n:m] + id = id[2:] + if len(id) != UUID_LEN: + # id should be 36 now + raise ValueError("Unexpected id length") + + for ch in id: + if ch.isalnum(): + continue + if ch == "-": + continue + raise ValueError(f"Unexpected character in uuid: {ch}") + + +def isValidUuid(id, obj_class=None): + try: + validateUuid(id, obj_class) + return True + except ValueError: + return False + + +def isValidChunkId(id): + if not isValidUuid(id): + return False + if id[0] != "c": + return False + return True + + +def getClassForObjId(id): + """return domains/chunks/groups/datasets/datatypes based on id""" + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) == 0: + raise ValueError("Empty string") + if id[0] == "/": + return "domains" + if isValidChunkId(id): + return "chunks" + else: + return getCollectionForId(id) + + +def isObjId(id): + """return true if uuid or domain""" + if not isinstance(id, str) or len(id) == 0: + return False + if id.find("/") > 0: + # domain id is any string in the form / + return True + return isValidUuid(id) + + +def getUuidFromId(id): + """strip off the type prefix ('g-' or 'd-', or 't-') + and return the uuid part""" + if len(id) == UUID_LEN: + # just a uuid + return id + elif len(id) == UUID_LEN + 2: + # 'g-', 'd-', or 't-' prefix + return id[2:] + else: + raise ValueError(f"Unexpected obj_id: {id}") + + + \ No newline at end of file diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py new file mode 100755 index 00000000..7c02482f --- /dev/null +++ b/test/unit/objid_test.py @@ -0,0 +1,199 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import sys + +from h5json.objid import isRootObjId, isValidUuid, validateUuid +from h5json.objid import createObjId, getCollectionForId +from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id + + +class IdUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(IdUtilTest, self).__init__(*args, **kwargs) + # main + + def testCreateObjId(self): + id_len = 38 # 36 for uuid plus two for prefix ("g-", "d-") + ids = set() # we'll use this to verify we always get a unique id + # create just a plain uuid... + id = createObjId() + self.assertEqual(len(id) + 2, id_len) + # create a v2 root_id + root_id = createObjId(obj_type="groups") + self.assertEqual(len(root_id), id_len) + for obj_type in ("groups", "datasets", "datatypes", "chunks"): + for i in range(100): + id = createObjId(obj_type=obj_type, root_id=root_id) + self.assertEqual(len(id), id_len) + self.assertTrue(id[0] in ("g", "d", "t", "c")) + self.assertEqual(id[1], "-") + ids.add(id) + + self.assertEqual(len(ids), 400) + try: + createObjId(obj_type="bad_class") + self.assertTrue(False) # should throw exception + except ValueError: + pass # expected + + def testIsValidUuid(self): + group1_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" # orig schema + group2_id = "g-314d61b8-995411e6-a733-3c15c2-da029e" + root_id = "g-f9aaa28e-d42e10e5-7122-2a065c-a6986d" + dataset1_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" # orig schema + dataset2_id = "d-4c48f3ae-995411e6-a3cd-3c15c2-da029e" + ctype1_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" # orig schema + ctype2_id = "t-8c785f1c-995311e6-9bc2-0242ac-110005" + chunk1_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2" # orig schema + chunk2_id = "c-8c785f1c-995311e6-9bc2-0242ac-110005_7_2" + domain_id = "mybucket/bob/mydata.h5" + s3_domain_id = "s3://mybucket/bob/mydata.h5" + file_domain_id = "file://mybucket/bob/mydata.h5" + azure_domain_id = "https://myaccount.blob.core.windows.net/mybucket/bob/mydata.h5" + valid_id_map = { + group1_id: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + group2_id: "db/314d61b8-995411e6/g/a733-3c15c2-da029e/.group.json", + dataset1_id: "26928-d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e", + dataset2_id: "db/4c48f3ae-995411e6/d/a3cd-3c15c2-da029e/.dataset.json", + ctype1_id: "5a9cf-t-8c785f1c-9953-11e6-9bc2-0242ac110005", + ctype2_id: "db/8c785f1c-995311e6/t/9bc2-0242ac-110005/.datatype.json", + chunk1_id: "dc4ce-c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2", + chunk2_id: "db/8c785f1c-995311e6/d/9bc2-0242ac-110005/7_2", + domain_id: "bob/mydata.h5/.domain.json", + s3_domain_id: "bob/mydata.h5/.domain.json", + file_domain_id: "bob/mydata.h5/.domain.json", + azure_domain_id: "bob/mydata.h5/.domain.json", } + + bad_ids = ("g-1e76d862", "/bob/mydata.h5") + + self.assertTrue(isValidUuid(group1_id)) + self.assertFalse(isSchema2Id(group1_id)) + self.assertTrue(isValidUuid(group1_id, obj_class="Group")) + self.assertTrue(isValidUuid(group1_id, obj_class="group")) + self.assertTrue(isValidUuid(group1_id, obj_class="groups")) + self.assertTrue(isSchema2Id(root_id)) + self.assertTrue(isValidUuid(root_id, obj_class="Group")) + self.assertTrue(isValidUuid(root_id, obj_class="group")) + self.assertTrue(isValidUuid(root_id, obj_class="groups")) + self.assertTrue(isRootObjId(root_id)) + self.assertTrue(isValidUuid(dataset1_id, obj_class="datasets")) + self.assertFalse(isSchema2Id(dataset1_id)) + self.assertTrue(isValidUuid(ctype1_id, obj_class="datatypes")) + self.assertFalse(isSchema2Id(ctype1_id)) + self.assertTrue(isValidUuid(chunk1_id, obj_class="chunks")) + self.assertFalse(isSchema2Id(chunk1_id)) + self.assertTrue(isValidUuid(group2_id)) + self.assertTrue(isSchema2Id(group2_id)) + self.assertTrue(isValidUuid(group2_id, obj_class="Group")) + self.assertTrue(isValidUuid(group2_id, obj_class="group")) + self.assertTrue(isValidUuid(group2_id, obj_class="groups")) + self.assertFalse(isRootObjId(group2_id)) + self.assertTrue(isValidUuid(dataset2_id, obj_class="datasets")) + self.assertTrue(isSchema2Id(dataset2_id)) + self.assertTrue(isValidUuid(ctype2_id, obj_class="datatypes")) + self.assertTrue(isSchema2Id(ctype2_id)) + self.assertTrue(isValidUuid(chunk2_id, obj_class="chunks")) + self.assertTrue(isSchema2Id(chunk2_id)) + validateUuid(group1_id) + try: + isRootObjId(group1_id) + self.assertTrue(False) + except ValueError: + # only works for v2 schema + pass # expected + + for item in valid_id_map: + self.assertTrue(isObjId(item)) + s3key = getS3Key(item) + self.assertTrue(s3key[0] != "/") + self.assertTrue(isS3ObjKey(s3key)) + expected = valid_id_map[item] + self.assertEqual(s3key, expected) + if item.find("/") > 0: + continue # bucket name gets lost when domain ids get converted to s3keys + objid = getObjId(s3key) + self.assertEqual(objid, item) + for item in bad_ids: + self.assertFalse(isValidUuid(item)) + self.assertFalse(isObjId(item)) + + def testGetCollection(self): + group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" + dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" + ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" + bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e" + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + try: + getCollectionForId(bad_id) + self.assertTrue(False) + except ValueError: + pass # expected + try: + getCollectionForId(None) + self.assertTrue(False) + except ValueError: + pass # expected + + def testSchema2Id(self): + root_id = createObjId("groups") + group_id = createObjId("groups", root_id=root_id) + dataset_id = createObjId("datasets", root_id=root_id) + ctype_id = createObjId("datatypes", root_id=root_id) + + self.assertEqual(getCollectionForId(root_id), "groups") + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + chunk_id = "c" + dataset_id[1:] + "_1_2" + chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2" + + for id in (chunk_id, chunk_partition_id): + try: + getCollectionForId(id) + self.assertTrue(False) + except ValueError: + pass # expected + valid_ids = ( + group_id, + dataset_id, + ctype_id, + chunk_id, + chunk_partition_id, + root_id, + ) + s3prefix = getS3Key(root_id) + self.assertTrue(s3prefix.endswith("/.group.json")) + s3prefix = s3prefix[: -(len(".group.json"))] + for oid in valid_ids: + self.assertTrue(len(oid) >= 38) + parts = oid.split("-") + self.assertEqual(len(parts), 6) + self.assertTrue(oid[0] in ("g", "d", "t", "c")) + self.assertTrue(isSchema2Id(oid)) + if oid == root_id: + self.assertTrue(isRootObjId(oid)) + else: + self.assertFalse(isRootObjId(oid)) + + s3key = getS3Key(oid) + self.assertTrue(s3key.startswith(s3prefix)) + self.assertEqual(getObjId(s3key), oid) + self.assertTrue(isS3ObjKey(s3key)) + + +if __name__ == "__main__": + # setup test files + + unittest.main() From 28dcfc6e744b376e51b8bbf4521150eced7d4bc6 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 4 Feb 2025 23:02:09 +0800 Subject: [PATCH 002/129] fix flake8 errors --- src/h5json/hdf5db.py | 3 +-- src/h5json/objid.py | 18 +++++++----------- test/unit/objid_test.py | 1 - 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 676dbef5..f23dc3a4 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -13,7 +13,6 @@ import time import h5py import numpy as np -import uuid import os.path as op import os import json @@ -1902,7 +1901,7 @@ def listToRef(self, data): # object reference should be in the form: / for prefix in ("datasets", "groups", "datatypes"): if data.startswith(prefix): - uuid_ref = data[len(prefix) :] + uuid_ref = data[len(prefix):] if len(uuid_ref) == (UUID_LEN + 1) and uuid_ref.startswith("/"): obj = self.getObjectByUuid(prefix, uuid_ref[1:]) if obj: diff --git a/src/h5json/objid.py b/src/h5json/objid.py index 7a98a5b7..598790e0 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -24,7 +24,6 @@ UUID_LEN = 36 # length for uuid strings - def _getStorageProtocol(uri): """ returns 's3://', 'file://', or 'https://...net/' prefix if present. If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer @@ -55,7 +54,8 @@ def _getBaseName(uri): return uri else: return uri[len(protocol):] - + + def _getPrefixForCollection(collection): """ Return prefix character for given collection type """ collection = collection.lower() @@ -146,16 +146,15 @@ def getRootObjId(id): def createObjId(obj_type=None, root_id=None): - """ create a new objid - + """ create a new objid + if obj_type is None, return just a bare uuid. Otherwise a hsds v2 schema obj_id will be created. In this case obj_type should be one of "groups", "datasets", "datatypes", "chunks". If rootid is - None, a root group obj_id will be created. Otherwise the + None, a root group obj_id will be created. Otherwise the obj_id will be a an id that has root_id as it's root. """ - prefix = None if obj_type is None: # just return a regular uuid @@ -374,7 +373,7 @@ def getCollectionForId(obj_id): def validateUuid(id, obj_class=None): - """ verify the UUID is well-formed + """ verify the UUID is well-formed schema can be: None: expecting ordinary UUID "v1": expecting HSDS v1 format @@ -388,7 +387,7 @@ def validateUuid(id, obj_class=None): if len(id) == UUID_LEN: if obj_class: # expected a prefix - raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") + raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") else: # does this have a v1 schema hash tag? # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", @@ -480,6 +479,3 @@ def getUuidFromId(id): return id[2:] else: raise ValueError(f"Unexpected obj_id: {id}") - - - \ No newline at end of file diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py index 7c02482f..af4ac21e 100755 --- a/test/unit/objid_test.py +++ b/test/unit/objid_test.py @@ -10,7 +10,6 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import unittest -import sys from h5json.objid import isRootObjId, isValidUuid, validateUuid from h5json.objid import createObjId, getCollectionForId From 54c83d574b703981c3b60fda72ce09c92895a71c Mon Sep 17 00:00:00 2001 From: John Readey Date: Sat, 8 Feb 2025 18:38:50 +0800 Subject: [PATCH 003/129] merge hsds hdf5dtype changes --- data/json/bool_attr.json | 2 +- data/json/bool_dset.json | 2 +- data/json/enum_attr.json | 2 +- data/json/enum_dset.json | 2 +- src/h5json/hdf5db.py | 44 ++- src/h5json/hdf5dtype.py | 743 ++++++++++++++++++++++++++---------- test/unit/hdf5db_test.py | 2 - test/unit/hdf5dtype_test.py | 259 ++++++++++--- 8 files changed, 801 insertions(+), 255 deletions(-) mode change 100755 => 100644 src/h5json/hdf5dtype.py diff --git a/data/json/bool_attr.json b/data/json/bool_attr.json index ff092b9a..6d4d24da 100644 --- a/data/json/bool_attr.json +++ b/data/json/bool_attr.json @@ -20,7 +20,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "members": [ + "mapping": [ { "name": "FALSE", "value": 0 diff --git a/data/json/bool_dset.json b/data/json/bool_dset.json index 29e46d80..11f19e01 100644 --- a/data/json/bool_dset.json +++ b/data/json/bool_dset.json @@ -24,7 +24,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "members": [ + "mapping": [ { "name": "FALSE", "value": 0 diff --git a/data/json/enum_attr.json b/data/json/enum_attr.json index 9e9d94a9..e39425ef 100644 --- a/data/json/enum_attr.json +++ b/data/json/enum_attr.json @@ -21,7 +21,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "members": [ + "mapping": [ { "name": "GAS", "value": 2 diff --git a/data/json/enum_dset.json b/data/json/enum_dset.json index d2afcd4a..08291696 100644 --- a/data/json/enum_dset.json +++ b/data/json/enum_dset.json @@ -25,7 +25,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "members": [ + "mapping": [ { "name": "GAS", "value": 2 diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index f23dc3a4..112fb867 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -17,7 +17,7 @@ import os import json import logging -from .hdf5dtype import getTypeItem, createDataType, getItemSize +from .hdf5dtype import getTypeItem, createDataType, getItemSize, Reference, RegionReference from .objid import createObjId from .apiversion import _apiver @@ -73,6 +73,43 @@ _H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") +def convert_dtype(srcdt): + """Return a dtype based on input dtype, converting any Reference types from + h5json style to h5py. + """ + + if len(srcdt) > 0: + fields = [] + for name in srcdt.fields: + item = srcdt.fields[name] + # item is a tuple of dtype and integer offset + field_dt = convert_dtype(item[0]) + fields.append((name, field_dt)) + tgt_dt = np.dtype(fields) + else: + # check if this a "special dtype" + if srcdt.metadata and "ref" in srcdt.metadata: + if srcdt.metadata['ref'] is Reference: + tgt_dt = h5py.special_dtype(ref=h5py.Reference) + elif srcdt.metadata['ref'] is RegionReference: + tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) + else: + raise TypeError(f"Unexpected ref type: {srcdt}") + elif srcdt.metadata and "vlen" in srcdt.metadata: + src_vlen = srcdt.metadata["vlen"] + if isinstance(src_vlen, np.dtype): + tgt_base = convert_dtype(src_vlen) + else: + tgt_base = src_vlen + tgt_dt = h5py.special_dtype(vlen=tgt_base) + elif srcdt.kind == "U": + # use vlen for unicode strings + tgt_dt = h5py.special_dtype(vlen=str) + else: + tgt_dt = srcdt # no conversion needed + return tgt_dt + + def visitObj(path, obj): hdf5db = _db[obj.file.filename] hdf5db.visit(path, obj) @@ -1476,6 +1513,7 @@ def makeAttribute(self, obj, attr_name, shape, attr_type, value): self.makeNullTermStringAttribute(obj, attr_name, strLength, value) else: typeItem = getTypeItem(dt) + dt = convert_dtype(dt) value = self.toRef(rank, typeItem, value) # create numpy array @@ -1725,6 +1763,7 @@ def toNumPyValue(self, typeItem, src, des): baseType = typeItem["base"] dt = self.createTypeFromItem(baseType) + dt = convert_dtype(dt) des = np.array(src, dtype=dt) elif typeClass == "H5T_REFERENCE": @@ -2193,7 +2232,8 @@ def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): raise IOError(errno.EIO, msg) if isinstance(slices, (list, tuple)) and len(slices) != rank: - msg = "Unexpected error: getDatasetValuesByUuid: number of dims in selection not same as rank" + msg = "Unexpected error: getDatasetValuesByUuid: " + msg += "number of dims in selection not same as rank" self.log.error(msg) raise IOError(errno.EIO, msg) diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py old mode 100755 new mode 100644 index 9f867f27..fecf38f0 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -2,37 +2,199 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -""" -This class is used to map between HDF5 type representations and numpy types - -""" +import weakref import numpy as np -from h5py.h5t import special_dtype -from h5py.h5t import check_dtype -from h5py.h5r import Reference -from h5py.h5r import RegionReference + + +class Reference: + """ + Represents an HDF5 object reference + """ + + @property + def id(self): + """Low-level identifier appropriate for this object""" + return self._id + + @property + def objref(self): + """Weak reference to object""" + return self._objref # return weak ref to ref'd object + + def __init__(self, bind): + """Create a new reference by binding to + a group/dataset/committed type + """ + self._id = bind._id + self._objref = weakref.ref(bind) + + def __repr__(self): + # TBD: this is not consistent with hsds or h5py... + if not isinstance(self._id.id, str): + raise TypeError("Expected string id") + item = None + + collection_type = self._id.collection_type + item = f"{collection_type}/{self._id.id}" + return item + + def tolist(self): + if type(self._id.id) is not str: + raise TypeError("Expected string id") + if self._id.objtype_code == "d": + return [ + ("datasets/" + self._id.id), + ] + elif self._id.objtype_code == "g": + return [ + ("groups/" + self._id.id), + ] + elif self._id.objtype_code == "t": + return [ + ("datatypes/" + self._id.id), + ] + else: + raise TypeError("Unexpected id type") + + +class RegionReference: + """ + Represents an HDF5 region reference + """ + + @property + def id(self): + """Low-level identifier appropriate for this object""" + return self._id + + @property + def objref(self): + """Weak reference to object""" + return self._objref # return weak ref to ref'd object + + def __init__(self, bind): + """Create a new reference by binding to + a group/dataset/committed type + """ + self._id = bind._id + self._objref = weakref.ref(bind) + + def __repr__(self): + return "" + + +def special_dtype(**kwds): + """Create a new h5py "special" type. Only one keyword may be given. + + Legal keywords are: + + vlen = basetype + Base type for HDF5 variable-length datatype. This can be Python + str type or instance of np.dtype. + Example: special_dtype( vlen=str ) + + enum = (basetype, values_dict) + Create a NumPy representation of an HDF5 enumerated type. Provide + a 2-tuple containing an (integer) base dtype and a dict mapping + string names to integer values. + + ref = Reference | RegionReference + Create a NumPy representation of an HDF5 object or region reference + type.""" + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, val = kwds.popitem() + + if name == "vlen": + + return np.dtype("O", metadata={"vlen": val}) + + if name == "enum": + + try: + dt, enum_vals = val + except TypeError: + msg = "Enums must be created from a 2-tuple " + msg += "(basetype, values_dict)" + raise TypeError(msg) + + dt = np.dtype(dt) + if dt.kind not in "iu": + raise TypeError("Only integer types can be used as enums") + + return np.dtype(dt, metadata={"enum": enum_vals}) + + if name == "ref": + dt = None + if val is Reference: + dt = np.dtype("S48", metadata={"ref": Reference}) + elif val is RegionReference: + dt = np.dtype("S48", metadata={"ref": RegionReference}) + else: + raise ValueError("Ref class must be Reference or RegionReference") + + return dt + + raise TypeError(f'Unknown special type "{name}"') + + +def check_dtype(**kwds): + """Check a dtype for h5py special type "hint" information. Only one + keyword may be given. + + vlen = dtype + If the dtype represents an HDF5 vlen, returns the Python base class. + Currently only builting string vlens (str) are supported. Returns + None if the dtype does not represent an HDF5 vlen. + + enum = dtype + If the dtype represents an HDF5 enumerated type, returns the dictionary + mapping string names to integer values. Returns None if the dtype does + not represent an HDF5 enumerated type. + + ref = dtype + If the dtype represents an HDF5 reference type, returns the reference + class (either Reference or RegionReference). Returns None if the dtype + does not represent an HDF5 reference type. + """ + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, dt = kwds.popitem() + + if name not in ("vlen", "enum", "ref"): + raise TypeError('Unknown special type "%s"' % name) + + try: + return dt.metadata[name] + except TypeError: + return None + except KeyError: + return None def getTypeResponse(typeItem): """ Convert the given type item to a predefined type string for - predefined integer and floating point types ("H5T_STD_I64LE", et. al). - For compound types, recursively iterate through the typeItem and do same - conversion for fields of the compound type. - """ + predefined integer and floating point types ("H5T_STD_I64LE", et. al). + For compound types, recursively iterate through the typeItem and do + same conversion for fields of the compound type.""" response = None if "uuid" in typeItem: # committed type, just return uuid response = "datatypes/" + typeItem["uuid"] - elif typeItem["class"] == "H5T_INTEGER" or typeItem["class"] == "H5T_FLOAT": + elif typeItem["class"] in ("H5T_INTEGER", "H5T_FLOAT"): # just return the class and base for pre-defined types response = {} response["class"] = typeItem["class"] @@ -52,7 +214,7 @@ def getTypeResponse(typeItem): for field in typeItem["fields"]: fieldItem = {} fieldItem["name"] = field["name"] - fieldItem["type"] = getTypeResponse(field["type"]) # recursive call + fieldItem["type"] = getTypeResponse(field["type"]) # recurse call fieldList.append(fieldItem) response["fields"] = fieldList else: @@ -60,7 +222,7 @@ def getTypeResponse(typeItem): for k in typeItem.keys(): if k == "base": if isinstance(typeItem[k], dict): - response[k] = getTypeResponse(typeItem[k]) # recursive call + response[k] = getTypeResponse(typeItem[k]) # recurse call else: response[k] = typeItem[k] # predefined type elif k not in ("size", "base_size"): @@ -68,112 +230,12 @@ def getTypeResponse(typeItem): return response -def getItemSize(typeItem): - """ - Get size of an item in bytes. - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE" +def getTypeItem(dt, metadata=None): """ - # handle the case where we are passed a primitive type first - if isinstance(typeItem, bytes): - typeItem = typeItem.decode("ascii") - if isinstance(typeItem, str): - for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): - if typeItem.startswith(type_prefix): - num_bits = typeItem[len(type_prefix) :] - if num_bits[-2:] in ("LE", "BE"): - num_bits = num_bits[:-2] - try: - return int(num_bits) // 8 - except ValueError: - raise TypeError("Invalid Type") - # none of the expect primative types mathched - raise TypeError("Invalid Type") - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - item_size = 0 - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - item_size = typeItem["length"] - - elif typeClass == "H5T_VLEN": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_OPAQUE": - if "size" not in typeItem: - raise KeyError("'size' not provided") - item_size = int(typeItem["size"]) - - elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("'base' must be provided for enum types") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_REFERENCE": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if type(fields) is not list: - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - # add up the size of each sub-field - for field in fields: - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "type" not in field: - raise KeyError("'type' missing from field") - subtype_size = getItemSize(field["type"]) # recursive call - if subtype_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" - break # don't need to look at the rest - - item_size += subtype_size - else: - raise TypeError("Invalid type class") - - # calculate array type - if "dims" in typeItem and type(item_size) is int: - dims = typeItem["dims"] - for dim in dims: - item_size *= dim - - return item_size - - -""" Return type info. For primitive types, return string with typename For compound types return array of dictionary items -""" - - -def getTypeItem(dt): - + """ predefined_int_types = { "int8": "H5T_STD_I8", "uint8": "H5T_STD_U8", @@ -184,10 +246,16 @@ def getTypeItem(dt): "int64": "H5T_STD_I64", "uint64": "H5T_STD_U64", } - predefined_float_types = {"float32": "H5T_IEEE_F32", "float64": "H5T_IEEE_F64"} + predefined_float_types = { + "float16": "H5T_IEEE_F16", + "float32": "H5T_IEEE_F32", + "float64": "H5T_IEEE_F64", + } + if not metadata and dt.metadata: + metadata = dt.metadata type_info = {} - if len(dt) > 1 or dt.names: + if len(dt) > 1: # compound type names = dt.names type_info["class"] = "H5T_COMPOUND" @@ -204,15 +272,22 @@ def getTypeItem(dt): # array type type_info["dims"] = dt.shape type_info["class"] = "H5T_ARRAY" - type_info["base"] = getTypeItem(dt.base) + type_info["base"] = getTypeItem(dt.base, metadata=metadata) elif dt.kind == "O": # vlen string or data # # check for h5py variable length extension - vlen_check = check_dtype(vlen=dt.base) - if vlen_check is not None and not isinstance(vlen_check, np.dtype): - vlen_check = np.dtype(vlen_check) - ref_check = check_dtype(ref=dt.base) + vlen_check = None + if metadata and "vlen" in metadata: + vlen_check = metadata["vlen"] + if vlen_check is not None and not isinstance(vlen_check, np.dtype): + vlen_check = np.dtype(vlen_check) + + if metadata and "ref" in metadata: + ref_check = metadata["ref"] + else: + ref_check = check_dtype(ref=dt.base) + if vlen_check == bytes: type_info["class"] = "H5T_STRING" type_info["length"] = "H5T_VARIABLE" @@ -229,15 +304,15 @@ def getTypeItem(dt): type_info["size"] = "H5T_VARIABLE" type_info["base"] = getTypeItem(vlen_check) elif vlen_check is not None: - # unknown vlen type + # unknown vlen type raise TypeError("Unknown h5py vlen type: " + str(vlen_check)) elif ref_check is not None: # a reference type type_info["class"] = "H5T_REFERENCE" - if ref_check is Reference: + if ref_check.__name__ == "Reference": type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: + elif ref_check.__name__ == "RegionReference": type_info["base"] = "H5T_STD_REF_DSETREG" # region ref else: raise TypeError("unexpected reference type") @@ -249,14 +324,40 @@ def getTypeItem(dt): type_info["size"] = dt.itemsize type_info["tag"] = "" # todo - determine tag elif dt.base.kind == "S": - # Fixed length string type - type_info["class"] = "H5T_STRING" - type_info["charSet"] = "H5T_CSET_ASCII" + # check for object reference + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + # a reference type + type_info["class"] = "H5T_REFERENCE" + + if ref_check is Reference: + type_info["base"] = "H5T_STD_REF_OBJ" # objref + elif ref_check is RegionReference: + type_info["base"] = "H5T_STD_REF_DSETREG" # region ref + else: + raise TypeError("unexpected reference type") + else: + # Fixed length string type + type_info["class"] = "H5T_STRING" type_info["length"] = dt.itemsize + type_info["charSet"] = "H5T_CSET_ASCII" type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.base.kind == "U": # Fixed length unicode type - raise TypeError("Fixed length unicode type is not supported") + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + raise TypeError("unexpected reference type") + + # Fixed length string type with unicode support + type_info["class"] = "H5T_STRING" + + # this can be problematic if the encoding of the string is not valid, + # or reqires too many bytes. Use variable length strings to handle all + # UTF8 strings correctly + type_info["charSet"] = "H5T_CSET_UTF8" + # convert from UTF32 length to a fixed length + type_info["length"] = dt.itemsize + type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.kind == "b": # boolean type - h5py stores as enum @@ -265,13 +366,12 @@ def getTypeItem(dt): if dt.base.byteorder == ">": byteorder = "BE" # this mapping is an h5py convention for boolean support - members = [{"name": "FALSE", "value": 0}, {"name": "TRUE", "value": 1}] + mapping = {"FALSE": 0, "TRUE": 1} type_info["class"] = "H5T_ENUM" - type_info["members"] = members + type_info["mapping"] = mapping base_info = {"class": "H5T_INTEGER"} base_info["base"] = "H5T_STD_I8" + byteorder type_info["base"] = base_info - elif dt.kind == "f": # floating point type type_info["class"] = "H5T_FLOAT" @@ -280,7 +380,8 @@ def getTypeItem(dt): byteorder = "BE" if dt.name in predefined_float_types: # maps to one of the HDF5 predefined types - type_info["base"] = predefined_float_types[dt.base.name] + byteorder + float_type = predefined_float_types[dt.base.name] + type_info["base"] = float_type + byteorder else: raise TypeError("Unexpected floating point type: " + dt.name) elif dt.kind == "i" or dt.kind == "u": @@ -291,14 +392,13 @@ def getTypeItem(dt): if dt.base.byteorder == ">": byteorder = "BE" - # numpy integer type - but check to see if this is the h5py + # numpy integer type - but check to see if this is the hypy # enum extension - mapping = check_dtype(enum=dt) - - if mapping: + if metadata and "enum" in metadata: # yes, this is an enum! + mapping = metadata["enum"] type_info["class"] = "H5T_ENUM" - type_info["members"] = [{"name": n, "value": v} for n, v in mapping.items()] + type_info["mapping"] = mapping if dt.name not in predefined_int_types: raise TypeError("Unexpected integer type: " + dt.name) # maps to one of the HDF5 predefined types @@ -316,11 +416,146 @@ def getTypeItem(dt): else: # unexpected kind - raise TypeError("unexpected dtype kind: " + dt.kind) + raise TypeError(f"unexpected dtype kind: {dt.kind}") return type_info +def getItemSize(typeItem): + """ + Get size of an item in bytes. + For variable length types (e.g. variable length strings), + return the string "H5T_VARIABLE" + """ + # handle the case where we are passed a primitive type first + if isinstance(typeItem, str) or isinstance(typeItem, bytes): + for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): + if typeItem.startswith(type_prefix): + nlen = len(type_prefix) + num_bits = typeItem[nlen:] + if num_bits[-2:] in ("LE", "BE"): + num_bits = num_bits[:-2] + try: + return int(num_bits) // 8 + except ValueError: + raise TypeError("Invalid Type") + # none of the expect primative types mathched + raise TypeError("Invalid Type") + if not isinstance(typeItem, dict): + raise TypeError("invalid type") + + item_size = 0 + if "class" not in typeItem: + raise KeyError("'class' not provided") + typeClass = typeItem["class"] + + if typeClass == "H5T_INTEGER": + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_FLOAT": + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_STRING": + if "length" not in typeItem: + raise KeyError("'length' not provided") + item_size = typeItem["length"] + + elif typeClass == "H5T_VLEN": + item_size = "H5T_VARIABLE" + elif typeClass == "H5T_OPAQUE": + if "size" not in typeItem: + raise KeyError("'size' not provided") + item_size = int(typeItem["size"]) + + elif typeClass == "H5T_ARRAY": + if "dims" not in typeItem: + raise KeyError("'dims' must be provided for array types") + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_ENUM": + if "base" not in typeItem: + raise KeyError("'base' must be provided for enum types") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_REFERENCE": + if "length" in typeItem: + item_size = typeItem["length"] + elif "base" in typeItem and typeItem["base"] == "H5T_STD_REF_OBJ": + # obj ref values are in the form: "groups/" or + # "datasets/" or "datatypes/" + item_size = 48 + else: + item_size = 80 # tb: just take a guess at this for now + elif typeClass == "H5T_COMPOUND": + if "fields" not in typeItem: + raise KeyError("'fields' not provided for compound type") + fields = typeItem["fields"] + if not isinstance(fields, list): + raise TypeError("Type Error: expected list type for 'fields'") + if not fields: + raise KeyError("no 'field' elements provided") + # add up the size of each sub-field + for field in fields: + if not isinstance(field, dict): + raise TypeError("Expected dictionary type for field") + if "type" not in field: + raise KeyError("'type' missing from field") + subtype_size = getItemSize(field["type"]) # recursive call + if subtype_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" + break # don't need to look at the rest + + item_size += subtype_size + else: + raise TypeError("Invalid type class") + + # calculate array type + if "dims" in typeItem and isinstance(item_size, int): + dims = typeItem["dims"] + for dim in dims: + item_size *= dim + + return item_size + + +def getDtypeItemSize(dtype): + """ Return size of dtype in bytes + For variable length types (e.g. variable length strings), + return the string "H5T_VARIABLE + """ + item_size = 0 + if len(dtype) > 0: + # compound dtype + for i in range(len(dtype)): + sub_dt = dtype[i] + sub_dt_size = getDtypeItemSize(sub_dt) + if sub_dt_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" # return variable if any component is variable + break + item_size += sub_dt_size + else: + # primitive type + if dtype.shape: + base_size = getDtypeItemSize(dtype.base) + if base_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" + else: + nelements = np.prod(dtype.shape) + item_size = base_size * nelements + else: + if dtype.metadata and "vlen" in dtype.metadata: + item_size = "H5T_VARIABLE" + else: + item_size = dtype.itemsize + return item_size + + def getNumpyTypename(hdf5TypeName, typeClass=None): predefined_int_types = { "H5T_STD_I8": "i1", @@ -332,7 +567,11 @@ def getNumpyTypename(hdf5TypeName, typeClass=None): "H5T_STD_I64": "i8", "H5T_STD_U64": "u8", } - predefined_float_types = {"H5T_IEEE_F32": "f4", "H5T_IEEE_F64": "f8"} + predefined_float_types = { + "H5T_IEEE_F16": "f2", + "H5T_IEEE_F32": "f4", + "H5T_IEEE_F64": "f8", + } if len(hdf5TypeName) < 3: raise Exception("Type Error: invalid typename: ") @@ -356,7 +595,6 @@ def getNumpyTypename(hdf5TypeName, typeClass=None): def createBaseDataType(typeItem): - dtRet = None if isinstance(typeItem, str): # should be one of the predefined types @@ -371,20 +609,32 @@ def createBaseDataType(typeItem): raise KeyError("'class' not provided") typeClass = typeItem["class"] + dims = "" + if "dims" in typeItem: + if typeClass != "H5T_ARRAY": + raise TypeError("'dims' only supported for integer types") + + dims = None + if isinstance(typeItem["dims"], int): + dims = typeItem["dims"] # make into a tuple + elif not isinstance(typeItem["dims"], list) and not isinstance( + typeItem["dims"], tuple + ): + raise TypeError("expected list or integer for dims") + else: + dims = typeItem["dims"] + dims = str(tuple(dims)) + if typeClass == "H5T_INTEGER": if "base" not in typeItem: raise KeyError("'base' not provided") - if "dims" in typeItem: - raise TypeError("'dims' not supported for integer types") baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_INTEGER") - dtRet = np.dtype(baseType) + dtRet = np.dtype(dims + baseType) elif typeClass == "H5T_FLOAT": if "base" not in typeItem: raise KeyError("'base' not provided") - if "dims" in typeItem: - raise TypeError("'dims' not supported for floating point types") baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_FLOAT") - dtRet = np.dtype(baseType) + dtRet = np.dtype(dims + baseType) elif typeClass == "H5T_STRING": if "length" not in typeItem: raise KeyError("'length' not provided") @@ -392,8 +642,9 @@ def createBaseDataType(typeItem): raise KeyError("'charSet' not provided") if typeItem["length"] == "H5T_VARIABLE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for variable types") + if dims: + msg = "ArrayType is not supported for variable len types" + raise TypeError(msg) if typeItem["charSet"] == "H5T_CSET_ASCII": dtRet = special_dtype(vlen=bytes) elif typeItem["charSet"] == "H5T_CSET_UTF8": @@ -408,20 +659,25 @@ def createBaseDataType(typeItem): if typeItem["charSet"] == "H5T_CSET_ASCII": type_code = "S" elif typeItem["charSet"] == "H5T_CSET_UTF8": - raise TypeError("fixed-width unicode strings are not supported") + # use the same type_code as ascii strings + # (othewise, numpy will reserve bytes for UTF32 representation) + type_code = "S" else: raise TypeError("unexpected 'charSet' value") - dtRet = np.dtype(type_code + str(nStrSize)) # fixed size string + # a fixed size string + dtRet = np.dtype(dims + type_code + str(nStrSize)) elif typeClass == "H5T_VLEN": - if "dims" in typeItem: - raise TypeError("'dims' not supported for vlen types") + if dims: + msg = "ArrayType is not supported for variable len types" + raise TypeError(msg) if "base" not in typeItem: raise KeyError("'base' not provided") baseType = createBaseDataType(typeItem["base"]) dtRet = special_dtype(vlen=np.dtype(baseType)) elif typeClass == "H5T_OPAQUE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for opaque types") + if dims: + msg = "Opaque Type is not supported for variable len types" + raise TypeError(msg) if "size" not in typeItem: raise KeyError("'size' not provided") nSize = int(typeItem["size"]) @@ -429,26 +685,19 @@ def createBaseDataType(typeItem): raise TypeError("'size' must be non-negative") dtRet = np.dtype("V" + str(nSize)) elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: + if not dims: raise KeyError("'dims' must be provided for array types") if "base" not in typeItem: raise KeyError("'base' not provided") arrayBaseType = typeItem["base"] - if type(arrayBaseType) is dict: + if isinstance(arrayBaseType, dict): if "class" not in arrayBaseType: raise KeyError("'class' not provided for array base type") - if arrayBaseType["class"] not in ( - "H5T_INTEGER", - "H5T_FLOAT", - "H5T_STRING", - "H5T_COMPOUND", - ): - raise TypeError( - f"{arrayBaseType['class']}: H5T_ARRAY base type not supported." - ) - - dt_base = createDataType(arrayBaseType) - + type_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_COMPOUND", "H5T_ARRAY") + if arrayBaseType["class"] not in type_classes: + msg = "Array Type base type must be integer, float, string, compound or array" + raise TypeError(msg) + baseType = createDataType(arrayBaseType) if isinstance(typeItem["dims"], int): dims = typeItem["dims"] # make into a tuple elif type(typeItem["dims"]) not in (list, tuple): @@ -457,11 +706,17 @@ def createBaseDataType(typeItem): dims = typeItem["dims"] # create an array type of the base type - dtRet = np.dtype((dt_base, dims)) - + dtRet = np.dtype((baseType, dims)) + """ + metadata = None + if baseType.metadata: + metadata = dict(baseType.metadata) + dtRet = np.dtype(dims + baseType.str, metadata=metadata) + else: + dtRet = np.dtype(dims + baseType.str) + return dtRet # return predefined type + """ elif typeClass == "H5T_REFERENCE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for reference types") if "base" not in typeItem: raise KeyError("'base' not provided") if typeItem["base"] == "H5T_STD_REF_OBJ": @@ -470,6 +725,7 @@ def createBaseDataType(typeItem): dtRet = special_dtype(ref=RegionReference) else: raise TypeError("Invalid base type for reference type") + elif typeClass == "H5T_ENUM": if "base" not in typeItem: raise KeyError("Expected 'base' to be provided for enum type") @@ -477,21 +733,32 @@ def createBaseDataType(typeItem): if "class" not in base_json: raise KeyError("Expected class field in base type") if base_json["class"] != "H5T_INTEGER": - raise TypeError("Only integer base types can be used with enum type") - if "members" not in typeItem: - raise KeyError("'members' not provided for enum type") - members = typeItem["members"] - if len(members) == 0: - raise KeyError("empty enum members") + msg = "Only integer base types can be used with enum type" + raise TypeError(msg) + if "mapping" not in typeItem: + raise KeyError("'mapping' not provided for enum type") + mapping = typeItem["mapping"] + if len(mapping) == 0: + raise KeyError("empty enum map") dt = createBaseDataType(base_json) - values_dict = dict((m["name"], m["value"]) for m in members) - if ( - dt.kind == "i" - and dt.name == "int8" - and len(members) == 2 - and "TRUE" in values_dict - and "FALSE" in values_dict + if isinstance(mapping, list): + # convert to a dictionary + values_dict = dict((m["name"], m["value"]) for m in mapping) + elif isinstance(mapping, dict): + # just use as is + values_dict = mapping + else: + raise TypeError("Expected dict or list mapping for enum type") + + if all( + ( + dt.kind == "i", + dt.name == "int8", + len(mapping) == 2, + "TRUE" in values_dict, + "FALSE" in values_dict, + ) ): # convert to numpy boolean type dtRet = np.dtype("bool") @@ -505,14 +772,12 @@ def createBaseDataType(typeItem): return dtRet -""" -Create a numpy datatype given a json type -""" - - def createDataType(typeItem): + """ + Create a numpy datatype given a json type + """ dtRet = None - if isinstance(typeItem, (str, bytes)): + if type(typeItem) in (str, bytes): # should be one of the predefined types dtName = getNumpyTypename(typeItem) dtRet = np.dtype(dtName) @@ -543,20 +808,90 @@ def createDataType(typeItem): if "type" not in field: raise KeyError("'type' missing from field") field_name = field["name"] - if isinstance(field_name, str): - # verify the field name is ascii - try: - field_name.encode("ascii") - except UnicodeDecodeError: - raise TypeError("non-ascii field name not allowed") + if not isinstance(field_name, str): + raise TypeError("field names must be strings") + # verify the field name is ascii + try: + field_name.encode("ascii") + except UnicodeEncodeError: + raise TypeError("non-ascii field name not allowed") dt = createDataType(field["type"]) # recursive call if dt is None: raise Exception("unexpected error") - subtypes.append((field_name, dt)) # append tuple + subtypes.append((field["name"], dt)) # append tuple dtRet = np.dtype(subtypes) - else: dtRet = createBaseDataType(typeItem) # create non-compound dt return dtRet + + +def validateTypeItem(typeItem): + """ + Validate a json type - call createDataType and if no exception, + it's valid + """ + createDataType(typeItem) + # throws KeyError, TypeError, or ValueError + + +def getBaseTypeJson(type_name): + """ + Return JSON representation of a predefined type string + """ + predefined_int_types = ( + "H5T_STD_I8", + "H5T_STD_U8", + "H5T_STD_I16", + "H5T_STD_U16", + "H5T_STD_I32", + "H5T_STD_U32", + "H5T_STD_I64", + "H5T_STD_U64", + ) + predefined_float_types = ("H5T_IEEE_F16", "H5T_IEEE_F32", "H5T_IEEE_F64") + type_json = {} + # predefined typenames start with 'H5T' and end with "LE" or "BE" + if all( + ( + type_name.startswith("H5T_"), + type_name[-1] == "E", + type_name[-2] in ("L", "B"), + ) + ): + # trime of the "BE/"LE" + type_prefix = type_name[:-2] + if type_prefix in predefined_int_types: + type_json["class"] = "H5T_INTEGER" + type_json["base"] = type_name + elif type_prefix in predefined_float_types: + type_json["class"] = "H5T_FLOAT" + type_json["base"] = type_name + else: + raise TypeError("Invalid type name") + else: + raise TypeError("Invalid type name") + return type_json + + +def getSubType(dt_parent, fields): + """ Return a dtype that is a compound type composed of + the fields given in the field_names list + """ + if len(dt_parent) == 0: + raise TypeError("getSubType - parent must be compound type") + if not fields: + raise TypeError("null field specification") + if isinstance(fields, str): + fields = [fields,] # convert to a list + + field_names = set(dt_parent.names) + dt_items = [] + for field in fields: + if field not in field_names: + raise TypeError(f"field: {field} is not defined in parent type") + dt_items.append((field, dt_parent[field])) + dt = np.dtype(dt_items) + + return dt diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 6a310c60..9ac6578d 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -841,7 +841,6 @@ def testCreateReferenceAttribute(self): ] db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) item = db.getAttributeItem("groups", root_uuid, "A1") - attr_type = item["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") @@ -1275,7 +1274,6 @@ def testGetEvalStr(self): for query in queries.keys(): eval_str = db._getEvalStr(query, fields) self.assertEqual(eval_str, queries[query]) - # print(query, "->", eval_str) def testBadQuery(self): queries = ( diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index 0f67d7bf..7101286a 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -2,8 +2,8 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # @@ -12,11 +12,12 @@ import unittest import logging import numpy as np -from h5py import special_dtype -from h5py import check_dtype -from h5py import Reference -from h5py import RegionReference + from h5json import hdf5dtype +from h5json.hdf5dtype import special_dtype +from h5json.hdf5dtype import check_dtype +from h5json.hdf5dtype import Reference +from h5json.hdf5dtype import RegionReference class Hdf5dtypeTest(unittest.TestCase): @@ -26,6 +27,31 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.INFO) + def testGetBaseTypeJson(self): + type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F64LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_IEEE_F64LE") + + type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F16LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_IEEE_F16LE") + + type_json = hdf5dtype.getBaseTypeJson("H5T_STD_I32LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_STD_I32LE") + + try: + hdf5dtype.getBaseTypeJson("foobar") + self.assertTrue(False) + except TypeError: + pass # expected + def testBaseIntegerTypeItem(self): dt = np.dtype(" Date: Sat, 8 Feb 2025 18:46:29 +0800 Subject: [PATCH 004/129] patch flake8 error --- test/unit/hdf5dtype_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index 7101286a..a645dc07 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -682,10 +682,10 @@ def testCreateCompoundArrayVlenType(self): "charSet": "H5T_CSET_ASCII", "strPad": "H5T_STR_NULLTERM", "length": "H5T_VARIABLE" - } + } # noqa: E126 }, "name": "VALUE3"} - ], + ], # noqa: E123 "class": "H5T_COMPOUND" } typeSize = hdf5dtype.getItemSize(typeItem) From 133e962c1f25a02f63b170c39747d263ca740187 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sat, 8 Feb 2025 18:47:35 +0800 Subject: [PATCH 005/129] patch flake8 error --- test/unit/hdf5dtype_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index a645dc07..dbc806bb 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -682,10 +682,10 @@ def testCreateCompoundArrayVlenType(self): "charSet": "H5T_CSET_ASCII", "strPad": "H5T_STR_NULLTERM", "length": "H5T_VARIABLE" - } # noqa: E126 + } # noqa: E126 }, "name": "VALUE3"} - ], # noqa: E123 + ], # noqa: E123 "class": "H5T_COMPOUND" } typeSize = hdf5dtype.getItemSize(typeItem) From 856ee6502641d71c0dae799b44275f3a9df38c24 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 9 Feb 2025 09:38:19 +0800 Subject: [PATCH 006/129] keep backward compatibility for enum members key --- data/json/bool_attr.json | 2 +- data/json/bool_dset.json | 2 +- data/json/enum_attr.json | 2 +- data/json/enum_dset.json | 2 +- src/h5json/hdf5db.py | 2 +- src/h5json/hdf5dtype.py | 8 ++++++-- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/data/json/bool_attr.json b/data/json/bool_attr.json index 6d4d24da..ff092b9a 100644 --- a/data/json/bool_attr.json +++ b/data/json/bool_attr.json @@ -20,7 +20,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "mapping": [ + "members": [ { "name": "FALSE", "value": 0 diff --git a/data/json/bool_dset.json b/data/json/bool_dset.json index 11f19e01..29e46d80 100644 --- a/data/json/bool_dset.json +++ b/data/json/bool_dset.json @@ -24,7 +24,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "mapping": [ + "members": [ { "name": "FALSE", "value": 0 diff --git a/data/json/enum_attr.json b/data/json/enum_attr.json index e39425ef..9e9d94a9 100644 --- a/data/json/enum_attr.json +++ b/data/json/enum_attr.json @@ -21,7 +21,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "mapping": [ + "members": [ { "name": "GAS", "value": 2 diff --git a/data/json/enum_dset.json b/data/json/enum_dset.json index 08291696..d2afcd4a 100644 --- a/data/json/enum_dset.json +++ b/data/json/enum_dset.json @@ -25,7 +25,7 @@ "class": "H5T_INTEGER" }, "class": "H5T_ENUM", - "mapping": [ + "members": [ { "name": "GAS", "value": 2 diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 112fb867..db48eda3 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -797,7 +797,7 @@ def getObjByPath(self, path): def getObjectByUuid(self, col_type, obj_uuid): # col_type should be either "datasets", "groups", or "datatypes" if col_type not in ("datasets", "groups", "datatypes"): - msg = "Unexpectd error, invalid col_type: [" + col_type + "]" + msg = "Unexpected error, invalid col_type: [" + col_type + "]" self.log.error(msg) raise IOError(errno.EIO, msg) if col_type == "groups" and obj_uuid == self.dbGrp.attrs["rootUUID"]: diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index fecf38f0..9c565ce0 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -735,9 +735,13 @@ def createBaseDataType(typeItem): if base_json["class"] != "H5T_INTEGER": msg = "Only integer base types can be used with enum type" raise TypeError(msg) - if "mapping" not in typeItem: + if "mapping" in typeItem: + mapping = typeItem["mapping"] + elif "members" in typeItem: + mapping = typeItem["members"] # backward-compatibility for hdf5-json + else: raise KeyError("'mapping' not provided for enum type") - mapping = typeItem["mapping"] + if len(mapping) == 0: raise KeyError("empty enum map") From eec4efce6706e89ffbad4e6cc9d500f71cfa8216 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 12 Feb 2025 22:39:11 +0800 Subject: [PATCH 007/129] first pass at abstrct db class --- src/h5json/array_util.py | 730 +++++++ src/h5json/dset_util.py | 114 + src/h5json/hdf5db.py | 3942 ++++------------------------------ src/h5json/hdf5dtype.py | 77 +- src/h5json/objid.py | 2 + test/unit/array_util_test.py | 1021 +++++++++ test/unit/hdf5db_test.py | 1449 +++---------- 7 files changed, 2642 insertions(+), 4693 deletions(-) create mode 100644 src/h5json/array_util.py create mode 100644 src/h5json/dset_util.py create mode 100644 test/unit/array_util_test.py diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py new file mode 100644 index 00000000..bef4587e --- /dev/null +++ b/src/h5json/array_util.py @@ -0,0 +1,730 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import math +import base64 +import binascii +import numpy as np + +MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million + + +def bytesArrayToList(data): + """ + Convert list that may contain bytes type elements to list of string elements + + TBD: Need to deal with non-string byte data (hexencode?) + """ + if type(data) in (bytes, str): + is_list = False + elif isinstance(data, (np.ndarray, np.generic)): + if len(data.shape) == 0: + is_list = False + data = data.tolist() # tolist will return a scalar in this case + if type(data) in (list, tuple): + is_list = True + else: + is_list = False + else: + is_list = True + elif type(data) in (list, tuple): + is_list = True + else: + is_list = False + + if is_list: + out = [] + for item in data: + try: + rec_item = bytesArrayToList(item) # recursive call + out.append(rec_item) + except ValueError as err: + raise err + elif type(data) is bytes: + try: + out = data.decode("utf-8") + except UnicodeDecodeError as err: + raise ValueError(err) + else: + out = data + + return out + + +def toTuple(rank, data): + """ + Convert a list to a tuple, recursively. + Example. [[1,2],[3,4]] -> ((1,2),(3,4)) + """ + if type(data) in (list, tuple): + if rank > 0: + return list(toTuple(rank - 1, x) for x in data) + else: + return tuple(toTuple(rank - 1, x) for x in data) + else: + if isinstance(data, str): + data = data.encode("utf8") + return data + + +def getArraySize(arr): + """ + Get size in bytes of a numpy array. + """ + nbytes = arr.dtype.itemsize + for n in arr.shape: + nbytes *= n + return nbytes + + +def getNumElements(dims): + """ + Get num elements defined by a shape + """ + num_elements = 0 + if isinstance(dims, int): + num_elements = dims + elif isinstance(dims, (list, tuple)): + num_elements = 1 + for dim in dims: + num_elements *= dim + else: + raise ValueError("Unexpected argument") + return num_elements + + +def isVlen(dt): + """ + Return True if the type contains variable length elements + """ + is_vlen = False + if len(dt) > 1: + names = dt.names + for name in names: + if isVlen(dt[name]): + is_vlen = True + break + else: + if dt.metadata and "vlen" in dt.metadata: + is_vlen = True + return is_vlen + + +def jsonToArray(data_shape, data_dtype, data_json): + """ + Return numpy array from the given json array. + """ + def fillVlenArray(rank, data, arr, index): + for i in range(len(data)): + if rank > 1: + index = fillVlenArray(rank - 1, data[i], arr, index) + else: + arr[index] = data[i] + index += 1 + return index + + if data_json is None: + return np.array([]).astype(data_dtype) + + if isinstance(data_json, (list, tuple)): + if None in data_json: + return np.array([]).astype(data_dtype) + + # need some special conversion for compound types -- + # each element must be a tuple, but the JSON decoder + # gives us a list instead. + if len(data_dtype) > 1 and not isinstance(data_json, (list, tuple)): + raise TypeError("expected list data for compound data type") + npoints = getNumElements(data_shape) + np_shape_rank = len(data_shape) + + if type(data_json) in (list, tuple): + converted_data = [] + if npoints == 1 and len(data_json) == len(data_dtype): + converted_data.append(toTuple(0, data_json)) + else: + converted_data = toTuple(np_shape_rank, data_json) + data_json = converted_data + else: + if isinstance(data_json, str): + data_json = data_json.encode("utf8") + data_json = [data_json,] # listify + + if isVlen(data_dtype): + arr = np.zeros((npoints,), dtype=data_dtype) + fillVlenArray(np_shape_rank, data_json, arr, 0) + else: + try: + arr = np.array(data_json, dtype=data_dtype) + except UnicodeEncodeError as ude: + msg = "Unable to encode data" + raise ValueError(msg) from ude + # raise an exception of the array shape doesn't match the selection shape + # allow if the array is a scalar and the selection shape is one element, + # numpy is ok with this + if arr.size != npoints: + msg = "Input data doesn't match selection number of elements" + msg += f" Expected {npoints}, but received: {arr.size}" + raise ValueError(msg) + if arr.shape != data_shape: + arr = arr.reshape(data_shape) # reshape to match selection + + return arr + + +def getElementSize(e, dt): + """ + Get number of byte needed to given element as a bytestream + """ + # print(f"getElementSize - e: {e} dt: {dt} metadata: {dt.metadata}") + if len(dt) > 1: + count = 0 + for name in dt.names: + field_dt = dt[name] + field_val = e[name] + count += getElementSize(field_val, field_dt) + elif not dt.metadata or "vlen" not in dt.metadata: + count = dt.itemsize # fixed size element + else: + # variable length element + vlen = dt.metadata["vlen"] + if isinstance(e, int): + if e == 0: + count = 4 # non-initialized element + else: + raise ValueError("Unexpected value: {}".format(e)) + elif isinstance(e, bytes): + count = len(e) + 4 + elif isinstance(e, str): + count = len(e.encode("utf-8")) + 4 + elif isinstance(e, np.ndarray): + nElements = math.prod(e.shape) + if e.dtype.kind != "O": + count = e.dtype.itemsize * nElements + else: + arr1d = e.reshape((nElements,)) + count = 0 + for item in arr1d: + count += getElementSize(item, dt) + count += 4 # byte count + elif isinstance(e, list) or isinstance(e, tuple): + if not e: + # empty list, just add byte count + count = 4 + else: + # not sure how to deal with this + count = len(e) * vlen.itemsize + 4 # +4 for byte count + else: + raise TypeError("unexpected type: {}".format(type(e))) + return count + + +def getByteArraySize(arr): + """ + Get number of bytes needed to store given numpy array as a bytestream + """ + if not isVlen(arr.dtype): + return arr.itemsize * math.prod(arr.shape) + nElements = math.prod(arr.shape) + # reshape to 1d for easier iteration + arr1d = arr.reshape((nElements,)) + dt = arr1d.dtype + count = 0 + for e in arr1d: + count += getElementSize(e, dt) + return count + + +def copyBuffer(src, des, offset): + """ + Copy to buffer at given offset + """ + # print(f"copyBuffer - src: {src} offset: {offset}") + # TBD: just do: des[offset:] = src[:] ? + for i in range(len(src)): + des[i + offset] = src[i] + + # print("returning:", offset + len(src)) + return offset + len(src) + + +def copyElement(e, dt, buffer, offset): + """ + Copy element to bytearray + """ + # print(f"copyElement - dt: {dt} offset: {offset}") + if len(dt) > 1: + for name in dt.names: + field_dt = dt[name] + field_val = e[name] + offset = copyElement(field_val, field_dt, buffer, offset) + elif not dt.metadata or "vlen" not in dt.metadata: + # print(f"e vlen: {e} type: {type(e)} itemsize: {dt.itemsize}") + e_buf = e.tobytes() + # print("tobytes:", e_buf) + if len(e_buf) < dt.itemsize: + # extend the buffer for fixed size strings + # print("extending buffer") + e_buf_ex = bytearray(dt.itemsize) + for i in range(len(e_buf)): + e_buf_ex[i] = e_buf[i] + e_buf = bytes(e_buf_ex) + + # print("length:", len(e_buf)) + offset = copyBuffer(e_buf, buffer, offset) + else: + # variable length element + vlen = dt.metadata["vlen"] + # print("copyBuffer vlen:", vlen) + if isinstance(e, int): + # print("copyBuffer int") + if e == 0: + # write 4-byte integer 0 to buffer + offset = copyBuffer(b"\x00\x00\x00\x00", buffer, offset) + else: + raise ValueError("Unexpected value: {}".format(e)) + elif isinstance(e, bytes): + # print("copyBuffer bytes") + count = np.int32(len(e)) + if count > MAX_VLEN_ELEMENT: + raise ValueError("vlen element too large") + offset = copyBuffer(count.tobytes(), buffer, offset) + offset = copyBuffer(e, buffer, offset) + elif isinstance(e, str): + # print("copyBuffer, str") + text = e.encode("utf-8") + count = np.int32(len(text)) + if count > MAX_VLEN_ELEMENT: + raise ValueError("vlen element too large") + offset = copyBuffer(count.tobytes(), buffer, offset) + offset = copyBuffer(text, buffer, offset) + + elif isinstance(e, np.ndarray): + nElements = math.prod(e.shape) + # print("copyBuffer ndarray, nElements:", nElements) + + if e.dtype.kind != "O": + count = np.int32(e.dtype.itemsize * nElements) + # print("copyBuffeer got vlen count:", count) + # print("copyBuffer e:", e) + if count > MAX_VLEN_ELEMENT: + raise ValueError("vlen element too large") + offset = copyBuffer(count.tobytes(), buffer, offset) + # print("copyBuffer write new count, offset:", offset) + offset = copyBuffer(e.tobytes(), buffer, offset) + # print("copyBuffer write data, offset:", offset) + else: + arr1d = e.reshape((nElements,)) + for item in arr1d: + offset = copyElement(item, dt, buffer, offset) + + elif isinstance(e, list) or isinstance(e, tuple): + # print("cooyBuffer list/tuple vlen:", vlen, "e:", e) + count = np.int32(len(e) * vlen.itemsize) + offset = copyBuffer(count.tobytes(), buffer, offset) + if isinstance(e, np.ndarray): + arr = e + else: + arr = np.asarray(e, dtype=vlen) + offset = copyBuffer(arr.tobytes(), buffer, offset) + + else: + raise TypeError("unexpected type: {}".format(type(e))) + # print("buffer: {}".format(buffer)) + return offset + + +def getElementCount(buffer, offset=0): + """ + Get the count value from persisted vlen array + """ + + n = offset + m = offset + 4 + count_bytes = bytes(buffer[n:m]) + + try: + count = int(np.frombuffer(count_bytes, dtype=" MAX_VLEN_ELEMENT: + # expect variable length element to be between 0 and 1mb + raise ValueError("varlen element size expected to be less than 1MB") + return count + + +def readElement(buffer, offset, arr, index, dt): + """ + Read a single element from buffer into array. + + Parameters: + buffer (bytearray): Byte array to read an element from. + offset (int): Starting offset in the buffer. + arr (numpy.ndarray): Array to store the element. + index (int): Index in 'arr' at which to store the element. + dt (numpy.dtype): Numpy datatype of the element. + + Note: If the provided datatype is a variable-length sequence, + this function will read the byte count from the first 4 bytes + of the buffer, and then read the entire sequence. + + Returns: + int: The updated offset value after reading the element. + """ + if len(dt) > 1: + e = arr[index] + for name in dt.names: + field_dt = dt[name] + offset = readElement(buffer, offset, e, name, field_dt) + elif not dt.metadata or "vlen" not in dt.metadata: + count = dt.itemsize + n = offset + m = offset + count + e_buffer = buffer[n:m] + offset += count + try: + e = np.frombuffer(bytes(e_buffer), dtype=dt) + arr[index] = e[0] + except ValueError: + print(f"ERROR: ValueError setting {e_buffer} and dtype: {dt}") + raise + else: + # variable length element + vlenBaseType = dt.metadata["vlen"] + e = arr[index] + + if isinstance(e, np.ndarray): + nelements = math.prod(dt.shape) + e.reshape((nelements,)) + for i in range(nelements): + offset = readElement(buffer, offset, e, i, dt) + e.reshape(dt.shape) + else: + # total number of bytes in the vlen sequence/variable-length string + count = getElementCount(buffer, offset=offset) + offset += 4 + n = offset + m = offset + count + if count > 0: + e_buffer = buffer[n:m] + offset += count + + if vlenBaseType is bytes: + arr[index] = bytes(e_buffer) + elif vlenBaseType is str: + s = e_buffer.decode("utf-8") + arr[index] = s + else: + try: + e = np.frombuffer(bytes(e_buffer), dtype=vlenBaseType) + except ValueError: + msg = f"Failed to parse vlen data: {e_buffer} with dtype: {vlenBaseType}" + raise ValueError(msg) + arr[index] = e + return offset + + +def encodeData(data, encoding="base64"): + """ Encode given data """ + if encoding != "base64": + raise ValueError("only base64 encoding is supported") + try: + if isinstance(data, str): + data = data.encode("utf8") + except UnicodeEncodeError: + raise ValueError("can not encode string value") + if not isinstance(data, bytes): + msg = "Expected str or bytes type to encodeData, " + msg += f"but got: {type(data)}" + raise TypeError(msg) + try: + encoded_data = base64.b64encode(data) + except Exception as e: + # TBD: what exceptions can be raised? + raise ValueError(f"Unable to encode: {e}") + return encoded_data + + +def decodeData(data, encoding="base64"): + if encoding != "base64": + raise ValueError("only base64 decoding is supported") + try: + decoded_data = base64.b64decode(data) + except Exception as e: + # TBD: catch actual exception + raise ValueError(f"Unable to decode: {e}") + return decoded_data + + +def arrayToBytes(arr, encoding=None): + """ + Return byte representation of numpy array + """ + if isVlen(arr.dtype): + nSize = getByteArraySize(arr) + buffer = bytearray(nSize) + offset = 0 + nElements = math.prod(arr.shape) + arr1d = arr.reshape((nElements,)) + for e in arr1d: + # print("arrayToBytes:", e) + offset = copyElement(e, arr1d.dtype, buffer, offset) + data = bytes(buffer) + else: + # fixed length type + data = arr.tobytes() + + if encoding: + data = encodeData(data) + return data + +def bytesToArray(data, dt, shape, encoding=None): + """ + Create numpy array based on byte representation + """ + if encoding: + # decode the data + # will raise ValueError if non-decodeable + data = decodeData(data) + if not isVlen(dt): + # regular numpy from string + arr = np.frombuffer(data, dtype=dt) + else: + nelements = getNumElements(shape) + + arr = np.zeros((nelements,), dtype=dt) + offset = 0 + for index in range(nelements): + offset = readElement(data, offset, arr, index, dt) + if shape is not None: + arr = arr.reshape(shape) + # check that we can update the array if needed + # Note: this seems to have been required starting with numpuy v 1.17 + # Setting the flag directly is not recommended. + # cf: https://github.com/numpy/numpy/issues/9440 + + if not arr.flags["WRITEABLE"]: + arr_copy = arr.copy() + arr = arr_copy + + return arr + + +def getNumpyValue(value, dt=None, encoding=None): + """ + Return value as numpy type for given dtype and encoding + Encoding is expected to be one of None or "base64" + """ + # create a scalar numpy array + arr = np.zeros((), dtype=dt) + + if encoding and not isinstance(value, str): + msg = "Expected value to be string to use encoding" + raise ValueError(msg) + + if encoding == "base64": + try: + data = base64.decodebytes(value.encode("utf-8")) + except binascii.Error: + msg = "Unable to decode base64 string: {value}" + # log.warn(msg) + raise ValueError(msg) + arr = bytesToArray(data, dt, dt.shape) + else: + if isinstance(value, list): + # convert to tuple + value = tuple(value) + elif dt.kind == "f" and isinstance(value, str) and value == "nan": + value = np.nan + else: + # use as is + pass + arr = np.asarray(value, dtype=dt.base) + return arr[()] + + +def squeezeArray(data): + """ + Reduce dimensions by removing any 1-extent dimensions. + Just return input if no 1-extent dimensions + + Note: only works with ndarrays (for now at least) + """ + if not isinstance(data, np.ndarray): + raise TypeError("expected ndarray") + if len(data.shape) <= 1: + return data + can_reduce = True + for extent in data.shape: + if extent == 1: + can_reduce = True + break + if can_reduce: + data = data.squeeze() + return data + + +class IndexIterator(object): + """ + Class to iterate through list of chunks of a given dataset + """ + + def __init__(self, shape, sel=None): + self._shape = shape + self._rank = len(self._shape) + self._stop = False + + if self._rank < 1: + raise ValueError("IndexIterator can not be used on arrays of zero rank") + + if sel is None: + # select over entire dataset + slices = [] + for dim in range(self._rank): + slices.append(slice(0, self._shape[dim])) + self._sel = tuple(slices) + else: + if isinstance(sel, slice): + self._sel = (sel,) + else: + self._sel = sel + if len(self._sel) != self._rank: + raise ValueError("Invalid selection - selection region must have same rank as shape") + self._index = [] + for dim in range(self._rank): + s = self._sel[dim] + if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: + raise ValueError( + "Invalid selection - selection region must be within dataset space" + ) + self._index.append(s.start) + + def __iter__(self): + return self + + def __next__(self): + if self._stop: + raise StopIteration() + # bump up the last index and carry forward if we run outside the selection + dim = self._rank - 1 + ret_index = self._index.copy() + while True: + s = self._sel[dim] + if s.step: + step = s.step + else: + step = 1 + self._index[dim] += step + + if self._index[dim] < s.stop: + # we still have room to extend along this dimensions + break + + # reset to the start and continue iterating with higher dimension + self._index[dim] = s.start + dim -= 1 + if dim < 0: + # ran past last index, stop iteration on next run + self._stop = True + + return tuple(ret_index) + + +def ndarray_compare(arr1, arr2): + # compare two numpy arrays. + # return true if the same (exclusive of null vs. empty array) + # false otherwise + # TBD: this is slow for multi-megabyte vlen arrays, needs to be optimized + if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): + if not isinstance(arr1, np.void) and not isinstance(arr2, np.void): + return arr1 == arr2 + if isinstance(arr1, np.void) and not isinstance(arr2, np.void): + if arr1.size == 0 and not arr2: + return True + else: + return False + if not isinstance(arr1, np.void) and isinstance(arr2, np.void): + if not arr1 and arr2.size == 0: + return True + else: + return False + # both np.voids + if arr1.size != arr2.size: + return False + + if len(arr1) != len(arr2): + return False + + for i in range(len(arr1)): + if not ndarray_compare(arr1[i], arr2[i]): + return False + return True + + if isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): + # same only if arr1 is empty and arr2 is 0 + if arr1.size == 0 and not arr2: + return True + else: + return False + if not isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray): + # same only if arr1 is empty and arr2 size is 0 + if not arr1 and arr2.size == 0: + return True + else: + return False + + # two ndarrays... + if arr1.shape != arr2.shape: + return False + if arr2.dtype != arr2.dtype: + return False + + if isVlen(arr1.dtype): + # need to compare element by element + + nElements = np.prod(arr1.shape) + arr1 = arr1.reshape((nElements,)) + arr2 = arr2.reshape((nElements,)) + for i in range(nElements): + if not ndarray_compare(arr1[i], arr2[i]): + return False + return True + else: + # can just us np array_compare + return np.array_equal(arr1, arr2) + + +def getBroadcastShape(mshape, element_count): + # if element_count is less than the number of elements + # defined by mshape, return a numpy compatible broadcast + # shape that contains element_count elements. + # If non exists return None + + if np.prod(mshape) == element_count: + return None + + if element_count == 1: + # this always works + return [1,] + + bcshape = [] + rank = len(mshape) + for n in range(rank - 1): + bcshape.insert(0, mshape[rank - n - 1]) + if element_count == np.prod(bcshape): + return bcshape # have a match + + return None # no broadcast found diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py new file mode 100644 index 00000000..75854212 --- /dev/null +++ b/src/h5json/dset_util.py @@ -0,0 +1,114 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import time +from .hdf5dtype import getTypeItem + +""" +# standard compress filters +_HDF_FILTERS = { + 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, + 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, + 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, + 4: { + "class": "H5Z_FILTER_SZIP", + "alias": "szip", + "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], + }, + 5: {"class": "H5Z_FILTER_NBIT"}, + 6: { + "class": "H5Z_FILTER_SCALEOFFSET", + "alias": "scaleoffset", + "options": ["scaleType", "scaleOffset"], + }, + 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, +} + +_HDF_FILTER_OPTION_ENUMS = { + "coding": { + h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", + h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", + }, + "scaleType": { + h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", + h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", + h5py.h5z.SO_INT: "H5Z_SO_INT", + }, +} + +# h5py supported filters +_H5PY_FILTERS = { + "gzip": 1, + "shuffle": 2, + "fletcher32": 3, + "szip": 4, + "scaleoffset": 6, + "lzf": 32000, +} + +_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") +""" + +def make_new_dset( + shape=None, + dtype=None, + chunks=None, + compression=None, + shuffle=None, + maxshape=None, + compression_opts=None, + fillvalue=None, + cpl=None + ): + + type_json = getTypeItem(dtype) + if shape == "H5S_NULL": + shape_json = {"class": "H5S_NULL"} + else: + shape_json = {"class": "H5S_SIMPLE"} + shape_json["dims"] = list(shape) + + if maxshape: + shape_json["maxshape"] = maxshape + if cpl is None: + cpl = {} + if chunks: + cpl["chunks"] = chunks + if compression: + cpl["compression"] = compression + if shuffle: + cpl["shuffle"] = shuffle + if compression_opts: + cpl["compression_opts"] = compression_opts + if fillvalue: + cpl["fillvalue"] = fillvalue + + + # TBD - other properties + dset_json = {"shape": shape_json, "type": type_json, "cpl": cpl} + dset_json["created"] = time.time() + dset_json["modified"] = None + + return dset_json + +def resize_dataset(dset_json, shape): + shape_json = dset_json["shape"] + shape_class = shape_json["class"] + if shape_class != "H5S_SIMPLE": + raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") + if len(shape_class["dims"]) != len(shape): + raise ValueError("Resize shape parameter doesn't match dataset's rank") + # TBD: validate shape + shape_json["dims"] = list(shape) + dset_json["modified"] = time.time() + + \ No newline at end of file diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index db48eda3..e7ea8d9c 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -9,3563 +9,501 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import errno import time -import h5py import numpy as np -import os.path as op -import os -import json import logging -from .hdf5dtype import getTypeItem, createDataType, getItemSize, Reference, RegionReference -from .objid import createObjId +from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype +from .array_util import jsonToArray, bytesArrayToList +from .dset_util import make_new_dset, resize_dataset +from .objid import createObjId, getCollectionForId from .apiversion import _apiver -# global dictionary to direct back to the Hdf5db instance by filename -# (needed for visititems callback) -# Will break in multi-threaded context -_db = {} - -UUID_LEN = 36 # length for uuid strings - -# standard compress filters -_HDF_FILTERS = { - 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, - 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, - 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, - 4: { - "class": "H5Z_FILTER_SZIP", - "alias": "szip", - "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], - }, - 5: {"class": "H5Z_FILTER_NBIT"}, - 6: { - "class": "H5Z_FILTER_SCALEOFFSET", - "alias": "scaleoffset", - "options": ["scaleType", "scaleOffset"], - }, - 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, -} - -_HDF_FILTER_OPTION_ENUMS = { - "coding": { - h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", - h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", - }, - "scaleType": { - h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", - h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", - h5py.h5z.SO_INT: "H5Z_SO_INT", - }, -} - -# h5py supported filters -_H5PY_FILTERS = { - "gzip": 1, - "shuffle": 2, - "fletcher32": 3, - "szip": 4, - "scaleoffset": 6, - "lzf": 32000, -} - -_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") - - -def convert_dtype(srcdt): - """Return a dtype based on input dtype, converting any Reference types from - h5json style to h5py. - """ - - if len(srcdt) > 0: - fields = [] - for name in srcdt.fields: - item = srcdt.fields[name] - # item is a tuple of dtype and integer offset - field_dt = convert_dtype(item[0]) - fields.append((name, field_dt)) - tgt_dt = np.dtype(fields) - else: - # check if this a "special dtype" - if srcdt.metadata and "ref" in srcdt.metadata: - if srcdt.metadata['ref'] is Reference: - tgt_dt = h5py.special_dtype(ref=h5py.Reference) - elif srcdt.metadata['ref'] is RegionReference: - tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) - else: - raise TypeError(f"Unexpected ref type: {srcdt}") - elif srcdt.metadata and "vlen" in srcdt.metadata: - src_vlen = srcdt.metadata["vlen"] - if isinstance(src_vlen, np.dtype): - tgt_base = convert_dtype(src_vlen) - else: - tgt_base = src_vlen - tgt_dt = h5py.special_dtype(vlen=tgt_base) - elif srcdt.kind == "U": - # use vlen for unicode strings - tgt_dt = h5py.special_dtype(vlen=str) - else: - tgt_dt = srcdt # no conversion needed - return tgt_dt - - -def visitObj(path, obj): - hdf5db = _db[obj.file.filename] - hdf5db.visit(path, obj) - - class Hdf5db: """ This class is used to manage UUID lookup tables for primary HDF objects (Groups, Datasets, - and Datatypes). For HDF5 files that are read/write, this information is managed within - the file itself in the "__db__" group. For read-only files, the data is managed in - an external file (domain filename with ".db" extension). - - "___db__" ("root" for read-only case) - description: Group object (member of root group). Only objects below this group are used - for UUID data - members: "{groups}", "{datasets}", "{datatypes}", "{objects}", "{paths}" - attrs: 'rootUUID': UUID of the root group - - "{groups}" - description: contains map of UUID->group objects - members: hard link to each anonymous group (i.e. groups which are not - linked to by anywhere else). Link name is the UUID - attrs: group reference (or path for read-only files) to the group (for non- - anonymous groups). - - "{datasets}" - description: contains map of UUID->dataset objects - members: hard link to each anonymous dataset (i.e. datasets which are not - linked to by anywhere else). Link name is the UUID - attrs: dataset reference (or path for read-only files) to the dataset (for non- - anonymous datasets). - - "{dataset_props}: - description contains dataset creation properties" - members: sub-group with link name as UUID. Sub-group attributes are the creation props - - "{datatypes}" - description: contains map of UUID->datatyped objects - members: hard link to each anonymous datatype (i.e. datatypes which are not - linked to by anywhere else). Link name is the UUID - attrs: datatype reference (or path for read-only files) to the datatype (for non- - anonymous datatypes). - - "{addr}" - description: contains map of file offset to UUID. - members: none - attrs: map of file offset to UUID + and Datatypes). By default all data is held in-memory. Initialize with h5_reader to read from + an HDF5 compatible storage pool, and or, h5_writer to write to an HDF5 compatible storage pool. """ - - @staticmethod - def createHDF5File(filePath): - # create an "empty" hdf5 file - # if op.isfile(filePath): - # raise IOError(errno.EEXIST, "Resource already exists") - - f = h5py.File(filePath, "w") - f.close() - + @staticmethod def getVersionInfo(): versionInfo = {} versionInfo["hdf5-json-version"] = _apiver - versionInfo["h5py_version"] = h5py.version.version - versionInfo["hdf5_version"] = h5py.version.hdf5_version return versionInfo def __init__( self, - filePath, - dbFilePath=None, - readonly=False, - app_logger=None, - root_uuid=None, - update_timestamps=True, - userid=None, + h5_reader = None, + h5_writer = None, + app_logger = None, ): if app_logger: self.log = app_logger else: self.log = logging.getLogger() - if len(filePath) == 0 or not op.isfile(filePath): - raise IOError(errno.ENXIO, "file not found") - if not h5py.is_hdf5(filePath): - raise IOError(errno.EINVAL, "not an HDF5 file") - - mode = "r" - if readonly: - self.readonly = True - else: - if not os.stat(filePath).st_mode & 0o200: - # file is read-only - self.readonly = True - else: - mode = "r+" - self.readonly = False - - self.log.info("init -- filePath: " + filePath + " mode: " + mode) - - self.update_timestamps = update_timestamps - - self.f = h5py.File(filePath, mode, libver="latest") - - self.root_uuid = root_uuid - - if self.readonly: - # for read-only files, add a dot in front of the name to be used as - # the db file. This won't collide with actual data files, since - # "." is not allowed as the first character in a domain name. - if not dbFilePath: - dirname = op.dirname(self.f.filename) - basename = op.basename(self.f.filename) - if len(dirname) > 0: - dbFilePath = dirname + "/." + basename - else: - dbFilePath = "." + basename - dbMode = "r+" - if not op.isfile(dbFilePath): - dbMode = "w" - self.log.info("dbFilePath: " + dbFilePath + " mode: " + dbMode) - self.dbf = h5py.File(dbFilePath, dbMode) - else: - self.dbf = None # for read only - # create a global reference to this class - # so visitObj can call back - _db[filePath] = self + self._db = {} + + self._reader = h5_reader + self._writer = h5_writer + + if self._reader: + root_id = self._reader.get_objid("/") + kwargs = {"include_attrs": True, "include_links": True} + group_json = self._reader.get_obj(root_id, **kwargs) + else: + # create a root group + group_json = {"links": {}, "attributes": {}, "cpl": {}} + group_json["created"] = time.time() + root_id = createObjId(obj_type="groups") + self._db[root_id] = group_json + + self._root_id = root_id + def __enter__(self): + """ called on package init """ self.log.info("Hdf5db __enter") return self def __exit__(self, type, value, traceback): + """ called on package exit """ self.log.info("Hdf5db __exit") - filename = self.f.filename - self.f.flush() - self.f.close() - if self.dbf: - self.dbf.flush() - self.dbf.close() - del _db[filename] - - def getTimeStampName(self, uuid, objType="object", name=None): - ts_name = uuid - if objType != "object": - if len(name) == 0: - self.log.error("empty name passed to setCreateTime") - raise Exception("bad setCreateTimeParameter") - if objType == "attribute": - ts_name += "_attr:[" - ts_name += name - ts_name += "]" - elif objType == "link": - ts_name += "_link:[" - ts_name += name - ts_name += "]" - else: - msg = "Bad objType passed to setCreateTime" - self.log.error(msg) - raise IOError(errno.EIO, msg) - return ts_name - - """ - setCreateTime - sets the create time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - timestamp - time (otherwise current time will be used) - - returns - nothing - - Note - should only be called once per object - """ - - def setCreateTime(self, uuid, objType="object", name=None, timestamp=None): - if not self.update_timestamps: - return - ctime_grp = self.dbGrp["{ctime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - if timestamp is None: - timestamp = time.time() - if ts_name in ctime_grp.attrs: - self.log.warning("modifying create time for object: " + ts_name) - ctime_grp.attrs.create(ts_name, timestamp, dtype="int64") - - """ - getCreateTime - gets the create time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - useRoot - if true, use the time value for root object as default - - returns - create time for object, or create time for root if not set - """ - - def getCreateTime(self, uuid, objType="object", name=None, useRoot=True): - ctime_grp = self.dbGrp["{ctime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - timestamp = None - if ts_name in ctime_grp.attrs: - timestamp = ctime_grp.attrs[ts_name] - elif useRoot: - # return root timestamp - root_uuid = self.dbGrp.attrs["rootUUID"] - if root_uuid in ctime_grp.attrs: - timestamp = ctime_grp.attrs[root_uuid] - return timestamp - - """ - setModifiedTime - sets the modified time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - timestamp - time (otherwise current time will be used) - - returns - nothing - - """ - - def setModifiedTime(self, uuid, objType="object", name=None, timestamp=None): - if not self.update_timestamps: - return - mtime_grp = self.dbGrp["{mtime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - if timestamp is None: - timestamp = time.time() - mtime_grp.attrs.create(ts_name, timestamp, dtype="int64") - - """ - getModifiedTime - gets the modified time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - useRoot - if true, use the time value for root object as default - - returns - create time for object, or create time for root if not set - """ - - def getModifiedTime(self, uuid, objType="object", name=None, useRoot=True): - mtime_grp = self.dbGrp["{mtime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - timestamp = None - if ts_name in mtime_grp.attrs: - timestamp = mtime_grp.attrs[ts_name] - else: - # return create time if no modified time has been set - ctime_grp = self.dbGrp["{ctime}"] - if ts_name in ctime_grp.attrs: - timestamp = ctime_grp.attrs[ts_name] - elif useRoot: - # return root timestamp - root_uuid = self.dbGrp.attrs["rootUUID"] - timestamp = mtime_grp.attrs[root_uuid] - return timestamp - - """ - getAclGroup - return the db group "{acl}" if present, - otherwise return None - """ - - def getAclGroup(self, create=False): - if not self.dbGrp: - return None # file not initialized - if "{acl}" in self.dbGrp: - return self.dbGrp["{acl}"] - if not create: - return None - self.dbGrp.create_group("{acl}") - return self.dbGrp["{acl}"] - - """ - getAclDtype - return detype for ACL - """ - - def getAclDtype(self): - fields = [] - fields.append(("userid", np.int32)) - fields.append(("create", np.int8)) - fields.append(("read", np.int8)) - fields.append(("update", np.int8)) - fields.append(("delete", np.int8)) - fields.append(("readACL", np.int8)) - fields.append(("updateACL", np.int8)) - dt = np.dtype(fields) - return dt - - """ - getAclDataset - return ACL datset for given uuid - """ - - def getAclDataset(self, obj_uuid, create=False): - acl_group = self.getAclGroup(create=create) - - if acl_group is None: - return None - - if obj_uuid in acl_group: - return acl_group[obj_uuid] - - if not create: - return None - - # create dataset - dt = self.getAclDtype() - acl_group.create_dataset(obj_uuid, (0,), dtype=dt, maxshape=(None,)) - return acl_group[obj_uuid] - - """ - getNumAcls - return number of acls associatted with given uuid - """ - - def getNumAcls(self, obj_uuid): - acl_group = self.getAclGroup() - if acl_group is None: - return 0 - if obj_uuid not in acl_group: - return 0 - acls = acl_group[obj_uuid] - return acls.shape[0] - - """ - convertAclNdArrayToDict - helper function - return acl item to dict - """ - - def convertAclNdArrayToDict(self, acl_ndarray): - fields = acl_ndarray.dtype.fields.keys() - acl = {} - for field in fields: - value = int(acl_ndarray[field]) - acl[field] = value - return acl - - def getDefaultAcl(self): - """Get default acl - returns dict obj""" - - dt = self.getAclDtype() - acl = {} - for field in dt.fields.keys(): - if field == "userid": - acl[field] = 0 + if self._writer: + self._writer.flush() + self._writer.close() + + + def getObjectById(self, obj_id): + """ return objecct with given id """ + if obj_id not in self._db: + if self._reader: + # load the obj from the reader + kwargs = {"include_attrs": True, "include_links": True} + obj_json = self._reader.get_obj(obj_id, **kwargs) + self._db[obj_id] = obj_json else: - acl[field] = 1 # default is allowed - return acl - - def getAcl(self, obj_uuid, userid): - """ - getAcl - return ACL for given uuid and userid - returns ACL associated with the given uuid, or if none exists, - the ACL associatted with the root group. - - If an ACL is not present for a userid/obj and ACL will be returned - via the following precedence: - - 1) obj_uuid, user_id - 2) root_uuid, user_id - 3) obj_uuid, 0 - 4) root_uuid, 0 - 5) 'all perm' ACL - """ - acl_grp = self.getAclGroup() - - if acl_grp is not None: - acl = self.getAclByObjAndUser(obj_uuid, userid) - if acl is not None: - return acl - - if obj_uuid != self.root_uuid and userid != 0: - # get the root acl for this user - acl = self.getAclByObjAndUser(self.root_uuid, userid) - if acl is not None: - return acl - - if userid != 0: - # get acl for default user - acl = self.getAclByObjAndUser(obj_uuid, 0) - if acl is not None: - return acl - - if obj_uuid != self.root_uuid: - # get root acl for default user - acl = self.getAclByObjAndUser(self.root_uuid, 0) - if acl is not None: - return acl - - # create an ACL with default permissions - acl = self.getDefaultAcl() - - return acl - - def getAclByObjAndUser(self, obj_uuid, userid): - """ - get ACL for specific uuid and user - return None if not found - """ - acl = None - acl_dset = self.getAclDataset(obj_uuid) - - if acl_dset: - # iterate through elements, looking for user_id - acls = acl_dset[...] - num_acls = acl_dset.shape[0] - acl = None - for i in range(num_acls): - item = acls[i] - if item["userid"] == userid: - acl = item - break - - if acl is not None: - acl = self.convertAclNdArrayToDict(acl) - return acl - - def getAcls(self, obj_uuid): - """ - getAcls - get all acls for given uuid - """ - acls = [] - acl_dset = self.getAclDataset(obj_uuid) - - if acl_dset: - # iterate through elements, looking for user_id - num_acls = acl_dset.shape[0] - - for i in range(num_acls): - item = acl_dset[i] - acl = self.convertAclNdArrayToDict(item) - acls.append(acl) - - return acls - - def setAcl(self, obj_uuid, acl): - """ - setAcl - set the acl for given uuid. - """ - acl_dset = self.getAclDataset(obj_uuid, create=True) - - if acl_dset is None: - msg = "Unexpected error acl not created for uuid:[" + obj_uuid + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - userid = acl["userid"] - - # iterate through elements, looking for user_id - acls = acl_dset[...] - num_acls = acl_dset.shape[0] - - user_index = None - - for i in range(num_acls): - item = acls[i] - if item["userid"] == userid: - # update this element - user_index = i - break - - if user_index is None: - # userid not found - add row - acl_dset.resize(((num_acls + 1),)) - user_index = num_acls - - # update the acl dataset - item = acl_dset[user_index] - for field in acl.keys(): - item[field] = acl[field] - acl_dset[user_index] = item # save back to the file - - def initFile(self): - # self.log.info("initFile") - if self.readonly: - self.dbGrp = self.dbf - if "{groups}" in self.dbf: - # file already initialized - self.root_uuid = self.dbGrp.attrs["rootUUID"] - return - - else: - if "__db__" in self.f: - # file already initialized - self.dbGrp = self.f["__db__"] - self.root_uuid = self.dbGrp.attrs["rootUUID"] - return # already initialized - self.dbGrp = self.f.create_group("__db__") - - self.log.info("initializing file") - if not self.root_uuid: - self.root_uuid = createObjId() - self.dbGrp.attrs["rootUUID"] = self.root_uuid - self.dbGrp.create_group("{groups}") - self.dbGrp.create_group("{datasets}") - self.dbGrp.create_group("{datatypes}") - self.dbGrp.create_group("{addr}") # store object address - self.dbGrp.create_group("{ctime}") # stores create timestamps - self.dbGrp.create_group("{mtime}") # store modified timestamps - - mtime = op.getmtime(self.f.filename) - ctime = mtime - self.setCreateTime(self.root_uuid, timestamp=ctime) - self.setModifiedTime(self.root_uuid, timestamp=mtime) - - self.f.visititems(visitObj) - - def visit(self, path, obj): - name = obj.__class__.__name__ - if len(path) >= 6 and path[:6] == "__db__": - return # don't include the db objects - self.log.info("visit: " + path + " name: " + name) - col = None - if name == "Group": - col = self.dbGrp["{groups}"].attrs - elif name == "Dataset": - col = self.dbGrp["{datasets}"].attrs - elif name == "Datatype": - col = self.dbGrp["{datatypes}"].attrs - else: - msg = "Unknown object type: " + __name__ + " found during scan of HDF5 file" - self.log.error(msg) - raise IOError(errno.EIO, msg) - obj_id = createObjId() # create uuid - - addrGrp = self.dbGrp["{addr}"] - if not self.readonly: - # storing db in the file itself, so we can link to the object directly - col[obj_id] = obj.ref # save attribute ref to object - else: - # store path to object - col[obj_id] = obj.name - addr = h5py.h5o.get_info(obj.id).addr - # store reverse map as an attribute - addrGrp.attrs[str(addr)] = obj_id - - # - # Get Dataset creation properties - # - def getDatasetCreationProps(self, dset_uuid): - prop_list = {} - if "{dataset_props}" not in self.dbGrp: - # no, group, so no properties - return prop_list # return empty dict - dbPropsGrp = self.dbGrp["{dataset_props}"] - - if dset_uuid not in dbPropsGrp.attrs: - return prop_list # return empty dict - prop_str = dbPropsGrp.attrs[dset_uuid] - # expand json string - try: - prop_list = json.loads(prop_str) - except ValueError as ve: - msg = ( - "Unable to load creation properties for dataset:[" - + dset_uuid - + "]: " - + ve.message - ) - self.log.error(msg) - raise IOError(errno.EIO, msg) - - # fill in Filter class values - if "filters" in prop_list: - prop_filters = prop_list["filters"] - for prop_filter in prop_filters: - if "class" not in prop_filter: - filter_id = prop_filter["id"] - if filter_id in _HDF_FILTERS: - hdf_filter = _HDF_FILTERS[filter_id] - prop_filter["class"] = hdf_filter["class"] - else: - prop_filter["class"] = "H5Z_FILTER_USER" - - return prop_list - - # - # Set dataset creation property - # - def setDatasetCreationProps(self, dset_uuid, prop_dict): - self.log.info("setDataProp([" + dset_uuid + "]") - if not prop_dict: - # just ignore if empty dictionary - return - if "{dataset_props}" not in self.dbGrp: - self.dbGrp.create_group("{dataset_props}") - dbPropsGrp = self.dbGrp["{dataset_props}"] - if dset_uuid in dbPropsGrp.attrs: - # this should be write once - msg = ( - "Unexpected error setting dataset creation properties for dataset:[" - + dset_uuid - + "]" - ) - self.log.error(msg) - raise IOError(errno.EIO, msg) - prop_str = json.dumps(prop_dict) - dbPropsGrp.attrs[dset_uuid] = prop_str - - def getUUIDByAddress(self, addr): - if "{addr}" not in self.dbGrp: - self.log.error("expected to find {addr} group") - return None - addrGrp = self.dbGrp["{addr}"] - obj_uuid = None - if str(addr) in addrGrp.attrs: - obj_uuid = addrGrp.attrs[str(addr)] - if obj_uuid and type(obj_uuid) is not str: - # convert bytes to unicode - obj_uuid = obj_uuid.decode("utf-8") - return obj_uuid - - def getNumLinksToObjectInGroup(self, grp, obj): - """ - Get the number of links in a group to an object - """ - objAddr = h5py.h5o.get_info(obj.id).addr - numLinks = 0 - for name in grp: - try: - child = grp[name] - except KeyError: - # UDLink? Ignore for now - self.log.info("ignoring link (UDLink?): " + name) + raise KeyError(f"obj_id: {obj_id} not found") + obj_json = self._db[obj_id] + + return obj_json + + def getObjectIdByPath(self, h5path, parent_id=None): + """ Return id for the given link path starting from parent_id if set, + otherwise the root_id """ + + if h5path == "/": + return self._root_id # just return root id + + if parent_id is None: + parent_id = self._root_id + self.log.debug(f"getObjectIdDByPath(h5path: {h5path} parent_id: {parent_id}") + + obj_json = self.getObjectById(parent_id) + if obj_json is None: + self.log.warning("getObjectIdDByPath - parent_id not found") + raise KeyError("parent_id: {parent_id} not found") + + obj_id = parent_id + searched_ids = set(obj_id) + + link_names = h5path.split('/') + self.log.debug(f"link_names: {link_names}") + for link_name in link_names: + if not link_name: continue - - addr = h5py.h5o.get_info(child.id).addr - if addr == objAddr: - numLinks = numLinks + 1 - - return numLinks - - def getNumLinksToObject(self, obj): - """ - Get the number of links to the given object - """ - self.initFile() - groups = self.dbGrp["{groups}"] - numLinks = 0 - # iterate through each group in the file and unlink tgt if it is linked - # by the group - for uuidName in groups: - # iterate through anonymous groups - grp = groups[uuidName] - nLinks = self.getNumLinksToObjectInGroup(grp, obj) - if nLinks > 0: - numLinks += nLinks - for uuidName in groups.attrs: - # now non anonymous groups - grpRef = groups.attrs[uuidName] - grp = self.f[grpRef] # dereference - nLinks = self.getNumLinksToObjectInGroup(grp, obj) - if nLinks > 0: - numLinks += nLinks - # finally, check the root group - root = self.getObjByPath("/") - nLinks = self.getNumLinksToObjectInGroup(root, obj) - numLinks += nLinks - - return numLinks - - def getUUIDByPath(self, path): - self.initFile() - self.log.info("getUUIDByPath: [" + path + "]") - if len(path) >= 6 and path[:6] == "__db__": - msg = "getUUIDByPath called with invalid path: [" + path + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - if path == "/": - # just return the root UUID - root_uuid = self.dbGrp.attrs["rootUUID"] - if root_uuid and type(root_uuid) is not str: - # convert bytes to unicode - root_uuid = root_uuid.decode("utf-8") - return root_uuid - - obj = self.f[path] # will throw KeyError if object doesn't exist - addr = h5py.h5o.get_info(obj.id).addr - obj_uuid = self.getUUIDByAddress(addr) - return obj_uuid - - def getObjByPath(self, path): - if len(path) >= 6 and path[:6] == "__db__": - return None # don't include the db objects - obj = self.f[path] # will throw KeyError if object doesn't exist - return obj - - def getObjectByUuid(self, col_type, obj_uuid): - # col_type should be either "datasets", "groups", or "datatypes" - if col_type not in ("datasets", "groups", "datatypes"): - msg = "Unexpected error, invalid col_type: [" + col_type + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - if col_type == "groups" and obj_uuid == self.dbGrp.attrs["rootUUID"]: - return self.f["/"] # returns root group - - obj = None # Group, Dataset, or Datatype - col_name = "{" + col_type + "}" - # get the collection group for this collection type - col = self.dbGrp[col_name] - if obj_uuid in col.attrs: - ref = col.attrs[obj_uuid] - obj = self.f[ref] # this works for read-only as well - elif obj_uuid in col: - # anonymous object - obj = col[obj_uuid] - - return obj - - def getDatasetObjByUuid(self, obj_uuid): - self.initFile() - self.log.info("getDatasetObjByUuid(" + obj_uuid + ")") - - obj = self.getObjectByUuid("datasets", obj_uuid) - - return obj - - def getGroupObjByUuid(self, obj_uuid): - self.initFile() - self.log.info("getGroupObjByUuid(" + obj_uuid + ")") - - obj = self.getObjectByUuid("groups", obj_uuid) - - return obj - - def getDatasetTypeItemByUuid(self, obj_uuid): - dset = self.getDatasetObjByUuid(obj_uuid) # throws exception if not found - item = {"id": obj_uuid} - item["type"] = getTypeItem(dset.dtype) - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - return item - - def getNullReference(self): - """ - getNullReference - return a null object reference - """ - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - if "nullref" not in tmpGrp: - dt = h5py.special_dtype(ref=h5py.Reference) - tmpGrp.create_dataset("nullref", (1,), dtype=dt) - nullref_dset = tmpGrp["nullref"] - return nullref_dset[0] - - def getNullRegionReference(self): - """ - getNullRegionReference - return a null region reference - """ - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - if "nullregref" not in tmpGrp: - dt = h5py.special_dtype(ref=h5py.RegionReference) - tmpGrp.create_dataset("nullregref", (1,), dtype=dt) - nullregref_dset = tmpGrp["nullregref"] - return nullregref_dset[0] - - def getShapeItemByDsetObj(self, obj): - item = {} - if obj.shape is None: - # new with h5py 2.6, null space datasets will return None for shape - item["class"] = "H5S_NULL" - elif len(obj.shape) == 0: - # check to see if this is a null space vs a scalar dataset we'll do - # this by seeing if an exception is raised when reading the dataset - # h5py issue https://github.com/h5py/h5py/issues/279 will provide a - # better way to determine null spaces - # Update 3/10/17: Above issue is closed, but waiting on 2.7 final release - try: - val = obj[...] - if val is None: - self.log.warning("no value returned for scalar dataset") - item["class"] = "H5S_SCALAR" - except IOError: - item["class"] = "H5S_NULL" - else: - item["class"] = "H5S_SIMPLE" - item["dims"] = obj.shape - maxshape = [] - include_maxdims = False - for i in range(len(obj.shape)): - extent = 0 - if len(obj.maxshape) > i: - extent = obj.maxshape[i] - if extent is None: - extent = 0 - if extent > obj.shape[i] or extent == 0: - include_maxdims = True - maxshape.append(extent) - if include_maxdims: - item["maxdims"] = maxshape - return item - - def getShapeItemByAttrObj(self, obj): - item = {} - if obj.shape is None or obj.get_storage_size() == 0: - # If storage size is 0, assume this is a null space obj - # See: h5py issue https://github.com/h5py/h5py/issues/279 - item["class"] = "H5S_NULL" - else: - if obj.shape: - item["class"] = "H5S_SIMPLE" - item["dims"] = obj.shape - else: - item["class"] = "H5S_SCALAR" - return item - - # - # Get dataset creation properties maintained by HDF5 library - # - def getHDF5DatasetCreationProperties(self, obj_uuid, type_class): - dset = self.getDatasetObjByUuid(obj_uuid) - # - # Fill in creation properties - # - creationProps = {} - plist = h5py.h5d.DatasetID.get_create_plist(dset.id) - - # alloc time - nAllocTime = plist.get_alloc_time() - if nAllocTime == h5py.h5d.ALLOC_TIME_DEFAULT: - creationProps["allocTime"] = "H5D_ALLOC_TIME_DEFAULT" - elif nAllocTime == h5py.h5d.ALLOC_TIME_LATE: - creationProps["allocTime"] = "H5D_ALLOC_TIME_LATE" - elif nAllocTime == h5py.h5d.ALLOC_TIME_EARLY: - creationProps["allocTime"] = "H5D_ALLOC_TIME_EARLY" - elif nAllocTime == h5py.h5d.ALLOC_TIME_INCR: - creationProps["allocTime"] = "H5D_ALLOC_TIME_INCR" - else: - self.log.warning("Unknown alloc time value: " + str(nAllocTime)) - - # fill time - nFillTime = plist.get_fill_time() - if nFillTime == h5py.h5d.FILL_TIME_ALLOC: - creationProps["fillTime"] = "H5D_FILL_TIME_ALLOC" - elif nFillTime == h5py.h5d.FILL_TIME_NEVER: - creationProps["fillTime"] = "H5D_FILL_TIME_NEVER" - elif nFillTime == h5py.h5d.FILL_TIME_IFSET: - creationProps["fillTime"] = "H5D_FILL_TIME_IFSET" - else: - self.log.warning("unknown fill time value: " + str(nFillTime)) - - if type_class not in ("H5T_VLEN", "H5T_OPAQUE"): - if plist.fill_value_defined() == h5py.h5d.FILL_VALUE_USER_DEFINED: - creationProps["fillValue"] = self.bytesArrayToList(dset.fillvalue) - - # layout - nLayout = plist.get_layout() - if nLayout == h5py.h5d.COMPACT: - creationProps["layout"] = {"class": "H5D_COMPACT"} - elif nLayout == h5py.h5d.CONTIGUOUS: - creationProps["layout"] = {"class": "H5D_CONTIGUOUS"} - elif nLayout == h5py.h5d.CHUNKED: - creationProps["layout"] = {"class": "H5D_CHUNKED", "dims": dset.chunks} - else: - self.log.warning("Unknown layout value:" + str(nLayout)) - - num_filters = plist.get_nfilters() - filter_props = [] - if num_filters: - for n in range(num_filters): - filter_info = plist.get_filter(n) - opt_values = filter_info[2] - filter_prop = {} - filter_id = filter_info[0] - filter_prop["id"] = filter_id - if filter_info[3]: - filter_prop["name"] = self.bytesArrayToList(filter_info[3]) - if filter_id in _HDF_FILTERS: - hdf_filter = _HDF_FILTERS[filter_id] - filter_prop["class"] = hdf_filter["class"] - if "options" in hdf_filter: - filter_opts = hdf_filter["options"] - for i in range(len(filter_opts)): - if len(opt_values) <= i: - break # end of option values - opt_value = opt_values[i] - opt_value_enum = None - option_name = filter_opts[i] - if option_name in _HDF_FILTER_OPTION_ENUMS: - option_enums = _HDF_FILTER_OPTION_ENUMS[option_name] - if opt_value in option_enums: - opt_value_enum = option_enums[opt_value] - if opt_value_enum: - filter_prop[option_name] = opt_value_enum - else: - filter_prop[option_name] = opt_value - else: - # custom filter - filter_prop["class"] = "H5Z_FILTER_USER" - if opt_values: - filter_prop["parameters"] = opt_values - filter_props.append(filter_prop) - creationProps["filters"] = filter_props - - return creationProps - - # - # Get dataset information - type, shape, num attributes, creation properties - # - def getDatasetItemByUuid(self, obj_uuid): - dset = self.getDatasetObjByUuid(obj_uuid) - if dset is None: - if self.getModifiedTime(obj_uuid, useRoot=False): - msg = "Dataset with uuid: " + obj_uuid + " has been previously deleted" - self.log.info(msg) - raise IOError(errno.ENOENT, msg) + link_tgt = None + self.log.debug(f"link_name: {link_name}") + if not obj_id: + break + if 'links' not in obj_json: + self.log.error(f"expected to find links key in: {obj_json}") + raise KeyError(h5path) + links = obj_json['links'] + self.log.debug(f"links: {links}") + if link_name not in links: + self.log.warning(f"link: {link_name} not found in {obj_id}") + self.log.debug(f"links: {links}") + raise KeyError(h5path) + link_tgt = links[link_name] + self.log.debug(f"link_tgt: {link_tgt}") + link_class = link_tgt['class'] + obj_id = None + obj_json = None + if link_class == 'H5L_TYPE_HARD': + # hard link + obj_id = link_tgt['id'] + if obj_id in searched_ids: + self.log.warning(f"circular reference using path: {h5path}") + raise KeyError(h5path) + obj_json = self.getObjectById(obj_id) + searched_ids.add(obj_id) + elif link_class == 'H5L_TYPE_SOFT': + self.log.warning("getObjectIdByPath can't follow soft links") + elif link_class == 'H5L_TYPE_EXTERNAL': + self.log.warning("getObjectIdByPath can't follow external links") else: - msg = "Dataset with uuid: " + obj_uuid + " was not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - # fill in the item info for the dataset - item = {"id": obj_uuid} - - alias = [] - if dset.name and not dset.name.startswith("/__db__"): - alias.append(dset.name) # just use the default h5py path for now - item["alias"] = alias - - item["attributeCount"] = len(dset.attrs) - - # check if the dataset is using a committed type - typeid = h5py.h5d.DatasetID.get_type(dset.id) - typeItem = None - if h5py.h5t.TypeID.committed(typeid): - type_uuid = None - addr = h5py.h5o.get_info(typeid).addr - type_uuid = self.getUUIDByAddress(addr) - committedType = self.getCommittedTypeItemByUuid(type_uuid) - typeItem = committedType["type"] - typeItem["uuid"] = type_uuid - else: - typeItem = getTypeItem(dset.dtype) - - item["type"] = typeItem - - # get shape - item["shape"] = self.getShapeItemByDsetObj(dset) - - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - creationProps = self.getDatasetCreationProps(obj_uuid) - if creationProps: - # if chunks is not in the db props, add it from the dataset prop - # (so auto-chunk values can be returned) - if dset.chunks and "layout" not in creationProps: - creationProps["layout"] = {"class": "H5D_CHUNKED", "dims": dset.chunks} - else: - # no db-tracked creation properties, pull properties from library - creationProps = self.getHDF5DatasetCreationProperties( - obj_uuid, typeItem["class"] - ) - - if creationProps: - item["creationProperties"] = creationProps - - return item - - def createTypeFromItem(self, attr_type): - """ - createTypeFromItem - create type given dictionary definition - """ - dt = None - - if isinstance(attr_type, (str, bytes)) and len(attr_type) == UUID_LEN: - # assume attr_type is a uuid of a named datatype - tgt = self.getCommittedTypeObjByUuid(attr_type) - if tgt is None: - msg = ( - "Unable to create attribute, committed type with uuid of: " - + attr_type - + " not found" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - dt = tgt # can use the object as the dt parameter - else: - try: - dt = createDataType(attr_type) - except KeyError as ke: - msg = "Unable to create type: " + str(ke) - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - except TypeError as te: - msg = "Unable to create type: " + str(te) - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if dt is None: - msg = "Unexpected error creating type" - self.log.error(msg) - raise IOError(errno, errno.EIO, msg) - return dt - - def createCommittedType(self, datatype, obj_uuid=None): + self.log.error(f"link type: {link_class} not supported") + + if not obj_id: + self.log.warning(f"get_bypath {h5path} not found") + raise KeyError(h5path) + return obj_id + + def getObjectByPath(self, path): + """ Get Object JSON at given path """ + obj_id = self.getObjectDByPath(path) + obj_json = self.getObjectById(obj_id) + return obj_json + + def getDtype(self, obj_id): + """ Return numpy data type for given object id """ + if obj_id not in self._db: + raise KeyError(f"{obj_id} not found") + obj_json = self._db[obj_id] + if "type" not in obj_json: + # group id? + raise TypeError(f"{obj_id} does not have a datatype") + type_json = obj_json["type"] + + # TBD: what about datasets using a committed type? + dtype = createDataType(type_json) + return dtype + + + def createCommittedType(self, datatype, cpl=None): """ createCommittedType - creates new named datatype Returns item """ self.log.info("createCommittedType") - self.initFile() - if self.readonly: - msg = "Can't create committed type (updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - datatypes = self.dbGrp["{datatypes}"] - if not obj_uuid: - obj_uuid = createObjId() - dt = self.createTypeFromItem(datatype) - - datatypes[obj_uuid] = dt - - if obj_uuid not in datatypes: - msg = "Unexpected failure to create committed datatype" - self.log.error(msg) - raise IOError(errno.EIO, msg) - newType = datatypes[obj_uuid] # this will be a h5py Datatype class - # store reverse map as an attribute - addr = h5py.h5o.get_info(newType.id).addr - addrGrp = self.dbGrp["{addr}"] - addrGrp.attrs[str(addr)] = obj_uuid - # set timestamp - now = time.time() - self.setCreateTime(obj_uuid, timestamp=now) - self.setModifiedTime(obj_uuid, timestamp=now) - item = {"id": obj_uuid} - item["attributeCount"] = len(newType.attrs) - # item['type'] = hdf5dtype.getTypeItem(datatype.dtype) - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - return item - - def getCommittedTypeObjByUuid(self, obj_uuid): - """ - getCommittedTypeObjByUuid - get obj from {datatypes} collection - Returns type obj - """ - self.log.info("getCommittedTypeObjByUuid(" + obj_uuid + ")") - self.initFile() - datatype = None - datatypesGrp = self.dbGrp["{datatypes}"] - if obj_uuid in datatypesGrp.attrs: - typeRef = datatypesGrp.attrs[obj_uuid] - # typeRef could be a reference or (for read-only) a path - datatype = self.f[typeRef] - elif obj_uuid in datatypesGrp: - datatype = datatypesGrp[obj_uuid] # non-linked type + if cpl is None: + cpl = {} + + ctype_id = createObjId(obj_type="datatypes", root_id=self._root_id) + if isinstance(datatype, np.dtype): + dt = datatype else: - msg = "Committed datatype: " + obj_uuid + " not found" - self.log.info(msg) - - return datatype - - def getCommittedTypeItemByUuid(self, obj_uuid): - """ - getCommittedTypeItemByUuid - get json from {datatypes} collection - Returns type obj - """ - self.log.info("getCommittedTypeItemByUuid(" + obj_uuid + ")") - self.initFile() - datatype = self.getCommittedTypeObjByUuid(obj_uuid) + dt = createDataType(datatype) - if datatype is None: - if self.getModifiedTime(obj_uuid, useRoot=False): - msg = "Datatype with uuid: " + obj_uuid + " has been previously deleted" - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - else: - msg = "Datatype with uuid: " + obj_uuid + " was not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - item = {"id": obj_uuid} - alias = [] - if datatype.name and not datatype.name.startswith("/__db__"): - alias.append(datatype.name) # just use the default h5py path for now - item["alias"] = alias - item["attributeCount"] = len(datatype.attrs) - item["type"] = getTypeItem(datatype.dtype) - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) + type_json = getTypeItem(dt) # get canonical json description of datatype - return item + ctype_json = {"type": type_json, "attributes": {}, "cpl": cpl} + ctype_json["created"] = time.time() + ctype_json["modified"] = None + self._db[ctype_id] = ctype_json + return ctype_id + - def getAttributeItemByObj(self, obj, name, includeData=True): + def getAttribute(self, obj_id, name, includeData=True): """ - Get attribute given an object and name + Get attribute given an object id and name returns: JSON object """ - if name not in obj.attrs: - msg = "Attribute: [" + name + "] not found in object: " + obj.name + + obj_json = self.getObjectById(obj_id) + attrs = obj_json["attributes"] + + if name not in attrs: + msg = f"Attribute: [{name }] not found in object: {obj_id}" self.log.info(msg) return None - - # get the attribute! - attrObj = h5py.h5a.open(obj.id, np.bytes_(name)) - attr = None - - item = {"name": name} - - # check if the dataset is using a committed type - typeid = attrObj.get_type() - typeItem = None - if h5py.h5t.TypeID.committed(typeid): - type_uuid = None - addr = h5py.h5o.get_info(typeid).addr - type_uuid = self.getUUIDByAddress(addr) - committedType = self.getCommittedTypeItemByUuid(type_uuid) - typeItem = committedType["type"] - typeItem["uuid"] = type_uuid - else: - typeItem = getTypeItem(attrObj.dtype) - item["type"] = typeItem - # todo - don't include data for OPAQUE until JSON serialization - # issues are addressed - - if isinstance(typeItem, dict) and typeItem["class"] in ("H5T_OPAQUE"): - includeData = False - - shape_json = self.getShapeItemByAttrObj(attrObj) - item["shape"] = shape_json + + attr_json = attrs[name] + + if includeData and "value" not in attr_json: + # Reader may not have pre-loaded large attributes + # fetch it now + if not self._reader: + raise RuntimeError(f"Expected to find value for attribute {name} of {obj_id}") + attr_json = self._reader.get_attribute(obj_id, name) + attr_json["value"] = attr_json # this will update the _db + + return attr_json + + def getAttributeValue(self, obj_id, name): + """ Return NDArray of the given attribute value """ + attr_json = self.getAttribute(obj_id, name) + shape_json = attr_json["shape"] if shape_json["class"] == "H5S_NULL": - includeData = False - if includeData: - try: - attr = obj.attrs[name] # returns a numpy array - except TypeError: - self.log.warning("type error reading attribute") - - if includeData and attr is not None: - if shape_json["class"] == "H5S_SCALAR": - data = self.getDataValue(typeItem, attr) - else: - dims = shape_json["dims"] - rank = len(dims) - # convert numpy object to python list - # values = self.toList(typeItem, attr) - data = self.toList(rank, typeItem, attr) - # data = self.bytesToString(data) - item["value"] = data - # timestamps will be added by getAttributeItem() - return item - - def getAttributeItems(self, col_type, obj_uuid, marker=None, limit=0): - self.log.info("db.getAttributeItems(" + obj_uuid + ")") - if marker: - self.log.info("...marker: " + marker) - if limit: - self.log.info("...limit: " + str(limit)) - - self.initFile() - obj = self.getObjectByUuid(col_type, obj_uuid) - if obj is None: - msg = "Object: " + obj_uuid + " could not be loaded" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - items = [] - gotMarker = True - if marker is not None: - gotMarker = False - count = 0 - for name in obj.attrs: - if not gotMarker: - if name == marker: - gotMarker = True - continue # start filling in result on next pass - else: - continue # keep going! - item = self.getAttributeItemByObj(obj, name, False) - # mix-in timestamps - if self.update_timestamps: - item["ctime"] = self.getCreateTime( - obj_uuid, objType="attribute", name=name - ) - item["mtime"] = self.getModifiedTime( - obj_uuid, objType="attribute", name=name - ) - - items.append(item) - count += 1 - if limit > 0 and count == limit: - break # return what we got - return items - - def getAttributeItem(self, col_type, obj_uuid, name): - self.log.info( - "getAttributeItemByUuid(" + col_type + ", " + obj_uuid + ", " + name + ")" - ) - self.initFile() - obj = self.getObjectByUuid(col_type, obj_uuid) - if obj is None: - msg = "Parent object: " + obj_uuid + " of attribute not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) + # no value for empty shape attributes return None - item = self.getAttributeItemByObj(obj, name) - if item is None: - if self.getModifiedTime( - obj_uuid, objType="attribute", name=name, useRoot=False - ): - # attribute has been removed - msg = ( - "Attribute: [" - + name - + "] of object: " - + obj_uuid - + " has been previously deleted" - ) - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - msg = "Attribute: [" + name + "] of object: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - # mix-in timestamps - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid, objType="attribute", name=name) - item["mtime"] = self.getModifiedTime( - obj_uuid, objType="attribute", name=name - ) - - return item - - def isDimensionList(self, attr_name, attr_type): - """ - isDimensionList - return True if this attribute json looks like a dimension list - """ - if attr_name != "DIMENSION_LIST": - return False - if type(attr_type) is not dict: - return False - if attr_type["class"] != "H5T_VLEN": - return False - base_type = attr_type["base"] - if base_type["class"] != "H5T_REFERENCE": - return False - return True + elif shape_json["class"] == "H5S_SCALAR": + dims = () + else: + dims = shape_json["dims"] + dtype = createDataType(attr_json["type"]) + value = attr_json["value"] + arr = jsonToArray(dims, dtype, value) + return arr + + + def createAttribute(self, obj_id, name, value, shape=None, dtype=None): + """ + create an attribute - will override any existing attributes + """ + + # TBD: if dtype is a committed ref type, fetch it first + # TBD: also, check special case for complex types + + if isinstance(dtype, str) and dtype.startswith("datatypes/"): + ctype_id = dtype[len("datatypes/"):] + if getCollectionForId(ctype_id) != "datatypes": + raise TypeError(f"unexpected dtype value for createAttribute: {dtype}") + if ctype_id not in self._db: + raise KeyError(f"ctype: {ctype_id} not found") + ctype_json = self.getObjectById(ctype_id) + type_json = ctype_json["type"] + dtype = createDataType(type_json) + + # First, make sure we have a NumPy array. + if isinstance(value, Reference) and dtype is None: + dtype = special_dtype(ref=Reference) + if shape == "H5S_NULL": + if value: + raise ValueError("Value can't be set for Null space attributes") + if dtype is None: + raise ValueError("Dtype must be set for Null space attributes") + else: + dtype = np.dtype(dtype) + else: + value = np.asarray(value, dtype=dtype, order='C') + if dtype is None: + dtype = value.dtype + else: + dtype = np.dtype(dtype) # In case a string, e.g. 'i8' is passed + + # Where a top-level array type is requested, we have to do some + # fiddling around to present the data as a smaller array of + # sub-arrays. + if value is not None: + if dtype.subdtype is not None: + subdtype, subshape = dtype.subdtype + + # Make sure the subshape matches the last N axes' sizes. + if shape[-len(subshape):] != subshape: + raise ValueError(f"Array dtype shape {subshape} is incompatible with data shape {shape}") + + # New "advertised" shape and dtype + shape = shape[0:len(shape) - len(subshape)] + dtype = subdtype + + # Not an array type; make sure to check the number of elements + # is compatible, and reshape if needed. + else: + if isinstance(shape, tuple): + if np.prod(shape) != np.prod(value.shape): + raise ValueError("Shape of new attribute conflicts with shape of data") - def isReferenceList(self, attr_name, attr_type): - """ - isReferenceList - return True if this attribute json looks like a reference list - """ - if attr_name != "REFERENCE_LIST": - return False - if type(attr_type) is not dict: - return False - if attr_type["class"] != "H5T_COMPOUND": - return False + if shape != value.shape: + value = value.reshape(shape) - return True + # We need this to handle special string types. + value = np.asarray(value, dtype=dtype) + value_json = bytesArrayToList(value) + else: + value_json = None - def makeDimensionList(self, obj, shape, value): - """ - makeDimensionList - work-around for h5py problems saving dimension list - - types which are vlen's of references are not working directly, so use dim_scale api - Note: this is a work-around for h5py issue: - https://github.com/h5py/h5py/issues/553 + if shape is None: + shape = value.shape + if shape == "H5S_NULL": + shape_json = {"class": "H5S_NULL"} + elif len(shape) == 0: + shape_json = {"class": "H5S_SCALAR"} + else: + shape_json = {"class": "H5S_SIMPLE"} + shape_json["dims"] = list(shape) + + obj_json = self.getObjectById(obj_id) + attrs_json = obj_json["attributes"] + if name in attrs_json: + # replace, update modified timestamp + created = attrs_json["created"] + modified = time.time() + else: + created = time.time() + modified = None + type_json = getTypeItem(dtype) + # finally put it all together... + attr_json = {"shape": shape_json, "type": type_json, "value": value_json} + attr_json["created"] = created + attr_json["modified"] = modified + + # slot into the obj_json["attrs"] + attrs_json[name] = attr_json + + + def deleteAttribute(self, obj_id, name): + """ delete the given attribute """ + obj_json = self.getObjectById(obj_id) + attrs_json = obj_json["attributes"] + if name not in attrs_json: + raise KeyError(f"attribute [{name}] not found in {obj_id}") + del attrs_json[name] + + + def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. """ - dset_refs = self.listToRef(value) - for i in range(len(dset_refs)): - refs = dset_refs[i] - if type(refs) not in (list, tuple): - msg = "Invalid dimension list value" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - for j in range(len(refs)): - scale_obj = self.f[refs[j]] - if scale_obj is None: - self.log.warning( - "dimension list, missing obj reference: " + value[i] - ) - continue - if "CLASS" not in scale_obj.attrs: - self.log.warning("dimension list, no scale obj") - continue - if scale_obj.attrs["CLASS"] != b"DIMENSION_SCALE": - self.log.warning("dimension list, invalid class for scale obj") - continue + self.log.info(f"getDatasetValues obj_id: {obj_id}, slices: {slices} format: {format}") + #TBD + - try: - h5py.h5ds.attach_scale(obj.id, scale_obj.id, i) - except RuntimeError: - self.log.error("got runtime error attaching scale") - - def writeNdArrayToAttribute(self, attrs, attr_name, npdata, shape, dt): - """ - writeNdArrayToAttribute - create an attribute given numpy array - """ - attrs.create(attr_name, npdata, shape=shape, dtype=dt) - - def makeNullTermStringAttribute(self, obj, attr_name, strLength, value): - """ - create a scalar string attribute using nullterm padding - """ - self.log.info( - "make nullterm, length: " + str(strLength) + " value:" + str(value) - ) - value = str(value) - if strLength < len(value): - self.log.warning( - "makeNullTermStringAttribute: value string longer than length" - ) - # value = value[:strLength] # truncate to length - - if isinstance(attr_name, str): - try: - attr_name = attr_name.encode("ascii") - except UnicodeDecodeError: - raise TypeError("non-ascii attribute name not allowed") - - # create the attribute - tid = h5py.h5t.TypeID.copy(h5py.h5t.C_S1) - tid.set_size(strLength) - tid.set_strpad(h5py.h5t.STR_NULLTERM) - sid = h5py.h5s.create(h5py.h5s.SCALAR) - aid = h5py.h5a.create(obj.id, attr_name, tid, sid) - # write the value - dtype_code = "S" + str(strLength) - ndarr = np.array(value, dtype=np.dtype(dtype_code)) - aid.write(ndarr) - - def makeAttribute(self, obj, attr_name, shape, attr_type, value): - """ - makeAttribute - create an attribute (except for dimension list - attribute) - """ - is_committed_type = False - if isinstance(attr_type, str) and len(attr_type) == UUID_LEN: - # assume attr_type is a uuid of a named datatype - is_committed_type = True - - dt = self.createTypeFromItem(attr_type) - - if shape is None: - self.log.info("shape is null - will create null space attribute") - # create null space attribute - # null space datasets/attributes not supported in h5py yet: - # See: https://github.com/h5py/h5py/issues/279 - # work around this by using low-level interface. - # first create a temp scalar dataset so we can pull out the typeid - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - tmpGrp.attrs.create(attr_name, 0, shape=(), dtype=dt) - b_attr_name = attr_name.encode("utf-8") - tmpAttr = h5py.h5a.open(tmpGrp.id, name=b_attr_name) - if not tmpAttr: - msg = "Unexpected error creating datatype for nullspace attribute" - self.log.error(msg) - raise IOError(errno.EIO, msg) - tid = tmpAttr.get_type() - sid = sid = h5py.h5s.create(h5py.h5s.NULL) - # now create the permanent attribute - if attr_name in obj.attrs: - self.log.info("deleting attribute: " + attr_name) - del obj.attrs[attr_name] - attr_id = h5py.h5a.create(obj.id, b_attr_name, tid, sid) - # delete the temp attribute - del tmpGrp.attrs[attr_name] - if not attr_id: - msg = "Unexpected error creating nullspace attribute" - self.log.error(msg) - raise IOError(errno.EIO, msg) - else: - if type(value) is tuple: - value = list(value) - if type(shape) is list: - shape = tuple(shape) - if not is_committed_type: - # apparently committed types can not be used as reference types - # todo - verify why that is - - rank = len(shape) - # convert python list to numpy object - strPad = None - strLength = 0 - if ( - isinstance(attr_type, dict) - and attr_type["class"] == "H5T_STRING" - and "strPad" in attr_type - ): - strPad = attr_type["strPad"] - strLength = attr_type["length"] - - if ( - rank == 0 - and isinstance(strLength, int) - and strPad == "H5T_STR_NULLTERM" - ): - self.makeNullTermStringAttribute(obj, attr_name, strLength, value) - else: - typeItem = getTypeItem(dt) - dt = convert_dtype(dt) - value = self.toRef(rank, typeItem, value) - - # create numpy array - npdata = np.zeros(shape, dtype=dt) - - if rank == 0: - npdata[()] = self.toNumPyValue(attr_type, value, npdata[()]) - else: - self.toNumPyArray(rank, attr_type, value, npdata) - - self.writeNdArrayToAttribute( - obj.attrs, attr_name, npdata, shape, dt - ) - - """ - createAttribute - create an attribute - """ - - def createAttribute(self, col_name, obj_uuid, attr_name, shape, attr_type, value): - self.log.info("createAttribute: [" + attr_name + "]") - - self.initFile() - if self.readonly: - msg = "Unable to create attribute (updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - obj = self.getObjectByUuid(col_name, obj_uuid) - if not obj: - msg = "Object with uuid: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - if self.isDimensionList(attr_name, attr_type): - self.makeDimensionList(obj, shape, value) - elif self.isReferenceList(attr_name, attr_type): - pass # Skip since reference list will be created by attach scale - else: - self.makeAttribute(obj, attr_name, shape, attr_type, value) - - now = time.time() - self.setCreateTime(obj_uuid, objType="attribute", name=attr_name, timestamp=now) - self.setModifiedTime( - obj_uuid, objType="attribute", name=attr_name, timestamp=now - ) - self.setModifiedTime(obj_uuid, timestamp=now) # owner entity is modified - - def deleteAttribute(self, col_name, obj_uuid, attr_name): - self.initFile() - if self.readonly: - msg = "Unable to delete attribute (updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - obj = self.getObjectByUuid(col_name, obj_uuid) - - if attr_name not in obj.attrs: - msg = ( - "Attribute with name: [" - + attr_name - + "] of object: " - + obj_uuid - + " not found" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - del obj.attrs[attr_name] - now = time.time() - self.setModifiedTime( - obj_uuid, objType="attribute", name=attr_name, timestamp=now - ) - - return True - - """ - Return a json-serializable representation of the numpy value - """ - - def getDataValue(self, typeItem, value, dimension=0, dims=None): - if dimension > 0: - if type(dims) not in (list, tuple): - msg = "unexpected type for type array dimensions" - self.log.error(msg) - raise IOError(errno.EIO, msg) - out = [] - rank = len(dims) - if dimension > rank: - msg = "unexpected dimension for type array" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nElements = dims[rank - dimension] - for i in range(nElements): - item_value = self.getDataValue( - typeItem, value[i], dimension=(dimension - 1), dims=dims - ) - out.append(item_value) - return out # done for array case - - out = None - typeClass = typeItem["class"] - if isinstance(value, (np.ndarray, np.generic)): - value = value.tolist() # convert numpy object to list - if typeClass == "H5T_COMPOUND": - if type(value) not in (list, tuple): - msg = "Unexpected type for compound value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - fields = typeItem["fields"] - if len(fields) != len(value): - msg = "Number of elements in compound type does not match type" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nFields = len(fields) - out = [] - for i in range(nFields): - field = fields[i] - item_value = self.getDataValue(field["type"], value[i]) - out.append(item_value) - elif typeClass == "H5T_VLEN": - if type(value) not in (list, tuple): - msg = "Unexpected type for vlen value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - baseType = typeItem["base"] - out = [] - nElements = len(value) - for i in range(nElements): - item_value = self.getDataValue(baseType, value[i]) - out.append(item_value) - elif typeClass == "H5T_REFERENCE": - out = self.refToList(value) - elif typeClass == "H5T_OPAQUE": - out = "???" # todo - elif typeClass == "H5T_ARRAY": - type_dims = typeItem["dims"] - if type(type_dims) not in (list, tuple): - msg = "unexpected type for type array dimensions" - self.log.error(msg) - raise IOError(errno.EIO, msg) - rank = len(type_dims) - baseType = typeItem["base"] - out = self.getDataValue(baseType, value, dimension=rank, dims=type_dims) - - elif typeClass in ("H5T_INTEGER", "H5T_FLOAT", "H5T_ENUM"): - out = value # just copy value - elif typeClass == "H5T_STRING": - if "charSet" in typeItem: - charSet = typeItem["charSet"] - else: - charSet = "H5T_CSET_ASCII" - if charSet == "H5T_CSET_ASCII" and isinstance(value, bytes): - out = value.decode("utf-8") - else: - out = value - else: - msg = "Unexpected type class: " + typeClass - self.log.info(msg) - raise IOError(errno.ENINVAL, msg) - return out - - def getRefValue(self, typeItem: dict, value: list): - """ - Return a numpy value based on json representation - """ - out = None - typeClass = typeItem["class"] - if typeClass == "H5T_COMPOUND": - if not isinstance(value, (list, tuple)): - msg = f"Unexpected type for compound value: {type(value)}" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - fields = typeItem["fields"] - if len(fields) != len(value): - msg = "Number of elements in compound type does not match type" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nFields = len(fields) - out = [] - for i in range(nFields): - field = fields[i] - item_value = self.getRefValue(field["type"], value[i]) - out.append(item_value) - elif typeClass == "H5T_VLEN": - if type(value) not in (list, tuple): - msg = "Unexpected type for vlen value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - baseType = typeItem["base"] - out = [] - nElements = len(value) - for i in range(nElements): - item_value = self.getRefValue(baseType, value[i]) - out.append(item_value) - elif typeClass == "H5T_REFERENCE": - out = self.listToRef(value) - elif typeClass == "H5T_OPAQUE": - out = "???" # todo - elif typeClass == "H5T_ARRAY": - out = self.toRef(len(typeItem["dims"]), typeItem["base"], value) - elif typeClass in ("H5T_INTEGER", "H5T_FLOAT", "H5T_ENUM"): - out = value # just copy value - elif typeClass == "H5T_STRING": - if typeItem["charSet"] == "H5T_CSET_UTF8": - # out = value.encode('utf-8') - out = value - else: - out = value.encode() - else: - msg = "Unexpected type class: " + typeClass - self.log.info(msg) - raise IOError(errno.ENINVAL, msg) - - if isinstance(out, list): - out = tuple(out) # convert to tuple - return out - - """ - Return a numpy value based on json representation - """ - - def toNumPyValue(self, typeItem, src, des): - typeClass = "H5T_INTEGER" # default to int type - if type(typeItem) is dict: - typeClass = typeItem["class"] - if typeClass == "H5T_COMPOUND": - fields = typeItem["fields"] - if len(fields) != len(src): - msg = "Number of elements in compound type does not match type" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nFields = len(fields) - - for i in range(nFields): - field = fields[i] - field_name = field["name"] - des[field_name] = src[i] - - elif typeClass == "H5T_VLEN": - if type(src) not in (list, tuple): - msg = "Unexpected type for vlen value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - baseType = typeItem["base"] - - dt = self.createTypeFromItem(baseType) - dt = convert_dtype(dt) - des = np.array(src, dtype=dt) - - elif typeClass == "H5T_REFERENCE": - des = src # self.listToRef(src) - - elif typeClass == "H5T_OPAQUE": - des = "???" # todo - elif typeClass == "H5T_ARRAY": - des = src - elif typeClass in ("H5T_INTEGER", "H5T_FLOAT", "H5T_ENUM"): - des = src # just copy value - elif typeClass == "H5T_STRING": - if typeItem["charSet"] == "H5T_CSET_UTF8": - des = src # src.encode('utf-8') - else: - if type(src) is str: - try: - src.encode("ascii") - except UnicodeDecodeError: - raise TypeError( - "non-ascii value not allowed with H5T_CSET_ASCII" - ) - des = src - - else: - msg = "Unexpected type class: " + typeClass - self.log.info(msg) - raise IOError(errno.ENINVAL, msg) - return des - - """ - copy src data to numpy array - """ - - def toNumPyArray(self, rank, typeItem, src, des): - if rank == 0: - msg = "unexpected rank value" - self.log.error(msg) - raise IOError(errno.EIO, msg) # shouldn't be called with rank 0 - - for i in range(len(des)): - des_sec = des[i] # numpy slab - - src_sec = src[i] - - if rank > 1: - self.toNumPyArray(rank - 1, typeItem, src_sec, des_sec) - else: - rv = self.toNumPyValue(typeItem, src_sec, des_sec) - # if the numpy object is writeable, des_sec will be - # already updated. Otherwise, update the des by assignment - if not hasattr(des_sec, "flags") or not des_sec.flags["WRITEABLE"]: - des[i] = rv - - def toRef(self, rank, typeItem, data): - """ - Convert json list to h5py compatible values - """ - out = None - - if isinstance(typeItem, str): - # commited type - get json representation - committed_type_item = self.getCommittedTypeItemByUuid(typeItem) - typeItem = committed_type_item["type"] - - typeClass = typeItem["class"] - if typeClass in ("H5T_INTEGER", "H5T_FLOAT"): - out = data # just use as is - - elif rank == 0: - # scalar value - out = self.getRefValue(typeItem, data) - else: - out = [] - for item in data: - if rank > 1: - out_item = self.toRef(rank - 1, typeItem, item) - out.append(out_item) - else: - out_item = self.getRefValue(typeItem, item) - out.append(out_item) - - return out - - """ - Convert list to json serializable values. - """ - - def toList(self, rank, typeItem, data): - out = None - typeClass = typeItem["class"] - if typeClass in ("H5T_INTEGER", "H5T_FLOAT"): - out = data.tolist() # just use as is - - elif rank == 0: - # scalar value - out = self.getDataValue(typeItem, data) - else: - out = [] - for item in data: - if rank > 1: - out_item = self.toList(rank - 1, typeItem, item) - out.append(out_item) - else: - out_item = self.getDataValue(typeItem, item) - out.append(out_item) - - return out - - """ - Create ascii representation of vlen data object - """ - - def vlenToList(self, data): - # todo - verify that data is a numpy.ndarray - out = None - if len(data.shape) == 0: - out = [] - else: - try: - if data.dtype.kind != "O": - out = data.tolist() - else: - out = [] - for item in data: - out.append(self.vlenToList(item)) # recursive call - except AttributeError: - # looks like this is not a numpy ndarray, just return the value - out = data - return out - - """ - Create ascii representation of ref data object - """ - - def refToList(self, data): - # todo - verify that data is a numpy.ndarray - out = None - if type(data) is h5py.h5r.Reference: - if bool(data): - grpref = self.f[data] - addr = h5py.h5o.get_info(grpref.id).addr - uuid = self.getUUIDByAddress(addr) - if self.getGroupObjByUuid(uuid): - out = "groups/" + uuid - elif self.getDatasetObjByUuid(uuid): - out = "datasets/" + uuid - elif self.getCommittedTypeObjByUuid(uuid): - out = "datatypes/" + uuid - else: - self.log.warning("uuid in region ref not found: [" + uuid + "]") - return None - else: - out = "null" - elif type(data) is h5py.h5r.RegionReference: - out = self.getRegionReference(data) - else: - out = [] - for item in data: - out.append(self.refToList(item)) # recursive call - return out - - """ - Convert ascii representation of data references to data ref - """ - - def listToRef(self, data): - out = None - if not data: - # null reference - out = self.getNullReference() - elif isinstance(data, (bytes, str)): - obj_ref = None - # object reference should be in the form: / - for prefix in ("datasets", "groups", "datatypes"): - if data.startswith(prefix): - uuid_ref = data[len(prefix):] - if len(uuid_ref) == (UUID_LEN + 1) and uuid_ref.startswith("/"): - obj = self.getObjectByUuid(prefix, uuid_ref[1:]) - if obj: - obj_ref = obj.ref - else: - msg = ( - "Invalid object reference value: [" - + uuid_ref - + "] not found" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - break - if not obj_ref: - msg = "Invalid object reference value: [" + data + "]" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - else: - out = obj_ref - - elif isinstance(data, (list, tuple)): - out = [] - for item in data: - out.append(self.listToRef(item)) # recursive call - elif isinstance(data, dict): - # assume region ref - out = self.createRegionReference(data) - else: - msg = "Invalid object reference value type: [" + str(type(data)) + "]" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - return out - - def bytesArrayToList(self, data): - """ - Convert list that may contain bytes type elements to list of string elements - """ - if isinstance(data, (bytes, str)): - is_list = False - elif isinstance(data, (np.ndarray, np.generic)): - if len(data.shape) == 0: - is_list = False - data = data.tolist() # tolist will return a scalar in this case - if isinstance(data, (list, tuple)): - is_list = True - else: - is_list = False - else: - is_list = True - elif isinstance(data, (list, tuple)): - is_list = True - else: - is_list = False - - if is_list: - out = [] - for item in data: - out.append(self.bytesArrayToList(item)) # recursive call - elif isinstance(data, bytes): - out = data.decode("utf-8") - else: - out = data - - return out - - def getRegionReference(self, regionRef): - """ - Get item description of region reference value - """ - selectionEnums = { - h5py.h5s.SEL_NONE: "H5S_SEL_NONE", - h5py.h5s.SEL_ALL: "H5S_SEL_ALL", - h5py.h5s.SEL_POINTS: "H5S_SEL_POINTS", - h5py.h5s.SEL_HYPERSLABS: "H5S_SEL_HYPERSLABS", - } - - item = {} - objid = h5py.h5r.dereference(regionRef, self.f.file.file.id) - if objid: - item["id"] = self.getUUIDByAddress(h5py.h5o.get_info(objid).addr) - else: - self.log.info("region reference unable to find item with objid: " + objid) - return item - - sel = h5py.h5r.get_region(regionRef, objid) - select_type = sel.get_select_type() - if select_type not in selectionEnums: - msg = "Unexpected selection type: " + regionRef.typecode - self.log.error(msg) - raise IOError(errno.EIO, msg) - item["select_type"] = selectionEnums[select_type] - pointlist = None - if select_type == h5py.h5s.SEL_POINTS: - # retrieve a numpy array of selection points - points = sel.get_select_elem_pointlist() - pointlist = points.tolist() - elif select_type == h5py.h5s.SEL_HYPERSLABS: - points = sel.get_select_hyper_blocklist() - if points is not None: - pointlist = points[...].tolist() - # bump up the second coordinate by one to match api spec - for point in pointlist: - coord2 = point[1] - for i in range(len(coord2)): - coord2[i] = coord2[i] + 1 - - item["selection"] = pointlist - - return item - - def createRegionReference(self, item): - """ - Create region reference from item description of region reference value - """ - selectionEnums = { - "H5S_SEL_NONE": h5py.h5s.SEL_NONE, - "H5S_SEL_ALL": h5py.h5s.SEL_ALL, - "H5S_SEL_POINTS": h5py.h5s.SEL_POINTS, - "H5S_SEL_HYPERSLABS": h5py.h5s.SEL_HYPERSLABS, - } - region_ref = None - - if "select_type" not in item: - msg = "select_type not provided for region selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - select_type = item["select_type"] - if select_type not in selectionEnums.keys(): - msg = "selection type: [" + select_type + "] is not valid" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - dset = None - if select_type == "H5S_SEL_NONE": - if "id" not in item: - # select none on null dataset, return null ref - out = self.getNullReference() - return out - else: # select_type != 'H5S_SEL_NONE' - if "id" not in item: - msg = "id not provided for region selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - # Otherwise need to provide uuid of dataset - uuid_ref = item["id"] - if len(uuid_ref) != UUID_LEN: - msg = "uuid value: [" + uuid_ref + "] for region reference is not valid" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - obj = self.getObjectByUuid("datasets", uuid_ref) - if obj: - dset = obj - else: - msg = "Invalid region refence value: [" + uuid_ref + "] not found" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if select_type in ("H5S_SEL_POINTS", "H5S_SEL_HYPERSLABS"): - if "selection" not in item: - msg = "selection key not provided for region selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - rank = len(dset.shape) - space_id = h5py.h5d.DatasetID.get_space(dset.id) - h5py.h5s.SpaceID.select_none(space_id) - - if select_type == "H4S_SEL_NONE": - pass # did select_none above - elif select_type == "H5S_SEL_ALL": - h5py.h5s.SpaceID.select_all(space_id) - elif select_type == "H5S_SEL_POINTS": - selection = item["selection"] - for point in selection: - if len(point) != rank: - msg = "point selection number of elements must mach rank of referenced dataset" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - h5py.h5s.SpaceID.select_elements(space_id, selection) - elif select_type == "H5S_SEL_HYPERSLABS": - selection = item["selection"] - - for slab in selection: - # each item should be a two element array defining the hyperslab boundary - if len(slab) != 2: - msg = "selection value not valid (not a 2 element array)" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - start = slab[0] - if isinstance(start, list): - start = tuple(start) - if type(start) is not tuple or len(start) != rank: - msg = "selection value not valid, start element should have number " - msg += "elements equal to rank of referenced dataset" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - stop = slab[1] - if isinstance(stop, list): - stop = tuple(stop) - if type(stop) is not tuple or len(stop) != rank: - msg = "selection value not valid, count element should have number " - msg += "elements equal to rank of referenced dataset" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - count = [] - for i in range(rank): - if start[i] < 0: - msg = "start value for hyperslab selection must be non-negative" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if stop[i] <= start[i]: - msg = "stop value must be greater than start value for hyperslab selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - count.append(stop[i] - start[i]) - count = tuple(count) - - h5py.h5s.SpaceID.select_hyperslab( - space_id, start, count, op=h5py.h5s.SELECT_OR - ) - - # now that we've selected the desired region in the space, return a region reference - dset_name = dset.name.encode("utf-8") - region_ref = h5py.h5r.create( - self.f.id, dset_name, h5py.h5r.DATASET_REGION, space_id - ) - - return region_ref - - def toTuple(self, rank, data): - """ - Convert a list to a tuple, recursively. - Example. [[1,2],[3,4]] -> ((1,2),(3,4)) - """ - if isinstance(data, (list, tuple)): - if rank > 0: - return list(self.toTuple(rank - 1, x) for x in data) - else: - return tuple(self.toTuple(rank - 1, x) for x in data) - else: - return data - - def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): - """ - Get values from dataset identified by obj_uuid. - If a slices list or tuple is provided, it should have the same - number of elements as the rank of the dataset. - """ - dset = self.getDatasetObjByUuid(obj_uuid) - if format not in ("json", "binary"): - msg = "only json and binary formats are supported" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - values = None - dt = dset.dtype - typeItem = getTypeItem(dt) - itemSize = getItemSize(typeItem) - if itemSize == "H5T_VARIABLE" and format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset.shape is None: - # null space dataset (with h5py 2.6.0) - return None - - rank = len(dset.shape) - - if rank == 0: - # check for null dataspace - try: - val = dset[...] - except IOError: - # assume null dataspace, return none - return None - if val is None: - self.log.warning("no value returned from scalar dataset") - - if not isinstance(slices, (list, tuple)) and slices is not Ellipsis: - msg = "Unexpected error: getDatasetValuesByUuid: bad type for dim parameter" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - if isinstance(slices, (list, tuple)) and len(slices) != rank: - msg = "Unexpected error: getDatasetValuesByUuid: " - msg += "number of dims in selection not same as rank" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - if dt.kind == "O": - if format != "json": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - # numpy object type - could be a vlen string or generic vlen - h5t_check = h5py.h5t.check_dtype(vlen=dt) - if h5t_check is str or h5t_check is bytes: - values = self.bytesArrayToList(dset[slices]) - elif h5t_check is not None: - # other vlen data - values = self.vlenToList(dset[slices]) - else: - # check for reference type - h5t_check = h5py.h5t.check_dtype(ref=dt) - if h5t_check is not None: - # reference type - values = self.refToList(dset[slices]) - else: - msg = "Unexpected error, object type unknown" - self.log.error(msg) - raise IOError(errno.EIO, msg) - elif dt.kind == "V" and len(dt) <= 1 and len(dt.shape) == 0 and not dt.names: - # opaque type - skip for now - self.log.warning("unable to get opaque type values") - values = "????" - elif dt.kind == "S" and format == "json": - values = self.bytesArrayToList(dset[slices]) - elif len(dt) > 1 or dt.names: - # compound type - if format == "json": - values = self.bytesArrayToList(dset[slices]) - else: - values = dset[slices].tobytes() - else: - values = dset[slices] - - # just use tolist to dump - if format == "json": - values = values.tolist() - else: - # values = base64.b64encode(dset[slices].tobytes()) - values = values.tobytes() - - return values - - """ - doDatasetQueryByUuid: return rows based on query string - Return rows from a dataset that matches query string. - - Note: Only supported for compound_type/one-dimensional datasets - """ - - def doDatasetQueryByUuid( - self, obj_uuid, query, start=0, stop=-1, step=1, limit=None - ): - self.log.info("doQueryByUuid - uuid: " + obj_uuid + " query:" + query) - self.log.info( - "start: " - + str(start) - + " stop: " - + str(stop) - + " step: " - + str(step) - + " limit: " - + str(limit) - ) - dset = self.getDatasetObjByUuid(obj_uuid) - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - values = [] - dt = dset.dtype - typeItem = getTypeItem(dt) - # itemSize = getItemSize(typeItem) - if typeItem["class"] != "H5T_COMPOUND": - msg = "Only compound type datasets can be used as query target" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset.shape is None: - # null space dataset (with h5py 2.6.0) - return None - - rank = len(dset.shape) - if rank != 1: - msg = "One one-dimensional datasets can be used as query target" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - values = [] - indexes = [] - count = 0 - - num_elements = dset.shape[0] - if stop == -1: - stop = num_elements - elif stop > num_elements: - stop = num_elements - block_size = self._getBlockSize(dset) - self.log.info("block_size: " + str(block_size)) - - field_names = list(dset.dtype.fields.keys()) - eval_str = self._getEvalStr(query, field_names) - - while start < stop: - if limit and (count == limit): - break # no more rows for this batch - end = start + block_size - if end > stop: - end = stop - rows = dset[start:end] # read from dataset - where_result = np.where(eval(eval_str)) - index = where_result[0].tolist() - if len(index) > 0: - for i in index: - row = rows[i] - item = self.bytesArrayToList(row) - values.append(item) - indexes.append(start + i) - count += 1 - if limit and (count == limit): - break # no more rows for this batch - - start = end # go to next block - - # values = self.getDataValue(item_type, values, dimension=1, dims=(len(values),)) - - self.log.info("got " + str(count) + " query matches") - return (indexes, values) - - """ - _getBlockSize: Get number of rows to read from disk - - heurestic to get reasonable sized chunk of data to fetch. - make multiple of chunk_size if possible - """ - - def _getBlockSize(self, dset): - target_block_size = 256 * 1000 - if dset.chunks: - chunk_size = dset.chunks[0] - if chunk_size < target_block_size: - block_size = (target_block_size // chunk_size) * chunk_size - else: - block_size = target_block_size - else: - block_size = target_block_size - return block_size - - """ - _getEvalStr: Get eval string for given query - - Gets Eval string to use with numpy where method. - """ - - def _getEvalStr(self, query, field_names): - i = 0 - eval_str = "" - var_name = None - end_quote_char = None - var_count = 0 - paren_count = 0 - black_list = ("import",) # field names that are not allowed - self.log.info("getEvalStr(" + query + ")") - for item in black_list: - if item in field_names: - msg = "invalid field name" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - while i < len(query): - ch = query[i] - if (i + 1) < len(query): - ch_next = query[i + 1] - else: - ch_next = None - if var_name and not ch.isalnum(): - # end of variable - if var_name not in field_names: - # invalid - msg = "unknown field name" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - eval_str += "rows['" + var_name + "']" - var_name = None - var_count += 1 - - if end_quote_char: - if ch == end_quote_char: - # end of literal - end_quote_char = None - eval_str += ch - elif ch in ("'", '"'): - end_quote_char = ch - eval_str += ch - elif ch.isalpha(): - if ch == "b" and ch_next in ("'", '"'): - eval_str += "b" # start of a byte string literal - elif var_name is None: - var_name = ch # start of a variable - else: - var_name += ch - elif ch == "(" and end_quote_char is None: - paren_count += 1 - eval_str += ch - elif ch == ")" and end_quote_char is None: - paren_count -= 1 - if paren_count < 0: - msg = "Mismatched paren" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - eval_str += ch - else: - # just add to eval_str - eval_str += ch - i = i + 1 - if end_quote_char: - msg = "no matching quote character" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - if var_count == 0: - msg = "No field value" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - if paren_count != 0: - msg = "Mismatched paren" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - - return eval_str - - """ - Get values from dataset identified by obj_uuid using the given - point selection. - """ - - def getDatasetPointSelectionByUuid(self, obj_uuid, points): - dset = self.getDatasetObjByUuid(obj_uuid) - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - rank = len(dset.shape) - values = np.zeros(len(points), dtype=dset.dtype) - try: - i = 0 - for point in points: - if rank == 1: - values[i] = dset[[point]] - else: - values[i] = dset[tuple(point)] - i += 1 - except ValueError: - # out of range error - msg = "getDatasetPointSelection, out of range error" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - return values.tolist() - - """ - setDatasetValuesByUuid - update the given dataset values with supplied data - and optionally a hyperslab selection (slices) - """ - - def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): - dset = self.getDatasetObjByUuid(obj_uuid) - - if format not in ("json", "binary"): - msg = "only json and binary formats are supported" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if format == "binary" and type(data) is not bytes: - msg = "data must be of type bytes for binary writing" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - dt = dset.dtype - typeItem = getTypeItem(dt) - itemSize = getItemSize(typeItem) - rank = len(dset.shape) - arraySize = 1 - for extent in dset.shape: - arraySize *= arraySize - - if itemSize == "H5T_VARIABLE" and format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if slices is None: - slices = [] - # create selection that covers entire dataset - for dim in range(rank): - s = slice(0, dset.shape[dim], 1) - slices.append(s) - slices = tuple(slices) - - if not isinstance(slices, tuple): - msg = "setDatasetValuesByUuid: bad type for dim parameter" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - if len(slices) != rank: - msg = "number of dims in selection not same as rank" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - npoints = 1 - np_shape = [] - for i in range(rank): - s = slices[i] - - if s.start < 0 or s.step <= 0 or s.stop < s.start: - msg = "invalid slice specification" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if s.stop > dset.shape[i]: - msg = "invalid slice specification" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - np_shape.append(s.stop - s.start) - - count = (s.stop - s.start) // s.step - if count <= 0: - msg = "invalid slice specification" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - npoints *= count - - np_shape = tuple(np_shape) # for comparison with ndarray shape - - self.log.info("selection shape:" + str(np_shape)) - - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if format != "binary" and dset.dtype.names and isinstance(data, (list, tuple)): - data = self.toTuple(rank, data) - # for i in range(len(data)): - # converted_data.append(self.toTuple(data[i])) - # data = converted_data - else: - h5t_check = h5py.check_dtype(ref=dset.dtype) - if h5t_check in (h5py.Reference, h5py.RegionReference): - # convert data to data refs - if format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - data = self.listToRef(data) - - if format == "binary": - if npoints * itemSize != len(data): - msg = ( - "Expected: " - + str(npoints * itemSize) - + " bytes, but got: " - + str(len(data)) - ) - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if dset.dtype.shape == (): - arr = np.fromstring(data, dtype=dset.dtype) - arr = arr.reshape(np_shape) # conform to selection shape - else: - # tricy array type! - arr = np.empty(np_shape, dtype=dset.dtype) - base_arr = np.fromstring(data, dtype=dset.dtype.base) - base_shape = list(np_shape) - base_shape.extend(dset.dtype.shape) # add on the type dimensions - base_arr = base_arr.reshape(base_shape) - arr[...] = base_arr - else: - # data is json - if npoints == 1 and len(dset.dtype) > 1: - # convert to tuple for compound singleton writes - data = [ - tuple(data), - ] - - arr = np.array(data, dtype=dset.dtype) - # raise an exception of the array shape doesn't match the selection shape - # allow if the array is a scalar and the selection shape is one element, - # numpy is ok with this - np_index = 0 - for dim in range(len(arr.shape)): - data_extent = arr.shape[dim] - selection_extent = 1 - if np_index < len(np_shape): - selection_extent = np_shape[np_index] - if selection_extent == data_extent: - np_index += 1 - continue # good - if data_extent == 1: - continue # skip singleton selection - if selection_extent == 1: - np_index += 1 - continue # skip singleton selection - - # selection/data mismatch! - msg = "data shape doesn't match selection shape" - msg += "--data shape: " + str(arr.shape) - msg += "--selection shape: " + str(np_shape) - - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - # write temp numpy array to dataset - if rank == 1: - s = slices[0] - try: - dset[s] = arr - except TypeError as te: - self.log.info("h5py setitem exception: " + str(te)) - raise IOError(errno.EINVAL, str(te)) - else: - try: - dset[slices] = arr - except TypeError as te: - self.log.info("h5py setitem exception: " + str(te)) - raise IOError(errno.EINVAL, str(te)) - - # update modified time - self.setModifiedTime(obj_uuid) - return True - - """ - setDatasetValuesByPointSelection - Update the dataset values using the given - data and point selection - """ - - def setDatasetValuesByPointSelection(self, obj_uuid, data, points, format="json"): - dset = self.getDatasetObjByUuid(obj_uuid) - - if format not in ("json", "binary"): - msg = "only json and binary formats are supported" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if format == "binary" and type(data) is not bytes: - msg = "data must be of type bytes for binary writing" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - dt = dset.dtype - typeItem = getTypeItem(dt) - itemSize = getItemSize(typeItem) - if itemSize == "H5T_VARIABLE" and format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - rank = len(dset.shape) - - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if format == "json" and len(dset.dtype) > 1 and type(data) in (list, tuple): - raise NotImplementedError("need some special conversion for compound types") - # converted_data = self.toTuple(rank, data) - # for i in range(len(data)): - # converted_data.append(self.toTuple(data[i])) - # data = converted_data - - if format == "json": - try: - i = 0 - for point in points: - if rank == 1: - dset[[point]] = data[i] - else: - dset[tuple(point)] = data[i] - i += 1 - except ValueError: - # out of range error - msg = "setDatasetValuesByPointSelection, out of range error" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - else: - # binary - arr = np.fromstring(data, dtype=dset.dtype) - dset[points] = arr # coordinate write - - # update modified time - self.setModifiedTime(obj_uuid) - return True - - """ - createDataset - creates new dataset given shape and datatype - Returns item - """ + """ + createDataset - creates new dataset given shape and datatype + Returns obj_id + """ def createDataset( - self, datatype, datashape, max_shape=None, creation_props=None, obj_uuid=None + self, + shape=None, + dtype=None, + chunks=None, + compression=None, + shuffle=None, + maxshape=None, + compression_opts=None, + fillvalue=None, + cpl=None, ): - self.initFile() - if self.readonly: - msg = "Unable to create dataset (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - datasets = self.dbGrp["{datasets}"] - if not obj_uuid: - obj_uuid = createObjId() - dt = None - item = {} - fillvalue = None - - # h5py.createdataset fields - kwargs = {} # key word arguments for h5py dataset creation - - if creation_props is None: - creation_props = {} # create empty list for convience - - if creation_props: - if "fillValue" in creation_props: - fillvalue = creation_props["fillValue"] - if "trackTimes" in creation_props: - kwargs["track_times"] = creation_props["trackTimes"] - if "layout" in creation_props: - layout = creation_props["layout"] - if "dims" in layout: - kwargs["chunks"] = tuple(layout["dims"]) - if "filters" in creation_props: - filter_props = creation_props["filters"] - for filter_prop in filter_props: - if "id" not in filter_prop: - msg = "filter id not provided" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - filter_id = filter_prop["id"] - if filter_id not in _HDF_FILTERS: - self.log.info( - "unknown filter id: " + str(filter_id) + " ignoring" - ) - continue - - hdf_filter = _HDF_FILTERS[filter_id] - - self.log.info("got filter: " + str(filter_id)) - if "alias" not in hdf_filter: - self.log.info( - "unsupported filter id: " + str(filter_id) + " ignoring" - ) - continue - - filter_alias = hdf_filter["alias"] - if not h5py.h5z.filter_avail(filter_id): - self.log.info( - "compression filter not available, filter: " - + filter_alias - + " will be ignored" - ) - continue - if filter_alias in _H5PY_COMPRESSION_FILTERS: - if kwargs.get("compression"): - self.log.info( - "compression filter already set, filter: " - + filter_alias - + " will be ignored" - ) - continue - - kwargs["compression"] = filter_alias - self.log.info( - "setting compression filter to: " + kwargs["compression"] - ) - if filter_alias == "gzip": - # check for an optional compression value - if "level" in filter_prop: - kwargs["compression_opts"] = filter_prop["level"] - elif filter_alias == "szip": - bitsPerPixel = None - coding = "nn" - - if "bitsPerPixel" in filter_prop: - bitsPerPixel = filter_prop["bitsPerPixel"] - if "coding" in filter_prop: - if filter_prop["coding"] == "H5_SZIP_EC_OPTION_MASK": - coding = "ec" - elif filter_prop["coding"] == "H5_SZIP_NN_OPTION_MASK": - coding = "nn" - else: - msg = "invalid szip option: 'coding'" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - # note: pixelsPerBlock, and pixelsPerScanline not supported by h5py, - # so these options will be ignored - if "pixelsPerBlock" in filter_props: - self.log.info("ignoring szip option: 'pixelsPerBlock'") - if "pixelsPerScanline" in filter_props: - self.log.info( - "ignoring szip option: 'pixelsPerScanline'" - ) - if bitsPerPixel: - kwargs["compression_opts"] = (coding, bitsPerPixel) - else: - if filter_alias == "shuffle": - kwargs["shuffle"] = True - elif filter_alias == "fletcher32": - kwargs["fletcher32"] = True - elif filter_alias == "scaleoffset": - if "scaleOffset" not in filter_prop: - msg = "No scale_offset provided for scale offset filter" - self.log(msg) - raise IOError(errno.EINVAL, msg) - kwargs["scaleoffset"] = filter_prop["scaleOffset"] - else: - self.log.info( - "Unexpected filter name: " - + filter_alias - + " , ignoring" - ) - - dt_ref = self.createTypeFromItem(datatype) - if dt_ref is None: - msg = "Unexpected error, no type returned" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - dt = dt_ref - if hasattr(dt_ref, "dtype"): - # dt_ref is actualy a handle to a committed type - # get the dtype prop, but use dt_ref for the actual dataset creation - dt = dt_ref.dtype - - if fillvalue and len(dt) > 1 and type(fillvalue) in (list, tuple): - # for compound types, need to convert from list to dataset compatible element - - if len(dt) != len(fillvalue): - msg = "fillvalue has incorrect number of elements" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - ndscalar = np.zeros((), dtype=dt) - for i in range(len(fillvalue)): - field = dt.names[i] - ndscalar[field] = self.toTuple(0, fillvalue[i]) - fillvalue = ndscalar - + + kwds = {} + if chunks: + kwds["chunks"] = chunks + if compression: + kwds["compression"] = compression + if shuffle: + kwds["shuffle"] = shuffle + if compression_opts: + kwds["compression_opts"] = compression_opts + if maxshape: + kwds["maxshape"] = maxshape if fillvalue: - kwargs["fillvalue"] = fillvalue - - dataset_id = None - if datashape is None: - # create null space dataset - # null space datasets not supported in h5py yet: - # See: https://github.com/h5py/h5py/issues/279 - # work around this by using low-level interface. - # first create a temp scalar dataset so we can pull out the typeid - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - tmpDataset = tmpGrp.create_dataset(obj_uuid, shape=(1,), dtype=dt_ref) - tid = tmpDataset.id.get_type() - sid = sid = h5py.h5s.create(h5py.h5s.NULL) - # now create the permanent dataset - gid = datasets.id - b_obj_uuid = obj_uuid.encode("utf-8") - dataset_id = h5py.h5d.create(gid, b_obj_uuid, tid, sid) - # delete the temp dataset - del tmpGrp[obj_uuid] - else: - # create the dataset - try: - newDataset = datasets.create_dataset( - obj_uuid, - shape=datashape, - maxshape=max_shape, - dtype=dt_ref, - **kwargs, - ) - except ValueError as ve: - msg = "Unable to create dataset" - try: - msg += ": " + ve.message - except AttributeError: - pass # no message - self.log.info(msg) - raise IOError(errno.EINVAL, msg) # assume this is due to invalid params - - if newDataset: - dataset_id = newDataset.id - - if dataset_id is None: - msg = "Unexpected failure to create dataset" - self.log.error(msg) - raise IOError(errno.EIO, msg) - # store reverse map as an attribute - addr = h5py.h5o.get_info(dataset_id).addr - addrGrp = self.dbGrp["{addr}"] - addrGrp.attrs[str(addr)] = obj_uuid - - # save creation props if any - if creation_props: - self.setDatasetCreationProps(obj_uuid, creation_props) - - # set timestamp - now = time.time() - self.setCreateTime(obj_uuid, timestamp=now) - self.setModifiedTime(obj_uuid, timestamp=now) - - item["id"] = obj_uuid - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - item["attributeCount"] = 0 - return item - - """ - Resize existing Dataset - """ - - def resizeDataset(self, obj_uuid, shape): - self.log.info("resizeDataset(") # + obj_uuid + "): ") # + str(shape)) - self.initFile() - if self.readonly: - msg = "Unable to resize dataset (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EACESS, msg) - dset = self.getDatasetObjByUuid(obj_uuid) # will throw exception if not found - if len(shape) != len(dset.shape): - msg = "Unable to resize dataset, shape has wrong number of dimensions" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - for i in range(len(shape)): - if shape[i] < dset.shape[i]: - msg = "Unable to resize dataset, cannot make extent smaller" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if dset.maxshape[i] is not None and shape[i] > dset.maxshape[i]: - msg = "Unable to resize dataset, max extent exceeded" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - dset.resize(shape) # resize - - # update modified time - self.setModifiedTime(obj_uuid) - - """ - Check if link points to given target (as a HardLink) - """ - - def isObjectHardLinked(self, parentGroup, targetGroup, linkName): - try: - linkObj = parentGroup.get(linkName, None, False, True) - linkClass = linkObj.__class__.__name__ - except TypeError: - # UDLink? Ignore for now - return False - if linkClass == "SoftLink": - return False - elif linkClass == "ExternalLink": - return False - elif linkClass == "HardLink": - if parentGroup[linkName] == targetGroup: - return True - else: - self.log.warning("unexpected linkclass: " + linkClass) - return False - - """ - Delete Dataset, Group or Datatype by UUID - """ - - def deleteObjectByUuid(self, objtype, obj_uuid): - if objtype not in ("group", "dataset", "datatype"): - msg = "unexpected objtype: " + objtype - self.log.error(msg) - raise IOError(errno.EIO, msg) - self.initFile() - self.log.info("delete uuid: " + obj_uuid) - if self.readonly: - msg = "Unable to delete object (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - if obj_uuid == self.dbGrp.attrs["rootUUID"] and objtype == "group": - # can't delete root group - msg = "Unable to delete group (root group may not be deleted)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - dbCol = None - tgt = None - if objtype == "dataset": - tgt = self.getDatasetObjByUuid(obj_uuid) - dbCol = self.dbGrp["{datasets}"] - elif objtype == "group": - tgt = self.getGroupObjByUuid(obj_uuid) - dbCol = self.dbGrp["{groups}"] - else: # datatype - tgt = self.getCommittedTypeObjByUuid(obj_uuid) - dbCol = self.dbGrp["{datatypes}"] - - if tgt is None: - msg = "Unable to delete " + objtype + ", uuid: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - # unlink from root (if present) - self.unlinkObject(self.f["/"], tgt) - - groups = self.dbGrp["{groups}"] - # iterate through each group in the file and unlink tgt if it is linked - # by the group. - # We'll store a list of links to be removed as we go, and then actually - # remove the links after the iteration is done (otherwise we can run into issues - # where the key has become invalid) - linkList = [] # this is our list - for uuidName in groups.attrs: - grpRef = groups.attrs[uuidName] - # de-reference handle - grp = self.f[grpRef] - for linkName in grp: - if self.isObjectHardLinked(grp, tgt, linkName): - linkList.append({"group": grp, "link": linkName}) - for item in linkList: - self.unlinkObjectItem(item["group"], tgt, item["link"]) - - addr = h5py.h5o.get_info(tgt.id).addr - addrGrp = self.dbGrp["{addr}"] - del addrGrp.attrs[str(addr)] # remove reverse map - dbRemoved = False - - # finally, remove the dataset from db - if obj_uuid in dbCol: - # should be here (now it is anonymous) - del dbCol[obj_uuid] - dbRemoved = True - - if not dbRemoved: - self.log.warning("did not find: " + obj_uuid + " in anonymous collection") - - if obj_uuid in dbCol.attrs: - self.log.info( - "removing: " + obj_uuid + " from non-anonymous collection" - ) - del dbCol.attrs[obj_uuid] - dbRemoved = True - - if not dbRemoved: - msg = "Unexpected Error, did not find reference to: " + obj_uuid - self.log.error(msg) - raise IOError(errno.EIO, msg) - - # note when the object was deleted - self.setModifiedTime(obj_uuid) - - return True - - def getGroupItemByUuid(self, obj_uuid): - self.initFile() - grp = self.getGroupObjByUuid(obj_uuid) - if grp is None: - if self.getModifiedTime(obj_uuid, useRoot=False): - msg = "Group with uuid: " + obj_uuid + " has been previously deleted" - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - else: - msg = "Group with uuid: " + obj_uuid + " was not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - linkCount = len(grp) - if "__db__" in grp: - linkCount -= 1 # don't include the db group - - item = {"id": obj_uuid} - alias = [] - if grp.name and not grp.name.startswith("/__db__"): - alias.append(grp.name) # just use the default h5py path for now - item["alias"] = alias - item["attributeCount"] = len(grp.attrs) - item["linkCount"] = linkCount - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - return item - - """ - getLinkItemByObj - return info about a link - parent: reference to group - linkName: name of link - return: item dictionary with link attributes, or None if not found - """ - - def getLinkItemByObj(self, parent, link_name): - if link_name not in parent: - return None - - if link_name == "__db__": - return None # don't provide link to db group - # "http://somefile/#h5path(somepath)") - item = {"title": link_name} - # get the link object, one of HardLink, SoftLink, or ExternalLink - try: - linkObj = parent.get(link_name, None, False, True) - linkClass = linkObj.__class__.__name__ - except TypeError: - # UDLink? set class as 'user' - linkClass = "UDLink" # user defined links - item["class"] = "H5L_TYPE_USER_DEFINED" - if linkClass == "SoftLink": - item["class"] = "H5L_TYPE_SOFT" - item["h5path"] = linkObj.path - item["href"] = "#h5path(" + linkObj.path + ")" - elif linkClass == "ExternalLink": - item["class"] = "H5L_TYPE_EXTERNAL" - item["h5path"] = linkObj.path - item["file"] = linkObj.filename - item["href"] = "#h5path(" + linkObj.path + ")" - elif linkClass == "HardLink": - # Hardlink doesn't have any properties itself, just get the linked - # object - obj = parent[link_name] - addr = h5py.h5o.get_info(obj.id).addr - item["class"] = "H5L_TYPE_HARD" - item["id"] = self.getUUIDByAddress(addr) - class_name = obj.__class__.__name__ - if class_name == "Dataset": - item["href"] = "datasets/" + item["id"] - item["collection"] = "datasets" - elif class_name == "Group": - item["href"] = "groups/" + item["id"] - item["collection"] = "groups" - elif class_name == "Datatype": - item["href"] = "datatypes/" + item["id"] - item["collection"] = "datatypes" - else: - self.log.warning("unexpected object type: " + item["type"]) - - return item - - def getLinkItemByUuid(self, grpUuid, link_name): - self.log.info("db.getLinkItemByUuid(" + grpUuid + ", [" + link_name + "])") - if not link_name: - msg = "link_name not specified" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - self.initFile() - parent = self.getGroupObjByUuid(grpUuid) - if parent is None: - msg = "Parent group: " + grpUuid + " of link not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - item = self.getLinkItemByObj(parent, link_name) - # add timestamps - if item: - if self.update_timestamps: - item["ctime"] = self.getCreateTime( - grpUuid, objType="link", name=link_name - ) - item["mtime"] = self.getModifiedTime( - grpUuid, objType="link", name=link_name - ) - else: - self.log.info("link not found") - mtime = self.getModifiedTime( - grpUuid, objType="link", name=link_name, useRoot=False - ) - if mtime: - msg = ( - "Link [" - + link_name - + "] of: " - + grpUuid - + " has been previously deleted" - ) - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - else: - msg = "Link [" + link_name + "] of: " + grpUuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - return item - - def getLinkItems(self, grpUuid, marker=None, limit=0): - self.log.info("db.getLinkItems(" + grpUuid + ")") - if marker: - self.log.info("...marker: " + marker) - if limit: - self.log.info("...limit: " + str(limit)) - - self.initFile() - parent = self.getGroupObjByUuid(grpUuid) - if parent is None: - msg = "Parent group: " + grpUuid + " not found, no links returned" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - items = [] - gotMarker = True - if marker is not None: - gotMarker = False - count = 0 - for link_name in parent: - if link_name == "__db__": - continue - if not gotMarker: - if link_name == marker: - gotMarker = True - continue # start filling in result on next pass - else: - continue # keep going! - item = self.getLinkItemByObj(parent, link_name) - items.append(item) - - count += 1 - if limit > 0 and count == limit: - break # return what we got - return items - - def unlinkItem(self, grpUuid, link_name): - if self.readonly: - msg = "Unable to unlink item (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - grp = self.getGroupObjByUuid(grpUuid) - if grp is None: - msg = "Parent group: " + grpUuid + " not found, cannot remove link" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - if link_name not in grp: - msg = ( - "Link: [" - + link_name - + "] of group: " - + grpUuid - + " not found, cannot remove link" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - if link_name == "__db__": - # don't allow db group to be unlinked! - msg = "Unlinking of __db__ group not allowed" - raise IOError(errno.EPERM, msg) - - obj = None - try: - linkObj = grp.get(link_name, None, False, True) - linkClass = linkObj.__class__.__name__ - if linkClass == "HardLink": - # we can safely reference the object - obj = grp[link_name] - except TypeError: - # UDLink? Return false to indicate that we can not delete this - msg = "Unable to unlink user defined link" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - linkDeleted = False - if obj is not None: - linkDeleted = self.unlinkObjectItem(grp, obj, link_name) - else: - # SoftLink or External Link - we can just remove the key - del grp[link_name] - linkDeleted = True - - if linkDeleted: - # update timestamp - self.setModifiedTime(grpUuid, objType="link", name=link_name) - - return linkDeleted - - def getCollection(self, col_type, marker=None, limit=None): - self.log.info("db.getCollection(" + col_type + ")") - # col_type should be either "datasets", "groups", or "datatypes" - if col_type not in ("datasets", "groups", "datatypes"): - msg = "Unexpected col_type: [" + col_type + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - self.initFile() - col = None # Group, Dataset, or Datatype - if col_type == "datasets": - col = self.dbGrp["{datasets}"] - elif col_type == "groups": - col = self.dbGrp["{groups}"] - else: # col_type == "datatypes" - col = self.dbGrp["{datatypes}"] - - uuids = [] - count = 0 - # gather the non-anonymous ids first - for obj_uuid in col.attrs: - if marker: - if obj_uuid == marker: - marker = None # clear and pick up next item - continue - uuids.append(obj_uuid) - count += 1 - if limit is not None and limit > 0 and count == limit: - break - - if limit == 0 or (limit is not None and count < limit): - # grab any anonymous obj ids next - for obj_uuid in col: - if marker: - if obj_uuid == marker: - marker = None # clear and pick up next item - continue - uuids.append(obj_uuid) - count += 1 - if limit is not None and limit > 0 and count == limit: - break - - return uuids - - """ - Get the DB Collection names - """ - - def getDBCollections(self): - return ("{groups}", "{datasets}", "{datatypes}") - - """ - Return the db collection the uuid belongs to - """ - - def getDBCollection(self, obj_uuid): - dbCollections = self.getDBCollections() - for dbCollectionName in dbCollections: - col = self.dbGrp[dbCollectionName] - if obj_uuid in col or obj_uuid in col.attrs: - return col - return None - - def unlinkObjectItem(self, parentGrp, tgtObj, link_name): - if self.readonly: - msg = "Unexpected attempt to unlink object" - self.log.error(msg) - raise IOError(errno.EIO, msg) - if link_name not in parentGrp: - msg = "Unexpected: did not find link_name: [" + link_name + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - try: - linkObj = parentGrp.get(link_name, None, False, True) - except TypeError: - # user defined link? - msg = "Unable to remove link (user-defined link?)" - self.log.error(msg) - raise IOError(errno.EIO, msg) - linkClass = linkObj.__class__.__name__ - # only deal with HardLinks - linkDeleted = False - if linkClass == "HardLink": - obj = parentGrp[link_name] - if tgtObj is None or obj == tgtObj: - numlinks = self.getNumLinksToObject(obj) - if numlinks == 1: - # last link to this object - convert to anonymous object by - # creating link under {datasets} or {groups} or {datatypes} - # also remove the attribute UUID key - addr = h5py.h5o.get_info(obj.id).addr - obj_uuid = self.getUUIDByAddress(addr) - self.log.info("converting: " + obj_uuid + " to anonymous obj") - dbCol = self.getDBCollection(obj_uuid) - del dbCol.attrs[obj_uuid] # remove the object ref - dbCol[obj_uuid] = obj # add a hardlink - self.log.info( - "deleting link: [" + link_name + "] from: " + parentGrp.name - ) - del parentGrp[link_name] - linkDeleted = True - else: - self.log.info("unlinkObjectItem: link is not a hardlink, ignoring") - return linkDeleted - - def unlinkObject(self, parentGrp, tgtObj): - for name in parentGrp: - self.unlinkObjectItem(parentGrp, tgtObj, name) - return True - - def linkObject(self, parentUUID, childUUID, link_name): - self.initFile() - if self.readonly: - msg = "Unable to create link (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - parentObj = self.getGroupObjByUuid(parentUUID) - if parentObj is None: - msg = "Unable to create link, parent UUID: " + parentUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - childObj = self.getDatasetObjByUuid(childUUID) - if childObj is None: - # maybe it's a group... - childObj = self.getGroupObjByUuid(childUUID) - if childObj is None: - # or maybe it's a committed datatype... - childObj = self.getCommittedTypeObjByUuid(childUUID) - if childObj is None: - msg = "Unable to link item, child UUID: " + childUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - if link_name in parentObj: - # link already exists - self.log.info("linkname already exists, deleting") - self.unlinkObjectItem(parentObj, None, link_name) - parentObj[link_name] = childObj - - # convert this from an anonymous object to ref if needed - dbCol = self.getDBCollection(childUUID) - if childUUID in dbCol: - # convert to a ref - del dbCol[childUUID] # remove hardlink - dbCol.attrs[childUUID] = childObj.ref # create a ref - - # set link timestamps - now = time.time() - self.setCreateTime(parentUUID, objType="link", name=link_name, timestamp=now) - self.setModifiedTime(parentUUID, objType="link", name=link_name, timestamp=now) - return True - - def createSoftLink(self, parentUUID, linkPath, link_name): - self.initFile() - if self.readonly: - msg = "Unable to create link (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - parentObj = self.getGroupObjByUuid(parentUUID) - if parentObj is None: - msg = "Unable to create link, parent UUID: " + parentUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - if link_name in parentObj: - # link already exists - self.log.info("linkname already exists, deleting") - del parentObj[link_name] # delete old link - parentObj[link_name] = h5py.SoftLink(linkPath) - - now = time.time() - self.setCreateTime(parentUUID, objType="link", name=link_name, timestamp=now) - self.setModifiedTime(parentUUID, objType="link", name=link_name, timestamp=now) - - return True - - def createExternalLink(self, parentUUID, extPath, linkPath, link_name): - self.initFile() - if self.readonly: - msg = "Unable to create link (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - parentObj = self.getGroupObjByUuid(parentUUID) - if parentObj is None: - msg = "Unable to create link, parent UUID: " + parentUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - if link_name in parentObj: - # link already exists - self.log.info("linkname already exists, deleting") - del parentObj[link_name] # delete old link - parentObj[link_name] = h5py.ExternalLink(extPath, linkPath) - - now = time.time() - self.setCreateTime(parentUUID, objType="link", name=link_name, timestamp=now) - self.setModifiedTime(parentUUID, objType="link", name=link_name, timestamp=now) - - return True - - def createGroup(self, obj_uuid=None): - self.initFile() - if self.readonly: - msg = "Unable to create group (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - groups = self.dbGrp["{groups}"] - if not obj_uuid: - obj_uuid = createObjId() - newGroup = groups.create_group(obj_uuid) - # store reverse map as an attribute - addr = h5py.h5o.get_info(newGroup.id).addr - addrGrp = self.dbGrp["{addr}"] - addrGrp.attrs[str(addr)] = obj_uuid - - # set timestamps - now = time.time() - self.setCreateTime(obj_uuid, timestamp=now) - self.setModifiedTime(obj_uuid, timestamp=now) - - return obj_uuid - - def getNumberOfGroups(self): - self.initFile() - count = 0 - groups = self.dbGrp["{groups}"] - count += len(groups) # anonymous groups - count += len(groups.attrs) # linked groups - count += 1 # add of for root group - - return count - - def getNumberOfDatasets(self): - self.initFile() - count = 0 - datasets = self.dbGrp["{datasets}"] - count += len(datasets) # anonymous datasets - count += len(datasets.attrs) # linked datasets - return count - - def getNumberOfDatatypes(self): - self.initFile() - count = 0 - datatypes = self.dbGrp["{datatypes}"] - count += len(datatypes) # anonymous datatypes - count += len(datatypes.attrs) # linked datatypes - return count + kwds["fillvalue"] = fillvalue + if cpl: + kwds["cpl"] = cpl + dset_json = make_new_dset(shape=shape, dtype=dtype, **kwds) + + dset_id = createObjId("datasets", root_id=self._root_id) + self._db[dset_id] = dset_json + return dset_id + + + def resizeDataset(self, dset_id, shape): + """ + Resize existing Dataset + """ + self.log.info(f"resizeDataset {dset_id}, {shape}") + + dset_json = self.getObjectById(dset_id) # will throw exception if not found + resize_dataset(dset_json, shape) + + + def deleteObject(self, obj_id): + """ Delete the given object """ + self.log.info(f"deleteObject: {obj_id}") + if obj_id not in self._db: + raise KeyError(f"Object {obj_id} not found for deletion") + if obj_id == self._root_id: + raise KeyError("Root group cannot be deleted") + del self._db[obj_id] + # TBD: add to pending deleted items + + def getLinks(self, grp_id): + """ Get the links for the given group """ + grp_json = self.getObjectById(grp_id) + if "links" not in grp_json: + raise KeyError(f"No links - {grp_id} not a group?") + return grp_json["links"] + + def getLink(self, grp_id, name): + """ Get the given link """ + + links = self.getLinks(grp_id) + if name not in links: + raise KeyError(f"Link [{name}] not found in {grp_id}") + return links[name] + + def createHardLink(self, grp_id, name, tgt_id): + """ Create a new hardlink """ + links = self.getLinks(grp_id) + if name in links: + self.deleteLink(grp_id, name) + link_json = {"class": "H5L_TYPE_HARD", "id": tgt_id} + link_json["created"] = time.time() + links[name] = link_json + + def createSoftLink(self, grp_id, name, h5path): + """ Create a soft link """ + links = self.getLinks(grp_id) + if name in links: + self.deleteLink(grp_id, name) + link_json = {"class": "H5L_TYPE_SOFT", "h5path": h5path} + link_json["created"] = time.time() + links[name] = link_json + + def createCustomLink(self, grp_id, name, link_json): + """ create a custom link """ + links = self.getLinks(grp_id) + if name in links: + self.deleteLink(grp_id, name) + if link_json.get("class") != "H5L_TYPE_USER_DEFINED": + link_json["class"] = "H5L_TYPE_USER_DEFINED" + link_json["created"] = time.time() + links[name] = link_json + + + def createExternalLink(self, grp_id, name, h5path, filepath): + """ Create a external link link """ + links = self.getLinks(grp_id) + if name in links: + self.deleteLink(grp_id, name) + link_json = {"class": "H5L_TYPE_EXTERNAL", "h5path": h5path, "file": filepath} + link_json["created"] = time.time() + links[name] = link_json + + def deleteLink(self, grp_id, name): + """ Delete the given link """ + grp_json = self.getObjectById(grp_id) + if "links" not in grp_json: + raise KeyError(f"No links - {grp_id} not a group?") + links = self.getLinks(grp_id) + if name not in links: + raise KeyError(f"Link [{name}] not found in {grp_id}") + del links[name] + grp_json["modified"] = time.time() + + + def createGroup(self, cpl=None): + """ Create a new group """ + + grp_id = createObjId("groups", root_id=self._root_id) + group_json = {"attributes": {}, "links": {}} + if cpl: + group_json["cpl"] = cpl + else: + group_json["cpl"] = {} + group_json["created"] = time.time + group_json["modified"] = None + self._db[grp_id] = group_json + return grp_id + + + def __len__(self): + # return the number of objects + return len(self._db) + + + def __iter__(self): + """ Iterate over object ids """ + + for obj_id in self._db: + yield obj_id + + + def __contains__(self, obj_id): + """ Test if a obj id exists """ + return obj_id in self._db diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index 9c565ce0..be1ffd62 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -14,6 +14,9 @@ import numpy as np +numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) +numpy_float_types = (np.float16, np.float32, np.float64) + class Reference: """ Represents an HDF5 object reference @@ -148,6 +151,58 @@ def special_dtype(**kwds): raise TypeError(f'Unknown special type "{name}"') + +def find_item_type(data): + """Find the item type of a simple object or collection of objects. + + E.g. [[['a']]] -> str + + The focus is on collections where all items have the same type; we'll return + None if that's not the case. + + The aim is to treat numpy arrays of Python objects like normal Python + collections, while treating arrays with specific dtypes differently. + We're also only interested in array-like collections - lists and tuples, + possibly nested - not things like sets or dicts. + """ + if isinstance(data, np.ndarray): + if ( + data.dtype.kind == 'O' and not check_dtype(vlen=data.dtype) + ): + item_types = {type(e) for e in data.flat} + else: + return None + elif isinstance(data, (list, tuple)): + item_types = {find_item_type(e) for e in data} + else: + return type(data) + + if len(item_types) != 1: + return None + return item_types.pop() + +def guess_dtype(data): + """ Attempt to guess an appropriate dtype for the object, returning None + if nothing is appropriate (or if it should be left up the the array + constructor to figure out) + """ + + # todo - handle RegionReference, Reference + item_type = find_item_type(data) + if item_type is bytes: + return special_dtype(vlen=bytes) + if item_type is str: + return special_dtype(vlen=str) + + return None + +def is_float16_dtype(dt): + if dt is None: + return False + + dt = np.dtype(dt) # normalize strings -> np.dtype objects + return dt.kind == 'f' and dt.itemsize == 2 + def check_dtype(**kwds): """Check a dtype for h5py special type "hint" information. Only one keyword may be given. @@ -222,7 +277,7 @@ def getTypeResponse(typeItem): for k in typeItem.keys(): if k == "base": if isinstance(typeItem[k], dict): - response[k] = getTypeResponse(typeItem[k]) # recurse call + response[k] = getTypeResponse(typeItem[k]) # recursive call else: response[k] = typeItem[k] # predefined type elif k not in ("size", "base_size"): @@ -251,6 +306,9 @@ def getTypeItem(dt, metadata=None): "float32": "H5T_IEEE_F32", "float64": "H5T_IEEE_F64", } + + dt = np.dtype(dt) # convert 'int32', np.int32, etc. to a dtype + if not metadata and dt.metadata: metadata = dt.metadata @@ -421,6 +479,23 @@ def getTypeItem(dt, metadata=None): return type_info +def isVlen(dt): + """ + Return True if the type contains variable length elements + """ + is_vlen = False + if len(dt) > 1: + names = dt.names + for name in names: + if isVlen(dt[name]): + is_vlen = True + break + else: + if dt.metadata and "vlen" in dt.metadata: + is_vlen = True + return is_vlen + + def getItemSize(typeItem): """ Get size of an item in bytes. diff --git a/src/h5json/objid.py b/src/h5json/objid.py index 598790e0..8c62a752 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -84,6 +84,8 @@ def isSchema2Id(id): """return true if this is a v2 id""" # v1 ids are in the standard UUID format: 8-4-4-4-12 # v2 ids are in the non-standard: 8-8-4-6-6 + if not isValidUuid(id): + return False parts = id.split("-") if len(parts) != 6: raise ValueError(f"Unexpected id formation for uuid: {id}") diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py new file mode 100644 index 00000000..d37c7f5f --- /dev/null +++ b/test/unit/array_util_test.py @@ -0,0 +1,1021 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import json +import numpy as np + +import base64 + +from h5json.array_util import bytesArrayToList +from h5json.array_util import toTuple +from h5json.array_util import getNumElements +from h5json.array_util import jsonToArray +from h5json.array_util import arrayToBytes +from h5json.array_util import bytesToArray +from h5json.array_util import getByteArraySize +from h5json.array_util import IndexIterator +from h5json.array_util import ndarray_compare +from h5json.array_util import getNumpyValue +from h5json.array_util import getBroadcastShape + +from h5json.hdf5dtype import special_dtype +from h5json.hdf5dtype import check_dtype +from h5json.hdf5dtype import createDataType + + +class ArrayUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(ArrayUtilTest, self).__init__(*args, **kwargs) + # main + + def testByteArrayToList(self): + data_items = ( + 42, + "foo", + b"foo", + [1, 2, 3], + (1, 2, 3), + ["A", "B", "C"], + [b"A", b"B", b"C"], + [["A", "B"], [b"a", b"b", b"c"]], + ) + for data in data_items: + json_data = bytesArrayToList(data) + # will throw TypeError if not able to convert + json.dumps(json_data) + + def testToTuple(self): + data0d = 42 # scalar + data1d1 = [1] # one dimensional, one element list + data1d = [1, 2, 3, 4, 5] # list + data2d1 = [ + [1, 2], + ] # two dimensional, one element + data2d = [[1, 0.1], [2, 0.2], [3, 0.3], [4, 0.4]] # list of two-element lists + data3d = [[[0, 0.0], [1, 0.1]], [[2, 0.2], [3, 0.3]]] # list of list of lists + out = toTuple(0, data0d) + self.assertEqual(data0d, out) + out = toTuple(1, data1d1) + self.assertEqual(data1d1, out) + out = toTuple(1, data1d) + self.assertEqual(data1d, out) + out = toTuple(2, data2d) + self.assertEqual(data2d, out) + out = toTuple(1, data2d1) + self.assertEqual([(1, 2)], out) + out = toTuple(3, data3d) + self.assertEqual(data3d, out) + out = toTuple(1, data2d) # treat input as 1d array of two-field compound types + self.assertEqual([(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4)], out) + out = toTuple(2, data3d) # treat input as 2d array of two-field compound types + self.assertEqual([[(0, 0.0), (1, 0.1)], [(2, 0.2), (3, 0.3)]], out) + out = toTuple(1, data3d) # treat input a 1d array of compound type of compound types + self.assertEqual([((0, 0.0), (1, 0.1)), ((2, 0.2), (3, 0.3))], out) + + def testGetNumElements(self): + shape = (4,) + nelements = getNumElements(shape) + self.assertEqual(nelements, 4) + + shape = [10,] + nelements = getNumElements(shape) + self.assertEqual(nelements, 10) + + shape = (10, 8) + nelements = getNumElements(shape) + self.assertEqual(nelements, 80) + + def testJsonToArray(self): + dt = np.dtype("i4") + shape = [4, ] + data = [0, 2, 4, 6] + out = jsonToArray(shape, dt, data) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (4,)) + for i in range(4): + self.assertEqual(out[i], i * 2) + + # compound type + dt = np.dtype([("a", "i4"), ("b", "S5")]) + shape = [2, ] + data = [[4, "four"], [5, "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + + self.assertEqual(out.shape, (2,)) + self.assertTrue(isinstance(out[0], np.void)) + e0 = out[0].tolist() + self.assertEqual(e0, (4, b"four")) + self.assertTrue(isinstance(out[1], np.void)) + e1 = out[1].tolist() + self.assertEqual(e1, (5, b"five")) + + shape = [1, ] + data = [ + [6, "six"], + ] + out = jsonToArray(shape, dt, data) + e0 = out[0].tolist() + self.assertEqual(e0, (6, b"six")) + + data = [6, "six"] + out = jsonToArray(shape, dt, data) + e0 = out[0].tolist() + self.assertEqual(e0, (6, b"six")) + + # test ascii chars >127 + dt = np.dtype("S26") + data = "extended ascii char 241: " + chr(241) + out = jsonToArray(shape, dt, data) + self.assertEqual(out[0], b'extended ascii char 241: \xc3') + + dt = np.dtype("S12") + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertEqual(out[0], b'eight: \xe5\x85\xab') + + # VLEN ascii + dt = special_dtype(vlen=bytes) + data = [b"one", b"two", b"three", b"four", b"five"] + shape = [5, ] + out = jsonToArray(shape, dt, data) + self.assertTrue("vlen" in out.dtype.metadata) + self.assertEqual(out.dtype.metadata["vlen"], bytes) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(out.shape, (5,)) + # TBD: code does not actually enforce use of bytes vs. str, + # probably not worth the effort to fix + self.assertEqual(out[2], b"three") + self.assertEqual(out[3], b"four") + + # VLEN str + dt = special_dtype(vlen=str) + data = [ + [b"part 1 - section A", b"part 1 - section B"], + [b"part 2 - section A", b"part 2 - section B"], + ] + shape = [2,] + out = jsonToArray(shape, dt, data) + self.assertTrue("vlen" in out.dtype.metadata) + self.assertEqual(out.dtype.metadata["vlen"], str) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(out.shape, (2,)) + self.assertEqual(out[0], tuple(data[0])) + self.assertEqual(out[1], tuple(data[1])) + + # VLEN Scalar str + dt = special_dtype(vlen=str) + data = "I'm a string!" + shape = [1, ] + out = jsonToArray(shape, dt, data) + + # VLEN unicode + dt = special_dtype(vlen=bytes) + data = ["one", "two", "three", "four", "five"] + shape = [5, ] + out = jsonToArray(shape, dt, data) + self.assertTrue("vlen" in out.dtype.metadata) + self.assertEqual(out.dtype.metadata["vlen"], bytes) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(out[2], b"three") + + # VLEN data + dt = special_dtype(vlen=np.dtype("int32")) + shape = [4, ] + data = [ + [1,], + [1, 2], + [1, 2, 3], + [1, 2, 3, 4], + ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + + self.assertEqual(out.shape, (4,)) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + for i in range(4): + e = out[i] # .tolist() + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, tuple(range(1, i + 2))) + + # VLEN 2D data + dt = special_dtype(vlen=np.dtype("int32")) + shape = [2, 2] + data = [ + [ + [0,], + [1, 2], + ], + [ + [1,], + [2, 3], + ], + ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + + self.assertEqual(out.shape, (2, 2)) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + for i in range(2): + for j in range(2): + e = out[i, j] # .tolist() + self.assertTrue(isinstance(e, tuple)) + + # create VLEN of obj ref's + ref_type = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} + vlen_type = {"class": "H5T_VLEN", "base": ref_type} + dt = createDataType(vlen_type) # np datatype + + id0 = b"g-a4f455b2-c8cf-11e7-8b73-0242ac110009" + id1 = b"g-a50af844-c8cf-11e7-8b73-0242ac110009" + id2 = b"g-a5236276-c8cf-11e7-8b73-0242ac110009" + + data = [ + [id0, ], + [id0, id1], + [id0, id1, id2], + ] + shape = [3, ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + base_type = check_dtype(vlen=out.dtype) + self.assertEqual(base_type.kind, "S") + self.assertEqual(base_type.itemsize, 48) + + self.assertEqual(out.shape, (3,)) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("S48")) + + e = out[0] + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, (id0,)) + e = out[1] + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, (id0, id1)) + e = out[2] + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, (id0, id1, id2)) + + # compound type with array field + dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) + shape = [2, ] + data = [[[4, 8, 12], "four"], [[5, 10, 15], "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + + self.assertEqual(out.shape, (2,)) + self.assertTrue(isinstance(out[0], np.void)) + e0 = out[0] + self.assertEqual(len(e0), 2) + e0a = e0[0] + self.assertTrue(isinstance(e0a, np.ndarray)) + self.assertEqual(e0a[0], 4) + self.assertEqual(e0a[1], 8) + self.assertEqual(e0a[2], 12) + e0b = e0[1] + self.assertEqual(e0b, b"four") + self.assertTrue(isinstance(out[1], np.void)) + e1 = out[1] + self.assertEqual(len(e1), 2) + e1a = e1[0] + self.assertTrue(isinstance(e1a, np.ndarray)) + self.assertEqual(e1a[0], 5) + self.assertEqual(e1a[1], 10) + self.assertEqual(e1a[2], 15) + e1b = e1[1] + self.assertEqual(e1b, b"five") + + def testToBytes(self): + # Simple array + dt = np.dtype(" expected_num_bytes) + + # convert buffer back to arr + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(np.array_equal(arr, arr_copy)) + + # fixed length string + dt = np.dtype("S8") + arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (3,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # Compound non-vlen + dt = np.dtype([("x", "f8"), ("y", "i4")]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (3.12, 42) + arr[3] = (1.28, 69) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # VLEN of int32's + dt = np.dtype("O", metadata={"vlen": np.dtype("int32")}) + arr = np.zeros((4,), dtype=dt) + arr[0] = np.int32([1, ]) + arr[1] = np.int32([1, 2]) + arr[2] = 0 # test un-intialized value + arr[3] = np.int32([1, 2, 3]) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # VLEN of strings + dt = np.dtype("O", metadata={"vlen": str}) + arr = np.zeros((5,), dtype=dt) + arr[0] = "one: \u4e00" + arr[1] = "two: \u4e8c" + arr[2] = "three: \u4e09" + arr[3] = "four: \u56db" + arr[4] = 0 + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + # VLEN of bytes + dt = np.dtype("O", metadata={"vlen": bytes}) + arr = np.zeros((5,), dtype=dt) + arr[0] = b"Parting" + arr[1] = b"is such" + arr[2] = b"sweet" + arr[3] = b"sorrow" + arr[4] = 0 + + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (5,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # + # Compound str vlen + # + dt_vstr = np.dtype("O", metadata={"vlen": str}) + dt = np.dtype([("x", "i4"), ("tag", dt_vstr), ("code", "S4")]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (42, "Hello", "X1") + arr[3] = (84, "Bye", "XYZ") + count = getByteArraySize(arr) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # + # Compound int vlen + # + dt_vint = np.dtype("O", metadata={"vlen": "int32"}) + dt = np.dtype([("x", "int32"), ("tag", dt_vint)]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (42, np.array((), dtype="int32")) + arr[3] = (84, np.array((1, 2, 3), dtype="int32")) + count = getByteArraySize(arr) + self.assertEqual(count, 44) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # + # VLEN utf string with array type + # + dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) + dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) + arr = np.zeros((4,), dtype=dt) + dt_str = np.dtype("O", metadata={"vlen": str}) + arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) + arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + + self.assertEqual(arr.dtype, arr_copy.dtype) + self.assertEqual(arr.shape, arr_copy.shape) + for i in range(4): + e = arr[i] + e_copy = arr_copy[i] + self.assertTrue(np.array_equal(e, e_copy)) + # + # VLEN ascii with array type + # + dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) + dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) + arr = np.zeros((4,), dtype=dt) + dt_str = np.dtype("O", metadata={"vlen": bytes}) + arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) + arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + def testArrayCompareInt(self): + # Simple array + dt = np.dtype(" time.time() - 1.0) + + db.createSoftLink(g2_id, "slink", "somewhere") + soft_link = db.getLink(g2_id, "slink") + self.assertEqual(soft_link["class"], "H5L_TYPE_SOFT") + self.assertEqual(soft_link["h5path"], "somewhere") + self.assertTrue(soft_link["created"] > time.time() - 1.0) + + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + ext_link = db.getLink(g2_id, "extlink") + self.assertEqual(ext_link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(ext_link["h5path"], "somewhere") + self.assertEqual(ext_link["file"], "someplace") + self.assertTrue(ext_link["created"] > time.time() - 1.0) + + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + cust_link = db.getLink(g2_id, "cust") + self.assertEqual(cust_link["class"], "H5L_TYPE_USER_DEFINED") + self.assertEqual(cust_link["foo"], "bar") + self.assertTrue(cust_link["created"] > time.time() - 1.0) + + links = db.getLinks(g2_id) + self.assertEqual(len(links), 3) + for title in "slink", "extlink", "cust": + self.assertTrue(title in links) + + db.deleteLink(g2_id, "cust") + links = db.getLinks(g2_id) + self.assertEqual(len(links), 2) + for title in "slink", "extlink": + self.assertTrue(title in links) - def testGetLinkItemsBatch(self): - # get test file - filepath = getFile("group100.h5", "getlinkitemsbatch.h5") - marker = None - count = 0 - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - while True: - # get items 13 at a time - batch = db.getLinkItems(rootUuid, marker=marker, limit=13) - if len(batch) == 0: - break # done! - count += len(batch) - lastItem = batch[len(batch) - 1] - marker = lastItem["title"] - self.assertEqual(count, 100) - - def testGetItemHardLink(self): - filepath = getFile("tall.h5", "getitemhardlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.1") - item = db.getLinkItemByUuid(grpUuid, "dset1.1.1") - self.assertTrue("id" in item) - self.assertEqual(item["title"], "dset1.1.1") - self.assertEqual(item["class"], "H5L_TYPE_HARD") - self.assertEqual(item["collection"], "datasets") - self.assertTrue("target" not in item) - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetItemSoftLink(self): - filepath = getFile("tall.h5", "getitemsoftlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.2/g1.2.1") - item = db.getLinkItemByUuid(grpUuid, "slink") - self.assertTrue("id" not in item) - self.assertEqual(item["title"], "slink") - self.assertEqual(item["class"], "H5L_TYPE_SOFT") - self.assertEqual(item["h5path"], "somevalue") - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetItemExternalLink(self): - filepath = getFile("tall_with_udlink.h5", "getitemexternallink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.2") - item = db.getLinkItemByUuid(grpUuid, "extlink") - self.assertTrue("uuid" not in item) - self.assertEqual(item["title"], "extlink") - self.assertEqual(item["class"], "H5L_TYPE_EXTERNAL") - self.assertEqual(item["h5path"], "somepath") - self.assertEqual(item["file"], "somefile") - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetItemUDLink(self): - filepath = getFile("tall_with_udlink.h5", "getitemudlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g2") - item = db.getLinkItemByUuid(grpUuid, "udlink") - self.assertTrue("uuid" not in item) - self.assertEqual(item["title"], "udlink") - self.assertEqual(item["class"], "H5L_TYPE_USER_DEFINED") - self.assertTrue("h5path" not in item) - self.assertTrue("file" not in item) - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetNumLinks(self): - filepath = getFile("tall.h5", "getnumlinks.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - g1 = db.getObjByPath("/g1") - numLinks = db.getNumLinksToObject(g1) - self.assertEqual(numLinks, 1) - - def testGetLinks(self): - g12_links = ("extlink", "g1.2.1") - hardLink = None - externalLink = None - filepath = getFile("tall_with_udlink.h5", "getlinks.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.2") - items = db.getLinkItems(grpUuid) - self.assertEqual(len(items), 2) - for item in items: - self.assertTrue(item["title"] in g12_links) - if item["class"] == "H5L_TYPE_HARD": - hardLink = item - elif item["class"] == "H5L_TYPE_EXTERNAL": - externalLink = item - self.assertEqual(hardLink["collection"], "groups") - self.assertTrue("id" in hardLink) - self.assertTrue("id" not in externalLink) - self.assertEqual(externalLink["h5path"], "somepath") - self.assertEqual(externalLink["file"], "somefile") - - def testDeleteLink(self): - # get test file - filepath = getFile("tall.h5", "deletelink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - numRootChildren = len(db.getLinkItems(rootUuid)) - self.assertEqual(numRootChildren, 2) - db.unlinkItem(rootUuid, "g2") - numRootChildren = len(db.getLinkItems(rootUuid)) - self.assertEqual(numRootChildren, 1) - - def testDeleteUDLink(self): - # get test file - filepath = getFile("tall_with_udlink.h5", "deleteudlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - g2Uuid = db.getUUIDByPath("/g2") - numG2Children = len(db.getLinkItems(g2Uuid)) - self.assertEqual(numG2Children, 3) - got_exception = False try: - db.unlinkItem(g2Uuid, "udlink") - except IOError as ioe: - got_exception = True - self.assertEqual(ioe.errno, errno.EPERM) - self.assertTrue(got_exception) - numG2Children = len(db.getLinkItems(g2Uuid)) - self.assertEqual(numG2Children, 3) - - def testReadOnlyGetUUID(self): - # get test file - filepath = getFile("tall.h5", "readonlygetuuid.h5", ro=True) - # remove db file! - removeFile("./out/." + "readonlygetuuid.h5") - g1Uuid = None - with Hdf5db(filepath, app_logger=self.log) as db: - g1Uuid = db.getUUIDByPath("/g1") - self.assertEqual(len(g1Uuid), UUID_LEN) - obj = db.getObjByPath("/g1") - self.assertEqual(obj.name, "/g1") - - # end of with will close file - # open again and verify we can get obj by name - with Hdf5db(filepath, app_logger=self.log) as db: - obj = db.getGroupObjByUuid(g1Uuid) - g1 = db.getObjByPath("/g1") - self.assertEqual(obj, g1) - g1links = db.getLinkItems(g1Uuid) - self.assertEqual(len(g1links), 2) - for item in g1links: - self.assertEqual(len(item["id"]), UUID_LEN) - - def testReadDataset(self): - filepath = getFile("tall.h5", "readdataset.h5") - d111_values = None - d112_values = None - with Hdf5db(filepath, app_logger=self.log) as db: - d111Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - self.assertEqual(len(d111Uuid), UUID_LEN) - d111_values = db.getDatasetValuesByUuid(d111Uuid) - self.assertTrue(type(d111_values) is list) - self.assertEqual(len(d111_values), 10) - for i in range(10): - arr = d111_values[i] - self.assertEqual(len(arr), 10) - for j in range(10): - self.assertEqual(arr[j], i * j) - - d112Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.2") - self.assertEqual(len(d112Uuid), UUID_LEN) - d112_values = db.getDatasetValuesByUuid(d112Uuid) - self.assertTrue(type(d112_values) is list) - self.assertEqual(len(d112_values), 20) - for i in range(20): - self.assertEqual(d112_values[i], i) - - def testReadDatasetBinary(self): - filepath = getFile("tall.h5", "readdatasetbinary.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - d111Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - self.assertEqual(len(d111Uuid), UUID_LEN) - d111_data = db.getDatasetValuesByUuid(d111Uuid, format="binary") - self.assertTrue(type(d111_data) is bytes) - self.assertEqual(len(d111_data), 400) # 10x10x(4 byte type) - - d112Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.2") - self.assertEqual(len(d112Uuid), UUID_LEN) - d112_data = db.getDatasetValuesByUuid(d112Uuid, format="binary") - self.assertEqual(len(d112_data), 80) # 20x(4 byte type) - - def testReadCompoundDataset(self): - filepath = getFile("compound.h5", "readcompound.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/dset") - self.assertEqual(len(dset_uuid), UUID_LEN) - dset_values = db.getDatasetValuesByUuid(dset_uuid) - - self.assertEqual(len(dset_values), 72) - elem = dset_values[0] - self.assertEqual(elem[0], 24) - self.assertEqual(elem[1], "13:53") - self.assertEqual(elem[2], 63) - self.assertEqual(elem[3], 29.88) - self.assertEqual(elem[4], "SE 10") - - def testReadDatasetCreationProp(self): - filepath = getFile("compound.h5", "readdatasetcreationprop.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/dset") - self.assertEqual(len(dset_uuid), UUID_LEN) - dset_item = db.getDatasetItemByUuid(dset_uuid) - self.assertTrue("creationProperties" in dset_item) - creationProp = dset_item["creationProperties"] - self.assertTrue("fillValue" in creationProp) - fillValue = creationProp["fillValue"] - - self.assertEqual(fillValue[0], 999) - self.assertEqual(fillValue[1], "99:90") - self.assertEqual(fillValue[2], 999) - self.assertEqual(fillValue[3], 999.0) - self.assertEqual(fillValue[4], "N") - - def testCreateScalarDataset(self): - creation_props = { - "allocTime": "H5D_ALLOC_TIME_LATE", - "fillTime": "H5D_FILL_TIME_IFSET", - "fillValue": "", - "layout": {"class": "H5D_CONTIGUOUS"}, - } - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": 1, - "strPad": "H5T_STR_NULLPAD", - } - filepath = getFile("empty.h5", "createscalardataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dims = () # if no space in body, default to scalar - max_shape = None - - db.createDataset( - datatype, dims, max_shape=max_shape, creation_props=creation_props - ) - - def testCreate1dDataset(self): - datatype = "H5T_STD_I64LE" - dims = (10,) - filepath = getFile("empty.h5", "create1ddataset.h5") - dset_uuid = None - with Hdf5db(filepath, app_logger=self.log) as db: - rsp = db.createDataset(datatype, dims) - - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - self.assertEqual(item["attributeCount"], 0) - type_item = item["type"] - self.assertEqual(type_item["class"], "H5T_INTEGER") - self.assertEqual(type_item["base"], "H5T_STD_I64LE") - shape_item = item["shape"] - self.assertEqual(shape_item["class"], "H5S_SIMPLE") - self.assertEqual(shape_item["dims"], (10,)) - - def testCreate2dExtendableDataset(self): - datatype = "H5T_STD_I64LE" - dims = (10, 10) - max_shape = (None, 10) - filepath = getFile("empty.h5", "create2dextendabledataset.h5") - dset_uuid = None - with Hdf5db(filepath, app_logger=self.log) as db: - rsp = db.createDataset(datatype, dims, max_shape=max_shape) - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - self.assertEqual(item["attributeCount"], 0) - type_item = item["type"] - self.assertEqual(type_item["class"], "H5T_INTEGER") - self.assertEqual(type_item["base"], "H5T_STD_I64LE") - shape_item = item["shape"] - self.assertEqual(shape_item["class"], "H5S_SIMPLE") - self.assertEqual(shape_item["dims"], (10, 10)) - self.assertTrue("maxdims" in shape_item) - self.assertEqual(shape_item["maxdims"], [0, 10]) - - def testCreateCommittedTypeDataset(self): - filepath = getFile("empty.h5", "createcommittedtypedataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 15, - } - item = db.createCommittedType(datatype) - type_uuid = item["id"] - - dims = () # if no space in body, default to scalar - rsp = db.createDataset(type_uuid, dims, max_shape=None, creation_props=None) - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - type_item = item["type"] - self.assertTrue("uuid" in type_item) - self.assertEqual(type_item["uuid"], type_uuid) - - def testCreateCommittedCompoundTypeDataset(self): - filepath = getFile("empty.h5", "createcommittedcompoundtypedataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - - datatype = {"class": "H5T_COMPOUND", "fields": []} - - type_fields = [] - type_fields.append({"name": "field_1", "type": "H5T_STD_I64BE"}) - type_fields.append({"name": "field_2", "type": "H5T_IEEE_F64BE"}) - - datatype["fields"] = type_fields - - creation_props = {"fillValue": [0, 0.0]} - - item = db.createCommittedType(datatype) - type_uuid = item["id"] - - dims = () # if no space in body, default to scalar - rsp = db.createDataset( - type_uuid, dims, max_shape=None, creation_props=creation_props - ) - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - type_item = item["type"] - self.assertTrue("uuid" in type_item) - self.assertEqual(type_item["uuid"], type_uuid) + db.getObjectIdByPath("/g1/foo") + self.assertTrue(False) + except KeyError: + pass # expected - def testReadZeroDimDataset(self): - filepath = getFile("zerodim.h5", "readzerodeimdataset.h5") - - with Hdf5db(filepath, app_logger=self.log) as db: - dsetUuid = db.getUUIDByPath("/dset") - self.assertEqual(len(dsetUuid), UUID_LEN) - dset_value = db.getDatasetValuesByUuid(dsetUuid) - self.assertEqual(dset_value, 42) - - def testReadNullSpaceDataset(self): - filepath = getFile("null_space_dset.h5", "readnullspacedataset.h5") - - with Hdf5db(filepath, app_logger=self.log) as db: - dsetUuid = db.getUUIDByPath("/DS1") - self.assertEqual(len(dsetUuid), UUID_LEN) - obj = db.getDatasetObjByUuid(dsetUuid) - shape_item = db.getShapeItemByDsetObj(obj) - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_NULL") - - def testReadScalarSpaceArrayDataset(self): - filepath = getFile("scalar_array_dset.h5", "readscalarspacearraydataset.h5") + try: + db.getLink(g2_id, "not_a_link") + self.assertTrue(False) + except KeyError: + pass # expected - with Hdf5db(filepath, app_logger=self.log) as db: - dsetUuid = db.getUUIDByPath("/DS1") - self.assertEqual(len(dsetUuid), UUID_LEN) - obj = db.getDatasetObjByUuid(dsetUuid) - shape_item = db.getShapeItemByDsetObj(obj) - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_SCALAR") - def testReadNullSpaceAttribute(self): - filepath = getFile("null_space_attr.h5", "readnullspaceattr.h5") + def testNullSpaceAttribute(self): - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - self.assertEqual(len(rootUuid), UUID_LEN) - item = db.getAttributeItem("groups", rootUuid, "attr1") + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") self.assertTrue("shape" in item) shape_item = item["shape"] self.assertTrue("class" in shape_item) self.assertEqual(shape_item["class"], "H5S_NULL") - - def testReadAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("tall.h5", "readattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - self.assertEqual(len(rootUuid), UUID_LEN) - item = db.getAttributeItem("groups", rootUuid, "attr1") - self.assertTrue(item is not None) - - def testWriteScalarAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writescalarattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") + self.assertTrue(item["created"] > time.time() - 1.0) + self.assertEqual(item["modified"], None) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + + def testScalarAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") dims = () - datatype = "H5T_STD_I32LE" value = 42 - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned self.assertEqual(item["value"], 42) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) shape = item["shape"] self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") - self.assertEqual( - len(item_type.keys()), 2 - ) # just two keys should be returned + - def testWriteFixedStringAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writefixedstringattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLPAD", - "length": 13, - } + def testFixedStringAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") value = "Hello, world!" - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") item_type = item["type"] - self.assertEqual(item_type["length"], 13) self.assertEqual(item_type["class"], "H5T_STRING") self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - - def testWriteFixedNullTermStringAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writefixednulltermstringattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 13, - } + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + ret_value = db.getAttributeValue(root_id, "A1") + + + def testVlenAsciiAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + value = b"Hello, world!" + dt = special_dtype(vlen=bytes) # write the attribute - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) + db.createAttribute(root_id, "A1", value, dtype=dt) # read it back - item = db.getAttributeItem("groups", root_uuid, "A1") - - self.assertEqual(item["name"], "A1") - # the following compare fails - see issue #34 - # self.assertEqual(item['value'], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") item_type = item["type"] - self.assertEqual(item_type["length"], 13) self.assertEqual(item_type["class"], "H5T_STRING") - # NULLTERM get's converted to NULLPAD since the numpy dtype does not - # support other padding conventions. - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - - def testWriteVlenStringAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writevlenstringattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - } - - # value = np.string_("Hello, world!") - value = "Hello, world!" - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) - def testReadVlenStringDataset(self): - item = None - filepath = getFile("vlen_string_dset.h5", "vlen_string_dset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - # actual padding is SPACEPAD - See issue #32 - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),)) - self.assertEqual(row, ["Parting"]) + def testVlenUtf8Attribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=str) - def testReadVlenStringDataset_utc(self): - item = None - filepath = getFile("vlen_string_dset_utc.h5", "vlen_string_dset_utc.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/ds1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 2293) + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") item_type = item["type"] self.assertEqual(item_type["class"], "H5T_STRING") self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") self.assertEqual(item_type["length"], "H5T_VARIABLE") - # next line throws conversion error - see issue #19 - # row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),)) - - def testReadFixedStringDataset(self): - item = None - filepath = getFile("fixed_string_dset.h5", "fixed_string_dset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 7) - row = db.getDatasetValuesByUuid(dset_uuid) - self.assertEqual(row, ["Parting", "is such", "sweet", "sorrow."]) - row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),)) - self.assertEqual( - row, - [ - "Parting", - ], - ) - row = db.getDatasetValuesByUuid(dset_uuid, (slice(2, 3),)) - self.assertEqual( - row, - [ - "sweet", - ], - ) - - def testReadFixedStringDatasetBinary(self): - item = None - filepath = getFile("fixed_string_dset.h5", "fixed_string_dset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 7) - row = db.getDatasetValuesByUuid(dset_uuid, format="binary") - self.assertEqual(row, b"Partingis suchsweet\x00\x00sorrow.") - row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),), format="binary") - self.assertEqual(row, b"Parting") - row = db.getDatasetValuesByUuid(dset_uuid, (slice(2, 3),), format="binary") - self.assertEqual(row, b"sweet\x00\x00") - - def testWriteVlenUnicodeAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writevlenunicodeattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_UTF8", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - } - value = "\u6b22\u8fce\u63d0\u4ea4\u5fae\u535a\u641c\u7d22\u4f7f\u7528\u53cd\u9988\uff0c\u8bf7\u76f4\u63a5" - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - - self.assertEqual(item["name"], "A1") - self.assertEqual(item["value"], value) - now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") - self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + + - def testWriteIntAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writeintattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = (5,) - datatype = "H5T_STD_I16LE" + def testIntAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") value = [2, 3, 5, 7, 11] - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") self.assertEqual(item["value"], [2, 3, 5, 7, 11]) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) item_type = item["type"] self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I16LE") def testCreateReferenceAttribute(self): - filepath = getFile("empty.h5", "createreferencedataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - - dims = () # if no space in body, default to scalar - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - dset_uuid = rsp["id"] - db.linkObject(root_uuid, dset_uuid, "DS1") - - dims = (1,) - datatype = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} - ds1_ref = "datasets/" + dset_uuid - value = [ - ds1_ref, - ] - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - attr_type = item["type"] + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + + dt = special_dtype(ref=Reference) + + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + + attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") attr_value = item["value"] @@ -849,149 +280,49 @@ def testCreateReferenceAttribute(self): self.assertEqual(attr_value[0], ds1_ref) def testCreateVlenReferenceAttribute(self): - filepath = getFile("empty.h5", "createreferenceattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - - dims = () # if no space in body, default to scalar - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - dset_uuid = rsp["id"] - db.linkObject(root_uuid, dset_uuid, "DS1") - - dims = (1,) - datatype = { - "class": "H5T_VLEN", - "base": {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"}, - } - ds1_ref = "datasets/" + dset_uuid - value = [ - [ - ds1_ref, - ], - ] - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - - attr_type = item["type"] - self.assertEqual(attr_type["class"], "H5T_VLEN") - base_type = attr_type["base"] - # todo - this should be H5T_REFERENCE, not H5T_OPAQUE - # See h5py issue: https://github.com/h5py/h5py/issues/553 - import h5py - - # test based on h5py version until we change install requirements - if h5py.version.version_tuple >= (2, 6, 0): - self.assertEqual(base_type["class"], "H5T_REFERENCE") - else: - self.assertEqual(base_type["class"], "H5T_OPAQUE") - - def testCreateReferenceListAttribute(self): - filepath = getFile("empty.h5", "createreferencelistattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - - dims = (10,) - - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - dset_uuid = rsp["id"] - db.linkObject(root_uuid, dset_uuid, "dset") - - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - xscale_uuid = rsp["id"] - nullterm_string_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": 16, - "strPad": "H5T_STR_NULLTERM", - } - scalar_dims = () - db.createAttribute( - "datasets", - xscale_uuid, - "CLASS", - scalar_dims, - nullterm_string_type, - "DIMENSION_SCALE", - ) - db.linkObject(root_uuid, xscale_uuid, "xscale") - - ref_dims = (1,) - datatype = { - "class": "H5T_VLEN", - "base": {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"}, - } - xscale_ref = "datasets/" + xscale_uuid - value = [ - (xscale_ref,), - ] - db.createAttribute( - "datasets", dset_uuid, "DIMENSION_LIST", ref_dims, datatype, value - ) - item = db.getAttributeItem("datasets", dset_uuid, "DIMENSION_LIST") - - attr_type = item["type"] - self.assertEqual(attr_type["class"], "H5T_VLEN") - base_type = attr_type["base"] - # todo - this should be H5T_REFERENCE, not H5T_OPAQUE - self.assertEqual(base_type["class"], "H5T_REFERENCE") + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") - def testReadCommittedType(self): - filepath = getFile("committed_type.h5", "readcommitted_type.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - type_uuid = db.getUUIDByPath("/Sensor_Type") - item = db.getCommittedTypeItemByUuid(type_uuid) - self.assertTrue("type" in item) item_type = item["type"] - self.assertTrue(item_type["class"], "H5T_COMPOUND") - ds1_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(ds1_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertTrue("class" in item_type) - self.assertEqual(item_type["class"], "H5T_COMPOUND") - self.assertTrue("uuid" in item_type) - self.assertEqual(item_type["uuid"], type_uuid) - - item = db.getAttributeItem("groups", root_uuid, "attr1") - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertTrue("class" in item_type) - self.assertEqual(item_type["class"], "H5T_COMPOUND") - self.assertTrue("uuid" in item_type) - self.assertEqual(item_type["uuid"], type_uuid) - - def testWriteCommittedType(self): - filepath = getFile("empty.h5", "writecommittedtype.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 15, - } - item = db.createCommittedType(datatype) - type_uuid = item["id"] - item = db.getCommittedTypeItemByUuid(type_uuid) - self.assertEqual(item["id"], type_uuid) - self.assertEqual(item["attributeCount"], 0) + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + + + def testCommittedType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dt = np.dtype("S15") + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - self.assertEqual(len(item["alias"]), 0) # anonymous, so no alias + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) item_type = item["type"] @@ -1000,318 +331,56 @@ def testWriteCommittedType(self): self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") self.assertEqual(item_type["length"], 15) - def testWriteCommittedCompoundType(self): - filepath = getFile("empty.h5", "writecommittedcompoundtype.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") - datatype = {"class": "H5T_COMPOUND", "fields": []} - fixed_str_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 15, - } + def testCommittedCompoundType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") - var_str_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": "H5T_VARIABLE", - "strPad": "H5T_STR_NULLTERM", - } - type_fields = [] - type_fields.append({"name": "field_1", "type": "H5T_STD_I64BE"}) - type_fields.append({"name": "field_2", "type": "H5T_IEEE_F64BE"}) - type_fields.append({"name": "field_3", "type": fixed_str_type}) - type_fields.append({"name": "field_4", "type": var_str_type}) - datatype["fields"] = type_fields + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) - item = db.createCommittedType(datatype) - type_uuid = item["id"] - item = db.getCommittedTypeItemByUuid(type_uuid) - self.assertEqual(item["id"], type_uuid) - self.assertEqual(item["attributeCount"], 0) + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - self.assertEqual(len(item["alias"]), 0) # anonymous, so no alias + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) item_type = item["type"] self.assertEqual(item_type["class"], "H5T_COMPOUND") fields = item_type["fields"] self.assertEqual(len(fields), 4) - # todo - the last field class should be H5T_STRING, but it is getting - # saved to HDF5 as Opaque - see: https://github.com/h5py/h5py/issues/613 - # this is fixed in h5py v. 2.6.0 - check the version until 2.6.0 becomes - # available via pip and anaconda. - import h5py - - if h5py.version.version_tuple >= (2, 6, 0): - field_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_STRING") - else: - field_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_OPAQUE") - for i in range(4): - field = fields[i] - self.assertEqual(field["name"], "field_" + str(i + 1)) - field_type = field["type"] - self.assertEqual(field_type["class"], field_classes[i]) - - def testToRef(self): - - filepath = getFile("empty.h5", "toref.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - type_item = { - "order": "H5T_ORDER_LE", - "base_size": 1, - "class": "H5T_INTEGER", - "base": "H5T_STD_I8LE", - "size": 1, - } - data_list = [2, 3, 5, 7, 11] - ref_value = db.toRef(1, type_item, data_list) - self.assertEqual(ref_value, data_list) - - type_item = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": 8, - "strPad": "H5T_STR_NULLPAD", - } - data_list = ["Hypertext", "as", "engine", "of", "state"] - ref_value = db.toRef(1, type_item, data_list) - - def testToTuple(self): - filepath = getFile("empty.h5", "totuple.h5") - data1d = [1, 2, 3] - data2d = [[1, 2], [3, 4]] - data3d = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] - with Hdf5db(filepath, app_logger=self.log) as db: - self.assertEqual(db.toTuple(1, data1d), [1, 2, 3]) - self.assertEqual(db.toTuple(2, data2d), [[1, 2], [3, 4]]) - self.assertEqual(db.toTuple(1, data2d), [(1, 2), (3, 4)]) - self.assertEqual( - db.toTuple(3, data3d), [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] - ) - self.assertEqual( - db.toTuple(2, data3d), [[(1, 2), (3, 4)], [(5, 6), (7, 8)]] - ) - self.assertEqual( - db.toTuple(1, data3d), [((1, 2), (3, 4)), ((5, 6), (7, 8))] - ) - - def testBytesArrayToList(self): - filepath = getFile("empty.h5", "bytestostring.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - - val = db.bytesArrayToList(b"Hello") - self.assertTrue(type(val) is str) - val = db.bytesArrayToList( - [ - b"Hello", - ] - ) - self.assertEqual(len(val), 1) - self.assertTrue(type(val[0]) is str) - self.assertEqual(val[0], "Hello") - - import numpy as np - - data = np.array([b"Hello"]) - val = db.bytesArrayToList(data) - self.assertEqual(len(val), 1) - self.assertTrue(type(val[0]) is str) - self.assertEqual(val[0], "Hello") - - def testGetDataValue(self): - # typeItem, value, dimension=0, dims=None): - filepath = getFile("empty.h5", "bytestostring.h5") - string_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - } - - with Hdf5db(filepath, app_logger=self.log) as db: - - import numpy as np - - data = np.array([b"Hello"]) - val = db.getDataValue(string_type, data, dimension=1, dims=(1,)) - self.assertTrue(type(val[0]) is str) - - def testGetAclDataset(self): - filepath = getFile("tall.h5", "getacldataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - d111_uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - acl_dset = db.getAclDataset(d111_uuid, create=True) - self.assertTrue(acl_dset.name.endswith(d111_uuid)) - self.assertEqual(len(acl_dset.dtype), 7) - self.assertEqual(len(acl_dset.shape), 1) - self.assertEqual(acl_dset.shape[0], 0) - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - def testSetAcl(self): - filepath = getFile("tall.h5", "setacl.h5") - user1 = 123 - user2 = 456 - with Hdf5db(filepath, app_logger=self.log) as db: - d111_uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - # add read/write acl for user1 - acl_user1 = db.getAcl(d111_uuid, user1) - - self.assertEqual(acl_user1["userid"], 0) - acl_user1["userid"] = user1 - acl_user1["readACL"] = 0 - acl_user1["updateACL"] = 0 - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - db.setAcl(d111_uuid, acl_user1) - acl = db.getAcl(d111_uuid, user1) - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 1) - - # add read-only acl for user2 - acl_user2 = db.getAcl(d111_uuid, user2) - self.assertEqual(acl_user2["userid"], 0) - acl_user2["userid"] = user2 - acl_user2["create"] = 0 - acl_user2["read"] = 1 - acl_user2["update"] = 0 - acl_user2["delete"] = 0 - acl_user2["readACL"] = 0 - acl_user2["updateACL"] = 0 - db.setAcl(d111_uuid, acl_user2) - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 2) - - # fetch and verify acls - acl = db.getAcl(d111_uuid, user1) - self.assertEqual(acl["userid"], user1) - self.assertEqual(acl["create"], 1) - self.assertEqual(acl["read"], 1) - self.assertEqual(acl["update"], 1) - self.assertEqual(acl["delete"], 1) - self.assertEqual(acl["readACL"], 0) - self.assertEqual(acl["updateACL"], 0) - - acl = db.getAcl(d111_uuid, user2) - self.assertEqual(acl["userid"], user2) - self.assertEqual(acl["create"], 0) - self.assertEqual(acl["read"], 1) - self.assertEqual(acl["update"], 0) - self.assertEqual(acl["delete"], 0) - self.assertEqual(acl["readACL"], 0) - self.assertEqual(acl["updateACL"], 0) - - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 2) - - # get acl data_list - acls = db.getAcls(d111_uuid) - self.assertEqual(len(acls), 2) - - def testRootAcl(self): - filepath = getFile("tall.h5", "rootacl.h5") - user1 = 123 - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - d111_uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - # add read/write acl for user1 at root - acl_root = db.getAcl(root_uuid, 0) - self.assertEqual(acl_root["userid"], 0) - acl_root["create"] = 0 - acl_root["read"] = 1 - acl_root["update"] = 0 - acl_root["delete"] = 0 - acl_root["readACL"] = 0 - acl_root["updateACL"] = 0 - num_acls = db.getNumAcls(root_uuid) - self.assertEqual(num_acls, 0) - - db.setAcl(root_uuid, acl_root) - num_acls = db.getNumAcls(root_uuid) - self.assertEqual(num_acls, 1) - - acl = db.getAcl(d111_uuid, user1) - num_acls = db.getNumAcls(d111_uuid) # this will fetch the root acl - self.assertEqual(num_acls, 0) - self.assertEqual(acl["userid"], 0) - self.assertEqual(acl["create"], 0) - self.assertEqual(acl["read"], 1) - self.assertEqual(acl["update"], 0) - self.assertEqual(acl["delete"], 0) - self.assertEqual(acl["readACL"], 0) - self.assertEqual(acl["updateACL"], 0) - - def testGetEvalStr(self): - queries = { - "date == 23": "rows['date'] == 23", - "wind == b'W 5'": "rows['wind'] == b'W 5'", - "temp > 61": "rows['temp'] > 61", - "(date >=22) & (date <= 24)": "(rows['date'] >=22) & (rows['date'] <= 24)", - "(date == 21) & (temp > 70)": "(rows['date'] == 21) & (rows['temp'] > 70)", - "(wind == b'E 7') | (wind == b'S 7')": "(rows['wind'] == b'E 7') | (rows['wind'] == b'S 7')", - } - - fields = ["date", "wind", "temp"] - filepath = getFile("empty.h5", "getevalstring.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - - for query in queries.keys(): - eval_str = db._getEvalStr(query, fields) - self.assertEqual(eval_str, queries[query]) - - def testBadQuery(self): - queries = ( - "foobar", # no variable used - "wind = b'abc", # non-closed literal - "(wind = b'N') & (temp = 32", # missing paren - "foobar > 42", # invalid field name - "import subprocess; subprocess.call(['ls', '/'])", - ) # injection attack - - fields = ("date", "wind", "temp") - filepath = getFile("empty.h5", "badquery.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - for query in queries: - try: - eval_str = db._getEvalStr(query, fields) - self.log.error(f"got eval_str: {eval_str}") - self.assertTrue(False) # shouldn't get here - except IOError: - pass # ok - - def testInjectionBlock(self): - queries = ( - "import subprocess; subprocess.call(['ls', '/'])", - ) # injection attack - - fields = ("import", "subprocess", "call") - filepath = getFile("empty.h5", "injectionblock.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - - for query in queries: - try: - eval_str = db._getEvalStr(query, fields) - self.log.error(f"got eval_str: {eval_str}") - self.assertTrue(False) # shouldn't get here - except IOError: - pass # ok + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + if __name__ == "__main__": # setup test files From 2f546b999e1f1e18491e69cea1327f47c8d645f3 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 18 Feb 2025 10:14:18 -0800 Subject: [PATCH 008/129] first pass at h5py reader --- src/h5json/h5py_reader.py | 273 ++++++++++++++++++++++++++++++++++ src/h5json/h5reader.py | 61 ++++++++ src/h5json/hdf5db.py | 49 +++--- test/unit/h5py_reader_test.py | 126 ++++++++++++++++ test/unit/hdf5db_test.py | 4 - 5 files changed, 488 insertions(+), 25 deletions(-) create mode 100644 src/h5json/h5py_reader.py create mode 100644 src/h5json/h5reader.py create mode 100644 test/unit/h5py_reader_test.py diff --git a/src/h5json/h5py_reader.py b/src/h5json/h5py_reader.py new file mode 100644 index 00000000..fc9bb07b --- /dev/null +++ b/src/h5json/h5py_reader.py @@ -0,0 +1,273 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import logging + +import h5py +import numpy as np + +from .objid import createObjId +from .hdf5dtype import getTypeItem +from .array_util import bytesArrayToList +from .h5reader import H5Reader + + +class H5pyReader(H5Reader): + """ + This class can be used by HDF5DB to read content from an HDF5 file (using h5py) + """ + + def visit(self, path, obj): + name = obj.__class__.__name__ + self.log.info(f"visit: {path} name: {name}") + + obj_id = createObjId(obj_type=name, root_id=self._root_id) # create uuid + + self._id_map[obj_id] = obj + + addr = h5py.h5o.get_info(obj.id).addr + self._addr_map[addr] = obj_id + + + def __init__( + self, + filepath, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + self._id_map = {} + self._addr_map = {} + self._filepath = filepath + f = h5py.File(self._filepath) + self._f = f + self._root_id = createObjId(obj_type="groups") + self._id_map[self._root_id] = f + addr = h5py.h5o.get_info(f.id).addr + self._addr_map[addr] = self._root_id + f.visititems(self.visit) + + def close(self): + if self._f: + self._f.close() + self._f = None + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getAttribute(self, obj_id, name, include_data=True): + """ Return JSON for the given attribute """ + + obj = self._id_map[obj_id] + + if name not in obj.attrs: + msg = f"Attribute: [{name}] not found in object: {obj.name}" + self.log.info(msg) + return None + + # get the attribute! + attrObj = h5py.h5a.open(obj.id, np.bytes_(name)) + + item = {} + + # check if the dataset is using a committed type + typeid = attrObj.get_type() + type_item = None + if h5py.h5t.TypeID.committed(typeid): + type_uuid = None + addr = h5py.h5o.get_info(typeid).addr + type_uuid = self.getObjIdByAddress(addr) + committedType = self.getCommittedTypeItemByUuid(type_uuid) + type_item = committedType["type"].copy() + type_item["id"] = type_uuid + else: + type_item = getTypeItem(attrObj.dtype) + item["type"] = type_item + + shape_item = {} + if attrObj.shape is None or attrObj.get_storage_size() == 0: + # If storage size is 0, assume this is a null space obj + # See: h5py issue https://github.com/h5py/h5py/issues/279 + shape_item["class"] = "H5S_NULL" + else: + if attrObj.shape: + shape_item["class"] = "H5S_SIMPLE" + shape_item["dims"] = attrObj.shape + else: + shape_item["class"] = "H5S_SCALAR" + + item["shape"] = shape_item + if shape_item["class"] == "H5S_NULL": + include_data = False + elif isinstance(type_item, dict) and type_item["class"] in ("H5T_OPAQUE"): + # TBD - don't include data for OPAQUE until JSON serialization + # issues are addressed + include_data = False + else: + pass # use include_data parameter + + if include_data: + try: + data = obj.attrs[name] + except TypeError: + self.log.warning("type error reading attribute") + + if include_data and data is not None: + item["value"] = bytesArrayToList(data) + + # timestamps will be added by getAttributeItem() + return item + + def getAttributes(self, obj_id, include_data=True): + h5obj = self._id_map[obj_id] + self.log.info(f"getAttributes: {obj_id} include_data={include_data}") + items = {} # with python 3.7+, this will maintain the attribute order we got from h5py + attrs = h5obj.attrs + for name in attrs: + item = self.getAttribute(obj_id, name, include_data=include_data) + items[name] = item + + return items + + def _getLink(self, parent, link_name): + if link_name not in parent: + return None + + item = {"title": link_name} + # get the link object, one of HardLink, SoftLink, or ExternalLink + try: + linkObj = parent.get(link_name, None, False, True) + linkClass = linkObj.__class__.__name__ + except TypeError: + # UDLink? set class as 'user' + linkClass = "UDLink" # user defined links + item["class"] = "H5L_TYPE_USER_DEFINED" + if linkClass == "SoftLink": + item["class"] = "H5L_TYPE_SOFT" + item["h5path"] = linkObj.path + elif linkClass == "ExternalLink": + item["class"] = "H5L_TYPE_EXTERNAL" + item["h5path"] = linkObj.path + item["file"] = linkObj.filename + elif linkClass == "HardLink": + # Hardlink doesn't have any properties itself, just get the linked + # object + obj = parent[link_name] + addr = h5py.h5o.get_info(obj.id).addr + item["class"] = "H5L_TYPE_HARD" + if addr not in self._addr_map: + self.log.error(f"expected to find addr for link {link_name} in addr_map") + item["id"] = None + else: + item["id"] = self._addr_map[addr] + + return item + + def _getLinks(self, grp): + items = {} # with python 3.7+, this will maintain the link order we got from h5py + for link_name in grp: + item = self._getLink(grp, link_name) + items[link_name] = item + return items + + def _getGroup(self, grp, include_links=True): + self.log.info("_getGroup alias: [{grp.name}]") + + item = {"alias": grp.name} + + if include_links: + links = self._getLinks(grp) + item["links"] = links + return item + + def _getDatatype(self, ctype, include_attrs=True): + self.log.info("getDatatype alias: ]{ctype.name}") + item = {"alias": ctype.name} + item["type"] = getTypeItem(ctype.dtype) + + return item + + + def _getDataset(self, dset): + self.log.info("getDataset alias: [{dset.name}]") + + item = {"alias": dset.name} + + typeid = dset.id.get_type() + if h5py.h5t.TypeID.committed(typeid): + type_uuid = None + addr = h5py.h5o.get_info(typeid).addr + type_uuid = self.getObjIdByAddress(addr) + committedType = self.getObjectByid(type_uuid) + typeItem = committedType["type"] + typeItem["id"] = type_uuid + else: + typeItem = getTypeItem(dset.dtype) + item["type"] = typeItem + + shapeItem = {} + if dset.shape is None: + # new with h5py 2.6, null space datasets will return None for shape + shapeItem["class"] = "H5S_NULL" + elif len(dset.shape) == 0: + shapeItem["class"] = "H5S_SCALAR" + else: + shapeItem["class"] = "H5S_SIMPLE" + shapeItem["dims"] = list(dset.shape) + maxshape = [] + include_maxdims = False + for i in range(len(dset.shape)): + extent = 0 + if len(dset.maxshape) > i: + extent = dset.maxshape[i] + if extent is None: + extent = 0 + if extent > dset.shape[i] or extent == 0: + include_maxdims = True + maxshape.append(extent) + if include_maxdims: + shapeItem["maxdims"] = maxshape + item["shape"] = shapeItem + + return item + + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + if obj_id not in self._id_map: + raise KeyError(f"{obj_id} not found") + h5obj = self._id_map[obj_id] + if isinstance(h5obj, h5py.Group): + obj_json = self._getGroup(h5obj, include_links=include_links) + elif isinstance(h5obj, h5py.Dataset): + obj_json = self._getDataset(h5obj) + elif isinstance(h5obj, h5py.Datatype): + obj_json = self._getDataType(h5obj) + else: + raise TypeError(f"unexpected object type: {type(h5obj)}") + + if include_attrs: + attributes = self.getAttributes(obj_id) + obj_json["attributes"] = attributes + + return obj_json + + + def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + pass + diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py new file mode 100644 index 00000000..6f504105 --- /dev/null +++ b/src/h5json/h5reader.py @@ -0,0 +1,61 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +from abc import ABC, abstractmethod + + + + +class H5Reader(ABC): + """ + This abstract class defines properties and methods that the Hdf5db class uses for reading from an HDF5 + compatible storage medium. + """ + + + def __init__( + self, + filepath + ): + self._filepath = filepath + + @abstractmethod + def get_root_id(self): + """ Return root id """ + pass + + @abstractmethod + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + pass + + @abstractmethod + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + pass + + @abstractmethod + def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + pass + + @abstractmethod + def close(self): + """ close any open handles to the storage """ + pass + diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index e7ea8d9c..2dfd9374 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -17,11 +17,12 @@ from .dset_util import make_new_dset, resize_dataset from .objid import createObjId, getCollectionForId from .apiversion import _apiver +from .h5reader import H5Reader class Hdf5db: """ - This class is used to manage UUID lookup tables for primary HDF objects (Groups, Datasets, + This class is used to manage id lookup tables for primary HDF objects (Groups, Datasets, and Datatypes). By default all data is held in-memory. Initialize with h5_reader to read from an HDF5 compatible storage pool, and or, h5_writer to write to an HDF5 compatible storage pool. """ @@ -34,7 +35,7 @@ def getVersionInfo(): def __init__( self, - h5_reader = None, + h5_reader: H5Reader = None, h5_writer = None, app_logger = None, ): @@ -49,18 +50,28 @@ def __init__( self._writer = h5_writer if self._reader: - root_id = self._reader.get_objid("/") - kwargs = {"include_attrs": True, "include_links": True} - group_json = self._reader.get_obj(root_id, **kwargs) + root_id = self._reader.get_root_id() + group_json = self._reader.getObjectById(root_id) else: + root_id = createObjId(obj_type="groups") # create a root group group_json = {"links": {}, "attributes": {}, "cpl": {}} group_json["created"] = time.time() - root_id = createObjId(obj_type="groups") - self._db[root_id] = group_json + self._db[root_id] = group_json self._root_id = root_id + def close(self): + """ close reader and writer handles """ + self.log.info("Hdf5db __close") + if self._writer: + self._writer.flush() + self._writer.close() + if self._reader: + self._reader.close() + self._root_id = None + self._db = {} + def __enter__(self): """ called on package init """ self.log.info("Hdf5db __enter") @@ -69,18 +80,15 @@ def __enter__(self): def __exit__(self, type, value, traceback): """ called on package exit """ self.log.info("Hdf5db __exit") - if self._writer: - self._writer.flush() - self._writer.close() + self.close() def getObjectById(self, obj_id): - """ return objecct with given id """ + """ return object with given id """ if obj_id not in self._db: if self._reader: # load the obj from the reader - kwargs = {"include_attrs": True, "include_links": True} - obj_json = self._reader.get_obj(obj_id, **kwargs) + obj_json = self._reader.getObjectById(obj_id) self._db[obj_id] = obj_json else: raise KeyError(f"obj_id: {obj_id} not found") @@ -152,7 +160,7 @@ def getObjectIdByPath(self, h5path, parent_id=None): def getObjectByPath(self, path): """ Get Object JSON at given path """ - obj_id = self.getObjectDByPath(path) + obj_id = self.getObjectIDByPath(path) obj_json = self.getObjectById(obj_id) return obj_json @@ -166,7 +174,6 @@ def getDtype(self, obj_id): raise TypeError(f"{obj_id} does not have a datatype") type_json = obj_json["type"] - # TBD: what about datasets using a committed type? dtype = createDataType(type_json) return dtype @@ -253,7 +260,8 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): if ctype_id not in self._db: raise KeyError(f"ctype: {ctype_id} not found") ctype_json = self.getObjectById(ctype_id) - type_json = ctype_json["type"] + type_json = ctype_json["type"].copy() + type_json["id"] = ctype_id dtype = createDataType(type_json) # First, make sure we have a NumPy array. @@ -352,11 +360,6 @@ def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): #TBD - """ - createDataset - creates new dataset given shape and datatype - Returns obj_id - """ - def createDataset( self, shape=None, @@ -369,6 +372,10 @@ def createDataset( fillvalue=None, cpl=None, ): + """ + createDataset - creates new dataset given shape and datatype + Returns obj_id + """ kwds = {} if chunks: diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py new file mode 100644 index 00000000..420909ca --- /dev/null +++ b/test/unit/h5py_reader_test.py @@ -0,0 +1,126 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import os +import time +import errno +import os.path as op +import stat +import logging +import shutil +from h5json import Hdf5db +from h5json.h5py_reader import H5pyReader + + +def getFile(name, tgt, ro=False): + src = "data/hdf5/" + name + logging.info("copying file to this directory: " + src) + + filepath = "./out/" + tgt + + if op.isfile(filepath): + # make sure it's writable, before we copy over it + os.chmod(filepath, stat.S_IWRITE | stat.S_IREAD) + shutil.copyfile(src, filepath) + if ro: + logging.info("make read-only") + os.chmod(filepath, stat.S_IREAD) + return filepath + + +def removeFile(name): + try: + os.stat(name) + except OSError: + return + # file does not exist + os.remove(name) + + +class H5pyReaderTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5pyReaderTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.INFO) + handler = logging.FileHandler("./hdf5dbtest.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + + def testSimple(self): + filepath = getFile("tall.h5", "tall.h5", ro=True) + kwargs = {"app_logger": self.log} + with Hdf5db(h5_reader=H5pyReader(filepath, **kwargs), **kwargs) as db: + root_id = db.getObjectIdByPath("/") + root_json = db.getObjectById(root_id) + + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10,10]) + + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + + db.close() + + + + + + +if __name__ == "__main__": + # setup test files + + unittest.main() + + + + diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index bee33014..2c2812dc 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -18,10 +18,6 @@ from h5json.hdf5dtype import special_dtype, Reference -UUID_LEN = 36 # length for uuid strings - - - class Hdf5dbTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(Hdf5dbTest, self).__init__(*args, **kwargs) From 4b9cb682988cbd55d9bb5ea775718b22e02755d2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sat, 22 Feb 2025 20:03:56 -0800 Subject: [PATCH 009/129] added h5json_writer --- src/h5json/dset_util.py | 2 +- src/h5json/h5json_writer.py | 256 +++++++++++++++++++++++++ src/h5json/h5py_reader.py | 15 +- src/h5json/h5reader.py | 9 +- src/h5json/h5writer.py | 59 ++++++ src/h5json/hdf5db.py | 36 +++- src/h5json/objid.py | 49 +++-- test/unit/h5json_writer_test.py | 323 ++++++++++++++++++++++++++++++++ test/unit/objid_test.py | 3 +- 9 files changed, 719 insertions(+), 33 deletions(-) create mode 100644 src/h5json/h5json_writer.py create mode 100644 src/h5json/h5writer.py create mode 100644 test/unit/h5json_writer_test.py diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 75854212..c5da3514 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -94,7 +94,7 @@ def make_new_dset( # TBD - other properties - dset_json = {"shape": shape_json, "type": type_json, "cpl": cpl} + dset_json = {"shape": shape_json, "type": type_json, "cpl": cpl, "attributes": {}} dset_json["created"] = time.time() dset_json["modified"] = None diff --git a/src/h5json/h5json_writer.py b/src/h5json/h5json_writer.py new file mode 100644 index 00000000..8add66bb --- /dev/null +++ b/src/h5json/h5json_writer.py @@ -0,0 +1,256 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import json + +from .h5writer import H5Writer +from .objid import stripId, getCollectionForId + +class H5JsonWriter(H5Writer): + """ + This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 + compatible storage medium. + """ + + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + super().__init__(filepath, append=append, app_logger=app_logger) + self.alias_db = {} + self.json = {} + self._no_data = no_data + self._root_uuid = None + + def flush(self): + """ Write dirty items """ + # json writer doesn't support incremental updates, so we'll wait + # for close to write out database + self.log.info("flush") + + def close(self): + """ close storage handle """ + self.dumpFile() + + + def _setAlias(self, obj_id, id_set, h5path): + """ add the given h5path to the object's alias list + If the object is a group, recurse through each hard link """ + obj_json = self.db.getObjectById(obj_id) + alias_list = self.alias_db[obj_id] + if h5path in alias_list: + return # nothing to do + alias_list.append(h5path) + if getCollectionForId(obj_id) != "groups": + return # done + id_set.add(obj_id) # keep track of objects we've visited to avoid loops + links = obj_json["links"] + if h5path[-1] != '/': + h5path += '/' + + for link_name in links: + link_json = links[link_name] + if link_json["class"] == "H5L_TYPE_HARD": + tgt_id = link_json["id"] + if tgt_id in id_set: + self.log.info(f"_setAlias - circular loop found") + else: + self._setAlias(tgt_id, id_set, h5path+link_name) + id_set.remove(obj_id) + + def getAliasList(self): + """ update the alias list for each object """ + # clear exiting aliases + obj_ids = self.db.getCollection() + for obj_id in obj_ids: + self.alias_db[obj_id] = [] + + self._setAlias(self._root_uuid, set(), "/") + + + def dumpAttribute(self, obj_id, attr_name): + self.log.info(f"dumpAttribute: [{attr_name}]") + item = self.db.getAttribute(obj_id, attr_name) + response = {"name": attr_name} + response["type"] = item["type"] + response["shape"] = item["shape"] + if True: #not self.options.D: + if "value" not in item: + self.log.warning("no value key in attribute: " + attr_name) + else: + # dump values unless header -D was passed + response["value"] = item["value"] + return response + + def dumpAttributes(self, obj_id): + attrs = self.db.getAttributes(obj_id) + self.log.info(f"dumpAttributes: {obj_id}") + items = [] + for attr_name in attrs: + item = self.dumpAttribute(obj_id, attr_name) + items.append(item) + + return items + + def dumpLink(self, obj_id, name): + item = self.db.getLink(obj_id, name) + response = {"class": item["class"]} + if "id" in item: + tgt_id = item["id"] + response["collection"] = getCollectionForId(tgt_id) + response["id"] = stripId(tgt_id) + + for key in item: + if key in ("id", "created", "modified"): + continue + response[key] = item[key] + response["title"] = name + return response + + def dumpLinks(self, obj_id): + links = self.db.getLinks(obj_id) + items = [] + for link_name in links: + item = self.dumpLink(obj_id, link_name) + items.append(item) + return items + + def dumpGroup(self, obj_id): + item = self.db.getObjectById(obj_id) + response = {} + alias = self.alias_db[obj_id] + response["alias"] = alias + + if "cpl" in item: + item["creationProperties"] = item["cpl"] + attributes = self.dumpAttributes(obj_id) + if attributes: + response["attributes"] = attributes + links = self.dumpLinks(obj_id) + if links: + response["links"] = links + return response + + def dumpGroups(self): + groups = {} + item = self.dumpGroup(self._root_uuid) + root_uuid = stripId(self._root_uuid) + groups[root_uuid] = item + obj_ids = self.db.getCollection("groups") + for obj_id in obj_ids: + if obj_id == self._root_uuid: + continue + item = self.dumpGroup(obj_id) + obj_uuid = stripId(obj_id) + groups[obj_uuid] = item + + self.json["groups"] = groups + + def dumpDataset(self, obj_id): + response = {} + self.log.info("dumpDataset: " + obj_id) + item = self.db.getObjectById(obj_id) + if "alias" in item: + alias = item["alias"] + if alias: + self.log.info(f"dumpDataset alias: [{alias[0]}]") + response["alias"] = item["alias"] + + response["type"] = item["type"] + shapeItem = item["shape"] + shape_rsp = {} + num_elements = 1 + shape_rsp["class"] = shapeItem["class"] + if "dims" in shapeItem: + shape_rsp["dims"] = shapeItem["dims"] + for dim in shapeItem["dims"]: + num_elements *= dim + if "maxdims" in shapeItem: + maxdims = [] + for dim in shapeItem["maxdims"]: + if dim == 0: + maxdims.append("H5S_UNLIMITED") + else: + maxdims.append(dim) + shape_rsp["maxdims"] = maxdims + response["shape"] = shape_rsp + + if "cpl" in item: + response["creationProperties"] = item["cpl"] + + attributes = self.dumpAttributes(obj_id) + if attributes: + response["attributes"] = attributes + + if not self._no_data: + if num_elements > 0: + value = self.db.getDatasetValues(obj_id) + response["value"] = value # dump values unless header flag was passed + else: + response["value"] = [] # empty list + return response + + def dumpDatasets(self): + obj_ids = self.db.getCollection("datasets") + if obj_ids: + datasets = {} + for obj_id in obj_ids: + item = self.dumpDataset(obj_id) + datasets[obj_id] = item + + self.json["datasets"] = datasets + + def dumpDatatype(self, obj_id): + response = {} + item = self.db.getObjectById(obj_id) + response["alias"] = item["alias"] + response["type"] = item["type"] + if "cpl" in item: + response["creationProperties"] = item["cpl"] + attributes = self.dumpAttributes(obj_id) + if attributes: + response["attributes"] = attributes + return response + + def dumpDatatypes(self): + obj_ids = self.db.getCollection("datatypes") + if obj_ids: + datatypes = {} + for obj_id in obj_ids: + item = self.dumpDatatype(obj_id) + datatypes[obj_id] = item + + self.json["datatypes"] = datatypes + + + def dumpFile(self): + self._root_uuid = self.db.getObjectIdByPath("/") + + db_version_info = self.db.getVersionInfo() + + self.json["apiVersion"] = db_version_info["hdf5-json-version"] + self.json["root"] = stripId(self._root_uuid) + self.getAliasList() # create alias_db with obj_id to alias list dict + self.dumpGroups() + + self.dumpDatasets() + + self.dumpDatatypes() + + print(json.dumps(self.json, sort_keys=True, indent=4)) + + + diff --git a/src/h5json/h5py_reader.py b/src/h5json/h5py_reader.py index fc9bb07b..238c48e6 100644 --- a/src/h5json/h5py_reader.py +++ b/src/h5json/h5py_reader.py @@ -9,8 +9,6 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import logging - import h5py import numpy as np @@ -42,13 +40,16 @@ def __init__( filepath, app_logger=None ): + self._id_map = {} + self._addr_map = {} + """ if app_logger: self.log = app_logger else: self.log = logging.getLogger() - self._id_map = {} - self._addr_map = {} self._filepath = filepath + """ + super().__init__(filepath, app_logger=app_logger) f = h5py.File(self._filepath) self._f = f self._root_id = createObjId(obj_type="groups") @@ -182,7 +183,7 @@ def _getLinks(self, grp): return items def _getGroup(self, grp, include_links=True): - self.log.info("_getGroup alias: [{grp.name}]") + self.log.info(f"_getGroup alias: [{grp.name}]") item = {"alias": grp.name} @@ -192,7 +193,7 @@ def _getGroup(self, grp, include_links=True): return item def _getDatatype(self, ctype, include_attrs=True): - self.log.info("getDatatype alias: ]{ctype.name}") + self.log.info(f"getDatatype alias: ]{ctype.name}") item = {"alias": ctype.name} item["type"] = getTypeItem(ctype.dtype) @@ -200,7 +201,7 @@ def _getDatatype(self, ctype, include_attrs=True): def _getDataset(self, dset): - self.log.info("getDataset alias: [{dset.name}]") + self.log.info(f"getDataset alias: [{dset.name}]") item = {"alias": dset.name} diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index 6f504105..6a37a07a 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -11,7 +11,7 @@ ############################################################################## from abc import ABC, abstractmethod - +import logging class H5Reader(ABC): @@ -23,9 +23,14 @@ class H5Reader(ABC): def __init__( self, - filepath + filepath, + app_logger=None ): self._filepath = filepath + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() @abstractmethod def get_root_id(self): diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py new file mode 100644 index 00000000..3aa77bb9 --- /dev/null +++ b/src/h5json/h5writer.py @@ -0,0 +1,59 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +from abc import ABC, abstractmethod +import weakref +import logging + + +class H5Writer(ABC): + """ + This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 + compatible storage medium. + """ + + + def __init__( + self, + filepath, + append=False, + app_logger=None + ): + self._filepath = filepath + self._append = append + self._filepath = filepath + self._db_ref = None + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + + def set_db(self, db): + #TBD - use weak ref? + self._db_ref = weakref.ref(db) + + @property + def db(self): + if not self._db_ref: + raise ValueError("db not available") + return self._db_ref() + + @abstractmethod + def flush(self): + """ Write dirty items """ + pass + + @abstractmethod + def close(self): + """ close storage handle """ + pass + diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 2dfd9374..283b31fa 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -18,6 +18,7 @@ from .objid import createObjId, getCollectionForId from .apiversion import _apiver from .h5reader import H5Reader +from .h5writer import H5Writer class Hdf5db: @@ -57,15 +58,23 @@ def __init__( # create a root group group_json = {"links": {}, "attributes": {}, "cpl": {}} group_json["created"] = time.time() + + if self._writer: + self._writer.set_db(self) self._db[root_id] = group_json self._root_id = root_id + + def flush(self): + """ write out any changes """ + if self._writer: + self._writer.flush() def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") - if self._writer: - self._writer.flush() + self.flush() + if self._writer: self._writer.close() if self._reader: self._reader.close() @@ -228,6 +237,17 @@ def getAttribute(self, obj_id, name, includeData=True): return attr_json + def getAttributes(self, obj_id): + """ + Get attributes given an object id and name + returns: JSON object + """ + + obj_json = self.getObjectById(obj_id) + attrs = obj_json["attributes"] + + return attrs + def getAttributeValue(self, obj_id, name): """ Return NDArray of the given attribute value """ attr_json = self.getAttribute(obj_id, name) @@ -424,7 +444,8 @@ def getLinks(self, grp_id): grp_json = self.getObjectById(grp_id) if "links" not in grp_json: raise KeyError(f"No links - {grp_id} not a group?") - return grp_json["links"] + links = grp_json["links"] + return links def getLink(self, grp_id, name): """ Get the given link """ @@ -493,11 +514,18 @@ def createGroup(self, cpl=None): group_json["cpl"] = cpl else: group_json["cpl"] = {} - group_json["created"] = time.time + group_json["created"] = time.time() group_json["modified"] = None self._db[grp_id] = group_json return grp_id + + def getCollection(self, col_type=None): + obj_ids = [] + for obj_id in self._db: + if not col_type or getCollectionForId(obj_id) == col_type: + obj_ids.append(obj_id) + return obj_ids def __len__(self): # return the number of objects diff --git a/src/h5json/objid.py b/src/h5json/objid.py index 8c62a752..e36e8a22 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -113,13 +113,42 @@ def hexRot(ch): return format((int(ch, base=16) + 8) % 16, "x") +def getCollectionForId(obj_id): + """return groups/datasets/datatypes based on id""" + if not isinstance(obj_id, str): + raise ValueError("invalid object id") + + collection = None + if obj_id.startswith("g-"): + collection = "groups" + elif obj_id.startswith("d-"): + collection = "datasets" + elif obj_id.startswith("t-"): + collection = "datatypes" + else: + raise ValueError(f"{obj_id} not a collection id") + return collection + +def stripId(obj_id): + """ return just the base id without any prefix (e.g. 'g-') """ + if len(obj_id) == UUID_LEN: + return obj_id # just return as is + if len(obj_id) == UUID_LEN + 2: + return obj_id[2:] + else: + raise ValueError("unexpected obj_id: {obj_id}") + + def isRootObjId(id): """returns true if this is a root id (only for v2 schema)""" if not isSchema2Id(id): raise ValueError("isRootObjId can only be used with v2 ids") validateUuid(id) # will throw ValueError exception if not a objid - if id[0] != "g": - return False # not a group + try: + if getCollectionForId(id) != "groups": + return False # not a group + except ValueError: + return False token = getIdHexChars(id) # root ids will have last 16 chars rotated version of the first 16 is_root = True @@ -358,22 +387,6 @@ def isS3ObjKey(s3key): return valid -def getCollectionForId(obj_id): - """return groups/datasets/datatypes based on id""" - if not isinstance(obj_id, str): - raise ValueError("invalid object id") - collection = None - if obj_id.startswith("g-"): - collection = "groups" - elif obj_id.startswith("d-"): - collection = "datasets" - elif obj_id.startswith("t-"): - collection = "datatypes" - else: - raise ValueError("not a collection id") - return collection - - def validateUuid(id, obj_class=None): """ verify the UUID is well-formed schema can be: diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py new file mode 100644 index 00000000..710ffe16 --- /dev/null +++ b/test/unit/h5json_writer_test.py @@ -0,0 +1,323 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import time +import logging +import numpy as np +from h5json import Hdf5db +from h5json.h5json_writer import H5JsonWriter +from h5json.objid import isRootObjId, isValidUuid, isSchema2Id, stripId +from h5json.hdf5dtype import special_dtype, Reference + + +class H5JsonWriterTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5JsonWriterTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.DEBUG) + # create logger + + handler = logging.FileHandler("./hdf5dbtest.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + # self.log.propagate = False # prevent log out going to stdout + self.log.info("init!") + + + def testGroup(self): + + with Hdf5db(h5_writer=H5JsonWriter("/tmp/foo.json", no_data=True), app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "attr1", value=[1,2,3,4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10,10), dtype=np.int32) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.flush() + + + + + def testNullSpaceAttribute(self): + + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + self.assertEqual(item["modified"], None) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + + def testScalarAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + + + def testFixedStringAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + ret_value = db.getAttributeValue(root_id, "A1") + + + def testVlenAsciiAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=bytes) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + + def testVlenUtf8Attribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=str) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + + + + def testIntAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + value = [2, 3, 5, 7, 11] + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") + + def testCreateReferenceAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + + dt = special_dtype(ref=Reference) + + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = item["value"] + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref) + + def testCreateVlenReferenceAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + + + def testCommittedType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dt = np.dtype("S15") + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) + + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + + + def testCommittedCompoundType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + self.assertEqual(item["modified"], None) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py index af4ac21e..1357c184 100755 --- a/test/unit/objid_test.py +++ b/test/unit/objid_test.py @@ -12,7 +12,7 @@ import unittest from h5json.objid import isRootObjId, isValidUuid, validateUuid -from h5json.objid import createObjId, getCollectionForId +from h5json.objid import createObjId, getCollectionForId, stripId from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id @@ -134,6 +134,7 @@ def testGetCollection(self): self.assertEqual(getCollectionForId(group_id), "groups") self.assertEqual(getCollectionForId(dataset_id), "datasets") self.assertEqual(getCollectionForId(ctype_id), "datatypes") + self.assertEqual(stripId(group_id), "314d61b8-9954-11e6-a733-3c15c2da029e") try: getCollectionForId(bad_id) self.assertTrue(False) From bad401207b724db71e6420fbebfc0ac14c6841bc Mon Sep 17 00:00:00 2001 From: John Readey Date: Sat, 22 Feb 2025 20:22:54 -0800 Subject: [PATCH 010/129] create reader and writer packages --- pyproject.toml | 3 ++- src/h5json/hdf5db.py | 6 +++--- src/h5json/reader/__init__.py | 0 src/h5json/{ => reader}/h5py_reader.py | 8 ++++---- src/h5json/{ => reader}/h5reader.py | 0 src/h5json/writer/__init__.py | 0 src/h5json/{ => writer}/h5json_writer.py | 3 +-- src/h5json/{ => writer}/h5writer.py | 0 test/unit/h5json_writer_test.py | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 src/h5json/reader/__init__.py rename src/h5json/{ => reader}/h5py_reader.py (98%) rename src/h5json/{ => reader}/h5reader.py (100%) create mode 100644 src/h5json/writer/__init__.py rename src/h5json/{ => writer}/h5json_writer.py (99%) rename src/h5json/{ => writer}/h5writer.py (100%) diff --git a/pyproject.toml b/pyproject.toml index 5ddb024f..4ea50247 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ dependencies = [ "numpy >= 2.0; python_version>='3.9'", "jsonschema >=4.4.0", "tomli; python_version<'3.11'", - "numpy >=1.20,<2.0.0; python_version=='3.8'", ] dynamic = ["version"] @@ -53,6 +52,8 @@ build-backend = "setuptools.build_meta" package-dir = { "" = "src" } packages = [ "h5json", + "h5json.reader", + "h5json.writer", "h5json.h5tojson", "h5json.jsontoh5", "h5json.schema", diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 283b31fa..39de3b60 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -17,8 +17,8 @@ from .dset_util import make_new_dset, resize_dataset from .objid import createObjId, getCollectionForId from .apiversion import _apiver -from .h5reader import H5Reader -from .h5writer import H5Writer +from .reader.h5reader import H5Reader +from .writer.h5writer import H5Writer class Hdf5db: @@ -37,7 +37,7 @@ def getVersionInfo(): def __init__( self, h5_reader: H5Reader = None, - h5_writer = None, + h5_writer: H5Writer = None, app_logger = None, ): if app_logger: diff --git a/src/h5json/reader/__init__.py b/src/h5json/reader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/h5json/h5py_reader.py b/src/h5json/reader/h5py_reader.py similarity index 98% rename from src/h5json/h5py_reader.py rename to src/h5json/reader/h5py_reader.py index 238c48e6..dc9220ae 100644 --- a/src/h5json/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -12,10 +12,10 @@ import h5py import numpy as np -from .objid import createObjId -from .hdf5dtype import getTypeItem -from .array_util import bytesArrayToList -from .h5reader import H5Reader +from ..objid import createObjId +from ..hdf5dtype import getTypeItem +from ..array_util import bytesArrayToList +from ..h5reader import H5Reader class H5pyReader(H5Reader): diff --git a/src/h5json/h5reader.py b/src/h5json/reader/h5reader.py similarity index 100% rename from src/h5json/h5reader.py rename to src/h5json/reader/h5reader.py diff --git a/src/h5json/writer/__init__.py b/src/h5json/writer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/h5json/h5json_writer.py b/src/h5json/writer/h5json_writer.py similarity index 99% rename from src/h5json/h5json_writer.py rename to src/h5json/writer/h5json_writer.py index 8add66bb..81f9b4f9 100644 --- a/src/h5json/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -13,7 +13,7 @@ import json from .h5writer import H5Writer -from .objid import stripId, getCollectionForId +from ..objid import stripId, getCollectionForId class H5JsonWriter(H5Writer): """ @@ -21,7 +21,6 @@ class H5JsonWriter(H5Writer): compatible storage medium. """ - def __init__( self, filepath, diff --git a/src/h5json/h5writer.py b/src/h5json/writer/h5writer.py similarity index 100% rename from src/h5json/h5writer.py rename to src/h5json/writer/h5writer.py diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index 710ffe16..47ff3b1e 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -14,7 +14,7 @@ import logging import numpy as np from h5json import Hdf5db -from h5json.h5json_writer import H5JsonWriter +from h5json.writer.h5json_writer import H5JsonWriter from h5json.objid import isRootObjId, isValidUuid, isSchema2Id, stripId from h5json.hdf5dtype import special_dtype, Reference From c5c28a42e9ff8e1957ad56e0805d590f84676400 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 25 Feb 2025 22:31:05 -0800 Subject: [PATCH 011/129] basic dataset read/write methods added --- src/h5json/dset_util.py | 4 +- src/h5json/hdf5db.py | 296 +++++++--- src/h5json/reader/h5py_reader.py | 15 +- src/h5json/reader/h5reader.py | 2 +- src/h5json/selections.py | 834 +++++++++++++++++++++++++++++ src/h5json/writer/h5json_writer.py | 24 +- test/unit/h5json_writer_test.py | 18 +- test/unit/hdf5db_test.py | 74 ++- 8 files changed, 1159 insertions(+), 108 deletions(-) create mode 100644 src/h5json/selections.py diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index c5da3514..7a3a7aa3 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -107,7 +107,9 @@ def resize_dataset(dset_json, shape): raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") if len(shape_class["dims"]) != len(shape): raise ValueError("Resize shape parameter doesn't match dataset's rank") - # TBD: validate shape + if shape_json["dims"] == list(shape): + # no change, just return + return shape_json["dims"] = list(shape) dset_json["modified"] = time.time() diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 39de3b60..991e7561 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -16,6 +16,7 @@ from .array_util import jsonToArray, bytesArrayToList from .dset_util import make_new_dset, resize_dataset from .objid import createObjId, getCollectionForId +from . import selections from .apiversion import _apiver from .reader.h5reader import H5Reader from .writer.h5writer import H5Writer @@ -49,6 +50,9 @@ def __init__( self._reader = h5_reader self._writer = h5_writer + + self._new_objects = set() # set of obj_id's + self._dirty_objects = set() # set of obj_id's if self._reader: root_id = self._reader.get_root_id() @@ -65,19 +69,70 @@ def __init__( self._db[root_id] = group_json self._root_id = root_id + @property + def db(self): + """ return object db dictionary """ + return self._db + + @property + def reader(self): + """ return reader instance """ + return self._reader + + @property + def writer(self): + """ return writer instance """ + return self._writer + + @property + def root_id(self): + """ return root uuid """ + return self._root_id + + def is_new(self, obj_id): + """ return true if this is a new object (has not been persisted) """ + return obj_id in self._new_objects + + def is_dirty(self, obj_id): + """ return true if this object has been modified """ + if self.is_new(obj_id): + return True + return obj_id in self._dirty_objects + + def make_dirty(self, obj_id): + """ Mark the object as dirty and update the lastModified timestamp """ + if self.is_new(obj_id): + # object hasn't been initially written yet, just return + return + if obj_id not in self.db: + self.log.error("make dirty called on deleted object") + raise KeyError(f"obj_id: {obj_id} not found") + if self.db[obj_id] is None: + # object deleted, just return + return + obj_json = self.db[obj_id] + now = time.time() + obj_json["lastModified"] = now + self._dirty_objects.add(obj_id) + + def flush(self): """ write out any changes """ - if self._writer: - self._writer.flush() + if not self.writer: + return # nothing to do + if self.writer.flush(): + # reset new and dirty sets + self._new_objects = set() + self._dirty_objects = set() def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") self.flush() - if self._writer: - self._writer.close() - if self._reader: - self._reader.close() + if self.writer: + self.writer.close() + if self.reader: + self.reader.close() self._root_id = None self._db = {} @@ -94,14 +149,14 @@ def __exit__(self, type, value, traceback): def getObjectById(self, obj_id): """ return object with given id """ - if obj_id not in self._db: - if self._reader: + if obj_id not in self.db: + if self.reader: # load the obj from the reader - obj_json = self._reader.getObjectById(obj_id) - self._db[obj_id] = obj_json + obj_json = self.reader.getObjectById(obj_id) + self.db[obj_id] = obj_json else: raise KeyError(f"obj_id: {obj_id} not found") - obj_json = self._db[obj_id] + obj_json = self.db[obj_id] return obj_json @@ -110,10 +165,10 @@ def getObjectIdByPath(self, h5path, parent_id=None): otherwise the root_id """ if h5path == "/": - return self._root_id # just return root id + return self.root_id # just return root id if parent_id is None: - parent_id = self._root_id + parent_id = self.root_id self.log.debug(f"getObjectIdDByPath(h5path: {h5path} parent_id: {parent_id}") obj_json = self.getObjectById(parent_id) @@ -175,9 +230,9 @@ def getObjectByPath(self, path): def getDtype(self, obj_id): """ Return numpy data type for given object id """ - if obj_id not in self._db: + if obj_id not in self.db: raise KeyError(f"{obj_id} not found") - obj_json = self._db[obj_id] + obj_json = self.db[obj_id] if "type" not in obj_json: # group id? raise TypeError(f"{obj_id} does not have a datatype") @@ -196,7 +251,7 @@ def createCommittedType(self, datatype, cpl=None): if cpl is None: cpl = {} - ctype_id = createObjId(obj_type="datatypes", root_id=self._root_id) + ctype_id = createObjId(obj_type="datatypes", root_id=self.root_id) if isinstance(datatype, np.dtype): dt = datatype else: @@ -207,7 +262,8 @@ def createCommittedType(self, datatype, cpl=None): ctype_json = {"type": type_json, "attributes": {}, "cpl": cpl} ctype_json["created"] = time.time() ctype_json["modified"] = None - self._db[ctype_id] = ctype_json + self.db[ctype_id] = ctype_json + self._new_objects.add(ctype_id) return ctype_id @@ -224,15 +280,19 @@ def getAttribute(self, obj_id, name, includeData=True): msg = f"Attribute: [{name }] not found in object: {obj_id}" self.log.info(msg) return None + if attrs[name] == None: + msg = f"Attribute: [{name}] has been deleted" + self.log.info(None) + return None attr_json = attrs[name] if includeData and "value" not in attr_json: # Reader may not have pre-loaded large attributes # fetch it now - if not self._reader: + if not self.reader: raise RuntimeError(f"Expected to find value for attribute {name} of {obj_id}") - attr_json = self._reader.get_attribute(obj_id, name) + attr_json = self.reader.get_attribute(obj_id, name) attr_json["value"] = attr_json # this will update the _db return attr_json @@ -245,8 +305,12 @@ def getAttributes(self, obj_id): obj_json = self.getObjectById(obj_id) attrs = obj_json["attributes"] + names = [] + for name in attrs: + if attrs[name] != None: + names.append(name) - return attrs + return names def getAttributeValue(self, obj_id, name): """ Return NDArray of the given attribute value """ @@ -277,7 +341,7 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): ctype_id = dtype[len("datatypes/"):] if getCollectionForId(ctype_id) != "datatypes": raise TypeError(f"unexpected dtype value for createAttribute: {dtype}") - if ctype_id not in self._db: + if ctype_id not in self.db: raise KeyError(f"ctype: {ctype_id} not found") ctype_json = self.getObjectById(ctype_id) type_json = ctype_json["type"].copy() @@ -345,21 +409,21 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): obj_json = self.getObjectById(obj_id) attrs_json = obj_json["attributes"] if name in attrs_json: - # replace, update modified timestamp + # replace, keep, created timestamp created = attrs_json["created"] - modified = time.time() else: created = time.time() - modified = None type_json = getTypeItem(dtype) # finally put it all together... attr_json = {"shape": shape_json, "type": type_json, "value": value_json} attr_json["created"] = created - attr_json["modified"] = modified # slot into the obj_json["attrs"] attrs_json[name] = attr_json + # mark object as dirty + self.make_dirty(obj_id) + def deleteAttribute(self, obj_id, name): """ delete the given attribute """ @@ -367,18 +431,93 @@ def deleteAttribute(self, obj_id, name): attrs_json = obj_json["attributes"] if name not in attrs_json: raise KeyError(f"attribute [{name}] not found in {obj_id}") - del attrs_json[name] + attrs_json[name] = None # mark key for deletion + + self.make_dirty(obj_id) - def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): + def getDatasetValues(self, dset_id, sel): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same number of elements as the rank of the dataset. """ - self.log.info(f"getDatasetValues obj_id: {obj_id}, slices: {slices} format: {format}") - #TBD - + self.log.info(f"getDatasetValues dset_id: {dset_id}, sel: {sel}") + dset_json = self.getObjectById(dset_id) + shape_json = dset_json["shape"] + if not isinstance(sel, selections.Selection): + raise TypeError("Expected Selection class") + + if shape_json["class"] == "H5S_NULL": + return None + + if shape_json["class"] == "H5S_SCALAR": + if sel.select_type != sel.H5S_SELECT_ALL: + # TBD: support other selection types + raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") + if sel.shape != (): + raise ValueError("Selection shape does not match dataset shape") + else: + dims = tuple(shape_json["dims"]) + if sel.shape != dims: + raise ValueError("Selection shape does not match dataset shape") + rank = len(dims) + + dtype = self.getDtype(dset_id) + if self.reader: + arr = self.reader.getDatasetValues(dset_id, sel) + else: + # TBD: Initialize with fill value if non-zero + arr = np.zeros(sel.shape, dtype=dtype) + + if "updates" in dset_json: + # apply any non-flushed changes that intersect the current selection + updates = dset_json["updates"] + for (update_sel, update_val) in updates: + sel_inter = selections.intersect(sel, update_sel) + if sel_inter.nselect == 0: + continue + # update portion of arr, that intersects update_val + slices = [] + for dim in range(rank): + start = sel_inter.start[dim] - sel.start[dim] + stop = start + sel_inter.count[dim] + slices.append(slice(start, stop, 1)) + slices = tuple(slices) + arr[slices] = update_val + + return arr + + def setDatasetValues(self, dset_id, sel, arr): + """ + Write the given ndarray to the dataset using the selection + """ + dset_json = self.getObjectById(dset_id) + shape_json = dset_json["shape"] + if not isinstance(sel, selections.Selection): + raise TypeError("Expected Selection class") + if sel.select_type not in (selections.H5S_SELECT_HYPERSLABS, selections.H5S_SELECT_ALL): + # TBD: support other selection types + raise ValueError("Only hyperslab selections are currently supported") + if not isinstance(arr, np.ndarray): + raise TypeError("Expected ndarray for data value") + if shape_json["class"] == "H5S_NULL": + raise ValueError("writing to null space dataset not supported") + if shape_json["class"] == "H5S_SCALAR": + if sel.shape != (): + raise ValueError("Selection shape does not match dataset shape") + if len(arr.shape) > 0: + raise TypeError("Expected scalar ndarray for scalar dataset") + else: + dims = tuple(shape_json["dims"]) + if sel.shape != dims: + raise ValueError("Selection shape does not match dataset shape") + if "updates" not in dset_json or sel.select_type == selections.H5S_SELECT_ALL: + # for select all, throw out any existing updates since this will overwrite them + dset_json["updates"] = [] + updates = dset_json["updates"] + updates.append((sel, arr.copy())) + self.make_dirty(dset_id) def createDataset( self, @@ -414,8 +553,9 @@ def createDataset( kwds["cpl"] = cpl dset_json = make_new_dset(shape=shape, dtype=dtype, **kwds) - dset_id = createObjId("datasets", root_id=self._root_id) - self._db[dset_id] = dset_json + dset_id = createObjId("datasets", root_id=self.root_id) + self.db[dset_id] = dset_json + self._new_objects.add(dset_id) return dset_id @@ -426,18 +566,25 @@ def resizeDataset(self, dset_id, shape): self.log.info(f"resizeDataset {dset_id}, {shape}") dset_json = self.getObjectById(dset_id) # will throw exception if not found - resize_dataset(dset_json, shape) + if resize_dataset(dset_json, shape): + self._dirty_objects.add(dset_id) def deleteObject(self, obj_id): """ Delete the given object """ self.log.info(f"deleteObject: {obj_id}") - if obj_id not in self._db: + if obj_id not in self.db: raise KeyError(f"Object {obj_id} not found for deletion") - if obj_id == self._root_id: + if obj_id == self.root_id: raise KeyError("Root group cannot be deleted") - del self._db[obj_id] - # TBD: add to pending deleted items + self.db[obj_id] = None + + if obj_id in self._new_objects: + self._new_objects.remove(obj_id) + + if obj_id in self._dirty_objects: + self._dirty_objects.remove(obj_id) + def getLinks(self, grp_id): """ Get the links for the given group """ @@ -445,100 +592,113 @@ def getLinks(self, grp_id): if "links" not in grp_json: raise KeyError(f"No links - {grp_id} not a group?") links = grp_json["links"] - return links + names = [] + for name in links: + if links[name] != None: + names.append(name) + return names def getLink(self, grp_id, name): """ Get the given link """ - links = self.getLinks(grp_id) + obj_json = self.getObjectById(grp_id) + links = obj_json["links"] if name not in links: - raise KeyError(f"Link [{name}] not found in {grp_id}") + self.log.info(f"Link [{name}] not found in {grp_id}") + return None + if links[name] == None: + self.log.info(f"Link {name} in {grp_id} has been deleted") + return None + return links[name] + def _addLink(self, grp_id, name, link_json): + obj_json = self.getObjectById(grp_id) + links = obj_json["links"] + links[name] = link_json + self.make_dirty(grp_id) + def createHardLink(self, grp_id, name, tgt_id): """ Create a new hardlink """ - links = self.getLinks(grp_id) - if name in links: - self.deleteLink(grp_id, name) link_json = {"class": "H5L_TYPE_HARD", "id": tgt_id} link_json["created"] = time.time() - links[name] = link_json + self._addLink(grp_id, name, link_json) def createSoftLink(self, grp_id, name, h5path): """ Create a soft link """ - links = self.getLinks(grp_id) - if name in links: - self.deleteLink(grp_id, name) link_json = {"class": "H5L_TYPE_SOFT", "h5path": h5path} link_json["created"] = time.time() - links[name] = link_json + self._addLink(grp_id, name, link_json) def createCustomLink(self, grp_id, name, link_json): """ create a custom link """ - links = self.getLinks(grp_id) - if name in links: - self.deleteLink(grp_id, name) if link_json.get("class") != "H5L_TYPE_USER_DEFINED": link_json["class"] = "H5L_TYPE_USER_DEFINED" link_json["created"] = time.time() - links[name] = link_json - + self._addLink(grp_id, name, link_json) def createExternalLink(self, grp_id, name, h5path, filepath): """ Create a external link link """ - links = self.getLinks(grp_id) - if name in links: - self.deleteLink(grp_id, name) link_json = {"class": "H5L_TYPE_EXTERNAL", "h5path": h5path, "file": filepath} link_json["created"] = time.time() - links[name] = link_json + self._addLink(grp_id, name, link_json) def deleteLink(self, grp_id, name): """ Delete the given link """ grp_json = self.getObjectById(grp_id) if "links" not in grp_json: raise KeyError(f"No links - {grp_id} not a group?") - links = self.getLinks(grp_id) + links = grp_json["links"] if name not in links: raise KeyError(f"Link [{name}] not found in {grp_id}") - del links[name] - grp_json["modified"] = time.time() + links[name] = None # mark for deletion + self.make_dirty(grp_id) def createGroup(self, cpl=None): """ Create a new group """ - grp_id = createObjId("groups", root_id=self._root_id) + grp_id = createObjId("groups", root_id=self.root_id) group_json = {"attributes": {}, "links": {}} if cpl: group_json["cpl"] = cpl else: group_json["cpl"] = {} group_json["created"] = time.time() - group_json["modified"] = None - self._db[grp_id] = group_json + self.db[grp_id] = group_json + self._new_objects.add(grp_id) return grp_id def getCollection(self, col_type=None): obj_ids = [] - for obj_id in self._db: + for obj_id in self.db: + if self.db[obj_id] == None: + # skip deleted objects + continue if not col_type or getCollectionForId(obj_id) == col_type: obj_ids.append(obj_id) return obj_ids def __len__(self): # return the number of objects - return len(self._db) - + count = 0 + for obj_id in self.db: + # skip deleted objects + if self.db[obj_id] != None: + count += 1 + return count def __iter__(self): """ Iterate over object ids """ - for obj_id in self._db: + for obj_id in self.db: + if self.db[obj_id] == None: + # skip deleted objects + continue yield obj_id def __contains__(self, obj_id): """ Test if a obj id exists """ - return obj_id in self._db + return obj_id in self.db and self.db[obj_id] != None diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index dc9220ae..4e7c9b55 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -42,13 +42,10 @@ def __init__( ): self._id_map = {} self._addr_map = {} - """ if app_logger: self.log = app_logger else: self.log = logging.getLogger() - self._filepath = filepath - """ super().__init__(filepath, app_logger=app_logger) f = h5py.File(self._filepath) self._f = f @@ -264,11 +261,19 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): return obj_json - def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): + def getDatasetValues(self, dset_id, selection): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same number of elements as the rank of the dataset. """ - pass + dset = self._id_map[dset_id] + self.log.info(f"getDatasetValues: {dset_id}") + if dset.shape is None: + # TBD: return something like h5py.Empty in this case? + return None + arr = dset[selection] + return arr + + diff --git a/src/h5json/reader/h5reader.py b/src/h5json/reader/h5reader.py index 6a37a07a..69a45d07 100644 --- a/src/h5json/reader/h5reader.py +++ b/src/h5json/reader/h5reader.py @@ -51,7 +51,7 @@ def getAttribute(self, obj_id, name, includeData=True): pass @abstractmethod - def getDatasetValues(self, obj_id, slices=Ellipsis, format="json"): + def getDatasetValues(self, obj_id, selection): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same diff --git a/src/h5json/selections.py b/src/h5json/selections.py new file mode 100644 index 00000000..4d700d94 --- /dev/null +++ b/src/h5json/selections.py @@ -0,0 +1,834 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +# We use __getitem__ side effects, which pylint doesn't like. +# pylint: disable=pointless-statement + +""" + High-level access to HDF5 dataspace selections +""" + +from __future__ import absolute_import + +import numpy as np + +H5S_SEL_POINTS = 0 +H5S_SELECT_SET = 1 +H5S_SELECT_APPEND = 2 +H5S_SELECT_PREPEND = 3 +H5S_SELECT_OR = 4 +H5S_SELECT_NONE = 5 +H5S_SELECT_ALL = 6 +H5S_SELECT_HYPERSLABS = 7 +H5S_SELECT_NOTB = 8 +H5S_SELLECT_FANCY = 9 + + +def select(obj, args): + """ High-level routine to generate a selection from arbitrary arguments + to __getitem__. The arguments should be the following: + + obj + Datatset object + + args + Either a single argument or a tuple of arguments. See below for + supported classes of argument. + + Argument classes: + + Single Selection instance + Returns the argument. + + numpy.ndarray + Must be a boolean mask. Returns a PointSelection instance. + + RegionReference + Returns a Selection instance. + + Indices, slices, ellipses only + Returns a SimpleSelection instance + + Indices, slices, ellipses, lists or boolean index arrays + Returns a FancySelection instance. + """ + if not isinstance(args, tuple): + args = (args,) + + if hasattr(obj, "shape") and obj.shape == (): + # scalar object + sel = ScalarSelection(obj.shape, args) + return sel + + # "Special" indexing objects + if len(args) == 1: + + arg = args[0] + + if isinstance(arg, Selection): + if arg.shape != obj.shape: + raise TypeError("Mismatched selection shape") + return arg + + elif isinstance(arg, np.ndarray) or isinstance(arg, list): + sel = PointSelection(obj.shape) + sel[arg] + return sel + """ + #todo - RegionReference + elif isinstance(arg, h5r.RegionReference): + sid = h5r.get_region(arg, dsid) + if shape != sid.shape: + raise TypeError("Reference shape does not match dataset shape") + + return Selection(shape, spaceid=sid) + """ + + for a in args: + use_fancy = False + if isinstance(a, np.ndarray): + use_fancy = True + elif a is []: + use_fancy = True + elif not isinstance(a, slice) and a is not Ellipsis: + try: + int(a) + except Exception: + use_fancy = True + if use_fancy and hasattr(obj, "shape"): + sel = FancySelection(obj.shape) + sel[args] + return sel + if hasattr(obj, "shape"): + sel = SimpleSelection(obj.shape) + else: + sel = SimpleSelection(obj) + sel[args] + return sel + +def intersect(s1, s2): + """ Return the intersection of two selections """ + # TBD: this is currently only working for simple selections with stride 1 + valid_select_types = (H5S_SELECT_HYPERSLABS, H5S_SELECT_ALL) + if not isinstance(s1, Selection): + raise TypeError("Expected selection type for first arg") + if not isinstance(s2, Selection): + raise TypeError("Expected selection type for second arg") + if s1.select_type not in valid_select_types: + raise TypeError("Expected hyperslab selection for first arg") + if s2.select_type not in valid_select_types: + raise TypeError("Expected hyperslab selection for second arg") + if s1.shape != s2.shape: + raise ValueError("selections have incompatible shapes") + + slices = [] + rank = len(s1.shape) + for dim in range(rank): + start = max(s1.start[dim], s2.start[dim]) + stop = min(s1.start[dim] + s1.count[dim], s2.start[dim] + s2.count[dim]) + msg = "stepped slices not currently supported" + if s1.step[dim] > 1: + raise ValueError(msg) + if s2.step[dim] > 1: + raise ValueError("stepped slices not currently supported") + if start > stop: + stop = start + slices.append(slice(start, stop, 1)) + slices = tuple(slices) + + return select(s1.shape, slices) + + +class Selection(object): + + """ + Base class for HDF5 dataspace selections. Subclasses support the + "selection protocol", which means they have at least the following + members: + + __init__(shape) => Create a new selection on "shape"-tuple + __getitem__(args) => Perform a selection with the range specified. + What args are allowed depends on the + particular subclass in use. + + id (read-only) => h5py.h5s.SpaceID instance + shape (read-only) => The shape of the dataspace. + mshape (read-only) => The shape of the selection region. + Not guaranteed to fit within "shape", although + the total number of points is less than + product(shape). + nselect (read-only) => Number of selected points. Always equal to + product(mshape). + + broadcast(target_shape) => Return an iterable which yields dataspaces + for read, based on target_shape. + + The base class represents "unshaped" selections (1-D). + """ + + def __init__(self, shape, *args, **kwds): + """ Create a selection. """ + + shape = tuple(shape) + self._shape = shape + + self._select_type = H5S_SELECT_ALL + + @property + def select_type(self): + """ SpaceID instance """ + return self._select_type + + @property + def shape(self): + """ Shape of whole dataspace """ + return self._shape + + @property + def nselect(self): + """ Number of elements currently selected """ + + return self.getSelectNpoints() + + @property + def mshape(self): + """ Shape of selection (always 1-D for this class) """ + return (self.nselect,) + + def getSelectNpoints(self): + npoints = None + if self._select_type == H5S_SELECT_NONE: + npoints = 0 + elif self._select_type == H5S_SELECT_ALL: + dims = self._shape + npoints = 1 + for nextent in dims: + npoints *= nextent + else: + raise IOError("Unsupported select type") + return npoints + + def broadcast(self, target_shape): + """ Get an iterable for broadcasting """ + if np.product(target_shape) != self.nselect: + raise TypeError("Broadcasting is not supported for point-wise selections") + yield self._id + + def __getitem__(self, args): + raise NotImplementedError("This class does not support indexing") + + def __repr__(self): + return f"Selection(shape:{self._shape})" + + +class PointSelection(Selection): + + """ + Represents a point-wise selection. You can supply sequences of + points to the three methods append(), prepend() and set(), or a + single boolean array to __getitem__. + """ + def __init__(self, shape, *args, **kwds): + """ Create a Point selection. """ + Selection.__init__(self, shape, *args, **kwds) + self._points = [] + + @property + def points(self): + """ selection points """ + return self._points + + def getSelectNpoints(self): + npoints = None + if self._select_type == H5S_SELECT_NONE: + npoints = 0 + elif self._select_type == H5S_SELECT_ALL: + dims = self._shape + npoints = 1 + for nextent in dims: + npoints *= nextent + elif self._select_type == H5S_SEL_POINTS: + dims = self._shape + rank = len(dims) + if len(self._points) == rank and not type(self._points[0]) in (list, tuple, np.ndarray): + npoints = 1 + else: + npoints = len(self._points) + else: + raise IOError("Unsupported select type") + return npoints + + def _perform_selection(self, points, op): + """ Internal method which actually performs the selection """ + if isinstance(points, np.ndarray) or True: + points = np.asarray(points, order='C', dtype='u8') + if len(points.shape) == 1: + # points.shape = (1,points.shape[0]) + pass + + if self._select_type != H5S_SEL_POINTS: + op = H5S_SELECT_SET + self._select_type = H5S_SEL_POINTS + + if op == H5S_SELECT_SET: + self._points = points + elif op == H5S_SELECT_APPEND: + self._points.extent(points) + elif op == H5S_SELECT_PREPEND: + tmp = self._points + self._points = points + self._points.extend(tmp) + else: + raise ValueError("Unsupported operation") + + # def _perform_list_selection(points, H5S_SELECT_SET): + + def __getitem__(self, arg): + """ Perform point-wise selection from a NumPy boolean array """ + if isinstance(arg, list): + points = arg + else: + if not (isinstance(arg, np.ndarray) and arg.dtype.kind == 'b'): + raise TypeError("PointSelection __getitem__ only works with bool arrays") + if not arg.shape == self._shape: + raise TypeError("Boolean indexing array has incompatible shape") + + points = np.transpose(arg.nonzero()) + self.set(points) + return self + + def append(self, points): + """ Add the sequence of points to the end of the current selection """ + self._perform_selection(points, H5S_SELECT_APPEND) + + def prepend(self, points): + """ Add the sequence of points to the beginning of the current selection """ + self._perform_selection(points, H5S_SELECT_PREPEND) + + def set(self, points): + """ Replace the current selection with the given sequence of points""" + """ + if isinstance(points, list): + # selection with list of points + self._perform_list_selection(points, H5S_SELECT_SET) + + else: + # selection with boolean ndarray + """ + self._perform_selection(points, H5S_SELECT_SET) + + def __repr__(self): + return f"PointSelection(shape:{self._shape}, {len(self._points)} points)" + + +class SimpleSelection(Selection): + + """ A single "rectangular" (regular) selection composed of only slices + and integer arguments. Can participate in broadcasting. + """ + + @property + def mshape(self): + """ Shape of current selection """ + return self._mshape + + @property + def start(self): + return self._sel[0] + + @property + def count(self): + return self._sel[1] + + @property + def step(self): + return self._sel[2] + + def __init__(self, shape, *args, **kwds): + Selection.__init__(self, shape, *args, **kwds) + rank = len(self._shape) + self._sel = ((0,) * rank, self._shape, (1,) * rank, (False,) * rank) + self._mshape = self._shape + self._select_type = H5S_SELECT_ALL + + def __getitem__(self, args): + + if not isinstance(args, tuple): + args = (args,) + + if self._shape == (): + if len(args) > 0 and args[0] not in (Ellipsis, ()): + raise TypeError("Invalid index for scalar dataset (only ..., () allowed)") + self._select_type = H5S_SELECT_ALL + return self + + start, count, step, scalar = _handle_simple(self._shape, args) + self._sel = (start, count, step, scalar) + + # self._id.select_hyperslab(start, count, step) + self._select_type = H5S_SELECT_HYPERSLABS + + self._mshape = tuple(x for x, y in zip(count, scalar) if not y) + + return self + + def getSelectNpoints(self): + """Return number of elements in current selection + """ + npoints = None + if self._select_type == H5S_SELECT_NONE: + npoints = 0 + elif self._select_type == H5S_SELECT_ALL: + dims = self._shape + npoints = 1 + for nextent in dims: + npoints *= nextent + elif self._select_type == H5S_SELECT_HYPERSLABS: + dims = self._shape + npoints = 1 + rank = len(dims) + for i in range(rank): + npoints *= self.count[i] + else: + raise IOError("Unsupported select type") + return npoints + + def getQueryParam(self): + """ Get select param for use with HDF Rest API""" + param = '' + rank = len(self._shape) + if rank == 0: + return None + + param += "[" + for i in range(rank): + start = self.start[i] + stop = start + (self.count[i] * self.step[i]) + if stop > self._shape[i]: + stop = self._shape[i] + dim_sel = str(start) + ':' + str(stop) + if self.step[i] != 1: + dim_sel += ':' + str(self.step[i]) + if i != rank - 1: + dim_sel += ',' + param += dim_sel + param += ']' + return param + + def broadcast(self, target_shape): + """ Return an iterator over target dataspaces for broadcasting. + + Follows the standard NumPy broadcasting rules against the current + selection shape (self._mshape). + """ + if self._shape == (): + if np.product(target_shape) != 1: + raise TypeError(f"Can't broadcast {target_shape} to scalar") + self._id.select_all() + yield self._id + return + + start, count, step, scalar = self._sel + + rank = len(count) + target = list(target_shape) + + tshape = [] + for idx in range(1, rank + 1): + if len(target) == 0 or scalar[-idx]: # Skip scalar axes + tshape.append(1) + else: + t = target.pop() + if t == 1 or count[-idx] == t: + tshape.append(t) + else: + raise TypeError(f"Can't broadcast {target_shape} -> {count}") + tshape.reverse() + tshape = tuple(tshape) + + chunks = tuple(x // y for x, y in zip(count, tshape)) + nchunks = int(np.product(chunks)) + + if nchunks == 1: + yield self._id + else: + sid = self._id.copy() + sid.select_hyperslab((0,) * rank, tshape, step) + for idx in range(nchunks): + offset = tuple(x * y * z + s for x, y, z, s in zip(np.unravel_index(idx, chunks), tshape, step, start)) + sid.offset_simple(offset) + yield sid + + def __repr__(self): + s = f"SimpleSelection(shape:{self._shape}, start: {self._sel[0]}," + s += f" count: {self._sel[1]}, step: {self._sel[2]}" + return s + + +class FancySelection(Selection): + + """ + Implements advanced NumPy-style selection operations in addition to + the standard slice-and-int behavior. + + Indexing arguments may be ints, slices, lists of indicies, or + per-axis (1D) boolean arrays. + + Broadcasting is not supported for these selections. + """ + + @property + def slices(self): + return self._slices + + @property + def mshape(self): + """ Shape of current selection """ + return self._mshape + + def __init__(self, shape, *args, **kwds): + Selection.__init__(self, shape, *args, **kwds) + self._slices = [] + + def __getitem__(self, args): + + if not isinstance(args, tuple): + args = (args,) + + args = _expand_ellipsis(args, len(self._shape)) + select_type = H5S_SELECT_HYPERSLABS # will adjust if we have a coord + + # Create list of slices and/or coordinates + slices = [] + mshape = [] + num_coordinates = None + for idx, arg in enumerate(args): + length = self._shape[idx] + if isinstance(arg, slice): + _, count, _ = _translate_slice(arg, length) # raise exception for invalid slice + if arg.start is None: + start = 0 + else: + start = arg.start + if arg.stop is None: + stop = length + else: + stop = arg.stop + if arg.step is None: + step = 1 + else: + step = arg.step + slices.append(slice(start, stop, step)) + mshape.append(count) + + elif hasattr(arg, 'dtype') and arg.dtype == np.dtype('bool'): + if len(arg.shape) != 1: + raise TypeError("Boolean indexing arrays must be 1-D") + arg = arg.nonzero()[0] + try: + slices.append(list(arg)) + except TypeError: + pass + else: + if sorted(arg) != list(arg): + raise TypeError("Indexing elements must be in increasing order") + mshape.append(len(arg)) + select_type = H5S_SELLECT_FANCY + elif isinstance(arg, list) or hasattr(arg, 'dtype'): + # coordinate selection + slices.append(arg) + for x in arg: + if x < 0 or x >= length: + raise IndexError(f"Index ({arg}) out of range (0-{length - 1})") + if num_coordinates is None: + num_coordinates = len(arg) + elif num_coordinates == len(arg): + # second set of coordinates doesn't effect mshape + continue + else: + # this shouldn't happen since HSDS would have thrown an error + raise ValueError("coordinate num element missmatch") + mshape.append(len(arg)) + select_type = H5S_SELLECT_FANCY + elif isinstance(arg, int): + if arg < 0 or arg >= length: + raise IndexError(f"Index ({arg}) out of range (0-{length - 1})") + slices.append(arg) + elif isinstance(arg, type(Ellipsis)): + slices.append(slice(0, length, 1)) + else: + raise TypeError(f"Unexpected arg type: {arg} - {type(arg)}") + self._slices = slices + self._select_type = select_type + self._mshape = tuple(mshape) + + def getSelectNpoints(self): + """Return number of elements in current selection + """ + npoints = 1 + for idx, s in enumerate(self._slices): + if isinstance(s, slice): + length = self._shape[idx] + _, count, _ = _translate_slice(s, length) + elif isinstance(s, list): + count = len(s) + else: + # scalar selection + count = 1 + npoints *= count + + return npoints + + def getQueryParam(self): + """ Get select param for use with HDF Rest API""" + query = [] + query.append('[') + rank = len(self._slices) + for dim, s in enumerate(self._slices): + if isinstance(s, slice): + if s.start is None and s.stop is None: + query.append(':') + elif s.stop is None: + query.append(f"{s.start}:") + else: + query.append(f"{s.start}:{s.stop}") + if s.step and s.step != 1: + query.append(f":{s.step}") + elif isinstance(s, list) or hasattr(s, 'dtype'): + query.append('[') + for idx, n in enumerate(s): + query.append(str(n)) + if idx + 1 < len(s): + query.append(',') + query.append(']') + else: + # scalar selection + query.append(str(s)) + if dim + 1 < rank: + query.append(',') + query.append(']') + return "".join(query) + + def broadcast(self, target_shape): + raise TypeError("Broadcasting is not supported for complex selections") + + def __repr__(self): + return f"FancySelection(shape:{self._shape}, slices: {self._slices})" + + +def _expand_ellipsis(args, rank): + """ Expand ellipsis objects and fill in missing axes. + """ + n_el = sum(1 for arg in args if arg is Ellipsis) + if n_el > 1: + raise ValueError("Only one ellipsis may be used.") + elif n_el == 0 and len(args) != rank: + args = args + (Ellipsis,) + + final_args = [] + n_args = len(args) + for arg in args: + + if arg is Ellipsis: + final_args.extend((slice(None, None, None),) * (rank - n_args + 1)) + else: + final_args.append(arg) + + if len(final_args) > rank: + raise TypeError("Argument sequence too long") + + return final_args + + +def _handle_simple(shape, args): + """ Process a "simple" selection tuple, containing only slices and + integer objects. Return is a 4-tuple with tuples for start, + count, step, and a flag which tells if the axis is a "scalar" + selection (indexed by an integer). + + If "args" is shorter than "shape", the remaining axes are fully + selected. + """ + args = _expand_ellipsis(args, len(shape)) + + start = [] + count = [] + step = [] + scalar = [] + + for arg, length in zip(args, shape): + if isinstance(arg, slice): + x, y, z = _translate_slice(arg, length) + s = False + else: + try: + x, y, z = _translate_int(int(arg), length) + s = True + except TypeError: + raise TypeError(f'Illegal index "{arg}" (must be a slice or number)') + start.append(x) + count.append(y) + step.append(z) + scalar.append(s) + + return tuple(start), tuple(count), tuple(step), tuple(scalar) + + +def _translate_int(exp, length): + """ Given an integer index, return a 3-tuple + (start, count, step) + for hyperslab selection + """ + if exp < 0: + exp = length + exp + + if not 0 <= exp < length: + raise IndexError(f"Index ({exp}) out of range (0-{length - 1})") + + return exp, 1, 1 + + +def _translate_slice(exp, length): + """ Given a slice object, return a 3-tuple + (start, count, step) + for use with the hyperslab selection routines + """ + start, stop, step = exp.indices(length) + # Now if step > 0, then start and stop are in [0, length]; + # if step < 0, they are in [-1, length - 1] (Python 2.6b2 and later; + # Python issue 3004). + + if step < 1: + raise ValueError("Step must be >= 1 (got %d)" % step) + if stop < start: + stop = start + + count = 1 + (stop - start - 1) // step + + return start, count, step + + +def guess_shape(sid): + """ Given a dataspace, try to deduce the shape of the selection. + + Returns one of: + * A tuple with the selection shape, same length as the dataspace + * A 1D selection shape for point-based and multiple-hyperslab selections + * None, for unselected scalars and for NULL dataspaces + """ + + sel_class = sid.get_simple_extent_type() # Dataspace class + sel_type = sid.get_select_type() # Flavor of selection in use + + if sel_class == 'H5S_NULL': + # NULL dataspaces don't support selections + return None + + elif sel_class == 'H5S_SCALAR': + # NumPy has no way of expressing empty 0-rank selections, so we use None + if sel_type == H5S_SELECT_NONE: + return None + if sel_type == H5S_SELECT_ALL: + return tuple() + + elif sel_class != 'H5S_SIMPLE': + raise TypeError(f"Unrecognized dataspace class {sel_class}") + + # We have a "simple" (rank >= 1) dataspace + + N = sid.get_select_npoints() + rank = len(sid.shape) + + if sel_type == H5S_SELECT_NONE: + return (0,) * rank + + elif sel_type == H5S_SELECT_ALL: + return sid.shape + + elif sel_type == H5S_SEL_POINTS: + # Like NumPy, point-based selections yield 1D arrays regardless of + # the dataspace rank + return (N,) + + elif sel_type != H5S_SELECT_HYPERSLABS: + raise TypeError(f"Unrecognized selection method {sel_type}") + + # We have a hyperslab-based selection + + if N == 0: + return (0,) * rank + + bottomcorner, topcorner = (np.array(x) for x in sid.get_select_bounds()) + + # Shape of full selection box + boxshape = topcorner - bottomcorner + np.ones((rank,)) + + def get_n_axis(sid, axis): + """ Determine the number of elements selected along a particular axis. + + To do this, we "mask off" the axis by making a hyperslab selection + which leaves only the first point along the axis. For a 2D dataset + with selection box shape (X, Y), for axis 1, this would leave a + selection of shape (X, 1). We count the number of points N_leftover + remaining in the selection and compute the axis selection length by + N_axis = N/N_leftover. + """ + + if (boxshape[axis]) == 1: + return 1 + + start = bottomcorner.copy() + start[axis] += 1 + count = boxshape.copy() + count[axis] -= 1 + + # Throw away all points along this axis + masked_sid = sid.copy() + masked_sid.select_hyperslab(tuple(start), tuple(count), op=H5S_SELECT_NOTB) + + N_leftover = masked_sid.get_select_npoints() + + return N // N_leftover + + shape = tuple(get_n_axis(sid, x) for x in range(rank)) + + if np.product(shape) != N: + # This means multiple hyperslab selections are in effect, + # so we fall back to a 1D shape + return (N,) + + return shape + + +class ScalarSelection(Selection): + + """ + Implements slicing for scalar datasets. + """ + + @property + def mshape(self): + return self._mshape + + def __init__(self, shape, *args, **kwds): + Selection.__init__(self, shape, *args, **kwds) + arg = None + if len(args) > 0: + arg = args[0] + if arg == (): + self._mshape = None + self._select_type = H5S_SELECT_ALL + elif arg == (Ellipsis,): + self._mshape = () + self._select_type = H5S_SELECT_ALL + else: + raise ValueError("Illegal slicing argument for scalar dataspace") diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index 81f9b4f9..fb2c8a73 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -14,6 +14,8 @@ from .h5writer import H5Writer from ..objid import stripId, getCollectionForId +from ..array_util import bytesArrayToList +from .. import selections class H5JsonWriter(H5Writer): """ @@ -39,6 +41,7 @@ def flush(self): # json writer doesn't support incremental updates, so we'll wait # for close to write out database self.log.info("flush") + return False def close(self): """ close storage handle """ @@ -86,7 +89,7 @@ def dumpAttribute(self, obj_id, attr_name): response = {"name": attr_name} response["type"] = item["type"] response["shape"] = item["shape"] - if True: #not self.options.D: + if True: if "value" not in item: self.log.warning("no value key in attribute: " + attr_name) else: @@ -173,10 +176,18 @@ def dumpDataset(self, obj_id): shape_rsp = {} num_elements = 1 shape_rsp["class"] = shapeItem["class"] - if "dims" in shapeItem: + if shapeItem["class"] == "H5S_NULL": + dims = None + num_elements = 0 + elif shapeItem["class"] == "H5S_SCALAR": + dims = () + num_elements = 1 + else: shape_rsp["dims"] = shapeItem["dims"] - for dim in shapeItem["dims"]: - num_elements *= dim + dims = tuple(shapeItem["dims"]) + for extent in dims: + num_elements *= extent + if "maxdims" in shapeItem: maxdims = [] for dim in shapeItem["maxdims"]: @@ -196,8 +207,9 @@ def dumpDataset(self, obj_id): if not self._no_data: if num_elements > 0: - value = self.db.getDatasetValues(obj_id) - response["value"] = value # dump values unless header flag was passed + sel_all = selections.select(dims, ...) + arr = self.db.getDatasetValues(obj_id, sel_all) + response["value"] = bytesArrayToList(arr) # dump values unless header flag was passed else: response["value"] = [] # empty list return response diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index 47ff3b1e..df69f029 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -17,6 +17,7 @@ from h5json.writer.h5json_writer import H5JsonWriter from h5json.objid import isRootObjId, isValidUuid, isSchema2Id, stripId from h5json.hdf5dtype import special_dtype, Reference +from h5json import selections class H5JsonWriterTest(unittest.TestCase): @@ -45,7 +46,7 @@ def __init__(self, *args, **kwargs): def testGroup(self): - with Hdf5db(h5_writer=H5JsonWriter("/tmp/foo.json", no_data=True), app_logger=self.log) as db: + with Hdf5db(h5_writer=H5JsonWriter("/tmp/foo.json", no_data=False), app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "attr1", value=[1,2,3,4]) db.createAttribute(root_id, "attr2", 42) @@ -57,6 +58,12 @@ def testGroup(self): g1_1_id = db.createGroup() db.createHardLink(g1_id, "g1.1", g1_1_id) dset_111_id = db.createDataset(shape=(10,10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) db.createSoftLink(g2_id, "slink", "somewhere") db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") @@ -77,7 +84,6 @@ def testNullSpaceAttribute(self): self.assertTrue("class" in shape_item) self.assertEqual(shape_item["class"], "H5S_NULL") self.assertTrue(item["created"] > time.time() - 1.0) - self.assertEqual(item["modified"], None) value = db.getAttributeValue(root_id, "A1") self.assertEqual(value, None) @@ -98,7 +104,6 @@ def testScalarAttribute(self): self.assertEqual(item["value"], 42) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) shape = item["shape"] self.assertEqual(shape["class"], "H5S_SCALAR") @@ -122,7 +127,6 @@ def testFixedStringAttribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) ret_value = db.getAttributeValue(root_id, "A1") @@ -147,7 +151,6 @@ def testVlenAsciiAttribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) def testVlenUtf8Attribute(self): with Hdf5db(app_logger=self.log) as db: @@ -170,8 +173,6 @@ def testVlenUtf8Attribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) - def testIntAttribute(self): @@ -183,7 +184,6 @@ def testIntAttribute(self): self.assertEqual(item["value"], [2, 3, 5, 7, 11]) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SIMPLE") self.assertEqual(item_shape["dims"], [5,]) @@ -257,7 +257,6 @@ def testCommittedType(self): item = db.getObjectById(ctype_id) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) item_type = item["type"] @@ -294,7 +293,6 @@ def testCommittedCompoundType(self): item = db.getObjectById(ctype_id) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) item_type = item["type"] diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 2c2812dc..8931dd9c 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -14,6 +14,7 @@ import logging import numpy as np from h5json import Hdf5db +from h5json import selections from h5json.objid import isRootObjId, isValidUuid, isSchema2Id from h5json.hdf5dtype import special_dtype, Reference @@ -43,7 +44,6 @@ def __init__(self, *args, **kwargs): def testGroup(self): - with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") self.assertTrue(isSchema2Id(root_id)) @@ -120,15 +120,11 @@ def testGroup(self): except KeyError: pass # expected - try: - db.getLink(g2_id, "not_a_link") - self.assertTrue(False) - except KeyError: - pass # expected + ret = db.getLink(g2_id, "not_a_link") + self.assertTrue(ret is None) def testNullSpaceAttribute(self): - with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) @@ -138,7 +134,6 @@ def testNullSpaceAttribute(self): self.assertTrue("class" in shape_item) self.assertEqual(shape_item["class"], "H5S_NULL") self.assertTrue(item["created"] > time.time() - 1.0) - self.assertEqual(item["modified"], None) value = db.getAttributeValue(root_id, "A1") self.assertEqual(value, None) @@ -159,7 +154,6 @@ def testScalarAttribute(self): self.assertEqual(item["value"], 42) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) shape = item["shape"] self.assertEqual(shape["class"], "H5S_SCALAR") @@ -183,8 +177,8 @@ def testFixedStringAttribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) ret_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(ret_value, value.encode("ascii")) def testVlenAsciiAttribute(self): @@ -208,7 +202,6 @@ def testVlenAsciiAttribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) def testVlenUtf8Attribute(self): with Hdf5db(app_logger=self.log) as db: @@ -231,8 +224,6 @@ def testVlenUtf8Attribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) - def testIntAttribute(self): @@ -244,7 +235,6 @@ def testIntAttribute(self): self.assertEqual(item["value"], [2, 3, 5, 7, 11]) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SIMPLE") self.assertEqual(item_shape["dims"], [5,]) @@ -318,7 +308,6 @@ def testCommittedType(self): item = db.getObjectById(ctype_id) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) item_type = item["type"] @@ -337,7 +326,6 @@ def testCommittedType(self): self.assertEqual(attr_type["length"], 15) self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") - def testCommittedCompoundType(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") @@ -355,7 +343,6 @@ def testCommittedCompoundType(self): item = db.getObjectById(ctype_id) now = int(time.time()) self.assertTrue(item["created"] > now - 1) - self.assertEqual(item["modified"], None) item_type = item["type"] @@ -376,6 +363,59 @@ def testCommittedCompoundType(self): value = db.getAttributeValue(root_id, "A1") self.assertTrue(isinstance(value, np.ndarray)) + + def testSimpleDataset(self): + with Hdf5db(app_logger=self.log) as db: + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + self.assertEqual(arr.min(), 0) + self.assertEqual(arr.max(), 0) + row = np.zeros((ncols,), dtype=dtype) + for i in range(nrows): + row[:] = list(range(i*10, (i + 1)*10)) + row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) + db.setDatasetValues(dset_id, row_sel, row) + arr = db.getDatasetValues(dset_id, sel_all) + for i in range(nrows): + row = np.array(list(range(i*10, (i + 1)*10)), dtype=dtype) + np.testing.assert_array_equal(arr[i, :], row) + + + def testScalarDataset(self): + dtype = np.int32 + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset((), dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select((), ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr[()], 0) + db.setDatasetValues(dset_id, sel_all, np.array(42, dtype=dtype)) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr.min(), 42) + self.assertEqual(arr.max(), 42) + + + + + + + if __name__ == "__main__": From c0a6cc369de5268d9a2c504690c1618fe6c9c0e2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 26 Feb 2025 13:39:31 -0800 Subject: [PATCH 012/129] update h5tojson script --- src/h5json/dset_util.py | 42 ------ src/h5json/h5tojson/h5tojson.py | 215 ++--------------------------- src/h5json/hdf5db.py | 120 ++++++++-------- src/h5json/reader/h5py_reader.py | 161 ++++++++++++++++++++- src/h5json/writer/h5json_writer.py | 42 ++++-- 5 files changed, 252 insertions(+), 328 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 7a3a7aa3..c89f141f 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -58,48 +58,6 @@ _H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") """ -def make_new_dset( - shape=None, - dtype=None, - chunks=None, - compression=None, - shuffle=None, - maxshape=None, - compression_opts=None, - fillvalue=None, - cpl=None - ): - - type_json = getTypeItem(dtype) - if shape == "H5S_NULL": - shape_json = {"class": "H5S_NULL"} - else: - shape_json = {"class": "H5S_SIMPLE"} - shape_json["dims"] = list(shape) - - if maxshape: - shape_json["maxshape"] = maxshape - if cpl is None: - cpl = {} - if chunks: - cpl["chunks"] = chunks - if compression: - cpl["compression"] = compression - if shuffle: - cpl["shuffle"] = shuffle - if compression_opts: - cpl["compression_opts"] = compression_opts - if fillvalue: - cpl["fillvalue"] = fillvalue - - - # TBD - other properties - dset_json = {"shape": shape_json, "type": type_json, "cpl": cpl, "attributes": {}} - dset_json["created"] = time.time() - dset_json["modified"] = None - - return dset_json - def resize_dataset(dset_json, shape): shape_json = dset_json["shape"] shape_class = shape_json["class"] diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/h5tojson/h5tojson.py index 89a65bdd..44a7a88c 100755 --- a/src/h5json/h5tojson/h5tojson.py +++ b/src/h5json/h5tojson/h5tojson.py @@ -10,216 +10,29 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import sys -import json import argparse import os.path as op -import tempfile import logging import logging.handlers -from h5json import Hdf5db -from h5json import hdf5dtype - - -class DumpJson: - """ - DumpJson - return json representation of all objects within the given file - """ - - def __init__(self, db, app_logger=None, options=None): - self.options = options - self.db = db - if app_logger: - self.log = app_logger - else: - self.log = logging.getLogger() - self.json = {} - - def dumpAttribute(self, col_name, uuid, attr_name): - self.log.info("dumpAttribute: [" + attr_name + "]") - item = self.db.getAttributeItem(col_name, uuid, attr_name) - response = {"name": attr_name} - typeItem = item["type"] - response["type"] = hdf5dtype.getTypeResponse(typeItem) - response["shape"] = item["shape"] - if not self.options.D: - if "value" not in item: - self.log.warning("no value key in attribute: " + attr_name) - else: - response["value"] = item[ - "value" - ] # dump values unless header -D was passed - return response - - def dumpAttributes(self, col_name, uuid): - attr_list = self.db.getAttributeItems(col_name, uuid) - self.log.info("dumpAttributes: " + uuid) - items = [] - for attr in attr_list: - item = self.dumpAttribute(col_name, uuid, attr["name"]) - items.append(item) - - return items - - def dumpLink(self, uuid, name): - item = self.db.getLinkItemByUuid(uuid, name) - for key in ("ctime", "mtime", "href"): - if key in item: - del item[key] - return item - - def dumpLinks(self, uuid): - link_list = self.db.getLinkItems(uuid) - items = [] - for link in link_list: - item = self.dumpLink(uuid, link["title"]) - items.append(item) - return items - - def dumpGroup(self, uuid): - item = self.db.getGroupItemByUuid(uuid) - if "alias" in item: - alias = item["alias"] - if alias: - self.log.info("dumpGroup alias: [" + alias[0] + "]") - for key in ("ctime", "mtime", "linkCount", "attributeCount", "id"): - if key in item: - del item[key] - attributes = self.dumpAttributes("groups", uuid) - if attributes: - item["attributes"] = attributes - links = self.dumpLinks(uuid) - if links: - item["links"] = links - return item - - def dumpGroups(self): - groups = {} - item = self.dumpGroup(self.root_uuid) - groups[self.root_uuid] = item - uuids = self.db.getCollection("groups") - for uuid in uuids: - item = self.dumpGroup(uuid) - groups[uuid] = item - - self.json["groups"] = groups - - def dumpDataset(self, uuid): - response = {} - self.log.info("dumpDataset: " + uuid) - item = self.db.getDatasetItemByUuid(uuid) - if "alias" in item: - alias = item["alias"] - if alias: - self.log.info("dumpDataset alias: [" + alias[0] + "]") - response["alias"] = item["alias"] - - typeItem = item["type"] - response["type"] = hdf5dtype.getTypeResponse(typeItem) - shapeItem = item["shape"] - shape_rsp = {} - num_elements = 1 - shape_rsp["class"] = shapeItem["class"] - if "dims" in shapeItem: - shape_rsp["dims"] = shapeItem["dims"] - for dim in shapeItem["dims"]: - num_elements *= dim - if "maxdims" in shapeItem: - maxdims = [] - for dim in shapeItem["maxdims"]: - if dim == 0: - maxdims.append("H5S_UNLIMITED") - else: - maxdims.append(dim) - shape_rsp["maxdims"] = maxdims - response["shape"] = shape_rsp - - if "creationProperties" in item: - response["creationProperties"] = item["creationProperties"] - - attributes = self.dumpAttributes("datasets", uuid) - if attributes: - response["attributes"] = attributes - - if not (self.options.D or self.options.d): - if num_elements > 0: - value = self.db.getDatasetValuesByUuid(uuid) - response["value"] = value # dump values unless header flag was passed - else: - response["value"] = [] # empty list - return response - - def dumpDatasets(self): - uuids = self.db.getCollection("datasets") - if uuids: - datasets = {} - for uuid in uuids: - item = self.dumpDataset(uuid) - datasets[uuid] = item - - self.json["datasets"] = datasets - - def dumpDatatype(self, uuid): - response = {} - item = self.db.getCommittedTypeItemByUuid(uuid) - response["alias"] = item["alias"] - typeItem = item["type"] - response["type"] = hdf5dtype.getTypeResponse(typeItem) - attributes = self.dumpAttributes("datatypes", uuid) - if attributes: - response["attributes"] = attributes - return response - - def dumpDatatypes(self): - uuids = self.db.getCollection("datatypes") - if uuids: - datatypes = {} - for uuid in uuids: - item = self.dumpDatatype(uuid) - datatypes[uuid] = item - - self.json["datatypes"] = datatypes - - def dumpFile(self): - - self.root_uuid = self.db.getUUIDByPath("/") - - db_version_info = self.db.getVersionInfo() - - self.json["apiVersion"] = db_version_info["hdf5-json-version"] - self.json["root"] = self.root_uuid - - self.dumpGroups() - - self.dumpDatasets() - - self.dumpDatatypes() - - print(json.dumps(self.json, sort_keys=True, indent=4)) - - -def getTempFileName(): - """ - Generate a temporary filename to avoid problems with trying to create a dbfile - in a read-only directory. (See: https://github.com/HDFGroup/h5serv/issues/37) - """ - f = tempfile.NamedTemporaryFile(delete=False) - f.close() - return f.name +from h5json import Hdf5db +from h5json.writer.h5json_writer import H5JsonWriter +from h5json.reader.h5py_reader import H5pyReader + def main(): parser = argparse.ArgumentParser(usage="%(prog)s [-h] [-D|-d] ") - parser.add_argument("-D", action="store_true", help="surpress all data output") + parser.add_argument("-D", action="store_true", help="suppress all data output") parser.add_argument( "-d", action="store_true", - help="surpress data output for" + " datasets (but not attribute values)", + help="suppress data output for" + " datasets (but not attribute values)", ) parser.add_argument("filename", nargs="+", help="HDF5 to be converted to json") args = parser.parse_args() # create logger - log = logging.getLogger("h5serv") + log = logging.getLogger("h5tojson") # log.setLevel(logging.WARN) log.setLevel(logging.INFO) # add log handler @@ -230,16 +43,14 @@ def main(): filename = args.filename[0] if not op.isfile(filename): - sys.exit("Cannot find file: %s" % filename) - - log.info("h5tojson " + filename) + sys.exit(f"Cannot find file: {filename}") - dbFilename = getTempFileName() - log.info("Using dbFile: " + dbFilename) - with Hdf5db(filename, dbFilePath=dbFilename, readonly=True, app_logger=log) as db: - dumper = DumpJson(db, app_logger=log, options=args) - dumper.dumpFile() + log.info(f"h5tojson {filename}") + kwargs = {"app_logger": log} + + with Hdf5db(h5_reader=H5pyReader(filename, **kwargs), h5_writer=H5JsonWriter("/tmp/foo.json", no_data=False, **kwargs), **kwargs) as db: + pass if __name__ == "__main__": main() diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 991e7561..714059a6 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -14,7 +14,7 @@ import logging from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype from .array_util import jsonToArray, bytesArrayToList -from .dset_util import make_new_dset, resize_dataset +from .dset_util import resize_dataset from .objid import createObjId, getCollectionForId from . import selections from .apiversion import _apiver @@ -242,31 +242,6 @@ def getDtype(self, obj_id): return dtype - def createCommittedType(self, datatype, cpl=None): - """ - createCommittedType - creates new named datatype - Returns item - """ - self.log.info("createCommittedType") - if cpl is None: - cpl = {} - - ctype_id = createObjId(obj_type="datatypes", root_id=self.root_id) - if isinstance(datatype, np.dtype): - dt = datatype - else: - dt = createDataType(datatype) - - type_json = getTypeItem(dt) # get canonical json description of datatype - - ctype_json = {"type": type_json, "attributes": {}, "cpl": cpl} - ctype_json["created"] = time.time() - ctype_json["modified"] = None - self.db[ctype_id] = ctype_json - self._new_objects.add(ctype_id) - return ctype_id - - def getAttribute(self, obj_id, name, includeData=True): """ Get attribute given an object id and name @@ -519,45 +494,6 @@ def setDatasetValues(self, dset_id, sel, arr): updates.append((sel, arr.copy())) self.make_dirty(dset_id) - def createDataset( - self, - shape=None, - dtype=None, - chunks=None, - compression=None, - shuffle=None, - maxshape=None, - compression_opts=None, - fillvalue=None, - cpl=None, - ): - """ - createDataset - creates new dataset given shape and datatype - Returns obj_id - """ - - kwds = {} - if chunks: - kwds["chunks"] = chunks - if compression: - kwds["compression"] = compression - if shuffle: - kwds["shuffle"] = shuffle - if compression_opts: - kwds["compression_opts"] = compression_opts - if maxshape: - kwds["maxshape"] = maxshape - if fillvalue: - kwds["fillvalue"] = fillvalue - if cpl: - kwds["cpl"] = cpl - dset_json = make_new_dset(shape=shape, dtype=dtype, **kwds) - - dset_id = createObjId("datasets", root_id=self.root_id) - self.db[dset_id] = dset_json - self._new_objects.add(dset_id) - return dset_id - def resizeDataset(self, dset_id, shape): """ @@ -668,7 +604,59 @@ def createGroup(self, cpl=None): self.db[grp_id] = group_json self._new_objects.add(grp_id) return grp_id - + + + def createCommittedType(self, datatype, cpl=None): + """ + createCommittedType - creates new named datatype + Returns item + """ + self.log.info("createCommittedType") + if cpl is None: + cpl = {} + + ctype_id = createObjId(obj_type="datatypes", root_id=self.root_id) + if isinstance(datatype, np.dtype): + dt = datatype + else: + dt = createDataType(datatype) + + type_json = getTypeItem(dt) # get canonical json description of datatype + + ctype_json = {"type": type_json, "attributes": {}, "cpl": cpl} + ctype_json["created"] = time.time() + self.db[ctype_id] = ctype_json + self._new_objects.add(ctype_id) + return ctype_id + + + def createDataset( + self, + shape=None, + dtype=None, + cpl=None, + ): + """ + createDataset - creates new dataset given shape and datatype + Returns obj_id + """ + type_json = getTypeItem(dtype) + if shape == "H5S_NULL": + shape_json = {"class": "H5S_NULL"} + else: + shape_json = {"class": "H5S_SIMPLE"} + shape_json["dims"] = list(shape) + + dset_json = {"shape": shape_json, "type": type_json, "attributes": {}} + if cpl: + dset_json["cpl"] = cpl + else: + dset_json["cpl"] = {} + + dset_id = createObjId("datasets", root_id=self.root_id) + self.db[dset_id] = dset_json + self._new_objects.add(dset_id) + return dset_id def getCollection(self, col_type=None): obj_ids = [] diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index 4e7c9b55..040d0ae4 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -11,12 +11,56 @@ ############################################################################## import h5py import numpy as np +import logging from ..objid import createObjId from ..hdf5dtype import getTypeItem from ..array_util import bytesArrayToList +from .. import selections from ..h5reader import H5Reader +_HDF_FILTERS = { + 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, + 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, + 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, + 4: { + "class": "H5Z_FILTER_SZIP", + "alias": "szip", + "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], + }, + 5: {"class": "H5Z_FILTER_NBIT"}, + 6: { + "class": "H5Z_FILTER_SCALEOFFSET", + "alias": "scaleoffset", + "options": ["scaleType", "scaleOffset"], + }, + 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, +} + +_HDF_FILTER_OPTION_ENUMS = { + "coding": { + h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", + h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", + }, + "scaleType": { + h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", + h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", + h5py.h5z.SO_INT: "H5Z_SO_INT", + }, +} + +# h5py supported filters +_H5PY_FILTERS = { + "gzip": 1, + "shuffle": 2, + "fletcher32": 3, + "szip": 4, + "scaleoffset": 6, + "lzf": 32000, +} + +_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") + class H5pyReader(H5Reader): """ @@ -196,6 +240,97 @@ def _getDatatype(self, ctype, include_attrs=True): return item + + def _getHDF5DatasetCreationProperties(self, dset, type_class): + """ Get dataset creation properties maintained by HDF5 library """ + + # + # Fill in creation properties + # + creationProps = {} + plist = h5py.h5d.DatasetID.get_create_plist(dset.id) + + # alloc time + nAllocTime = plist.get_alloc_time() + if nAllocTime == h5py.h5d.ALLOC_TIME_DEFAULT: + creationProps["allocTime"] = "H5D_ALLOC_TIME_DEFAULT" + elif nAllocTime == h5py.h5d.ALLOC_TIME_LATE: + creationProps["allocTime"] = "H5D_ALLOC_TIME_LATE" + elif nAllocTime == h5py.h5d.ALLOC_TIME_EARLY: + creationProps["allocTime"] = "H5D_ALLOC_TIME_EARLY" + elif nAllocTime == h5py.h5d.ALLOC_TIME_INCR: + creationProps["allocTime"] = "H5D_ALLOC_TIME_INCR" + else: + self.log.warning(f"Unknown alloc time value: {nAllocTime}") + + # fill time + nFillTime = plist.get_fill_time() + if nFillTime == h5py.h5d.FILL_TIME_ALLOC: + creationProps["fillTime"] = "H5D_FILL_TIME_ALLOC" + elif nFillTime == h5py.h5d.FILL_TIME_NEVER: + creationProps["fillTime"] = "H5D_FILL_TIME_NEVER" + elif nFillTime == h5py.h5d.FILL_TIME_IFSET: + creationProps["fillTime"] = "H5D_FILL_TIME_IFSET" + else: + self.log.warning(f"unknown fill time value: {nFillTime}") + + if type_class == "H5T_OPAQUE": + # TBD: store opaque fill value as a hex string + self.log.warning("Opaque fill value not supported") + else: + if plist.fill_value_defined() == h5py.h5d.FILL_VALUE_USER_DEFINED: + creationProps["fillValue"] = bytesArrayToList(dset.fillvalue) + + # layout + nLayout = plist.get_layout() + if nLayout == h5py.h5d.COMPACT: + creationProps["layout"] = {"class": "H5D_COMPACT"} + elif nLayout == h5py.h5d.CONTIGUOUS: + creationProps["layout"] = {"class": "H5D_CONTIGUOUS"} + elif nLayout == h5py.h5d.CHUNKED: + creationProps["layout"] = {"class": "H5D_CHUNKED", "dims": dset.chunks} + else: + self.log.warning(f"Unknown layout value: {nLayout}") + + num_filters = plist.get_nfilters() + filter_props = [] + if num_filters: + for n in range(num_filters): + filter_info = plist.get_filter(n) + opt_values = filter_info[2] + filter_prop = {} + filter_id = filter_info[0] + filter_prop["id"] = filter_id + if filter_info[3]: + filter_prop["name"] = self.bytesArrayToList(filter_info[3]) + if filter_id in _HDF_FILTERS: + hdf_filter = _HDF_FILTERS[filter_id] + filter_prop["class"] = hdf_filter["class"] + if "options" in hdf_filter: + filter_opts = hdf_filter["options"] + for i in range(len(filter_opts)): + if len(opt_values) <= i: + break # end of option values + opt_value = opt_values[i] + opt_value_enum = None + option_name = filter_opts[i] + if option_name in _HDF_FILTER_OPTION_ENUMS: + option_enums = _HDF_FILTER_OPTION_ENUMS[option_name] + if opt_value in option_enums: + opt_value_enum = option_enums[opt_value] + if opt_value_enum: + filter_prop[option_name] = opt_value_enum + else: + filter_prop[option_name] = opt_value + else: + # custom filter + filter_prop["class"] = "H5Z_FILTER_USER" + if opt_values: + filter_prop["parameters"] = opt_values + filter_props.append(filter_prop) + creationProps["filters"] = filter_props + + return creationProps def _getDataset(self, dset): self.log.info(f"getDataset alias: [{dset.name}]") @@ -207,7 +342,7 @@ def _getDataset(self, dset): type_uuid = None addr = h5py.h5o.get_info(typeid).addr type_uuid = self.getObjIdByAddress(addr) - committedType = self.getObjectByid(type_uuid) + committedType = self.getObjectById(type_uuid) typeItem = committedType["type"] typeItem["id"] = type_uuid else: @@ -237,7 +372,10 @@ def _getDataset(self, dset): if include_maxdims: shapeItem["maxdims"] = maxshape item["shape"] = shapeItem - + + item["cpl"] = self._getHDF5DatasetCreationProperties(dset, typeItem["class"]) + + return item def getObjectById(self, obj_id, include_attrs=True, include_links=True): @@ -261,7 +399,7 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): return obj_json - def getDatasetValues(self, dset_id, selection): + def getDatasetValues(self, dset_id, sel): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same @@ -272,7 +410,22 @@ def getDatasetValues(self, dset_id, selection): if dset.shape is None: # TBD: return something like h5py.Empty in this case? return None - arr = dset[selection] + if sel.select_type == selections.H5S_SELECT_ALL: + arr = dset[...] + elif sel.select_type == selections.H5S_SELECT_HYPERSLABS: + rank = len(dset.shape) + + slices = [] + for dim in range(rank): + start = sel.start[dim] + stop = start + sel.count[dim] + step = sel.step[dim] + slices.append(slice(start, stop, step)) + slices = tuple(slices) + arr = dset[slices] + else: + raise TypeError("selection type not supported") + return arr diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index fb2c8a73..4ca75cbe 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -47,12 +47,28 @@ def close(self): """ close storage handle """ self.dumpFile() + def getAliasList(self, obj_id): + """ return list of alias """ + if obj_id not in self.alias_db: + self.alias_db[obj_id] = [] + return self.alias_db[obj_id] + + + def updateAliasList(self): + """ update the alias list for each object """ + # clear exiting aliases + obj_ids = self.db.getCollection() + for obj_id in obj_ids: + self.alias_db[obj_id] = [] + + self._setAlias(self._root_uuid, set(), "/") + def _setAlias(self, obj_id, id_set, h5path): """ add the given h5path to the object's alias list If the object is a group, recurse through each hard link """ obj_json = self.db.getObjectById(obj_id) - alias_list = self.alias_db[obj_id] + alias_list = self.getAliasList(obj_id) if h5path in alias_list: return # nothing to do alias_list.append(h5path) @@ -73,15 +89,6 @@ def _setAlias(self, obj_id, id_set, h5path): self._setAlias(tgt_id, id_set, h5path+link_name) id_set.remove(obj_id) - def getAliasList(self): - """ update the alias list for each object """ - # clear exiting aliases - obj_ids = self.db.getCollection() - for obj_id in obj_ids: - self.alias_db[obj_id] = [] - - self._setAlias(self._root_uuid, set(), "/") - def dumpAttribute(self, obj_id, attr_name): self.log.info(f"dumpAttribute: [{attr_name}]") @@ -133,7 +140,8 @@ def dumpLinks(self, obj_id): def dumpGroup(self, obj_id): item = self.db.getObjectById(obj_id) response = {} - alias = self.alias_db[obj_id] + + alias = self.getAliasList(obj_id) response["alias"] = alias if "cpl" in item: @@ -220,7 +228,8 @@ def dumpDatasets(self): datasets = {} for obj_id in obj_ids: item = self.dumpDataset(obj_id) - datasets[obj_id] = item + obj_uuid = stripId(obj_id) + datasets[obj_uuid] = item self.json["datasets"] = datasets @@ -242,7 +251,8 @@ def dumpDatatypes(self): datatypes = {} for obj_id in obj_ids: item = self.dumpDatatype(obj_id) - datatypes[obj_id] = item + obj_uuid = stripId(obj_id) + datatypes[obj_uuid] = item self.json["datatypes"] = datatypes @@ -254,13 +264,17 @@ def dumpFile(self): self.json["apiVersion"] = db_version_info["hdf5-json-version"] self.json["root"] = stripId(self._root_uuid) - self.getAliasList() # create alias_db with obj_id to alias list dict + + self.updateAliasList() # create alias_db with obj_id to alias list dict + self.dumpGroups() self.dumpDatasets() self.dumpDatatypes() + + print(json.dumps(self.json, sort_keys=True, indent=4)) From 48d43e4968b1df20ffde35e1d165b84669312753 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 26 Feb 2025 18:19:58 -0800 Subject: [PATCH 013/129] added h5json read --- src/h5json/reader/h5json_reader.py | 186 +++++++++++++++++++++++++++++ src/h5json/reader/h5py_reader.py | 19 +-- src/h5json/reader/h5reader.py | 2 +- src/h5json/selections.py | 12 ++ src/h5json/writer/h5json_writer.py | 2 - test/unit/h5json_reader_test.py | 121 +++++++++++++++++++ test/unit/h5py_reader_test.py | 9 +- 7 files changed, 327 insertions(+), 24 deletions(-) create mode 100644 src/h5json/reader/h5json_reader.py create mode 100644 test/unit/h5json_reader_test.py diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/reader/h5json_reader.py new file mode 100644 index 00000000..44d178a5 --- /dev/null +++ b/src/h5json/reader/h5json_reader.py @@ -0,0 +1,186 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import json +import logging + +from ..objid import getCollectionForId, stripId + +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray +from .. import selections +from ..h5reader import H5Reader + + +class H5JsonReader(H5Reader): + """ + This class can be used by HDF5DB to read content from an hdf5-json file + """ + + + def __init__( + self, + filepath, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + super().__init__(filepath, app_logger=app_logger) + + with open(filepath) as f: + text = f.read() + + # parse the json file + h5json = json.loads(text) + + self._h5json = h5json + + if "root" not in h5json: + raise Exception("no root key in input file") + self._root_id = "g-" + h5json["root"] + + def close(self): + pass + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + collection = getCollectionForId(obj_id) + if collection not in self._h5json: + self.log.warning(f"getObjectBId - collection: {collection} not found") + return None + json_objs = self._h5json[collection] + obj_uuid = stripId(obj_id) + if obj_uuid not in json_objs: + self.log.warning(f"getObjectById - {obj_id} not found") + return None + json_obj = json_objs[obj_uuid] + + resp = {} + # selectively copy from the db dict + for k in json_obj: + for k in ("shape", "type", "cpl", "dcpl"): + if k in json_obj: + resp[k] = json_obj[k] + if include_attrs and "attributes" in json_obj: + attrs = {} + attr_list = json_obj["attributes"] + for item in attr_list: + if "name" not in item: + self.log.warning(f"expected to find name key for {obj_id} attributes") + continue + name = item["name"] + attr = {} + for k in ("type", "shape", "value"): + attr[k] = item[k] + attrs[name] = attr + resp["attributes"] = attrs + + if include_links and "links" in json_obj: + links = {} + link_list = json_obj["links"] + for item in link_list: + if "title" not in item: + self.log.warning(f"expected to find title key for {obj_id} links") + continue + title = item["title"] + link = {} + for k in ("class", "file", "h5path"): + if k in item: + link[k] = item[k] + if "collection" in item: + collection = item["collection"] + if "id" not in item: + self.log.warning(f"expected to find id key for {obj_id} link item") + continue + obj_uuid = item["id"] + if collection == "groups": + obj_id = "g-" + obj_uuid + elif collection == "datasets": + obj_id = "d-" + obj_uuid + elif collection == "datatypes": + obj_id = "t-" + obj_uuid + else: + self.log.warning(f"unexpected collection type: {collection}") + continue + item["id"] = obj_id + links[title] = item + resp["links"] = links + + return resp + + + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + self.log.debug(f"getAttribute({obj_id}), [{name}], include_data={includeData})") + json_obj = self.getObjectById(obj_id) + if json_obj is None: + return None + if "attributes" not in json_obj: + self.log.warning(f"obj: {obj_id} has no attributes collection") + return None + attributes = json_obj["attributes"] + if name not in attributes: + self.log.info(f"attr: [{name}] of {obj_id} not found") + return None + return attributes[name] + + + def getDatasetValues(self, obj_id, sel=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + self.log.debug(f"getDatasetValues({obj_id}), sel={sel}") + json_obj = self.getObjectById(obj_id) + if json_obj is None: + return None + if "value" not in json_obj: + self.log.warning("value key not found for {obj_id}") + return None + json_value = json_obj["value"] + shape_json = json_obj["shape"] + if shape_json["class"] == "H5S_NULL": + self.log.warning("getDatasetValues called for null space object: {obj_id}") + return None + elif shape_json["class"] == "H5S_SCALAR": + dims = () + else: + dims = shape_json["dims"] + + type_item = json_obj["type"] + dtype = createDataType(type_item) + arr = jsonToArray(dims, dtype, json_value) + if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + pass # just return the entire array + elif isinstance(sel, selections.SimpleSelection): + arr = arr[sel.slices] + else: + raise NotImplementedError("selection type not supported") + + return arr + + + + + + + diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index 040d0ae4..6d06e6c1 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -399,7 +399,7 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): return obj_json - def getDatasetValues(self, dset_id, sel): + def getDatasetValues(self, dset_id, sel=None): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same @@ -410,21 +410,12 @@ def getDatasetValues(self, dset_id, sel): if dset.shape is None: # TBD: return something like h5py.Empty in this case? return None - if sel.select_type == selections.H5S_SELECT_ALL: + if sel is None or sel.select_type == selections.H5S_SELECT_ALL: arr = dset[...] - elif sel.select_type == selections.H5S_SELECT_HYPERSLABS: - rank = len(dset.shape) - - slices = [] - for dim in range(rank): - start = sel.start[dim] - stop = start + sel.count[dim] - step = sel.step[dim] - slices.append(slice(start, stop, step)) - slices = tuple(slices) - arr = dset[slices] + elif isinstance(sel, selections.SimpleSelection): + arr = dset[sel.slices] else: - raise TypeError("selection type not supported") + raise NotImplementedError("selection type not supported") return arr diff --git a/src/h5json/reader/h5reader.py b/src/h5json/reader/h5reader.py index 69a45d07..3923bb15 100644 --- a/src/h5json/reader/h5reader.py +++ b/src/h5json/reader/h5reader.py @@ -51,7 +51,7 @@ def getAttribute(self, obj_id, name, includeData=True): pass @abstractmethod - def getDatasetValues(self, obj_id, selection): + def getDatasetValues(self, obj_id, sel=None): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 4d700d94..ef296d70 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -468,6 +468,18 @@ def broadcast(self, target_shape): sid.offset_simple(offset) yield sid + @property + def slices(self): + """ return tuple of slices for this selection """ + rank = len(self.shape) + slices = [] + for dim in range(rank): + start = self.start[dim] + stop = start + self.count[dim] + step = self.step[dim] + slices.append(slice(start, stop, step)) + return tuple(slices) + def __repr__(self): s = f"SimpleSelection(shape:{self._shape}, start: {self._sel[0]}," s += f" count: {self._sel[1]}, step: {self._sel[2]}" diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index 4ca75cbe..85dd8e38 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -273,8 +273,6 @@ def dumpFile(self): self.dumpDatatypes() - - print(json.dumps(self.json, sort_keys=True, indent=4)) diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py new file mode 100644 index 00000000..effa0e58 --- /dev/null +++ b/test/unit/h5json_reader_test.py @@ -0,0 +1,121 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import os +import os.path as op +import stat +import logging +import shutil +from h5json import Hdf5db +from h5json.reader.h5json_reader import H5JsonReader + + +def getFile(name, tgt, ro=False): + src = "data/json/" + name + logging.info("copying file to this directory: " + src) + + filepath = "./out/" + tgt + + if op.isfile(filepath): + # make sure it's writable, before we copy over it + os.chmod(filepath, stat.S_IWRITE | stat.S_IREAD) + shutil.copyfile(src, filepath) + if ro: + logging.info("make read-only") + os.chmod(filepath, stat.S_IREAD) + return filepath + + +def removeFile(name): + try: + os.stat(name) + except OSError: + return + # file does not exist + os.remove(name) + + +class H5pyReaderTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5pyReaderTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.INFO) + handler = logging.FileHandler("./h5json_reader_test.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + + def testSimple(self): + filepath = getFile("tall.json", "tall.json", ro=True) + kwargs = {"app_logger": self.log} + with Hdf5db(h5_reader=H5JsonReader(filepath, **kwargs), **kwargs) as db: + root_id = db.getObjectIdByPath("/") + root_json = db.getObjectById(root_id) + print("root_json:", root_json) + + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10,10]) + + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() + + + + diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index 420909ca..c612adc6 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -11,14 +11,13 @@ ############################################################################## import unittest import os -import time -import errno + import os.path as op import stat import logging import shutil from h5json import Hdf5db -from h5json.h5py_reader import H5pyReader +from h5json.reader.h5py_reader import H5pyReader def getFile(name, tgt, ro=False): @@ -111,10 +110,6 @@ def testSimple(self): db.close() - - - - if __name__ == "__main__": # setup test files From 06b5a6fe0e5ee1b390cdb13f584e136f2d012e88 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 27 Feb 2025 00:21:42 -0800 Subject: [PATCH 014/129] added h5py writer --- src/h5json/h5tojson/h5tojson.py | 26 +-- src/h5json/hdf5db.py | 2 +- src/h5json/jsontoh5/jsontoh5.py | 277 +++---------------------- src/h5json/reader/h5py_reader.py | 8 +- src/h5json/writer/h5json_writer.py | 3 +- src/h5json/writer/h5py_writer.py | 186 +++++++++++++++++ src/h5json/writer/h5writer.py | 2 + test/unit/h5py_writer_test.py | 321 +++++++++++++++++++++++++++++ 8 files changed, 559 insertions(+), 266 deletions(-) create mode 100644 src/h5json/writer/h5py_writer.py create mode 100644 test/unit/h5py_writer_test.py diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/h5tojson/h5tojson.py index 44a7a88c..48a4b83b 100755 --- a/src/h5json/h5tojson/h5tojson.py +++ b/src/h5json/h5tojson/h5tojson.py @@ -10,7 +10,6 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import sys -import argparse import os.path as op import logging import logging.handlers @@ -21,16 +20,18 @@ def main(): - parser = argparse.ArgumentParser(usage="%(prog)s [-h] [-D|-d] ") - parser.add_argument("-D", action="store_true", help="suppress all data output") - parser.add_argument( - "-d", - action="store_true", - help="suppress data output for" + " datasets (but not attribute values)", - ) - parser.add_argument("filename", nargs="+", help="HDF5 to be converted to json") - args = parser.parse_args() - + if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): + print(f"usage: {sys.argv[0]} [-h] [--nodata] ") + sys.exit(0) + + no_data = False + filename = None + for i in range(1, len(sys.argv)): + if sys.argv[i] == "--nodata": + no_data = True + else: + filename = sys.argv[i] + # create logger log = logging.getLogger("h5tojson") # log.setLevel(logging.WARN) @@ -41,7 +42,6 @@ def main(): # add handler to logger log.addHandler(handler) - filename = args.filename[0] if not op.isfile(filename): sys.exit(f"Cannot find file: {filename}") @@ -49,7 +49,7 @@ def main(): kwargs = {"app_logger": log} - with Hdf5db(h5_reader=H5pyReader(filename, **kwargs), h5_writer=H5JsonWriter("/tmp/foo.json", no_data=False, **kwargs), **kwargs) as db: + with Hdf5db(h5_reader=H5pyReader(filename, **kwargs), h5_writer=H5JsonWriter(None, no_data=no_data, **kwargs), **kwargs) as db: pass if __name__ == "__main__": diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 714059a6..e1194264 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -224,7 +224,7 @@ def getObjectIdByPath(self, h5path, parent_id=None): def getObjectByPath(self, path): """ Get Object JSON at given path """ - obj_id = self.getObjectIDByPath(path) + obj_id = self.getObjectIdByPath(path) obj_json = self.getObjectById(obj_id) return obj_json diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/jsontoh5/jsontoh5.py index c12d037a..bd1455e8 100755 --- a/src/h5json/jsontoh5/jsontoh5.py +++ b/src/h5json/jsontoh5/jsontoh5.py @@ -9,238 +9,35 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import json -import argparse -import h5py +import sys +import os.path as op import logging import logging.handlers from h5json import Hdf5db +from h5json.writer.h5py_writer import H5pyWriter +from h5json.reader.h5json_reader import H5JsonReader - -""" -Writeh5 - return json representation of all objects within the given file - h5writer = Writeh5(db, h5json) - h5writer.writeFile() -""" - - -class Writeh5: - def __init__(self, db, json, options=None): - self.options = options - self.db = db - self.json = json - self.root_uuid = None - - # - # Create a hard, soft, or external link - # - def createLink(self, link_obj, parent_uuid): - title = link_obj["title"] - link_class = link_obj["class"] - if link_class == "H5L_TYPE_HARD": - child_uuid = link_obj["id"] - self.db.linkObject(parent_uuid, child_uuid, title) - elif link_class == "H5L_TYPE_SOFT": - h5path = link_obj["h5path"] - self.db.createSoftLink(parent_uuid, h5path, title) - elif link_class == "H5L_TYPE_EXTERNAL": - h5path = link_obj["h5path"] - link_file = link_obj["file"] - self.db.createExternalLink(parent_uuid, link_file, h5path, title) - else: - print("Unable to create link with class:", link_class) - - # - # Create HDF5 dataset object and write data values - # - def createDataset(self, uuid, body): - datatype = body["type"] - if isinstance(datatype, str) and datatype.startswith("datatypes/"): - # committed datatype, just pass in the UUID part - datatype = datatype[len("datatypes/") :] - dims = () # if no space in body, default to scalar - max_shape = None - creation_props = None - if "creationProperties" in body: - creation_props = body["creationProperties"] - if "shape" in body: - shape = body["shape"] - if shape["class"] == "H5S_SIMPLE": - dims = shape["dims"] - if isinstance(dims, int): - # convert int to array - dim1 = shape - dims = [dim1] - if "maxdims" in shape: - max_shape = shape["maxdims"] - if isinstance(max_shape, int): - # convert to array - dim1 = max_shape - max_shape = [dim1] - # convert H5S_UNLIMITED's to None's - for i in range(len(max_shape)): - if max_shape[i] == "H5S_UNLIMITED": - max_shape[i] = None - elif shape["class"] == "H5S_NULL": - dims = None - - self.db.createDataset( - datatype, - dims, - max_shape=max_shape, - creation_props=creation_props, - obj_uuid=uuid, - ) - - if "value" in body: - data = body["value"] - if data: - data = self.db.toRef(len(dims), datatype, data) - self.db.setDatasetValuesByUuid(uuid, data) - - def createAttribute(self, attr_json, col_name, uuid): - attr_name = attr_json["name"] - datatype = attr_json["type"] - if isinstance(datatype, str) and datatype.startswith("datatypes/"): - # committed datatype, just pass in the UUID part - datatype = datatype[len("datatypes/") :] - - attr_value = None - if "value" in attr_json: - attr_value = attr_json["value"] - dims = None - if "shape" in attr_json: - shape = attr_json["shape"] - if shape["class"] == "H5S_SIMPLE": - dims = shape["dims"] - if isinstance(dims, int): - # convert int to array - dim1 = shape - dims = [dim1] - elif shape["class"] == "H5S_SCALAR": - dims = () # empty tuple for scalar - self.db.createAttribute(col_name, uuid, attr_name, dims, datatype, attr_value) - - # - # create committed datatype HDF5 object - # - def createDatatype(self, uuid, body): - datatype = body["type"] - self.db.createCommittedType(datatype, obj_uuid=uuid) - - # - # Create HDF5 group object (links and attributes will be added later) - # - def createGroup(self, uuid, body): - if uuid != self.root_uuid: - self.db.createGroup(obj_uuid=uuid) - - # - # Create all the HDF5 objects defined in the JSON file - # - def createObjects(self): - # create datatypes - if "datatypes" in self.json: - datatypes = self.json["datatypes"] - for uuid in datatypes: - json_obj = datatypes[uuid] - self.createDatatype(uuid, json_obj) - # create groups - if "groups" in self.json: - groups = self.json["groups"] - for uuid in groups: - json_obj = groups[uuid] - self.createGroup(uuid, json_obj) - # create datasets - if "datasets" in self.json: - datasets = self.json["datasets"] - for uuid in datasets: - json_obj = datasets[uuid] - self.createDataset(uuid, json_obj) - - # - # Create all the attributes for HDF5 objects defined in the JSON file - # Note: this needs to be done after createObjects since an attribute - # may use a committed datatype - # - def createAttributes(self): - dimension_list_attrs = [] # track dimension list attributes - # create datatype attributes - if "datatypes" in self.json: - datatypes = self.json["datatypes"] - for uuid in datatypes: - body = datatypes[uuid] - if "attributes" in body: - attributes = body["attributes"] - for attribute in attributes: - self.createAttribute(attribute, "datatypes", uuid) - # create group attributes - if "groups" in self.json: - groups = self.json["groups"] - for uuid in groups: - body = groups[uuid] - if "attributes" in body: - attributes = body["attributes"] - for attribute in attributes: - self.createAttribute(attribute, "groups", uuid) - # create datasets - if "datasets" in self.json: - datasets = self.json["datasets"] - for uuid in datasets: - body = datasets[uuid] - if "attributes" in body: - attributes = body["attributes"] - for attribute in attributes: - if attribute["name"] == "DIMENSION_LIST": - # defer dimension list creation until after we've created all other - # attributes (otherwsie attach_scale may fail) - dimension_list_attrs.append( - {"attribute": attribute, "uuid": uuid} - ) - else: - self.createAttribute(attribute, "datasets", uuid) - - # finally, do dimension_list attributes - for item in dimension_list_attrs: - attribute = item["attribute"] - uuid = item["uuid"] - self.createAttribute(attribute, "datasets", uuid) - - # - # Link all the objects - # Note: this will "de-anonymous-ize" objects defined in the HDF5 file - # Any non-linked objects will be deleted when the __db__ group is deleted - # - def createLinks(self): - if "groups" in self.json: - groups = self.json["groups"] - for uuid in groups: - json_obj = groups[uuid] - if "links" in json_obj: - links = json_obj["links"] - for link in links: - self.createLink(link, uuid) - - def writeFile(self): - - self.root_uuid = self.json["root"] - - self.createObjects() # create datasets, groups, committed datatypes - self.createAttributes() # create attributes for objects - self.createLinks() # link it all together - + def main(): - parser = argparse.ArgumentParser(usage="%(prog)s [-h] ") - parser.add_argument( - "in_filename", nargs="+", help="JSon file to be converted to h5" - ) - parser.add_argument("out_filename", nargs="+", help="name of HDF5 output file") - args = parser.parse_args() - + if len(sys.argv) < 3 or sys.argv[1] in ("-h", "--help"): + print(f"usage: {sys.argv[0]} [-h] [--nodata] ") + sys.exit(0) + + no_data = False + json_filename = None + hdf5_filename = None + for i in range(1, len(sys.argv)): + if sys.argv[i] == "--nodata": + no_data = True + elif not json_filename: + json_filename = sys.argv[i] + else: + hdf5_filename = sys.argv[i] + # create logger - log = logging.getLogger("h5serv") + log = logging.getLogger("h5json") # log.setLevel(logging.WARN) log.setLevel(logging.INFO) # add log handler @@ -249,34 +46,16 @@ def main(): # add handler to logger log.addHandler(handler) - text = open(args.in_filename[0]).read() - - # parse the json file - h5json = json.loads(text) - - if "root" not in h5json: - raise Exception("no root key in input file") - root_uuid = h5json["root"] - - filename = args.out_filename[0] - - # create the file, will raise IOError if there's a problem - Hdf5db.createHDF5File(filename) + if not op.isfile(json_filename): + sys.exit(f"Cannot find file: {json_filename}") - with Hdf5db( - filename, root_uuid=root_uuid, update_timestamps=False, app_logger=log - ) as db: - h5writer = Writeh5(db, h5json) - h5writer.writeFile() + log.info(f"jsontoh5 {json_filename} to {hdf5_filename}") - # open with h5py and remove the _db_ group - # Note: this will delete any anonymous (un-linked) objects - f = h5py.File(filename, "a") - if "__db__" in f: - del f["__db__"] - f.close() + kwargs = {"app_logger": log} + + with Hdf5db(h5_reader=H5JsonReader(json_filename, **kwargs), h5_writer=H5pyWriter(hdf5_filename, no_data=no_data, **kwargs), **kwargs) as db: + pass - print("done!") if __name__ == "__main__": diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index 6d06e6c1..57f0f3a0 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -108,6 +108,12 @@ def get_root_id(self): """ Return root id """ return self._root_id + def getObjIdByAddress(self, addr): + if addr in self._addr_map: + return self._addr_map[addr] + else: + return None + def getAttribute(self, obj_id, name, include_data=True): """ Return JSON for the given attribute """ @@ -130,7 +136,7 @@ def getAttribute(self, obj_id, name, include_data=True): type_uuid = None addr = h5py.h5o.get_info(typeid).addr type_uuid = self.getObjIdByAddress(addr) - committedType = self.getCommittedTypeItemByUuid(type_uuid) + committedType = self._id_map[type_uuid] type_item = committedType["type"].copy() type_item["id"] = type_uuid else: diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index 85dd8e38..8c5ce6af 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -30,10 +30,9 @@ def __init__( no_data=False, app_logger=None ): - super().__init__(filepath, append=append, app_logger=app_logger) + super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) self.alias_db = {} self.json = {} - self._no_data = no_data self._root_uuid = None def flush(self): diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py new file mode 100644 index 00000000..571dc37f --- /dev/null +++ b/src/h5json/writer/h5py_writer.py @@ -0,0 +1,186 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import h5py + +from ..objid import getCollectionForId +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray + +from .h5writer import H5Writer + + + +class H5pyWriter(H5Writer): + """ + This class saves state from the Hdf5Db class into an HDF5 file. + """ + + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) + + if append: + self._mode = "a" + else: + self._mode = "w" + + self._f = None + self._id_map = {} + + def _createGroup(self, parent, grp_json, name=None): + """ create the group and any links it contains """ + grp = parent.create_group(name) + if "links" in grp_json: + grp_links = grp_json["links"] + self._createLinks(grp, grp_links) + + + def _createDataset(self, parent, dset_json, name=None): + """ create a dataset object """ + + type_item = dset_json["type"] + dtype = createDataType(type_item) + kwds = {"dtype": dtype} + shape_json = dset_json["shape"] + if shape_json["class"] == "H5S_NULL": + # skip the shape keyword to create a null space dataset + pass + elif shape_json["class"] == "H5S_SCALAR": + kwds["shape"] = () + else: + kwds["shape"] = shape_json["dims"] + parent.create_dataset(name, **kwds) + + + def _createDatatype(self, parent, ctype_json, name=None): + """ create a datatype object """ + + type_item = ctype_json["type"] + dtype = createDataType(type_item) + parent[name] = dtype + + + def _createLinks(self, parent, links_json): + """ create links in the given group """ + for title in links_json: + if title in parent: + # TBD: this will do the wrong thing if the link tgt has changed + continue + link_json = links_json[title] + link_class = link_json["class"] + if link_class == "H5L_TYPE_SOFT": + h5path = link_json["h5path"] + parent[title] = h5py.SoftLink(h5path) + elif link_class == "H5L_TYPE_EXTERNAL": + h5path = link_json["h5path"] + filename = link_json["file"] + parent[title] = h5py.ExternalLink(filename, h5path) + elif link_class == "H5L_TYPE_USER_DEFINED": + self.log.warning("unable to create user-defined link: {title}") + elif link_class == "H5L_TYPE_HARD": + tgt_id = link_json["id"] + if tgt_id in self._id_map: + tgt_path = self._id_map[tgt_id] + tgt_obj = parent[tgt_path] + parent[title] = tgt_obj + else: + obj_json = self.db.getObjectById(tgt_id) + parent_path = parent.name + if parent_path[-1] != '/': + parent_path += '/' + self._id_map[tgt_id] = parent_path + title + collection = getCollectionForId(tgt_id) + kwds = {"name": title} + if collection == "groups": + tgt_obj = self._createGroup(parent, obj_json, **kwds) + elif collection == "datasets": + tgt_obj = self._createDataset(parent, obj_json, **kwds) + elif collection == "datatypes": + tgt_obj = self._createDatatype(parent, obj_json, **kwds) + else: + self.log.warning(f"unexpected collection: {collection}") + tgt_obj = None + if tgt_obj: + parent[title] = tgt_obj + else: + self.log.warning(f"unexpected link class: {link_class}") + + def createAttribute(self, obj, name, attr_json): + """ add the given attribute to obj """ + + dtype = createDataType(attr_json["type"]) + shape_json = attr_json["shape"] + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": + dims = None + elif shape_class == "H5S_SCALAR": + dims = () + else: + dims = tuple(shape_json["dims"]) + + if dims is None: + obj.attrs[name] = h5py.Empty(dtype) + else: + json_value = attr_json["value"] + arr = jsonToArray(dims, dtype, json_value) + obj.attrs[name] = arr + + + def createAttributes(self, obj, obj_json): + """ create attributes """ + + if "attributes" not in obj_json: + # no attributes + return + + attrs = obj_json["attributes"] + for name in attrs: + attr_json = attrs[name] + self.createAttribute(obj, name, attr_json) + + + def visitAttributes(self, path, obj): + name = obj.__class__.__name__ + self.log.info(f"visit: {path} name: {name}") + + obj_json = self.db.getObjectByPath(path) + self.createAttributes(obj, obj_json) + + def flush(self): + """ Write dirty items """ + if not self.db: + # no db set yet + return + + root_id = self.db.root_id + self._id_map[root_id] = "/" + with h5py.File(self._filepath, mode=self._mode) as f: + root_json = self.db.getObjectById(root_id) + if "links" in root_json: + root_links = root_json["links"] + self._createLinks(f, root_links) + # update attributes + self.createAttributes(f, root_json) + f.visititems(self.visitAttributes) + self._mode = "a" # use append mode for future updates + + + def close(self): + """ close storage handle """ + self.flush() + diff --git a/src/h5json/writer/h5writer.py b/src/h5json/writer/h5writer.py index 3aa77bb9..4e57048f 100644 --- a/src/h5json/writer/h5writer.py +++ b/src/h5json/writer/h5writer.py @@ -25,10 +25,12 @@ def __init__( self, filepath, append=False, + no_data=False, app_logger=None ): self._filepath = filepath self._append = append + self._no_data = no_data self._filepath = filepath self._db_ref = None if app_logger: diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py new file mode 100644 index 00000000..38447aff --- /dev/null +++ b/test/unit/h5py_writer_test.py @@ -0,0 +1,321 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import time +import logging +import numpy as np +from h5json import Hdf5db +from h5json.writer.h5py_writer import H5pyWriter +from h5json.objid import isRootObjId, isValidUuid, isSchema2Id, stripId +from h5json.hdf5dtype import special_dtype, Reference +from h5json import selections + + +class H5pyWriterTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5pyWriterTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.DEBUG) + # create logger + + handler = logging.FileHandler("./hdf5dbtest.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + # self.log.propagate = False # prevent log out going to stdout + self.log.info("init!") + + + def testGroup(self): + + with Hdf5db(h5_writer=H5pyWriter("/tmp/foo2.h5", no_data=False), app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "attr1", value=[1,2,3,4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10,10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.flush() + + + + + def testNullSpaceAttribute(self): + + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + + def testScalarAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + + + def testFixedStringAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + ret_value = db.getAttributeValue(root_id, "A1") + + + def testVlenAsciiAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=bytes) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + def testVlenUtf8Attribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=str) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + + def testIntAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + value = [2, 3, 5, 7, 11] + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") + + def testCreateReferenceAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + + dt = special_dtype(ref=Reference) + + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = item["value"] + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref) + + def testCreateVlenReferenceAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + + + def testCommittedType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dt = np.dtype("S15") + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) + + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + + + def testCommittedCompoundType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + + +if __name__ == "__main__": + # setup test files + + unittest.main() From 8fceb5f4efad7dc528d3c32dfb60b1e52a60fd3c Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 27 Feb 2025 11:14:08 -0800 Subject: [PATCH 015/129] added filters.py --- src/h5json/filters.py | 56 ++++++++++++++++ src/h5json/reader/h5py_reader.py | 55 ++-------------- src/h5json/writer/h5py_writer.py | 106 +++++++++++++++++++++++++++++-- 3 files changed, 162 insertions(+), 55 deletions(-) create mode 100644 src/h5json/filters.py diff --git a/src/h5json/filters.py b/src/h5json/filters.py new file mode 100644 index 00000000..e6511366 --- /dev/null +++ b/src/h5json/filters.py @@ -0,0 +1,56 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import h5py + +_HDF_FILTERS = { + 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, + 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, + 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, + 4: { + "class": "H5Z_FILTER_SZIP", + "alias": "szip", + "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], + }, + 5: {"class": "H5Z_FILTER_NBIT"}, + 6: { + "class": "H5Z_FILTER_SCALEOFFSET", + "alias": "scaleoffset", + "options": ["scaleType", "scaleOffset"], + }, + 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, +} + +_HDF_FILTER_OPTION_ENUMS = { + "coding": { + h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", + h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", + }, + "scaleType": { + h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", + h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", + h5py.h5z.SO_INT: "H5Z_SO_INT", + }, +} + +# h5py supported filters +_H5PY_FILTERS = { + "gzip": 1, + "shuffle": 2, + "fletcher32": 3, + "szip": 4, + "scaleoffset": 6, + "lzf": 32000, +} + +_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") + diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index 57f0f3a0..2100dec6 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -17,50 +17,9 @@ from ..hdf5dtype import getTypeItem from ..array_util import bytesArrayToList from .. import selections -from ..h5reader import H5Reader - -_HDF_FILTERS = { - 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, - 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, - 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, - 4: { - "class": "H5Z_FILTER_SZIP", - "alias": "szip", - "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], - }, - 5: {"class": "H5Z_FILTER_NBIT"}, - 6: { - "class": "H5Z_FILTER_SCALEOFFSET", - "alias": "scaleoffset", - "options": ["scaleType", "scaleOffset"], - }, - 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, -} - -_HDF_FILTER_OPTION_ENUMS = { - "coding": { - h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", - h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", - }, - "scaleType": { - h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", - h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", - h5py.h5z.SO_INT: "H5Z_SO_INT", - }, -} - -# h5py supported filters -_H5PY_FILTERS = { - "gzip": 1, - "shuffle": 2, - "fletcher32": 3, - "szip": 4, - "scaleoffset": 6, - "lzf": 32000, -} - -_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") - +from .. import filters +from .h5reader import H5Reader + class H5pyReader(H5Reader): """ @@ -309,8 +268,8 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): filter_prop["id"] = filter_id if filter_info[3]: filter_prop["name"] = self.bytesArrayToList(filter_info[3]) - if filter_id in _HDF_FILTERS: - hdf_filter = _HDF_FILTERS[filter_id] + if filter_id in filters._HDF_FILTERS: + hdf_filter = filters._HDF_FILTERS[filter_id] filter_prop["class"] = hdf_filter["class"] if "options" in hdf_filter: filter_opts = hdf_filter["options"] @@ -320,8 +279,8 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): opt_value = opt_values[i] opt_value_enum = None option_name = filter_opts[i] - if option_name in _HDF_FILTER_OPTION_ENUMS: - option_enums = _HDF_FILTER_OPTION_ENUMS[option_name] + if option_name in filters._HDF_FILTER_OPTION_ENUMS: + option_enums = filters._HDF_FILTER_OPTION_ENUMS[option_name] if opt_value in option_enums: opt_value_enum = option_enums[opt_value] if opt_value_enum: diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index 571dc37f..59865d5c 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -14,7 +14,7 @@ from ..objid import getCollectionForId from ..hdf5dtype import createDataType from ..array_util import jsonToArray - +from .. import filters from .h5writer import H5Writer @@ -55,17 +55,109 @@ def _createDataset(self, parent, dset_json, name=None): type_item = dset_json["type"] dtype = createDataType(type_item) - kwds = {"dtype": dtype} + kwargs = {"dtype": dtype} shape_json = dset_json["shape"] - if shape_json["class"] == "H5S_NULL": + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": # skip the shape keyword to create a null space dataset pass - elif shape_json["class"] == "H5S_SCALAR": - kwds["shape"] = () + elif shape_class == "H5S_SCALAR": + kwargs["shape"] = () else: - kwds["shape"] = shape_json["dims"] - parent.create_dataset(name, **kwds) + kwargs["shape"] = shape_json["dims"] + if "dcpl" in dset_json and shape_class != "H5S_NULL": + creation_props = dset_json["dcpl"] + if "fillValue" in creation_props: + fillvalue = creation_props["fillValue"] + if fillvalue and len(dtype) > 1 and type(fillvalue) in (list, tuple): + # for compound types, need to convert from list to dataset compatible element + + if len(dtype) != len(fillvalue): + msg = "fillvalue has incorrect number of elements" + self.log.warning(msg) + raise ValueError(msg) + + fillvalue = jsonToArray((), dtype, fillvalue) + + kwargs["fillvalue"] = fillvalue + if "trackTimes" in creation_props: + kwargs["track_times"] = creation_props["trackTimes"] + if "layout" in creation_props: + layout = creation_props["layout"] + if "dims" in layout: + kwargs["chunks"] = tuple(layout["dims"]) + if "filters" in creation_props: + filter_props = creation_props["filters"] + for filter_prop in filter_props: + if "id" not in filter_prop: + self.log.warning("filter id not provided") + continue + filter_id = filter_prop["id"] + if filter_id not in filters._HDF_FILTERS: + self.log.warning(f"unknown filter id: {filter_id} ignoring") + continue + + hdf_filter = filters._HDF_FILTERS[filter_id] + + self.log.info(f"got filter: {filter_id}") + if "alias" not in hdf_filter: + self.log.warning(f"unsupported filter id: {filter_id} ignoring") + continue + + filter_alias = hdf_filter["alias"] + if not h5py.h5z.filter_avail(filter_id): + msg = "compression filter not available, filter: {filter_alias}, ignoring" + self.log.warning(msg) + continue + if filter_alias in filters._H5PY_COMPRESSION_FILTERS: + if kwargs.get("compression"): + msg = f"compression filter already set for {filter_alias}, ignoring" + self.log.info(msg) + continue + + kwargs["compression"] = filter_alias + self.log.info("setting compression filter to: {filter_alias}") + if filter_alias == "gzip": + # check for an optional compression value + if "level" in filter_prop: + kwargs["compression_opts"] = filter_prop["level"] + elif filter_alias == "szip": + bitsPerPixel = None + coding = "nn" + + if "bitsPerPixel" in filter_prop: + bitsPerPixel = filter_prop["bitsPerPixel"] + if "coding" in filter_prop: + if filter_prop["coding"] == "H5_SZIP_EC_OPTION_MASK": + coding = "ec" + elif filter_prop["coding"] == "H5_SZIP_NN_OPTION_MASK": + coding = "nn" + else: + self.log.warning("invalid szip option: 'coding'") + # note: pixelsPerBlock, and pixelsPerScanline not supported by h5py, + # so these options will be ignored + if "pixelsPerBlock" in filter_props: + self.log.info("ignoring szip option: 'pixelsPerBlock'") + if "pixelsPerScanline" in filter_props: + self.log.info("ignoring szip option: 'pixelsPerScanline'") + if bitsPerPixel: + kwargs["compression_opts"] = (coding, bitsPerPixel) + else: + if filter_alias == "shuffle": + kwargs["shuffle"] = True + elif filter_alias == "fletcher32": + kwargs["fletcher32"] = True + elif filter_alias == "scaleoffset": + if "scaleOffset" not in filter_prop: + msg = "No scale_offset provided for scale offset filter, ignoring" + self.log(msg) + continue + kwargs["scaleoffset"] = filter_prop["scaleOffset"] + else: + self.log.info(f"Unexpected filter name: {filter_alias}, ignoring") + + parent.create_dataset(name, **kwargs) def _createDatatype(self, parent, ctype_json, name=None): """ create a datatype object """ From af4d46a2842affe7c684475150f218781f114a52 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 4 Mar 2025 12:28:15 -0800 Subject: [PATCH 016/129] updates for h5py_writer to write dataset values --- src/h5json/dset_util.py | 48 +------------------------ src/h5json/hdf5db.py | 26 ++++++++++---- src/h5json/writer/h5py_writer.py | 60 +++++++++++++++++++++----------- test/unit/h5json_reader_test.py | 32 +---------------- test/unit/h5py_reader_test.py | 27 +------------- test/unit/h5py_writer_test.py | 23 +++++++++++- 6 files changed, 85 insertions(+), 131 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index c89f141f..6cd51c3d 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -2,7 +2,7 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # # Utilities. The full HDF5 REST Server copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # @@ -11,52 +11,6 @@ ############################################################################## import time -from .hdf5dtype import getTypeItem - -""" -# standard compress filters -_HDF_FILTERS = { - 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, - 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, - 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, - 4: { - "class": "H5Z_FILTER_SZIP", - "alias": "szip", - "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], - }, - 5: {"class": "H5Z_FILTER_NBIT"}, - 6: { - "class": "H5Z_FILTER_SCALEOFFSET", - "alias": "scaleoffset", - "options": ["scaleType", "scaleOffset"], - }, - 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, -} - -_HDF_FILTER_OPTION_ENUMS = { - "coding": { - h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", - h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", - }, - "scaleType": { - h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", - h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", - h5py.h5z.SO_INT: "H5Z_SO_INT", - }, -} - -# h5py supported filters -_H5PY_FILTERS = { - "gzip": 1, - "shuffle": 2, - "fletcher32": 3, - "szip": 4, - "scaleoffset": 6, - "lzf": 32000, -} - -_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") -""" def resize_dataset(dset_json, shape): shape_json = dset_json["shape"] diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index e1194264..352d6794 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -111,8 +111,7 @@ def make_dirty(self, obj_id): # object deleted, just return return obj_json = self.db[obj_id] - now = time.time() - obj_json["lastModified"] = now + obj_json["lastModified"] = time.time() self._dirty_objects.add(obj_id) @@ -120,10 +119,25 @@ def flush(self): """ write out any changes """ if not self.writer: return # nothing to do - if self.writer.flush(): - # reset new and dirty sets - self._new_objects = set() - self._dirty_objects = set() + + print("self._new_objects:", self._new_objects) + print("self._dirty_objects:", self._dirty_objects) + obj_ids = self._new_objects.union(self._dirty_objects) + print(f"hdf5db_flush {len(obj_ids)} objects") + + if not self.writer.flush(): + # flush not successful, don't clear dirty set + return + + + for obj_id in obj_ids: + obj_json = self._db[obj_id] + if "values" in obj_json: + obj_json["values"] = [] + + # reset new and dirty sets + self._new_objects = set() + self._dirty_objects = set() def close(self): """ close reader and writer handles """ diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index 59865d5c..247098a8 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -47,8 +47,7 @@ def _createGroup(self, parent, grp_json, name=None): grp = parent.create_group(name) if "links" in grp_json: grp_links = grp_json["links"] - self._createLinks(grp, grp_links) - + self._createObjects(grp, grp_links) def _createDataset(self, parent, dset_json, name=None): """ create a dataset object """ @@ -167,8 +166,8 @@ def _createDatatype(self, parent, ctype_json, name=None): parent[name] = dtype - def _createLinks(self, parent, links_json): - """ create links in the given group """ + def _createObjects(self, parent, links_json): + """ create child object in the given group, recurse for any sub-groups """ for title in links_json: if title in parent: # TBD: this will do the wrong thing if the link tgt has changed @@ -212,8 +211,27 @@ def _createLinks(self, parent, links_json): else: self.log.warning(f"unexpected link class: {link_class}") + def updateDatasetValues(self, dset_id, dset): + """ write any pending dataset values """ + dset_json = self.db.getObjectById(dset_id) + if "updates" not in dset_json: + return + updates = dset_json["updates"] + for (sel, val) in updates: + slices = [] + for dim in range(len(sel.shape)): + start = sel.start[dim] + stop = start + sel.count[dim] + step = sel.step[dim] + slices.append(slice(start, stop, step)) + slices = tuple(slices) + dset[slices] = val + self.log.debug(f"h5py_writer dset {dset.name} updated") + + def createAttribute(self, obj, name, attr_json): """ add the given attribute to obj """ + print(f"h5py_writer.createAttribute {obj.name}: {name}") dtype = createDataType(attr_json["type"]) shape_json = attr_json["shape"] @@ -233,9 +251,11 @@ def createAttribute(self, obj, name, attr_json): obj.attrs[name] = arr - def createAttributes(self, obj, obj_json): - """ create attributes """ + def updateAttributes(self, obj_id, obj): + """ create/replace any modified attributes """ + obj_json = self.db.getObjectById(obj_id) + if "attributes" not in obj_json: # no attributes return @@ -245,31 +265,31 @@ def createAttributes(self, obj, obj_json): attr_json = attrs[name] self.createAttribute(obj, name, attr_json) - - def visitAttributes(self, path, obj): - name = obj.__class__.__name__ - self.log.info(f"visit: {path} name: {name}") - - obj_json = self.db.getObjectByPath(path) - self.createAttributes(obj, obj_json) - + def flush(self): """ Write dirty items """ if not self.db: # no db set yet - return - + return False + + self.log.info("h5py_writer.flush()") root_id = self.db.root_id self._id_map[root_id] = "/" with h5py.File(self._filepath, mode=self._mode) as f: root_json = self.db.getObjectById(root_id) if "links" in root_json: root_links = root_json["links"] - self._createLinks(f, root_links) - # update attributes - self.createAttributes(f, root_json) - f.visititems(self.visitAttributes) + self._createObjects(f, root_links) + # update attributes, dataset values + for obj_id in self._id_map: + if self.db.is_dirty(obj_id): + h5path = self._id_map[obj_id] + obj = f[h5path] + self.updateAttributes(obj_id, obj) + self.updateDatasetValues(obj_id, obj) + self._mode = "a" # use append mode for future updates + return True # all objects written successfully def close(self): diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index effa0e58..5027232e 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -10,40 +10,11 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import unittest -import os -import os.path as op -import stat import logging -import shutil from h5json import Hdf5db from h5json.reader.h5json_reader import H5JsonReader -def getFile(name, tgt, ro=False): - src = "data/json/" + name - logging.info("copying file to this directory: " + src) - - filepath = "./out/" + tgt - - if op.isfile(filepath): - # make sure it's writable, before we copy over it - os.chmod(filepath, stat.S_IWRITE | stat.S_IREAD) - shutil.copyfile(src, filepath) - if ro: - logging.info("make read-only") - os.chmod(filepath, stat.S_IREAD) - return filepath - - -def removeFile(name): - try: - os.stat(name) - except OSError: - return - # file does not exist - os.remove(name) - - class H5pyReaderTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(H5pyReaderTest, self).__init__(*args, **kwargs) @@ -64,12 +35,11 @@ def __init__(self, *args, **kwargs): self.log.removeHandler(lhStdout) def testSimple(self): - filepath = getFile("tall.json", "tall.json", ro=True) + filepath = "data/json/tall.json" kwargs = {"app_logger": self.log} with Hdf5db(h5_reader=H5JsonReader(filepath, **kwargs), **kwargs) as db: root_id = db.getObjectIdByPath("/") root_json = db.getObjectById(root_id) - print("root_json:", root_json) root_attrs = root_json["attributes"] self.assertEqual(len(root_attrs), 2) diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index c612adc6..b878434e 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -20,31 +20,6 @@ from h5json.reader.h5py_reader import H5pyReader -def getFile(name, tgt, ro=False): - src = "data/hdf5/" + name - logging.info("copying file to this directory: " + src) - - filepath = "./out/" + tgt - - if op.isfile(filepath): - # make sure it's writable, before we copy over it - os.chmod(filepath, stat.S_IWRITE | stat.S_IREAD) - shutil.copyfile(src, filepath) - if ro: - logging.info("make read-only") - os.chmod(filepath, stat.S_IREAD) - return filepath - - -def removeFile(name): - try: - os.stat(name) - except OSError: - return - # file does not exist - os.remove(name) - - class H5pyReaderTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(H5pyReaderTest, self).__init__(*args, **kwargs) @@ -65,7 +40,7 @@ def __init__(self, *args, **kwargs): self.log.removeHandler(lhStdout) def testSimple(self): - filepath = getFile("tall.h5", "tall.h5", ro=True) + filepath = "data/hdf5/tall.h5" kwargs = {"app_logger": self.log} with Hdf5db(h5_reader=H5pyReader(filepath, **kwargs), **kwargs) as db: root_id = db.getObjectIdByPath("/") diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 38447aff..9d595673 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -12,6 +12,7 @@ import unittest import time import logging +import h5py import numpy as np from h5json import Hdf5db from h5json.writer.h5py_writer import H5pyWriter @@ -46,12 +47,14 @@ def __init__(self, *args, **kwargs): def testGroup(self): - with Hdf5db(h5_writer=H5pyWriter("/tmp/foo2.h5", no_data=False), app_logger=self.log) as db: + filepath = "test/unit/out/h5py_writer_test_testGroup.h5" + with Hdf5db(h5_writer=H5pyWriter(filepath, no_data=False), app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "attr1", value=[1,2,3,4]) db.createAttribute(root_id, "attr2", 42) g1_id = db.createGroup() db.createHardLink(root_id, "g1", g1_id) + db.createAttribute(g1_id, "a1", "hello") g2_id = db.createGroup() db.createHardLink(root_id, "g2", g2_id) @@ -69,6 +72,24 @@ def testGroup(self): db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") db.createCustomLink(g2_id, "cust", {"foo": "bar"}) db.flush() + with h5py.File(filepath) as f: + self.assertTrue("attr1", f.attrs) + self.assertTrue("attr2", f.attrs) + self.assertTrue("g1" in f) + g1 = f["g1"] + self.assertTrue("a1" in g1.attrs) + self.assertTrue("g1.1" in g1) + g11 = g1["g1.1"] + self.assertTrue("dset1.1.1" in g11) + dset = g11["dset1.1.1"] + self.assertEqual(dset.shape, (10,10)) + for i in range(10): + for j in range(10): + self.assertEqual(dset[i, j], i*j) + self.assertTrue("g2" in f) + g2 = f["g2"] + self.assertTrue("extlink" in g2) + self.assertTrue("slink" in g2) From 7c393b6c9a01069ba95b75ede407f7ddf5c07b0e Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 4 Mar 2025 18:10:26 -0800 Subject: [PATCH 017/129] revert to using members for dtype enums --- src/h5json/hdf5db.py | 8 ++++ src/h5json/hdf5dtype.py | 14 ++++-- src/h5json/reader/h5py_reader.py | 4 +- src/h5json/writer/h5py_writer.py | 76 +++++++++++++++++++++----------- test/unit/h5py_writer_test.py | 38 ++++++++++++++++ test/unit/hdf5dtype_test.py | 24 ++++++---- 6 files changed, 125 insertions(+), 39 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 352d6794..0d19ef7a 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -99,6 +99,14 @@ def is_dirty(self, obj_id): return True return obj_id in self._dirty_objects + @property + def new_objects(self): + return self._new_objects + + @property + def dirty_objects(self): + return self._dirty_objects + def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ if self.is_new(obj_id): diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index be1ffd62..47f53a68 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -424,9 +424,11 @@ def getTypeItem(dt, metadata=None): if dt.base.byteorder == ">": byteorder = "BE" # this mapping is an h5py convention for boolean support - mapping = {"FALSE": 0, "TRUE": 1} + bool_false = {"name": "FALSE", "value": 0} + bool_true = {"name": "TRUE", "value": 1} + members = [bool_false, bool_true] type_info["class"] = "H5T_ENUM" - type_info["mapping"] = mapping + type_info["members"] = members base_info = {"class": "H5T_INTEGER"} base_info["base"] = "H5T_STD_I8" + byteorder type_info["base"] = base_info @@ -456,7 +458,13 @@ def getTypeItem(dt, metadata=None): # yes, this is an enum! mapping = metadata["enum"] type_info["class"] = "H5T_ENUM" - type_info["mapping"] = mapping + members = [] + for name in mapping: + value = mapping[name] + item = {"name": name, "value": value} + members.append(item) + type_info["members"] = members + #type_info["mapping"] = mapping if dt.name not in predefined_int_types: raise TypeError("Unexpected integer type: " + dt.name) # maps to one of the HDF5 predefined types diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index 2100dec6..cfae72cc 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -96,7 +96,7 @@ def getAttribute(self, obj_id, name, include_data=True): addr = h5py.h5o.get_info(typeid).addr type_uuid = self.getObjIdByAddress(addr) committedType = self._id_map[type_uuid] - type_item = committedType["type"].copy() + type_item = getTypeItem(committedType.dtype) type_item["id"] = type_uuid else: type_item = getTypeItem(attrObj.dtype) @@ -353,7 +353,7 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): elif isinstance(h5obj, h5py.Dataset): obj_json = self._getDataset(h5obj) elif isinstance(h5obj, h5py.Datatype): - obj_json = self._getDataType(h5obj) + obj_json = self._getDatatype(h5obj) else: raise TypeError(f"unexpected object type: {type(h5obj)}") diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index 247098a8..b932b2f5 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -45,9 +45,8 @@ def __init__( def _createGroup(self, parent, grp_json, name=None): """ create the group and any links it contains """ grp = parent.create_group(name) - if "links" in grp_json: - grp_links = grp_json["links"] - self._createObjects(grp, grp_links) + return grp + def _createDataset(self, parent, dset_json, name=None): """ create a dataset object """ @@ -156,7 +155,8 @@ def _createDataset(self, parent, dset_json, name=None): else: self.log.info(f"Unexpected filter name: {filter_alias}, ignoring") - parent.create_dataset(name, **kwargs) + dset = parent.create_dataset(name, **kwargs) + return dset def _createDatatype(self, parent, ctype_json, name=None): """ create a datatype object """ @@ -164,50 +164,76 @@ def _createDatatype(self, parent, ctype_json, name=None): type_item = ctype_json["type"] dtype = createDataType(type_item) parent[name] = dtype + return parent[name] - def _createObjects(self, parent, links_json): + def _createObjects(self, parent, links_json, visited=set()): """ create child object in the given group, recurse for any sub-groups """ + for title in links_json: - if title in parent: - # TBD: this will do the wrong thing if the link tgt has changed - continue + #if title in parent: + # # TBD: this will do the wrong thing if the link tgt has changed + # continue link_json = links_json[title] link_class = link_json["class"] - if link_class == "H5L_TYPE_SOFT": + if link_class == "H5L_TYPE_SOFT" and title not in parent: h5path = link_json["h5path"] parent[title] = h5py.SoftLink(h5path) - elif link_class == "H5L_TYPE_EXTERNAL": + elif link_class == "H5L_TYPE_EXTERNAL" and title not in parent: h5path = link_json["h5path"] filename = link_json["file"] parent[title] = h5py.ExternalLink(filename, h5path) - elif link_class == "H5L_TYPE_USER_DEFINED": + elif link_class == "H5L_TYPE_USER_DEFINED" and title not in parent: self.log.warning("unable to create user-defined link: {title}") elif link_class == "H5L_TYPE_HARD": tgt_id = link_json["id"] + """ + if tgt_id in visited: + # we've already processed this object + if title not in parent: + if tgt_id in self._id_map: + tgt_obj = self._id_map[tgt_id] + parent[title] = tgt_obj + else: + self.log.warning("h5py_writer - expected to find {tgt_id} in id_map") + continue + """ + + collection = getCollectionForId(tgt_id) + + obj_json = self.db.getObjectById(tgt_id) + if tgt_id in self._id_map: + # object has already been created tgt_path = self._id_map[tgt_id] tgt_obj = parent[tgt_path] - parent[title] = tgt_obj + if title not in parent: + parent[title] = tgt_obj + if collection == "groups" and tgt_id not in visited: + # recurse over sub-objects to pick up any new links + grp_links = obj_json["links"] + visited.add(tgt_id) + self._createObjects(tgt_obj, grp_links, visited=visited) else: - obj_json = self.db.getObjectById(tgt_id) parent_path = parent.name if parent_path[-1] != '/': parent_path += '/' self._id_map[tgt_id] = parent_path + title - collection = getCollectionForId(tgt_id) kwds = {"name": title} if collection == "groups": - tgt_obj = self._createGroup(parent, obj_json, **kwds) + tgt_grp = self._createGroup(parent, obj_json, **kwds) + if "links" in obj_json: + grp_links = obj_json["links"] + visited.add(tgt_id) + self._createObjects(tgt_grp, grp_links, visited=visited) elif collection == "datasets": - tgt_obj = self._createDataset(parent, obj_json, **kwds) + self._createDataset(parent, obj_json, **kwds) elif collection == "datatypes": - tgt_obj = self._createDatatype(parent, obj_json, **kwds) + self._createDatatype(parent, obj_json, **kwds) else: self.log.warning(f"unexpected collection: {collection}") - tgt_obj = None - if tgt_obj: - parent[title] = tgt_obj + visited.add(tgt_id) + else: self.log.warning(f"unexpected link class: {link_class}") @@ -231,7 +257,6 @@ def updateDatasetValues(self, dset_id, dset): def createAttribute(self, obj, name, attr_json): """ add the given attribute to obj """ - print(f"h5py_writer.createAttribute {obj.name}: {name}") dtype = createDataType(attr_json["type"]) shape_json = attr_json["shape"] @@ -276,10 +301,11 @@ def flush(self): root_id = self.db.root_id self._id_map[root_id] = "/" with h5py.File(self._filepath, mode=self._mode) as f: - root_json = self.db.getObjectById(root_id) - if "links" in root_json: - root_links = root_json["links"] - self._createObjects(f, root_links) + if self.db.new_objects: + root_json = self.db.getObjectById(root_id) + if "links" in root_json: + root_links = root_json["links"] + self._createObjects(f, root_links, visited=set(root_id)) # update attributes, dataset values for obj_id in self._id_map: if self.db.is_dirty(obj_id): diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 9d595673..3d81011c 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -90,6 +90,44 @@ def testGroup(self): g2 = f["g2"] self.assertTrue("extlink" in g2) self.assertTrue("slink" in g2) + + db.createAttribute(g1_id, "a2", "bye-bye") + db.flush() + + with h5py.File(filepath) as f: + g1 = f["g1"] + self.assertEqual(len(g1.attrs), 2) + self.assertTrue("a1" in g1.attrs) + self.assertTrue("a2" in g1.attrs) + + print("create group /g2/g2.1") + g21 = db.createGroup() + db.createHardLink(g2_id, "g2.1", g21) + db.flush() + + with h5py.File(filepath) as f: + g2 = f["g2"] + self.assertTrue("g2.1" in g2) + + sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + db.setDatasetValues(dset_111_id, sel, arr) + db.flush() + + with h5py.File(filepath) as f: + dset = f["/g1/g1.1/dset1.1.1"] + for i in range(10): + for j in range(10): + if i == 4 and j == 4: + # this is the one element that was updated + expected = 42 + else: + expected = i * j + self.assertEqual(dset[i, j], expected) + + + diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index dbc806bb..63efc239 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -125,8 +125,10 @@ def testBaseEnumTypeItem(self): baseItem = typeItem["base"] self.assertEqual(baseItem["class"], "H5T_INTEGER") self.assertEqual(baseItem["base"], "H5T_STD_I8LE") - self.assertTrue("mapping" in typeItem) - self.assertEqual(typeItem["mapping"]["GREEN"], 1) + self.assertTrue("members" in typeItem) + members = typeItem["members"] + expected = [{'name': 'RED', 'value': 0}, {'name': 'GREEN', 'value': 1}, {'name': 'BLUE', 'value': 2}] + self.assertEqual(members, expected) self.assertEqual(typeSize, 1) def testBaseBoolTypeItem(self): @@ -136,11 +138,11 @@ def testBaseBoolTypeItem(self): baseItem = typeItem["base"] self.assertEqual(baseItem["class"], "H5T_INTEGER") self.assertEqual(baseItem["base"], "H5T_STD_I8LE") - self.assertTrue("mapping" in typeItem) - mapping = typeItem["mapping"] - self.assertEqual(len(mapping), 2) - self.assertEqual(mapping["FALSE"], 0) - self.assertEqual(mapping["TRUE"], 1) + self.assertTrue("members" in typeItem) + members = typeItem["members"] + self.assertEqual(len(members), 2) + self.assertEqual(members[0], {"name": "FALSE", "value": 0}) + self.assertEqual(members[1], {"name": "TRUE", "value": 1}) self.assertEqual(typeSize, 1) def testBaseArrayTypeItem(self): @@ -205,8 +207,12 @@ def testEnumArrayTypeItem(self): self.assertEqual(typeItem["dims"], (2, 3)) baseItem = typeItem["base"] self.assertEqual(baseItem["class"], "H5T_ENUM") - self.assertTrue("mapping" in baseItem) - self.assertEqual(baseItem["mapping"]["GREEN"], 1) + self.assertTrue("members" in baseItem) + members = baseItem["members"] + self.assertEqual(len(members), 3) + self.assertEqual(members[0], {"name": "RED", "value": 0}) + self.assertEqual(members[1], {"name": "GREEN", "value": 1}) + self.assertEqual(members[2], {"name": "BLUE", "value": 2}) self.assertTrue("base" in baseItem) basePrim = baseItem["base"] self.assertEqual(basePrim["class"], "H5T_INTEGER") From 825fc89f3522929611a35cdebb1cfdcd0ed89a1f Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 6 Mar 2025 22:00:14 -0800 Subject: [PATCH 018/129] add support for reference types --- src/h5json/h5py_util.py | 108 +++++++++++++++++++++++++ src/h5json/hdf5db.py | 22 ++++++ src/h5json/hdf5dtype.py | 4 +- src/h5json/objid.py | 19 ++--- src/h5json/writer/h5py_writer.py | 131 +++++++++++++++++++++++++++---- test/unit/h5py_writer_test.py | 113 +++++++++++++++++++++----- 6 files changed, 355 insertions(+), 42 deletions(-) create mode 100644 src/h5json/h5py_util.py diff --git a/src/h5json/h5py_util.py b/src/h5json/h5py_util.py new file mode 100644 index 00000000..22df9ee0 --- /dev/null +++ b/src/h5json/h5py_util.py @@ -0,0 +1,108 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import h5py +import numpy as np + +from . import hdf5dtype + +def is_reference(val): + """ Return True if the type or value is a Reference """ + + if isinstance(val, object) and val.__class__.__name__ == "Reference": + return True + elif isinstance(val, type) and val.__name__ == "Reference": + return True + + return False + + +def is_regionreference(val): + """ Return True if the type or value is a RegionReference """ + + if isinstance(val, object) and val.__class__.__name__ == "RegionReference": + return True + elif isinstance(val, type) and val.__name__ == "RegionReference": + return True + + return False + + +def has_reference(dtype): + """ return True if the dtype (or a sub-type) is a Reference type """ + has_ref = False + if not isinstance(dtype, np.dtype): + return False + if len(dtype) > 0: + for name in dtype.fields: + item = dtype.fields[name] + if has_reference(item[0]): + has_ref = True + break + elif dtype.metadata and "ref" in dtype.metadata: + basedt = dtype.metadata["ref"] + has_ref = is_reference(basedt) + elif dtype.metadata and "vlen" in dtype.metadata: + basedt = dtype.metadata["vlen"] + has_ref = has_reference(basedt) + return has_ref + + +def convert_dtype(srcdt, to_h5py=True): + """Return a dtype based on input dtype, converting any Reference types from + h5py style to h5pyd and vice-versa. + """ + + if len(srcdt) > 0: + fields = [] + for name in srcdt.fields: + item = srcdt.fields[name] + # item is a tuple of dtype and integer offset + field_dt = convert_dtype(item[0], to_h5py=to_h5py) + fields.append((name, field_dt)) + tgt_dt = np.dtype(fields) + else: + # check if this a "special dtype" + if srcdt.metadata and "ref" in srcdt.metadata: + ref = srcdt.metadata["ref"] + if is_reference(ref): + if to_h5py: + tgt_dt = h5py.special_dtype(ref=h5py.Reference) + else: + tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.Reference) + elif is_regionreference(ref): + if to_h5py: + tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) + else: + tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.RegionReference) + else: + msg = f"Unexpected ref type: {srcdt}" + raise TypeError(msg) + elif srcdt.metadata and "vlen" in srcdt.metadata: + src_vlen = srcdt.metadata["vlen"] + if isinstance(src_vlen, np.dtype): + tgt_base = convert_dtype(src_vlen, to_h5py=to_h5py) + else: + tgt_base = src_vlen + if to_h5py: + tgt_dt = h5py.special_dtype(vlen=tgt_base) + else: + tgt_dt = h5pyd.special_dtype(vlen=tgt_base) + elif srcdt.kind == "U": + # use vlen for unicode strings + if to_h5py: + tgt_dt = h5py.special_dtype(vlen=str) + else: + tgt_dt = hdf5dtype.special_dtype(vlen=str) + else: + tgt_dt = srcdt + return tgt_dt diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 0d19ef7a..cc9d4220 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -79,11 +79,29 @@ def reader(self): """ return reader instance """ return self._reader + @reader.setter + def reader(self, value: H5Reader): + """ set the reader """ + if self._reader: + self._reader.close() + self._reader = value + if self._reader: + self._reader.set_db(self) + @property def writer(self): """ return writer instance """ return self._writer + @writer.setter + def writer(self, value: H5Reader): + """ set the writer """ + if self._writer: + self._writer.close() + self._writer = value + if self._writer: + self._writer.set_db(self) + @property def root_id(self): """ return root uuid """ @@ -321,8 +339,12 @@ def getAttributeValue(self, obj_id, name): else: dims = shape_json["dims"] dtype = createDataType(attr_json["type"]) + print("getAttributeValue dtype, metadata:", dtype.metadata) + value = attr_json["value"] arr = jsonToArray(dims, dtype, value) + print("getAttributeValue returning arr.dtype, metadata:", arr.dtype.metadata) + return arr diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index 47f53a68..acbb2d21 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -17,6 +17,7 @@ numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) numpy_float_types = (np.float16, np.float32, np.float64) + class Reference: """ Represents an HDF5 object reference @@ -743,7 +744,7 @@ def createBaseDataType(typeItem): type_code = "S" elif typeItem["charSet"] == "H5T_CSET_UTF8": # use the same type_code as ascii strings - # (othewise, numpy will reserve bytes for UTF32 representation) + # (otherwise, numpy will reserve bytes for UTF32 representation) type_code = "S" else: raise TypeError("unexpected 'charSet' value") @@ -804,6 +805,7 @@ def createBaseDataType(typeItem): raise KeyError("'base' not provided") if typeItem["base"] == "H5T_STD_REF_OBJ": dtRet = special_dtype(ref=Reference) + print("special dtype, metadata:", dtRet.metadata) elif typeItem["base"] == "H5T_STD_REF_DSETREG": dtRet = special_dtype(ref=RegionReference) else: diff --git a/src/h5json/objid.py b/src/h5json/objid.py index e36e8a22..bd34bc56 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -129,15 +129,6 @@ def getCollectionForId(obj_id): raise ValueError(f"{obj_id} not a collection id") return collection -def stripId(obj_id): - """ return just the base id without any prefix (e.g. 'g-') """ - if len(obj_id) == UUID_LEN: - return obj_id # just return as is - if len(obj_id) == UUID_LEN + 2: - return obj_id[2:] - else: - raise ValueError("unexpected obj_id: {obj_id}") - def isRootObjId(id): """returns true if this is a root id (only for v2 schema)""" @@ -494,3 +485,13 @@ def getUuidFromId(id): return id[2:] else: raise ValueError(f"Unexpected obj_id: {id}") + +def stripId(obj_id): + """ return just the base id without any prefix (e.g. 'g-') """ + if len(obj_id) == UUID_LEN: + return obj_id # just return as is + if len(obj_id) == UUID_LEN + 2: + return obj_id[2:] + else: + raise ValueError("unexpected obj_id: {obj_id}") + diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index b932b2f5..07717ddf 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -10,9 +10,11 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import h5py +import numpy as np -from ..objid import getCollectionForId +from ..objid import getCollectionForId, isValidUuid from ..hdf5dtype import createDataType +from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype from ..array_util import jsonToArray from .. import filters from .h5writer import H5Writer @@ -39,9 +41,108 @@ def __init__( else: self._mode = "w" - self._f = None self._id_map = {} + + def _copy_element(self, val, src_dt, tgt_dt, fout=None): + """ convert the given dataset or attribute element to h5py equivalent """ + + out = None + if len(src_dt) > 0: + out_fields = [] + i = 0 + for name in src_dt.fields: + field_src_dt = src_dt.fields[name][0] + field_tgt_dt = tgt_dt.fields[name][0] + field_val = val[i] + i += 1 + out_field = self._copy_element(field_val, field_src_dt, field_tgt_dt) + out_fields.append(out_field) + out = tuple(out_fields) + elif src_dt.metadata and "ref" in src_dt.metadata: + if not tgt_dt.metadata or "ref" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be ref, but got: {tgt_dt}") + ref = tgt_dt.metadata["ref"] + if is_reference(ref): + # initialize out to null ref + out = h5py.Reference() # null h5py ref + + if ref and val: + if isinstance(val, bytes): + val = val.decode("ascii") + # strip out collection prefix if present + parts = val.split("/") + obj_uuid = parts[-1] + if not isValidUuid(obj_uuid): + msg = f"invalid uuid: {obj_uuid}" + self.log.warning(msg) + elif obj_uuid not in self._id_map: + self.log.warning(f"ref object {obj_uuid} not found") + else: + h5path = self._id_map[obj_uuid] + try: + obj = fout[h5path] + out = obj.ref + except KeyError: + self.log.warning(f"referenced object: {h5path} not found") + + elif is_regionreference(ref): + self.log.warning("region reference not supported") + # TBD: just return a null region reference till we have support + out = h5py.RegionReference() + else: + raise TypeError(f"Unexpected ref type: {type(ref)}") + elif src_dt.metadata and "vlen" in src_dt.metadata: + if not isinstance(val, np.ndarray): + raise TypeError(f"Expecting ndarray or vlen element, but got: {type(val)}") + if not tgt_dt.metadata or "vlen" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be vlen, but got: {tgt_dt}") + src_vlen_dt = src_dt.metadata["vlen"] + tgt_vlen_dt = tgt_dt.metadata["vlen"] + if has_reference(src_vlen_dt): + if len(val.shape) == 0: + # scalar array + e = val[()] + v = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fout=fout) + out = np.array(v, dtype=tgt_dt) + else: + out = np.zeros(val.shape, dtype=tgt_dt) + for i in range(len(out)): + e = val[i] + out[i] = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fout=fout) + else: + # can just directly copy the array + out = np.zeros(val.shape, dtype=tgt_dt) + out[...] = val[...] + else: + out = val # can just copy as is + return out + + def _copy_array(self, src_arr, fout=None): + """Copy the numpy array to a new array. + Convert any reference type to point to item in the target's hierarchy. + """ + + if not isinstance(src_arr, np.ndarray): + raise TypeError(f"Expecting ndarray, but got: {src_arr}") + tgt_dt = convert_dtype(src_arr.dtype, to_h5py=True) + tgt_arr = np.zeros(src_arr.shape, dtype=tgt_dt) + + if has_reference(src_arr.dtype): + # flatten array to simplify iteration + count = int(np.prod(src_arr.shape)) + tgt_arr_flat = tgt_arr.reshape((count,)) + src_arr_flat = src_arr.reshape((count,)) + for i in range(count): + e = src_arr_flat[i] + element = self._copy_element(e, src_arr.dtype, tgt_dt, fout=fout) + tgt_arr_flat[i] = element + tgt_arr = tgt_arr_flat.reshape(src_arr.shape) + else: + # can just copy the entire array + tgt_arr[...] = src_arr[...] + return tgt_arr + def _createGroup(self, parent, grp_json, name=None): """ create the group and any links it contains """ grp = parent.create_group(name) @@ -254,26 +355,28 @@ def updateDatasetValues(self, dset_id, dset): dset[slices] = val self.log.debug(f"h5py_writer dset {dset.name} updated") + def createAttribute(self, obj, name, attr_json): """ add the given attribute to obj """ - - dtype = createDataType(attr_json["type"]) + + src_dt = createDataType(attr_json["type"]) + + # handle special case of null space attribute here shape_json = attr_json["shape"] shape_class = shape_json["class"] if shape_class == "H5S_NULL": - dims = None - elif shape_class == "H5S_SCALAR": + obj.attrs[name] = h5py.Empty(convert_dtype(src_dt, to_h5py=True)) + return + + if shape_class == "H5S_SCALAR": dims = () else: - dims = tuple(shape_json["dims"]) - - if dims is None: - obj.attrs[name] = h5py.Empty(dtype) - else: - json_value = attr_json["value"] - arr = jsonToArray(dims, dtype, json_value) - obj.attrs[name] = arr + dims = shape_json["dims"] + src_arr = jsonToArray(dims, src_dt, attr_json["value"]) + tgt_arr = self._copy_array(src_arr, fout=obj.file) + + obj.attrs[name] = tgt_arr def updateAttributes(self, obj_id, obj): diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 3d81011c..75b7e37b 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -45,10 +45,11 @@ def __init__(self, *args, **kwargs): self.log.info("init!") - def testGroup(self): + def testSimple(self): - filepath = "test/unit/out/h5py_writer_test_testGroup.h5" - with Hdf5db(h5_writer=H5pyWriter(filepath, no_data=False), app_logger=self.log) as db: + filepath = "test/unit/out/h5py_writer_test_testSimple.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "attr1", value=[1,2,3,4]) db.createAttribute(root_id, "attr2", 42) @@ -126,15 +127,11 @@ def testGroup(self): expected = i * j self.assertEqual(dset[i, j], expected) - - - - - - def testNullSpaceAttribute(self): + filepath = "test/unit/out/h5py_writer_test_testNullSpaceAttribute.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) item = db.getAttribute(root_id, "A1") @@ -145,9 +142,17 @@ def testNullSpaceAttribute(self): self.assertTrue(item["created"] > time.time() - 1.0) value = db.getAttributeValue(root_id, "A1") self.assertEqual(value, None) + db.flush() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + self.assertEqual(f.attrs["A1"], h5py.Empty(dtype=np.int32)) def testScalarAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testNullScalarAttribute.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") dims = () value = 42 @@ -165,13 +170,21 @@ def testScalarAttribute(self): self.assertTrue(item["created"] > now - 1) shape = item["shape"] self.assertEqual(shape["class"], "H5S_SCALAR") - self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, np.int32)) + self.assertEqual(a1, 42) def testFixedStringAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testFixedStringAttribute.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") value = "Hello, world!" db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) @@ -186,14 +199,23 @@ def testFixedStringAttribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - ret_value = db.getAttributeValue(root_id, "A1") + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, bytes)) + self.assertEqual(a1, b'Hello, world!') def testVlenAsciiAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testVlenAsciiAttribute.h5" + value = b"Hello, world!" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - value = b"Hello, world!" dt = special_dtype(vlen=bytes) # write the attribute @@ -211,11 +233,21 @@ def testVlenAsciiAttribute(self): now = int(time.time()) self.assertTrue(item["created"] > now - 1) + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, str)) + self.assertEqual(a1, value.decode("ascii")) + def testVlenUtf8Attribute(self): + + filepath = "test/unit/out/h5py_writer_test_testVlenUtf8Attribute.h5" + value = "one: \u4e00" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - value = b"Hello, world!" dt = special_dtype(vlen=str) # write the attribute @@ -229,15 +261,25 @@ def testVlenUtf8Attribute(self): self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") self.assertEqual(item_type["length"], "H5T_VARIABLE") self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") - self.assertEqual(item["value"], "Hello, world!") + self.assertEqual(item["value"], value) now = int(time.time()) self.assertTrue(item["created"] > now - 1) + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, str)) + self.assertEqual(a1, value) def testIntAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testIntAttribute.h5" + value = [2, 3, 5, 7, 11] + with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - value = [2, 3, 5, 7, 11] db.createAttribute(root_id, "A1", value, dtype=np.int16) item = db.getAttribute(root_id, "A1") self.assertEqual(item["value"], [2, 3, 5, 7, 11]) @@ -250,8 +292,20 @@ def testIntAttribute(self): self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I16LE") + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, np.ndarray)) + self.assertEqual(a1.shape, (5,)) + for i in range(5): + self.assertEqual(a1[i], value[i]) + + def testCreateReferenceAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testCreateReferenceAttribute.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") dset_id = db.createDataset(shape=(), dtype=np.int32) @@ -262,19 +316,28 @@ def testCreateReferenceAttribute(self): ds1_ref = "datasets/" + dset_id value = [ds1_ref,] db.createAttribute(root_id, "A1", value, dtype=dt) - item = db.getAttribute(root_id, "A1") attr = db.getAttribute(root_id, "A1") self.assertTrue("shape" in attr) attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") - attr_value = item["value"] + attr_value = db.getAttributeValue(root_id, "A1") self.assertEqual(len(attr_value), 1) - self.assertEqual(attr_value[0], ds1_ref) + self.assertEqual(attr_value[0], ds1_ref.encode('ascii')) + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + obj_ref = a1[0] + obj = f[obj_ref] + self.assertEqual(obj.name, "/DS1") def testCreateVlenReferenceAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testVlenReferenceAttribute.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) @@ -304,10 +367,24 @@ def testCreateVlenReferenceAttribute(self): item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SCALAR") + + print("open:", filepath) + with h5py.File(filepath) as f: + self.assertTrue("DS1" in f) + ds1 = f["DS1"] + self.assertTrue("G1" in f) + g1 = f["G1"] + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + ref_obj = f[a1[0]] + self.assertEqual(ref_obj.name, "/DS1") def testCommittedType(self): + + filepath = "test/unit/out/h5py_writer_test_testCommittedType.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") dt = np.dtype("S15") From 88fa1eb6b4a8dfc9118c93925b570f7c01d882d2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 25 Mar 2025 18:45:02 +0100 Subject: [PATCH 019/129] support for h5py and json readers and writers --- data/hdf5/dset_creationprop.h5 | Bin 8058 -> 7228 bytes data/json/nullspace_dset.json | 34 ------ setup.cfg | 6 + src/h5json/array_util.py | 3 +- src/h5json/dset_util.py | 20 ++- src/h5json/filters.py | 3 +- src/h5json/h5py_util.py | 9 +- src/h5json/hdf5db.py | 156 +++++++++++------------- src/h5json/hdf5dtype.py | 19 ++- src/h5json/objid.py | 4 +- src/h5json/reader/h5json_reader.py | 26 ++-- src/h5json/reader/h5py_reader.py | 188 ++++++++++++++++++++++------- src/h5json/reader/h5reader.py | 12 +- src/h5json/selections.py | 3 +- src/h5json/writer/h5json_writer.py | 56 ++++----- src/h5json/writer/h5py_writer.py | 44 +++---- src/h5json/writer/h5writer.py | 10 +- test/unit/h5json_writer_test.py | 38 +++++- test/unit/h5py_writer_test.py | 41 ++++++- test/unit/hdf5db_test.py | 28 +++++ test/unit/hdf5dtype_test.py | 2 + 21 files changed, 429 insertions(+), 273 deletions(-) delete mode 100644 data/json/nullspace_dset.json create mode 100644 setup.cfg diff --git a/data/hdf5/dset_creationprop.h5 b/data/hdf5/dset_creationprop.h5 index ff5b7a723a1800126515ab515f1957ef12bddf97..12b7a3265d45fa1c8b36a9e15cb8e27f6c9ea3c7 100644 GIT binary patch delta 35 mcmexmx5r|F29u4ar18$M{xkBbO@&a delta 402 zcmdmE@yl+422+*%My-Bk9wr731_)4Kp14tN^KNDd#?8N39K}WbojtsH%otdh8CV%a z7+4q>%q&ezCX2ADvN3@)NKD))#S&*fuYB?W4qwg;S%@J?le^e0{W7m2{JG+ zGO$%Arlb_rny^EZut|Xg`KwbBOA?DyOX3TP@(YS;!I$SwtGAOQp>(XPkA<^+S(8cm+asl^+glbM%o6`ca~kzwp)UM@-g O`<3T42&!TMsR96XQ%-mQ diff --git a/data/json/nullspace_dset.json b/data/json/nullspace_dset.json deleted file mode 100644 index 8808f215..00000000 --- a/data/json/nullspace_dset.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "apiVersion": "1.1.0", - "datasets": { - "23d3e919-7b53-11e4-961d-3c15c2da029e": { - "alias": [ - "/DS1" - ], - "shape": { - "class": "H5S_NULL" - }, - "type": { - "base": "H5T_STD_I32LE", - "class": "H5T_INTEGER" - }, - "value": null - } - }, - "groups": { - "23d2e06b-7b53-11e4-9910-3c15c2da029e": { - "alias": [ - "/" - ], - "links": [ - { - "class": "H5L_TYPE_HARD", - "collection": "datasets", - "id": "23d3e919-7b53-11e4-961d-3c15c2da029e", - "title": "DS1" - } - ] - } - }, - "root": "23d2e06b-7b53-11e4-9910-3c15c2da029e" -} diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..b2f3e822 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 120 +# E402: module level import not at top of file +# C901: too complex +# F401: unused exports are necessary in __init__.py +ignore = E402, C901, F401 diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index bef4587e..67c847c3 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -491,6 +491,7 @@ def arrayToBytes(arr, encoding=None): data = encodeData(data) return data + def bytesToArray(data, dt, shape, encoding=None): """ Create numpy array based on byte representation @@ -522,7 +523,7 @@ def bytesToArray(data, dt, shape, encoding=None): return arr - + def getNumpyValue(value, dt=None, encoding=None): """ Return value as numpy type for given dtype and encoding diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 6cd51c3d..5b10323f 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -12,17 +12,31 @@ import time + def resize_dataset(dset_json, shape): shape_json = dset_json["shape"] shape_class = shape_json["class"] if shape_class != "H5S_SIMPLE": raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") - if len(shape_class["dims"]) != len(shape): + if len(shape_json["dims"]) != len(shape): raise ValueError("Resize shape parameter doesn't match dataset's rank") + if "maxdims" not in shape_json: + raise ValueError("Dataset is not resizable") + dims = shape_json["dims"] + maxdims = shape_json["maxdims"] + if shape_json["dims"] == list(shape): # no change, just return return + for i in range(len(dims)): + extent = shape[i] + if extent < 0: + raise ValueError("dimensions can't be negative") + if maxdims[i] == "H5S_UNLIMITED": + # any positive extent is ok + continue + if extent > maxdims[i]: + raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}") + shape_json["dims"] = list(shape) dset_json["modified"] = time.time() - - \ No newline at end of file diff --git a/src/h5json/filters.py b/src/h5json/filters.py index e6511366..cda38178 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -9,7 +9,7 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## - + import h5py _HDF_FILTERS = { @@ -53,4 +53,3 @@ } _H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") - diff --git a/src/h5json/h5py_util.py b/src/h5json/h5py_util.py index 22df9ee0..ebe2dbdb 100644 --- a/src/h5json/h5py_util.py +++ b/src/h5json/h5py_util.py @@ -15,6 +15,7 @@ from . import hdf5dtype + def is_reference(val): """ Return True if the type or value is a Reference """ @@ -22,8 +23,8 @@ def is_reference(val): return True elif isinstance(val, type) and val.__name__ == "Reference": return True - - return False + else: + return False def is_regionreference(val): @@ -59,7 +60,7 @@ def has_reference(dtype): def convert_dtype(srcdt, to_h5py=True): """Return a dtype based on input dtype, converting any Reference types from - h5py style to h5pyd and vice-versa. + h5py style to h5json and vice-versa. """ if len(srcdt) > 0: @@ -96,7 +97,7 @@ def convert_dtype(srcdt, to_h5py=True): if to_h5py: tgt_dt = h5py.special_dtype(vlen=tgt_base) else: - tgt_dt = h5pyd.special_dtype(vlen=tgt_base) + tgt_dt = hdf5dtype.special_dtype(vlen=tgt_base) elif srcdt.kind == "U": # use vlen for unicode strings if to_h5py: diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index cc9d4220..5c7e37a6 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -28,7 +28,7 @@ class Hdf5db: and Datatypes). By default all data is held in-memory. Initialize with h5_reader to read from an HDF5 compatible storage pool, and or, h5_writer to write to an HDF5 compatible storage pool. """ - + @staticmethod def getVersionInfo(): versionInfo = {} @@ -39,7 +39,7 @@ def __init__( self, h5_reader: H5Reader = None, h5_writer: H5Writer = None, - app_logger = None, + app_logger=None, ): if app_logger: self.log = app_logger @@ -50,10 +50,10 @@ def __init__( self._reader = h5_reader self._writer = h5_writer - + self._new_objects = set() # set of obj_id's self._dirty_objects = set() # set of obj_id's - + if self._reader: root_id = self._reader.get_root_id() group_json = self._reader.getObjectById(root_id) @@ -65,7 +65,7 @@ def __init__( if self._writer: self._writer.set_db(self) - + self._db[root_id] = group_json self._root_id = root_id @@ -73,12 +73,12 @@ def __init__( def db(self): """ return object db dictionary """ return self._db - + @property def reader(self): """ return reader instance """ return self._reader - + @reader.setter def reader(self, value: H5Reader): """ set the reader """ @@ -87,44 +87,44 @@ def reader(self, value: H5Reader): self._reader = value if self._reader: self._reader.set_db(self) - + @property def writer(self): """ return writer instance """ return self._writer - + @writer.setter - def writer(self, value: H5Reader): + def writer(self, value: H5Writer): """ set the writer """ if self._writer: self._writer.close() self._writer = value if self._writer: self._writer.set_db(self) - + @property def root_id(self): """ return root uuid """ return self._root_id - + def is_new(self, obj_id): """ return true if this is a new object (has not been persisted) """ return obj_id in self._new_objects - + def is_dirty(self, obj_id): """ return true if this object has been modified """ if self.is_new(obj_id): return True return obj_id in self._dirty_objects - + @property def new_objects(self): return self._new_objects - + @property def dirty_objects(self): return self._dirty_objects - + def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ if self.is_new(obj_id): @@ -140,21 +140,16 @@ def make_dirty(self, obj_id): obj_json["lastModified"] = time.time() self._dirty_objects.add(obj_id) - def flush(self): """ write out any changes """ if not self.writer: return # nothing to do - - print("self._new_objects:", self._new_objects) - print("self._dirty_objects:", self._dirty_objects) + obj_ids = self._new_objects.union(self._dirty_objects) - print(f"hdf5db_flush {len(obj_ids)} objects") if not self.writer.flush(): # flush not successful, don't clear dirty set - return - + return for obj_id in obj_ids: obj_json = self._db[obj_id] @@ -164,12 +159,12 @@ def flush(self): # reset new and dirty sets self._new_objects = set() self._dirty_objects = set() - + def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") self.flush() - if self.writer: + if self.writer: self.writer.close() if self.reader: self.reader.close() @@ -185,7 +180,6 @@ def __exit__(self, type, value, traceback): """ called on package exit """ self.log.info("Hdf5db __exit") self.close() - def getObjectById(self, obj_id): """ return object with given id """ @@ -210,7 +204,7 @@ def getObjectIdByPath(self, h5path, parent_id=None): if parent_id is None: parent_id = self.root_id self.log.debug(f"getObjectIdDByPath(h5path: {h5path} parent_id: {parent_id}") - + obj_json = self.getObjectById(parent_id) if obj_json is None: self.log.warning("getObjectIdDByPath - parent_id not found") @@ -261,12 +255,12 @@ def getObjectIdByPath(self, h5path, parent_id=None): self.log.warning(f"get_bypath {h5path} not found") raise KeyError(h5path) return obj_id - + def getObjectByPath(self, path): """ Get Object JSON at given path """ obj_id = self.getObjectIdByPath(path) obj_json = self.getObjectById(obj_id) - return obj_json + return obj_json def getDtype(self, obj_id): """ Return numpy data type for given object id """ @@ -277,11 +271,10 @@ def getDtype(self, obj_id): # group id? raise TypeError(f"{obj_id} does not have a datatype") type_json = obj_json["type"] - + dtype = createDataType(type_json) return dtype - - + def getAttribute(self, obj_id, name, includeData=True): """ Get attribute given an object id and name @@ -290,28 +283,20 @@ def getAttribute(self, obj_id, name, includeData=True): obj_json = self.getObjectById(obj_id) attrs = obj_json["attributes"] - + if name not in attrs: msg = f"Attribute: [{name }] not found in object: {obj_id}" self.log.info(msg) return None - if attrs[name] == None: + if attrs[name] is None: msg = f"Attribute: [{name}] has been deleted" self.log.info(None) return None - + attr_json = attrs[name] - if includeData and "value" not in attr_json: - # Reader may not have pre-loaded large attributes - # fetch it now - if not self.reader: - raise RuntimeError(f"Expected to find value for attribute {name} of {obj_id}") - attr_json = self.reader.get_attribute(obj_id, name) - attr_json["value"] = attr_json # this will update the _db - return attr_json - + def getAttributes(self, obj_id): """ Get attributes given an object id and name @@ -322,11 +307,11 @@ def getAttributes(self, obj_id): attrs = obj_json["attributes"] names = [] for name in attrs: - if attrs[name] != None: + if attrs[name] is not None: names.append(name) - + return names - + def getAttributeValue(self, obj_id, name): """ Return NDArray of the given attribute value """ attr_json = self.getAttribute(obj_id, name) @@ -339,20 +324,17 @@ def getAttributeValue(self, obj_id, name): else: dims = shape_json["dims"] dtype = createDataType(attr_json["type"]) - print("getAttributeValue dtype, metadata:", dtype.metadata) value = attr_json["value"] arr = jsonToArray(dims, dtype, value) - print("getAttributeValue returning arr.dtype, metadata:", arr.dtype.metadata) return arr - def createAttribute(self, obj_id, name, value, shape=None, dtype=None): """ create an attribute - will override any existing attributes """ - + # TBD: if dtype is a committed ref type, fetch it first # TBD: also, check special case for complex types @@ -367,7 +349,7 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): type_json["id"] = ctype_id dtype = createDataType(type_json) - # First, make sure we have a NumPy array. + # First, make sure we have a NumPy array if isinstance(value, Reference) and dtype is None: dtype = special_dtype(ref=Reference) if shape == "H5S_NULL": @@ -383,7 +365,7 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): dtype = value.dtype else: dtype = np.dtype(dtype) # In case a string, e.g. 'i8' is passed - + # Where a top-level array type is requested, we have to do some # fiddling around to present the data as a smaller array of # sub-arrays. @@ -443,7 +425,6 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): # mark object as dirty self.make_dirty(obj_id) - def deleteAttribute(self, obj_id, name): """ delete the given attribute """ obj_json = self.getObjectById(obj_id) @@ -451,9 +432,8 @@ def deleteAttribute(self, obj_id, name): if name not in attrs_json: raise KeyError(f"attribute [{name}] not found in {obj_id}") attrs_json[name] = None # mark key for deletion - - self.make_dirty(obj_id) + self.make_dirty(obj_id) def getDatasetValues(self, dset_id, sel): """ @@ -466,22 +446,23 @@ def getDatasetValues(self, dset_id, sel): shape_json = dset_json["shape"] if not isinstance(sel, selections.Selection): raise TypeError("Expected Selection class") - + if shape_json["class"] == "H5S_NULL": return None if shape_json["class"] == "H5S_SCALAR": - if sel.select_type != sel.H5S_SELECT_ALL: + if sel.select_type != selections.H5S_SELECT_ALL: # TBD: support other selection types raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") if sel.shape != (): raise ValueError("Selection shape does not match dataset shape") + rank = 0 else: dims = tuple(shape_json["dims"]) if sel.shape != dims: raise ValueError("Selection shape does not match dataset shape") - rank = len(dims) - + rank = len(dims) + dtype = self.getDtype(dset_id) if self.reader: arr = self.reader.getDatasetValues(dset_id, sel) @@ -506,7 +487,7 @@ def getDatasetValues(self, dset_id, sel): arr[slices] = update_val return arr - + def setDatasetValues(self, dset_id, sel, arr): """ Write the given ndarray to the dataset using the selection @@ -538,17 +519,15 @@ def setDatasetValues(self, dset_id, sel, arr): updates.append((sel, arr.copy())) self.make_dirty(dset_id) - def resizeDataset(self, dset_id, shape): """ Resize existing Dataset """ self.log.info(f"resizeDataset {dset_id}, {shape}") - + dset_json = self.getObjectById(dset_id) # will throw exception if not found if resize_dataset(dset_json, shape): self._dirty_objects.add(dset_id) - def deleteObject(self, obj_id): """ Delete the given object """ @@ -558,14 +537,13 @@ def deleteObject(self, obj_id): if obj_id == self.root_id: raise KeyError("Root group cannot be deleted") self.db[obj_id] = None - + if obj_id in self._new_objects: self._new_objects.remove(obj_id) if obj_id in self._dirty_objects: self._dirty_objects.remove(obj_id) - def getLinks(self, grp_id): """ Get the links for the given group """ grp_json = self.getObjectById(grp_id) @@ -574,30 +552,30 @@ def getLinks(self, grp_id): links = grp_json["links"] names = [] for name in links: - if links[name] != None: + if links[name] is not None: names.append(name) return names - + def getLink(self, grp_id, name): """ Get the given link """ - + obj_json = self.getObjectById(grp_id) links = obj_json["links"] if name not in links: self.log.info(f"Link [{name}] not found in {grp_id}") return None - if links[name] == None: + if links[name] is None: self.log.info(f"Link {name} in {grp_id} has been deleted") return None return links[name] - + def _addLink(self, grp_id, name, link_json): obj_json = self.getObjectById(grp_id) links = obj_json["links"] links[name] = link_json self.make_dirty(grp_id) - + def createHardLink(self, grp_id, name, tgt_id): """ Create a new hardlink """ link_json = {"class": "H5L_TYPE_HARD", "id": tgt_id} @@ -622,7 +600,7 @@ def createExternalLink(self, grp_id, name, h5path, filepath): link_json = {"class": "H5L_TYPE_EXTERNAL", "h5path": h5path, "file": filepath} link_json["created"] = time.time() self._addLink(grp_id, name, link_json) - + def deleteLink(self, grp_id, name): """ Delete the given link """ grp_json = self.getObjectById(grp_id) @@ -633,7 +611,6 @@ def deleteLink(self, grp_id, name): raise KeyError(f"Link [{name}] not found in {grp_id}") links[name] = None # mark for deletion self.make_dirty(grp_id) - def createGroup(self, cpl=None): """ Create a new group """ @@ -648,7 +625,6 @@ def createGroup(self, cpl=None): self.db[grp_id] = group_json self._new_objects.add(grp_id) return grp_id - def createCommittedType(self, datatype, cpl=None): """ @@ -658,7 +634,7 @@ def createCommittedType(self, datatype, cpl=None): self.log.info("createCommittedType") if cpl is None: cpl = {} - + ctype_id = createObjId(obj_type="datatypes", root_id=self.root_id) if isinstance(datatype, np.dtype): dt = datatype @@ -672,11 +648,11 @@ def createCommittedType(self, datatype, cpl=None): self.db[ctype_id] = ctype_json self._new_objects.add(ctype_id) return ctype_id - - + def createDataset( self, shape=None, + maxdims=None, dtype=None, cpl=None, ): @@ -687,25 +663,34 @@ def createDataset( type_json = getTypeItem(dtype) if shape == "H5S_NULL": shape_json = {"class": "H5S_NULL"} + elif shape == (): + shape_json = {"class": "H5S_SCALAR"} else: shape_json = {"class": "H5S_SIMPLE"} shape_json["dims"] = list(shape) + if maxdims: + if shape_json["class"] != "H5S_SIMPLE": + raise ValueError("only simple shapes can be resizable") + if len(shape) != len(maxdims): + raise ValueError("maxdims length not equal to shape rank") + shape_json["maxdims"] = ["H5S_UNLIMITED" if x is None else x for x in maxdims] + dset_json = {"shape": shape_json, "type": type_json, "attributes": {}} if cpl: dset_json["cpl"] = cpl else: dset_json["cpl"] = {} - - dset_id = createObjId("datasets", root_id=self.root_id) - self.db[dset_id] = dset_json + + dset_id = createObjId("datasets", root_id=self.root_id) + self.db[dset_id] = dset_json self._new_objects.add(dset_id) return dset_id def getCollection(self, col_type=None): obj_ids = [] for obj_id in self.db: - if self.db[obj_id] == None: + if self.db[obj_id] is None: # skip deleted objects continue if not col_type or getCollectionForId(obj_id) == col_type: @@ -717,7 +702,7 @@ def __len__(self): count = 0 for obj_id in self.db: # skip deleted objects - if self.db[obj_id] != None: + if self.db[obj_id] is not None: count += 1 return count @@ -725,12 +710,11 @@ def __iter__(self): """ Iterate over object ids """ for obj_id in self.db: - if self.db[obj_id] == None: + if self.db[obj_id] is None: # skip deleted objects continue yield obj_id - def __contains__(self, obj_id): """ Test if a obj id exists """ - return obj_id in self.db and self.db[obj_id] != None + return obj_id in self.db and self.db[obj_id] is not None diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index acbb2d21..cd3c6a45 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -152,7 +152,6 @@ def special_dtype(**kwds): raise TypeError(f'Unknown special type "{name}"') - def find_item_type(data): """Find the item type of a simple object or collection of objects. @@ -182,6 +181,7 @@ def find_item_type(data): return None return item_types.pop() + def guess_dtype(data): """ Attempt to guess an appropriate dtype for the object, returning None if nothing is appropriate (or if it should be left up the the array @@ -197,6 +197,7 @@ def guess_dtype(data): return None + def is_float16_dtype(dt): if dt is None: return False @@ -204,6 +205,7 @@ def is_float16_dtype(dt): dt = np.dtype(dt) # normalize strings -> np.dtype objects return dt.kind == 'f' and dt.itemsize == 2 + def check_dtype(**kwds): """Check a dtype for h5py special type "hint" information. Only one keyword may be given. @@ -307,7 +309,7 @@ def getTypeItem(dt, metadata=None): "float32": "H5T_IEEE_F32", "float64": "H5T_IEEE_F64", } - + dt = np.dtype(dt) # convert 'int32', np.int32, etc. to a dtype if not metadata and dt.metadata: @@ -465,7 +467,6 @@ def getTypeItem(dt, metadata=None): item = {"name": name, "value": value} members.append(item) type_info["members"] = members - #type_info["mapping"] = mapping if dt.name not in predefined_int_types: raise TypeError("Unexpected integer type: " + dt.name) # maps to one of the HDF5 predefined types @@ -505,6 +506,17 @@ def isVlen(dt): return is_vlen +def isOpaqueDtype(dt): + """ + Return True if this is an opaque dtype + """ + if dt.kind == "V" and len(dt) <= 1 and len(dt.shape) == 0 and not dt.names: + return True + if dt.metadata and dt.metadata.get('h5py_opaque'): + return True + return False + + def getItemSize(typeItem): """ Get size of an item in bytes. @@ -805,7 +817,6 @@ def createBaseDataType(typeItem): raise KeyError("'base' not provided") if typeItem["base"] == "H5T_STD_REF_OBJ": dtRet = special_dtype(ref=Reference) - print("special dtype, metadata:", dtRet.metadata) elif typeItem["base"] == "H5T_STD_REF_DSETREG": dtRet = special_dtype(ref=RegionReference) else: diff --git a/src/h5json/objid.py b/src/h5json/objid.py index bd34bc56..a5453641 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -117,7 +117,7 @@ def getCollectionForId(obj_id): """return groups/datasets/datatypes based on id""" if not isinstance(obj_id, str): raise ValueError("invalid object id") - + collection = None if obj_id.startswith("g-"): collection = "groups" @@ -486,6 +486,7 @@ def getUuidFromId(id): else: raise ValueError(f"Unexpected obj_id: {id}") + def stripId(obj_id): """ return just the base id without any prefix (e.g. 'g-') """ if len(obj_id) == UUID_LEN: @@ -494,4 +495,3 @@ def stripId(obj_id): return obj_id[2:] else: raise ValueError("unexpected obj_id: {obj_id}") - diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/reader/h5json_reader.py index 44d178a5..6666587c 100644 --- a/src/h5json/reader/h5json_reader.py +++ b/src/h5json/reader/h5json_reader.py @@ -18,14 +18,13 @@ from ..array_util import jsonToArray from .. import selections from ..h5reader import H5Reader - + class H5JsonReader(H5Reader): """ This class can be used by HDF5DB to read content from an hdf5-json file """ - def __init__( self, filepath, @@ -55,7 +54,7 @@ def close(self): def get_root_id(self): """ Return root id """ return self._root_id - + def getObjectById(self, obj_id, include_attrs=True, include_links=True): """ return object with given id """ collection = getCollectionForId(obj_id) @@ -84,8 +83,14 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): continue name = item["name"] attr = {} - for k in ("type", "shape", "value"): - attr[k] = item[k] + if "type" not in item: + raise KeyError(f"expected to find type key for attribute {name} of {obj_id}") + attr["type"] = item["type"] + if "shape" not in item: + raise KeyError(f"expected to find shape key for attribute {name} of {obj_id}") + attr["shape"] = item["shape"] + if "value" in item: + attr["value"] = item["value"] attrs[name] = attr resp["attributes"] = attrs @@ -122,7 +127,6 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): return resp - def getAttribute(self, obj_id, name, includeData=True): """ Get attribute given an object id and name @@ -140,7 +144,6 @@ def getAttribute(self, obj_id, name, includeData=True): self.log.info(f"attr: [{name}] of {obj_id} not found") return None return attributes[name] - def getDatasetValues(self, obj_id, sel=None): """ @@ -175,12 +178,5 @@ def getDatasetValues(self, obj_id, sel=None): arr = arr[sel.slices] else: raise NotImplementedError("selection type not supported") - - return arr - - - - - - + return arr diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/reader/h5py_reader.py index cfae72cc..7042a259 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/reader/h5py_reader.py @@ -13,31 +13,130 @@ import numpy as np import logging -from ..objid import createObjId -from ..hdf5dtype import getTypeItem +from ..objid import createObjId, getCollectionForId +from ..hdf5dtype import getTypeItem, isOpaqueDtype from ..array_util import bytesArrayToList from .. import selections from .. import filters + +from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype from .h5reader import H5Reader - + class H5pyReader(H5Reader): """ - This class can be used by HDF5DB to read content from an HDF5 file (using h5py) + This class can be used by HDF5DB to read content from an HDF5 file (using h5py) """ + def _copy_element(self, val, src_dt, tgt_dt, fin=None): + """ convert the given dataset or attribute element from h5py to h5json equivalent """ + + out = None + if len(src_dt) > 0: + out_fields = [] + i = 0 + for name in src_dt.fields: + field_src_dt = src_dt.fields[name][0] + field_tgt_dt = tgt_dt.fields[name][0] + field_val = val[i] + i += 1 + out_field = self._copy_element(field_val, field_src_dt, field_tgt_dt, fin=fin) + out_fields.append(out_field) + out = tuple(out_fields) + elif src_dt.metadata and "ref" in src_dt.metadata: + if not tgt_dt.metadata or "ref" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be ref, but got: {tgt_dt}") + ref = tgt_dt.metadata["ref"] + if is_reference(ref): + # initialize out to null ref + out = h5py.Reference() # null h5py ref + + if ref and val: + try: + fin_obj = fin[val] + except AttributeError as ae: + msg = f"Unable able to get obj for ref value: {ae}" + self.log.error(msg) + raise ValueError(msg) + + addr = h5py.h5o.get_info(fin_obj.id).addr + if addr not in self._addr_map: + msg = f"No object found for ref object: {fin_obj.name}" + self.log.warning(msg) + out = "" + else: + obj_id = self._addr_map[addr] + collection = getCollectionForId(obj_id) + out = f"{collection}/{obj_id}" + + elif is_regionreference(ref): + self.log.warning("region reference not supported") + # TBD: just return a null region reference till we have support + out = "" + else: + raise TypeError(f"Unexpected ref type: {type(ref)}") + elif src_dt.metadata and "vlen" in src_dt.metadata: + if not isinstance(val, np.ndarray): + raise TypeError(f"Expecting ndarray or vlen element, but got: {type(val)}") + if not tgt_dt.metadata or "vlen" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be vlen, but got: {tgt_dt}") + src_vlen_dt = src_dt.metadata["vlen"] + tgt_vlen_dt = tgt_dt.metadata["vlen"] + if has_reference(src_vlen_dt): + if len(val.shape) == 0: + # scalar array + e = val[()] + v = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fin=fin) + out = np.array(v, dtype=tgt_dt) + else: + out = np.zeros(val.shape, dtype=tgt_dt) + for i in range(len(out)): + e = val[i] + out[i] = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fin=fin) + else: + # can just directly copy the array + out = np.zeros(val.shape, dtype=tgt_dt) + out[...] = val[...] + else: + out = val # can just copy as is + return out + + def _copy_array(self, src_arr, fin=None): + """Copy the numpy array to a new array. + Convert any reference type to point to item in the target's hierarchy. + """ + + if not isinstance(src_arr, np.ndarray): + raise TypeError(f"Expecting ndarray, but got: {src_arr}") + tgt_dt = convert_dtype(src_arr.dtype, to_h5py=False) + tgt_arr = np.zeros(src_arr.shape, dtype=tgt_dt) + + if has_reference(src_arr.dtype): + # flatten array to simplify iteration + count = int(np.prod(src_arr.shape)) + tgt_arr_flat = tgt_arr.reshape((count,)) + src_arr_flat = src_arr.reshape((count,)) + for i in range(count): + e = src_arr_flat[i] + element = self._copy_element(e, src_arr.dtype, tgt_dt, fin=fin) + tgt_arr_flat[i] = element + tgt_arr = tgt_arr_flat.reshape(src_arr.shape) + else: + # can just copy the entire array + tgt_arr[...] = src_arr[...] + return tgt_arr + def visit(self, path, obj): name = obj.__class__.__name__ self.log.info(f"visit: {path} name: {name}") - + obj_id = createObjId(obj_type=name, root_id=self._root_id) # create uuid - self._id_map[obj_id] = obj - + self._id_map[obj_id] = obj + addr = h5py.h5o.get_info(obj.id).addr self._addr_map[addr] = obj_id - def __init__( self, filepath, @@ -66,13 +165,13 @@ def close(self): def get_root_id(self): """ Return root id """ return self._root_id - + def getObjIdByAddress(self, addr): if addr in self._addr_map: return self._addr_map[addr] else: return None - + def getAttribute(self, obj_id, name, include_data=True): """ Return JSON for the given attribute """ @@ -117,7 +216,7 @@ def getAttribute(self, obj_id, name, include_data=True): item["shape"] = shape_item if shape_item["class"] == "H5S_NULL": include_data = False - elif isinstance(type_item, dict) and type_item["class"] in ("H5T_OPAQUE"): + elif isinstance(type_item, dict) and type_item["class"] == "H5T_OPAQUE": # TBD - don't include data for OPAQUE until JSON serialization # issues are addressed include_data = False @@ -126,13 +225,18 @@ def getAttribute(self, obj_id, name, include_data=True): if include_data: try: - data = obj.attrs[name] + data = obj.attrs[name] + # convert from h5py to h5json + data = self._copy_array(data, fin=obj.file) except TypeError: self.log.warning("type error reading attribute") if include_data and data is not None: - item["value"] = bytesArrayToList(data) - + value = bytesArrayToList(data) + item["value"] = value + else: + pass # no data + # timestamps will be added by getAttributeItem() return item @@ -146,7 +250,7 @@ def getAttributes(self, obj_id, include_data=True): items[name] = item return items - + def _getLink(self, parent, link_name): if link_name not in parent: return None @@ -178,7 +282,7 @@ def _getLink(self, parent, link_name): item["id"] = None else: item["id"] = self._addr_map[addr] - + return item def _getLinks(self, grp): @@ -197,7 +301,7 @@ def _getGroup(self, grp, include_links=True): links = self._getLinks(grp) item["links"] = links return item - + def _getDatatype(self, ctype, include_attrs=True): self.log.info(f"getDatatype alias: ]{ctype.name}") item = {"alias": ctype.name} @@ -205,7 +309,6 @@ def _getDatatype(self, ctype, include_attrs=True): return item - def _getHDF5DatasetCreationProperties(self, dset, type_class): """ Get dataset creation properties maintained by HDF5 library """ @@ -267,7 +370,7 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): filter_id = filter_info[0] filter_prop["id"] = filter_id if filter_info[3]: - filter_prop["name"] = self.bytesArrayToList(filter_info[3]) + filter_prop["name"] = bytesArrayToList(filter_info[3]) if filter_id in filters._HDF_FILTERS: hdf_filter = filters._HDF_FILTERS[filter_id] filter_prop["class"] = hdf_filter["class"] @@ -296,8 +399,8 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): creationProps["filters"] = filter_props return creationProps - - def _getDataset(self, dset): + + def _getDataset(self, dset): self.log.info(f"getDataset alias: [{dset.name}]") item = {"alias": dset.name} @@ -308,21 +411,21 @@ def _getDataset(self, dset): addr = h5py.h5o.get_info(typeid).addr type_uuid = self.getObjIdByAddress(addr) committedType = self.getObjectById(type_uuid) - typeItem = committedType["type"] - typeItem["id"] = type_uuid + type_item = committedType["type"] + type_item["id"] = type_uuid else: - typeItem = getTypeItem(dset.dtype) - item["type"] = typeItem - - shapeItem = {} + type_item = getTypeItem(dset.dtype) + item["type"] = type_item + + shape_item = {} if dset.shape is None: # new with h5py 2.6, null space datasets will return None for shape - shapeItem["class"] = "H5S_NULL" + shape_item["class"] = "H5S_NULL" elif len(dset.shape) == 0: - shapeItem["class"] = "H5S_SCALAR" + shape_item["class"] = "H5S_SCALAR" else: - shapeItem["class"] = "H5S_SIMPLE" - shapeItem["dims"] = list(dset.shape) + shape_item["class"] = "H5S_SIMPLE" + shape_item["dims"] = list(dset.shape) maxshape = [] include_maxdims = False for i in range(len(dset.shape)): @@ -335,14 +438,13 @@ def _getDataset(self, dset): include_maxdims = True maxshape.append(extent) if include_maxdims: - shapeItem["maxdims"] = maxshape - item["shape"] = shapeItem - - item["cpl"] = self._getHDF5DatasetCreationProperties(dset, typeItem["class"]) + shape_item["maxdims"] = maxshape + item["shape"] = shape_item + item["cpl"] = self._getHDF5DatasetCreationProperties(dset, type_item["class"]) return item - + def getObjectById(self, obj_id, include_attrs=True, include_links=True): """ return object with given id """ if obj_id not in self._id_map: @@ -356,33 +458,35 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): obj_json = self._getDatatype(h5obj) else: raise TypeError(f"unexpected object type: {type(h5obj)}") - + if include_attrs: attributes = self.getAttributes(obj_id) obj_json["attributes"] = attributes return obj_json - def getDatasetValues(self, dset_id, sel=None): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same number of elements as the rank of the dataset. """ + dset = self._id_map[dset_id] self.log.info(f"getDatasetValues: {dset_id}") if dset.shape is None: # TBD: return something like h5py.Empty in this case? return None + if isOpaqueDtype(dset.dtype): + # TBD: Opaque data not supported yet + return None if sel is None or sel.select_type == selections.H5S_SELECT_ALL: arr = dset[...] elif isinstance(sel, selections.SimpleSelection): arr = dset[sel.slices] else: raise NotImplementedError("selection type not supported") - - return arr - - + # convert any h5py references to h5json references + arr = self._copy_array(arr, fin=dset.file) + return arr diff --git a/src/h5json/reader/h5reader.py b/src/h5json/reader/h5reader.py index 3923bb15..377bc3f9 100644 --- a/src/h5json/reader/h5reader.py +++ b/src/h5json/reader/h5reader.py @@ -16,11 +16,10 @@ class H5Reader(ABC): """ - This abstract class defines properties and methods that the Hdf5db class uses for reading from an HDF5 - compatible storage medium. + This abstract class defines properties and methods that the Hdf5db class uses for reading from an HDF5 + compatible storage medium. """ - def __init__( self, filepath, @@ -31,17 +30,17 @@ def __init__( self.log = app_logger else: self.log = logging.getLogger() - + @abstractmethod def get_root_id(self): """ Return root id """ pass - @abstractmethod + @abstractmethod def getObjectById(self, obj_id, include_attrs=True, include_links=True): """ return object with given id """ pass - + @abstractmethod def getAttribute(self, obj_id, name, includeData=True): """ @@ -63,4 +62,3 @@ def getDatasetValues(self, obj_id, sel=None): def close(self): """ close any open handles to the storage """ pass - diff --git a/src/h5json/selections.py b/src/h5json/selections.py index ef296d70..3a94b094 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -115,6 +115,7 @@ def select(obj, args): sel[args] return sel + def intersect(s1, s2): """ Return the intersection of two selections """ # TBD: this is currently only working for simple selections with stride 1 @@ -129,7 +130,7 @@ def intersect(s1, s2): raise TypeError("Expected hyperslab selection for second arg") if s1.shape != s2.shape: raise ValueError("selections have incompatible shapes") - + slices = [] rank = len(s1.shape) for dim in range(rank): diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index 8c5ce6af..097a1ccc 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -17,10 +17,11 @@ from ..array_util import bytesArrayToList from .. import selections + class H5JsonWriter(H5Writer): """ - This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 - compatible storage medium. + This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 + compatible storage medium. """ def __init__( @@ -34,14 +35,14 @@ def __init__( self.alias_db = {} self.json = {} self._root_uuid = None - + def flush(self): """ Write dirty items """ # json writer doesn't support incremental updates, so we'll wait # for close to write out database self.log.info("flush") return False - + def close(self): """ close storage handle """ self.dumpFile() @@ -51,8 +52,7 @@ def getAliasList(self, obj_id): if obj_id not in self.alias_db: self.alias_db[obj_id] = [] return self.alias_db[obj_id] - - + def updateAliasList(self): """ update the alias list for each object """ # clear exiting aliases @@ -62,7 +62,6 @@ def updateAliasList(self): self._setAlias(self._root_uuid, set(), "/") - def _setAlias(self, obj_id, id_set, h5path): """ add the given h5path to the object's alias list If the object is a group, recurse through each hard link """ @@ -83,24 +82,23 @@ def _setAlias(self, obj_id, id_set, h5path): if link_json["class"] == "H5L_TYPE_HARD": tgt_id = link_json["id"] if tgt_id in id_set: - self.log.info(f"_setAlias - circular loop found") + self.log.info("_setAlias - circular loop found") else: - self._setAlias(tgt_id, id_set, h5path+link_name) + self._setAlias(tgt_id, id_set, f"{h5path}{link_name}") id_set.remove(obj_id) - def dumpAttribute(self, obj_id, attr_name): self.log.info(f"dumpAttribute: [{attr_name}]") item = self.db.getAttribute(obj_id, attr_name) response = {"name": attr_name} response["type"] = item["type"] response["shape"] = item["shape"] - if True: - if "value" not in item: - self.log.warning("no value key in attribute: " + attr_name) - else: - # dump values unless header -D was passed - response["value"] = item["value"] + + if "value" not in item: + self.log.warning(f"no value key in attribute: {attr_name}") + else: + # dump values unless header -D was passed + response["value"] = item["value"] return response def dumpAttributes(self, obj_id): @@ -142,7 +140,7 @@ def dumpGroup(self, obj_id): alias = self.getAliasList(obj_id) response["alias"] = alias - + if "cpl" in item: item["creationProperties"] = item["cpl"] attributes = self.dumpAttributes(obj_id) @@ -172,11 +170,8 @@ def dumpDataset(self, obj_id): response = {} self.log.info("dumpDataset: " + obj_id) item = self.db.getObjectById(obj_id) - if "alias" in item: - alias = item["alias"] - if alias: - self.log.info(f"dumpDataset alias: [{alias[0]}]") - response["alias"] = item["alias"] + alias = self.getAliasList(obj_id) + response["alias"] = alias response["type"] = item["type"] shapeItem = item["shape"] @@ -217,8 +212,6 @@ def dumpDataset(self, obj_id): sel_all = selections.select(dims, ...) arr = self.db.getDatasetValues(obj_id, sel_all) response["value"] = bytesArrayToList(arr) # dump values unless header flag was passed - else: - response["value"] = [] # empty list return response def dumpDatasets(self): @@ -235,7 +228,8 @@ def dumpDatasets(self): def dumpDatatype(self, obj_id): response = {} item = self.db.getObjectById(obj_id) - response["alias"] = item["alias"] + alias = self.getAliasList(obj_id) + response["alias"] = alias response["type"] = item["type"] if "cpl" in item: response["creationProperties"] = item["cpl"] @@ -255,7 +249,6 @@ def dumpDatatypes(self): self.json["datatypes"] = datatypes - def dumpFile(self): self._root_uuid = self.db.getObjectIdByPath("/") @@ -272,7 +265,10 @@ def dumpFile(self): self.dumpDatatypes() - print(json.dumps(self.json, sort_keys=True, indent=4)) - - - + indent = 4 + ensure_ascii = False + if self._filepath: + with open('data.json', 'w', encoding='utf-8') as f: + json.dump(self.json, f, ensure_ascii=ensure_ascii, indent=indent) + else: + print(json.dumps(self.json, sort_keys=True, ensure_ascii=ensure_ascii, indent=indent)) diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index 07717ddf..2d212102 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -20,13 +20,11 @@ from .h5writer import H5Writer - class H5pyWriter(H5Writer): """ - This class saves state from the Hdf5Db class into an HDF5 file. + This class saves state from the Hdf5Db class into an HDF5 file. """ - def __init__( self, filepath, @@ -43,7 +41,6 @@ def __init__( self._id_map = {} - def _copy_element(self, val, src_dt, tgt_dt, fout=None): """ convert the given dataset or attribute element to h5py equivalent """ @@ -66,7 +63,7 @@ def _copy_element(self, val, src_dt, tgt_dt, fout=None): if is_reference(ref): # initialize out to null ref out = h5py.Reference() # null h5py ref - + if ref and val: if isinstance(val, bytes): val = val.decode("ascii") @@ -148,7 +145,6 @@ def _createGroup(self, parent, grp_json, name=None): grp = parent.create_group(name) return grp - def _createDataset(self, parent, dset_json, name=None): """ create a dataset object """ @@ -175,7 +171,7 @@ def _createDataset(self, parent, dset_json, name=None): msg = "fillvalue has incorrect number of elements" self.log.warning(msg) raise ValueError(msg) - + fillvalue = jsonToArray((), dtype, fillvalue) kwargs["fillvalue"] = fillvalue @@ -255,7 +251,7 @@ def _createDataset(self, parent, dset_json, name=None): kwargs["scaleoffset"] = filter_prop["scaleOffset"] else: self.log.info(f"Unexpected filter name: {filter_alias}, ignoring") - + dset = parent.create_dataset(name, **kwargs) return dset @@ -267,14 +263,10 @@ def _createDatatype(self, parent, ctype_json, name=None): parent[name] = dtype return parent[name] - def _createObjects(self, parent, links_json, visited=set()): """ create child object in the given group, recurse for any sub-groups """ for title in links_json: - #if title in parent: - # # TBD: this will do the wrong thing if the link tgt has changed - # continue link_json = links_json[title] link_class = link_json["class"] if link_class == "H5L_TYPE_SOFT" and title not in parent: @@ -299,11 +291,11 @@ def _createObjects(self, parent, links_json, visited=set()): self.log.warning("h5py_writer - expected to find {tgt_id} in id_map") continue """ - + collection = getCollectionForId(tgt_id) obj_json = self.db.getObjectById(tgt_id) - + if tgt_id in self._id_map: # object has already been created tgt_path = self._id_map[tgt_id] @@ -351,55 +343,51 @@ def updateDatasetValues(self, dset_id, dset): stop = start + sel.count[dim] step = sel.step[dim] slices.append(slice(start, stop, step)) - slices = tuple(slices) + slices = tuple(slices) dset[slices] = val self.log.debug(f"h5py_writer dset {dset.name} updated") - - def createAttribute(self, obj, name, attr_json): """ add the given attribute to obj """ - + src_dt = createDataType(attr_json["type"]) - - # handle special case of null space attribute here + + # handle special case of null space attribute here shape_json = attr_json["shape"] shape_class = shape_json["class"] if shape_class == "H5S_NULL": obj.attrs[name] = h5py.Empty(convert_dtype(src_dt, to_h5py=True)) return - + if shape_class == "H5S_SCALAR": dims = () else: dims = shape_json["dims"] src_arr = jsonToArray(dims, src_dt, attr_json["value"]) tgt_arr = self._copy_array(src_arr, fout=obj.file) - - obj.attrs[name] = tgt_arr + obj.attrs[name] = tgt_arr def updateAttributes(self, obj_id, obj): """ create/replace any modified attributes """ obj_json = self.db.getObjectById(obj_id) - + if "attributes" not in obj_json: # no attributes return - + attrs = obj_json["attributes"] for name in attrs: attr_json = attrs[name] self.createAttribute(obj, name, attr_json) - def flush(self): """ Write dirty items """ if not self.db: # no db set yet return False - + self.log.info("h5py_writer.flush()") root_id = self.db.root_id self._id_map[root_id] = "/" @@ -420,8 +408,6 @@ def flush(self): self._mode = "a" # use append mode for future updates return True # all objects written successfully - def close(self): """ close storage handle """ self.flush() - diff --git a/src/h5json/writer/h5writer.py b/src/h5json/writer/h5writer.py index 4e57048f..aaab2e51 100644 --- a/src/h5json/writer/h5writer.py +++ b/src/h5json/writer/h5writer.py @@ -16,11 +16,10 @@ class H5Writer(ABC): """ - This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 - compatible storage medium. + This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 + compatible storage medium. """ - def __init__( self, filepath, @@ -38,9 +37,7 @@ def __init__( else: self.log = logging.getLogger() - def set_db(self, db): - #TBD - use weak ref? self._db_ref = weakref.ref(db) @property @@ -53,9 +50,8 @@ def db(self): def flush(self): """ Write dirty items """ pass - + @abstractmethod def close(self): """ close storage handle """ pass - diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index df69f029..e6512d7f 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -44,9 +44,12 @@ def __init__(self, *args, **kwargs): self.log.info("init!") - def testGroup(self): + def testSimple(self): - with Hdf5db(h5_writer=H5JsonWriter("/tmp/foo.json", no_data=False), app_logger=self.log) as db: + filepath = "test/unit/out/h5json_writer_testSimple.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "attr1", value=[1,2,3,4]) db.createAttribute(root_id, "attr2", 42) @@ -72,10 +75,12 @@ def testGroup(self): - def testNullSpaceAttribute(self): + + filepath = "test/unit/out/h5json_writer_testNullSpaceAttribute.h5" with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) item = db.getAttribute(root_id, "A1") @@ -88,7 +93,10 @@ def testNullSpaceAttribute(self): self.assertEqual(value, None) def testScalarAttribute(self): + filepath = "test/unit/out/h5json_writer_testScalarAttribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") dims = () value = 42 @@ -112,7 +120,10 @@ def testScalarAttribute(self): def testFixedStringAttribute(self): + filepath = "test/unit/out/h5json_writer_testFixedStringAttribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") value = "Hello, world!" db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) @@ -131,7 +142,10 @@ def testFixedStringAttribute(self): def testVlenAsciiAttribute(self): + filepath = "test/unit/out/h5json_writer_testVlenAsciiAttribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") value = b"Hello, world!" @@ -153,7 +167,10 @@ def testVlenAsciiAttribute(self): self.assertTrue(item["created"] > now - 1) def testVlenUtf8Attribute(self): + filepath = "test/unit/out/h5json_writer_testVlenutf8Attribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") value = b"Hello, world!" @@ -176,7 +193,10 @@ def testVlenUtf8Attribute(self): def testIntAttribute(self): + filepath = "test/unit/out/h5json_writer_testIntAttribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") value = [2, 3, 5, 7, 11] db.createAttribute(root_id, "A1", value, dtype=np.int16) @@ -192,7 +212,10 @@ def testIntAttribute(self): self.assertEqual(item_type["base"], "H5T_STD_I16LE") def testCreateReferenceAttribute(self): + filepath = "test/unit/out/h5json_writer_testCreateReferenceAttribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") dset_id = db.createDataset(shape=(), dtype=np.int32) @@ -215,7 +238,10 @@ def testCreateReferenceAttribute(self): self.assertEqual(attr_value[0], ds1_ref) def testCreateVlenReferenceAttribute(self): + filepath = "test/unit/out/h5json_writer_testVlenReferenceAttribute.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) @@ -248,7 +274,10 @@ def testCreateVlenReferenceAttribute(self): def testCommittedType(self): + filepath = "test/unit/out/h5json_writer_testCommittedType.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") dt = np.dtype("S15") @@ -277,7 +306,10 @@ def testCommittedType(self): def testCommittedCompoundType(self): + filepath = "test/unit/out/h5json_writer_testCommittedCompoundType.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") dt_str = special_dtype(vlen=str) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 75b7e37b..3a8964e0 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -368,7 +368,6 @@ def testCreateVlenReferenceAttribute(self): item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SCALAR") - print("open:", filepath) with h5py.File(filepath) as f: self.assertTrue("DS1" in f) ds1 = f["DS1"] @@ -383,16 +382,19 @@ def testCreateVlenReferenceAttribute(self): def testCommittedType(self): filepath = "test/unit/out/h5py_writer_test_testCommittedType.h5" + dt = np.dtype("S15") + with Hdf5db(app_logger=self.log) as db: db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - dt = np.dtype("S15") ctype_id = db.createCommittedType(dt) db.createHardLink(root_id, "ctype", ctype_id) item = db.getObjectById(ctype_id) now = int(time.time()) self.assertTrue(item["created"] > now - 1) + db.createHardLink(root_id, "T1", ctype_id) + item_type = item["type"] @@ -411,15 +413,30 @@ def testCommittedType(self): self.assertEqual(attr_type["length"], 15) self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + with h5py.File(filepath) as f: + self.assertTrue("T1" in f) + t1 = f["T1"] + self.assertTrue(isinstance(t1, h5py.Datatype)) + self.assertEqual(t1.dtype, dt) + + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + print("a1:", a1) + self.assertEqual(a1, b"hello, world!") + def testCommittedCompoundType(self): + + filepath = "test/unit/out/h5py_writer_test_testCommittedCompoundType.h5" + with Hdf5db(app_logger=self.log) as db: + db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") dt_str = special_dtype(vlen=str) fields = [] fields.append(("field_1", np.dtype(">i8"))) - fields.append(("field_2", ">f8")) + fields.append(("field_2", np.dtype(">f8"))) fields.append(("field_3", np.dtype("S15"))) fields.append(("field_4", dt_str)) dt = np.dtype(fields) @@ -429,6 +446,7 @@ def testCommittedCompoundType(self): item = db.getObjectById(ctype_id) now = int(time.time()) self.assertTrue(item["created"] > now - 1) + db.createHardLink(root_id, "T1", ctype_id) item_type = item["type"] @@ -449,6 +467,23 @@ def testCommittedCompoundType(self): value = db.getAttributeValue(root_id, "A1") self.assertTrue(isinstance(value, np.ndarray)) + + with h5py.File(filepath) as f: + self.assertTrue("T1" in f) + t1 = f["T1"] + self.assertTrue(isinstance(t1, h5py.Datatype)) + print("dtype:", t1.dtype) + self.assertEqual(len(t1.dtype), 4) + sub_dt = t1.dtype["field_1"] + self.assertEqual(sub_dt, np.dtype(">i8")) + sub_dt = t1.dtype["field_2"] + self.assertEqual(sub_dt, np.dtype(">f8")) + sub_dt = t1.dtype["field_3"] + self.assertEqual(sub_dt, np.dtype("S15")) + sub_dt = t1.dtype["field_4"] + self.assertEqual(sub_dt, h5py.special_dtype(vlen=str)) + + if __name__ == "__main__": diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 8931dd9c..dd6869ec 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -410,6 +410,34 @@ def testScalarDataset(self): self.assertEqual(arr.min(), 42) self.assertEqual(arr.max(), 42) + def testResizableDataset(self): + with Hdf5db(app_logger=self.log) as db: + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + maxdims = (None, ncols*2) + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + + # resize limited dimension + db.resizeDataset(dset_id, (nrows, ncols*2)) + + # try to go beyond max extent + try: + db.resizeDataset(dset_id, (nrows, ncols*3)) + self.assertTrue(False) + except ValueError: + pass # expected + + # resize unlimited dimension + db.resizeDataset(dset_id, (nrows*10, ncols)) + + + + diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index 63efc239..2f798378 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -18,6 +18,7 @@ from h5json.hdf5dtype import check_dtype from h5json.hdf5dtype import Reference from h5json.hdf5dtype import RegionReference +from h5json.hdf5dtype import isOpaqueDtype class Hdf5dtypeTest(unittest.TestCase): @@ -287,6 +288,7 @@ def testCompoundArrayVlenStringTypeItem(self): def testOpaqueTypeItem(self): dt = np.dtype("V200") + self.assertTrue(isOpaqueDtype(dt)) typeItem = hdf5dtype.getTypeItem(dt) typeSize = hdf5dtype.getItemSize(typeItem) self.assertEqual(typeItem["class"], "H5T_OPAQUE") From 541b96663b4c78fade681907429ae6eb2a4a2de8 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 28 Mar 2025 13:45:03 +0100 Subject: [PATCH 020/129] fix for vlen encoding --- src/h5json/array_util.py | 41 ++++---- src/h5json/hdf5db.py | 2 + src/h5json/writer/h5json_writer.py | 1 - src/h5json/writer/h5py_writer.py | 24 ++--- test/unit/array_util_test.py | 162 +++++++++++++++++++++-------- test/unit/h5json_reader_test.py | 6 +- test/unit/h5json_writer_test.py | 37 +++---- test/unit/h5py_reader_test.py | 6 +- test/unit/h5py_writer_test.py | 50 ++++----- test/unit/hdf5db_test.py | 55 ++++------ testall.py | 11 +- 11 files changed, 216 insertions(+), 179 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 67c847c3..1640d687 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -30,7 +30,7 @@ def bytesArrayToList(data): if len(data.shape) == 0: is_list = False data = data.tolist() # tolist will return a scalar in this case - if type(data) in (list, tuple): + if type(data) in (list, tuple, np.ndarray): is_list = True else: is_list = False @@ -40,7 +40,6 @@ def bytesArrayToList(data): is_list = True else: is_list = False - if is_list: out = [] for item in data: @@ -71,8 +70,6 @@ def toTuple(rank, data): else: return tuple(toTuple(rank - 1, x) for x in data) else: - if isinstance(data, str): - data = data.encode("utf8") return data @@ -124,12 +121,15 @@ def jsonToArray(data_shape, data_dtype, data_json): Return numpy array from the given json array. """ def fillVlenArray(rank, data, arr, index): - for i in range(len(data)): - if rank > 1: - index = fillVlenArray(rank - 1, data[i], arr, index) - else: - arr[index] = data[i] - index += 1 + if arr.shape == (): + arr[()] = data + else: + for i in range(len(data)): + if rank > 1: + index = fillVlenArray(rank - 1, data[i], arr, index) + else: + arr[index] = data[i] + index += 1 return index if data_json is None: @@ -149,25 +149,26 @@ def fillVlenArray(rank, data, arr, index): if type(data_json) in (list, tuple): converted_data = [] - if npoints == 1 and len(data_json) == len(data_dtype): - converted_data.append(toTuple(0, data_json)) + if npoints == 1: + converted_data = toTuple(np_shape_rank, data_json) else: converted_data = toTuple(np_shape_rank, data_json) data_json = converted_data - else: - if isinstance(data_json, str): - data_json = data_json.encode("utf8") - data_json = [data_json,] # listify if isVlen(data_dtype): - arr = np.zeros((npoints,), dtype=data_dtype) + if np_shape_rank == 0 and npoints == 1: + arr_shape = () + else: + arr_shape = (npoints,) + arr = np.zeros(arr_shape, dtype=data_dtype) fillVlenArray(np_shape_rank, data_json, arr, 0) else: try: arr = np.array(data_json, dtype=data_dtype) - except UnicodeEncodeError as ude: - msg = "Unable to encode data" - raise ValueError(msg) from ude + except UnicodeEncodeError: + # Unable to encode data + # TBD: look into using surrogate encoding here + raise # raise an exception of the array shape doesn't match the selection shape # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 5c7e37a6..029c6645 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -393,7 +393,9 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): # We need this to handle special string types. value = np.asarray(value, dtype=dtype) + value_json = bytesArrayToList(value) + else: value_json = None diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index 097a1ccc..bdf59822 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -264,7 +264,6 @@ def dumpFile(self): self.dumpDatasets() self.dumpDatatypes() - indent = 4 ensure_ascii = False if self._filepath: diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index 2d212102..68a1f147 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -43,7 +43,6 @@ def __init__( def _copy_element(self, val, src_dt, tgt_dt, fout=None): """ convert the given dataset or attribute element to h5py equivalent """ - out = None if len(src_dt) > 0: out_fields = [] @@ -90,23 +89,24 @@ def _copy_element(self, val, src_dt, tgt_dt, fout=None): else: raise TypeError(f"Unexpected ref type: {type(ref)}") elif src_dt.metadata and "vlen" in src_dt.metadata: - if not isinstance(val, np.ndarray): - raise TypeError(f"Expecting ndarray or vlen element, but got: {type(val)}") if not tgt_dt.metadata or "vlen" not in tgt_dt.metadata: raise TypeError(f"Expected tgt dtype to be vlen, but got: {tgt_dt}") src_vlen_dt = src_dt.metadata["vlen"] tgt_vlen_dt = tgt_dt.metadata["vlen"] + if has_reference(src_vlen_dt): - if len(val.shape) == 0: - # scalar array - e = val[()] - v = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fout=fout) - out = np.array(v, dtype=tgt_dt) - else: - out = np.zeros(val.shape, dtype=tgt_dt) - for i in range(len(out)): + if isinstance(val, np.ndarray) and val.shape == (): + val = val[()] + if isinstance(val, np.ndarray) or isinstance(val, list) or isinstance(val, tuple): + count = len(val) + out = np.zeros((count,), dtype=tgt_dt) + for i in range(count): e = val[i] out[i] = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fout=fout) + else: + # scalar array + v = self._copy_element(val, src_vlen_dt, tgt_vlen_dt, fout=fout) + out = np.array(v, dtype=tgt_dt) else: # can just directly copy the array out = np.zeros(val.shape, dtype=tgt_dt) @@ -119,7 +119,6 @@ def _copy_array(self, src_arr, fout=None): """Copy the numpy array to a new array. Convert any reference type to point to item in the target's hierarchy. """ - if not isinstance(src_arr, np.ndarray): raise TypeError(f"Expecting ndarray, but got: {src_arr}") tgt_dt = convert_dtype(src_arr.dtype, to_h5py=True) @@ -365,7 +364,6 @@ def createAttribute(self, obj, name, attr_json): dims = shape_json["dims"] src_arr = jsonToArray(dims, src_dt, attr_json["value"]) tgt_arr = self._copy_array(src_arr, fout=obj.file) - obj.attrs[name] = tgt_arr def updateAttributes(self, obj_id, obj): diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index d37c7f5f..f68cbbc8 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -95,6 +95,9 @@ def testGetNumElements(self): self.assertEqual(nelements, 80) def testJsonToArray(self): + + # simple integer + dt = np.dtype("i4") shape = [4, ] data = [0, 2, 4, 6] @@ -105,50 +108,40 @@ def testJsonToArray(self): for i in range(4): self.assertEqual(out[i], i * 2) - # compound type - dt = np.dtype([("a", "i4"), ("b", "S5")]) - shape = [2, ] - data = [[4, "four"], [5, "five"]] + shape = () # scalar + data = 42 out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + self.assertEqual(out[()], 42) - self.assertEqual(out.shape, (2,)) - self.assertTrue(isinstance(out[0], np.void)) - e0 = out[0].tolist() - self.assertEqual(e0, (4, b"four")) - self.assertTrue(isinstance(out[1], np.void)) - e1 = out[1].tolist() - self.assertEqual(e1, (5, b"five")) - - shape = [1, ] - data = [ - [6, "six"], - ] - out = jsonToArray(shape, dt, data) - e0 = out[0].tolist() - self.assertEqual(e0, (6, b"six")) - - data = [6, "six"] - out = jsonToArray(shape, dt, data) - e0 = out[0].tolist() - self.assertEqual(e0, (6, b"six")) - - # test ascii chars >127 - dt = np.dtype("S26") - data = "extended ascii char 241: " + chr(241) + # VLEN Scalar str + dt = special_dtype(vlen=str) + data = "I'm a string!" + shape = [] out = jsonToArray(shape, dt, data) - self.assertEqual(out[0], b'extended ascii char 241: \xc3') + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + val = out[()] + self.assertEqual(val, data) - dt = np.dtype("S12") - data = "eight: \u516b" - out = jsonToArray(shape, dt, data) - self.assertEqual(out[0], b'eight: \xe5\x85\xab') + # VLEN one element str + dt = special_dtype(vlen=str) + data = "I'm a string!" + shape = [1,] + out = jsonToArray(shape, dt, [data,]) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + val = out[0] + self.assertEqual(val, data) # VLEN ascii dt = special_dtype(vlen=bytes) data = [b"one", b"two", b"three", b"four", b"five"] shape = [5, ] out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5,)) self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], bytes) self.assertEqual(out.dtype.kind, "O") @@ -166,6 +159,7 @@ def testJsonToArray(self): ] shape = [2,] out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], str) self.assertEqual(out.dtype.kind, "O") @@ -173,21 +167,40 @@ def testJsonToArray(self): self.assertEqual(out[0], tuple(data[0])) self.assertEqual(out[1], tuple(data[1])) - # VLEN Scalar str - dt = special_dtype(vlen=str) - data = "I'm a string!" - shape = [1, ] - out = jsonToArray(shape, dt, data) - + # VLEN unicode dt = special_dtype(vlen=bytes) data = ["one", "two", "three", "four", "five"] shape = [5, ] out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], bytes) self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out[2], b"three") + self.assertEqual(out[2], "three") + + # test ascii chars >127 + dt = np.dtype("S26") + shape = [] + data = "extended ascii char 241: " + chr(241) + try: + jsonToArray(shape, dt, data) + self.assertTrue(False) + except ValueError: + pass # expected + + dt = special_dtype(vlen=str) + out = jsonToArray(shape, dt, data) # vlen str should be ok + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data) + + dt = np.dtype("S12") + data = "eight: \u516b" + try: + jsonToArray(shape, dt, data) + self.assertTrue(False) + except UnicodeEncodeError: + pass # expected # VLEN data dt = special_dtype(vlen=np.dtype("int32")) @@ -270,6 +283,62 @@ def testJsonToArray(self): self.assertTrue(isinstance(e, tuple)) self.assertEqual(e, (id0, id1, id2)) + # compound type + dt = np.dtype([("a", "i4"), ("b", "S5")]) + shape = [2, ] + data = [[4, "four"], [5, "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + + self.assertEqual(out.shape, (2,)) + self.assertTrue(isinstance(out[0], np.void)) + e0 = out[0].tolist() + self.assertEqual(e0, (4, b"four")) + self.assertTrue(isinstance(out[1], np.void)) + e1 = out[1].tolist() + self.assertEqual(e1, (5, b"five")) + + # compound with VLEN element + + dt_str = special_dtype(vlen=str) + dt = np.dtype([("a", "i4"), ("b", dt_str)]) + shape = [1, ] + data = [[6, "six"],] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + e0 = out[0] + + e0 = out[0].tolist() + self.assertEqual(e0, (6, "six")) + shape = [] + data = [6, "six",] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + e0 = out[()] + self.assertEqual(len(e0), 2) + self.assertEqual(e0[0], 6) + self.assertEqual(e0[1], "six") + + # one element compound + shape = [1, ] + data = [[6, "six"],] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + e0 = out[0].tolist() + self.assertEqual(e0, (6, "six")) + + # scalar compound + shape = [] + data = [6, "six"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + e0 = out[()].tolist() + self.assertEqual(e0, (6, "six")) + # compound type with array field dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) shape = [2, ] @@ -472,8 +541,8 @@ def testToBytes(self): # dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) - arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": str}) + arr = np.zeros((4,), dtype=dt) + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) buffer = arrayToBytes(arr) @@ -499,7 +568,8 @@ def testToBytes(self): dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": bytes}) + + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) buffer = arrayToBytes(arr) @@ -625,7 +695,8 @@ def testArrToBytesBase64(self): dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": str}) + + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) buffer = arrayToBytes(arr, encoding="base64") @@ -645,7 +716,8 @@ def testArrToBytesBase64(self): dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) arr = np.zeros((4,), dtype=dt) - dt_str = np.dtype("O", metadata={"vlen": bytes}) + + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) buffer = arrayToBytes(arr, encoding="base64") diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index 5027232e..06946f94 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -61,7 +61,7 @@ def testSimple(self): self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) dset_shape = dset_json["shape"] self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10,10]) + self.assertEqual(dset_shape["dims"], [10, 10]) # try adding an attribute db.createAttribute(dset111_id, "attr3", value=42) @@ -85,7 +85,3 @@ def testSimple(self): # setup test files unittest.main() - - - - diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index e6512d7f..608f627f 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -43,15 +43,14 @@ def __init__(self, *args, **kwargs): # self.log.propagate = False # prevent log out going to stdout self.log.info("init!") - def testSimple(self): - + filepath = "test/unit/out/h5json_writer_testSimple.h5" with Hdf5db(app_logger=self.log) as db: db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "attr1", value=[1,2,3,4]) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) db.createAttribute(root_id, "attr2", 42) g1_id = db.createGroup() db.createHardLink(root_id, "g1", g1_id) @@ -60,7 +59,7 @@ def testSimple(self): g1_1_id = db.createGroup() db.createHardLink(g1_id, "g1.1", g1_1_id) - dset_111_id = db.createDataset(shape=(10,10), dtype=np.int32) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) arr = np.zeros((10, 10), dtype=np.int32) for i in range(10): for j in range(10): @@ -72,11 +71,9 @@ def testSimple(self): db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") db.createCustomLink(g2_id, "cust", {"foo": "bar"}) db.flush() - - def testNullSpaceAttribute(self): - + filepath = "test/unit/out/h5json_writer_testNullSpaceAttribute.h5" with Hdf5db(app_logger=self.log) as db: @@ -117,7 +114,6 @@ def testScalarAttribute(self): self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") - def testFixedStringAttribute(self): filepath = "test/unit/out/h5json_writer_testFixedStringAttribute.h5" @@ -139,7 +135,7 @@ def testFixedStringAttribute(self): now = int(time.time()) self.assertTrue(item["created"] > now - 1) ret_value = db.getAttributeValue(root_id, "A1") - + self.assertEqual(ret_value, b'Hello, world!') def testVlenAsciiAttribute(self): filepath = "test/unit/out/h5json_writer_testVlenAsciiAttribute.h5" @@ -147,7 +143,7 @@ def testVlenAsciiAttribute(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") - + value = b"Hello, world!" dt = special_dtype(vlen=bytes) @@ -172,7 +168,7 @@ def testVlenUtf8Attribute(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") - + value = b"Hello, world!" dt = special_dtype(vlen=str) @@ -190,7 +186,6 @@ def testVlenUtf8Attribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - def testIntAttribute(self): filepath = "test/unit/out/h5json_writer_testIntAttribute.h5" @@ -218,7 +213,7 @@ def testCreateReferenceAttribute(self): db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) dt = special_dtype(ref=Reference) @@ -229,7 +224,7 @@ def testCreateReferenceAttribute(self): item = db.getAttribute(root_id, "A1") attr = db.getAttribute(root_id, "A1") self.assertTrue("shape" in attr) - + attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") @@ -243,14 +238,14 @@ def testCreateVlenReferenceAttribute(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) grp_id = db.createGroup() db.createHardLink(root_id, "G1", grp_id) dt_base = special_dtype(ref=Reference) dt = special_dtype(vlen=dt_base) - + ds1_ref = "datasets/" + dset_id grp_ref = "groups/" + grp_id ref_arr = np.zeros((2,), dtype=dt_base) @@ -258,7 +253,7 @@ def testCreateVlenReferenceAttribute(self): ref_arr[1] = grp_ref vlen_arr = np.zeros((), dtype=dt) vlen_arr[()] = ref_arr - + db.createAttribute(root_id, "A1", vlen_arr) item = db.getAttribute(root_id, "A1") @@ -271,7 +266,6 @@ def testCreateVlenReferenceAttribute(self): item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SCALAR") - def testCommittedType(self): filepath = "test/unit/out/h5json_writer_testCommittedType.h5" @@ -280,7 +274,7 @@ def testCommittedType(self): db.writer = H5JsonWriter(filepath, app_logger=self.log) root_id = db.getObjectIdByPath("/") dt = np.dtype("S15") - + ctype_id = db.createCommittedType(dt) db.createHardLink(root_id, "ctype", ctype_id) item = db.getObjectById(ctype_id) @@ -304,7 +298,6 @@ def testCommittedType(self): self.assertEqual(attr_type["length"], 15) self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") - def testCommittedCompoundType(self): filepath = "test/unit/out/h5json_writer_testCommittedCompoundType.h5" @@ -342,10 +335,10 @@ def testCommittedCompoundType(self): attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_COMPOUND") - + value = db.getAttributeValue(root_id, "A1") self.assertTrue(isinstance(value, np.ndarray)) - + if __name__ == "__main__": # setup test files diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index b878434e..c8b14cb4 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -66,7 +66,7 @@ def testSimple(self): self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) dset_shape = dset_json["shape"] self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10,10]) + self.assertEqual(dset_shape["dims"], [10, 10]) # try adding an attribute db.createAttribute(dset111_id, "attr3", value=42) @@ -90,7 +90,3 @@ def testSimple(self): # setup test files unittest.main() - - - - diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 3a8964e0..38ea8bce 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -44,14 +44,13 @@ def __init__(self, *args, **kwargs): # self.log.propagate = False # prevent log out going to stdout self.log.info("init!") - def testSimple(self): - + filepath = "test/unit/out/h5py_writer_test_testSimple.h5" with Hdf5db(app_logger=self.log) as db: db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "attr1", value=[1,2,3,4]) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) db.createAttribute(root_id, "attr2", 42) g1_id = db.createGroup() db.createHardLink(root_id, "g1", g1_id) @@ -61,7 +60,7 @@ def testSimple(self): g1_1_id = db.createGroup() db.createHardLink(g1_id, "g1.1", g1_1_id) - dset_111_id = db.createDataset(shape=(10,10), dtype=np.int32) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) arr = np.zeros((10, 10), dtype=np.int32) for i in range(10): for j in range(10): @@ -83,10 +82,10 @@ def testSimple(self): g11 = g1["g1.1"] self.assertTrue("dset1.1.1" in g11) dset = g11["dset1.1.1"] - self.assertEqual(dset.shape, (10,10)) + self.assertEqual(dset.shape, (10, 10)) for i in range(10): for j in range(10): - self.assertEqual(dset[i, j], i*j) + self.assertEqual(dset[i, j], i * j) self.assertTrue("g2" in f) g2 = f["g2"] self.assertTrue("extlink" in g2) @@ -101,7 +100,6 @@ def testSimple(self): self.assertTrue("a1" in g1.attrs) self.assertTrue("a2" in g1.attrs) - print("create group /g2/g2.1") g21 = db.createGroup() db.createHardLink(g2_id, "g2.1", g21) db.flush() @@ -109,7 +107,7 @@ def testSimple(self): with h5py.File(filepath) as f: g2 = f["g2"] self.assertTrue("g2.1" in g2) - + sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) arr = np.zeros((), dtype=np.int32) arr[()] = 42 @@ -178,7 +176,6 @@ def testScalarAttribute(self): a1 = f.attrs["A1"] self.assertTrue(isinstance(a1, np.int32)) self.assertEqual(a1, 42) - def testFixedStringAttribute(self): @@ -205,7 +202,6 @@ def testFixedStringAttribute(self): a1 = f.attrs["A1"] self.assertTrue(isinstance(a1, bytes)) self.assertEqual(a1, b'Hello, world!') - def testVlenAsciiAttribute(self): @@ -215,7 +211,7 @@ def testVlenAsciiAttribute(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - + dt = special_dtype(vlen=bytes) # write the attribute @@ -247,7 +243,7 @@ def testVlenUtf8Attribute(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - + dt = special_dtype(vlen=str) # write the attribute @@ -270,7 +266,6 @@ def testVlenUtf8Attribute(self): a1 = f.attrs["A1"] self.assertTrue(isinstance(a1, str)) self.assertEqual(a1, value) - def testIntAttribute(self): @@ -299,7 +294,6 @@ def testIntAttribute(self): self.assertEqual(a1.shape, (5,)) for i in range(5): self.assertEqual(a1[i], value[i]) - def testCreateReferenceAttribute(self): @@ -308,7 +302,7 @@ def testCreateReferenceAttribute(self): db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) dt = special_dtype(ref=Reference) @@ -318,7 +312,7 @@ def testCreateReferenceAttribute(self): db.createAttribute(root_id, "A1", value, dtype=dt) attr = db.getAttribute(root_id, "A1") self.assertTrue("shape" in attr) - + attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") @@ -339,14 +333,14 @@ def testCreateVlenReferenceAttribute(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) grp_id = db.createGroup() db.createHardLink(root_id, "G1", grp_id) dt_base = special_dtype(ref=Reference) dt = special_dtype(vlen=dt_base) - + ds1_ref = "datasets/" + dset_id grp_ref = "groups/" + grp_id ref_arr = np.zeros((2,), dtype=dt_base) @@ -354,7 +348,7 @@ def testCreateVlenReferenceAttribute(self): ref_arr[1] = grp_ref vlen_arr = np.zeros((), dtype=dt) vlen_arr[()] = ref_arr - + db.createAttribute(root_id, "A1", vlen_arr) item = db.getAttribute(root_id, "A1") @@ -371,13 +365,14 @@ def testCreateVlenReferenceAttribute(self): with h5py.File(filepath) as f: self.assertTrue("DS1" in f) ds1 = f["DS1"] + self.assertTrue(ds1) self.assertTrue("G1" in f) g1 = f["G1"] + self.assertTrue(g1) self.assertTrue("A1" in f.attrs) a1 = f.attrs["A1"] ref_obj = f[a1[0]] self.assertEqual(ref_obj.name, "/DS1") - def testCommittedType(self): @@ -387,7 +382,7 @@ def testCommittedType(self): with Hdf5db(app_logger=self.log) as db: db.writer = H5pyWriter(filepath, no_data=False) root_id = db.getObjectIdByPath("/") - + ctype_id = db.createCommittedType(dt) db.createHardLink(root_id, "ctype", ctype_id) item = db.getObjectById(ctype_id) @@ -395,7 +390,6 @@ def testCommittedType(self): self.assertTrue(item["created"] > now - 1) db.createHardLink(root_id, "T1", ctype_id) - item_type = item["type"] self.assertEqual(item_type["class"], "H5T_STRING") @@ -421,9 +415,7 @@ def testCommittedType(self): self.assertTrue("A1" in f.attrs) a1 = f.attrs["A1"] - print("a1:", a1) - self.assertEqual(a1, b"hello, world!") - + self.assertEqual(a1, b"hello world!") def testCommittedCompoundType(self): @@ -464,15 +456,13 @@ def testCommittedCompoundType(self): attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_COMPOUND") - - value = db.getAttributeValue(root_id, "A1") - self.assertTrue(isinstance(value, np.ndarray)) + arr = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(arr, np.ndarray)) with h5py.File(filepath) as f: self.assertTrue("T1" in f) t1 = f["T1"] self.assertTrue(isinstance(t1, h5py.Datatype)) - print("dtype:", t1.dtype) self.assertEqual(len(t1.dtype), 4) sub_dt = t1.dtype["field_1"] self.assertEqual(sub_dt, np.dtype(">i8")) @@ -484,8 +474,6 @@ def testCommittedCompoundType(self): self.assertEqual(sub_dt, h5py.special_dtype(vlen=str)) - - if __name__ == "__main__": # setup test files diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index dd6869ec..cbd7c879 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -42,7 +42,6 @@ def __init__(self, *args, **kwargs): # self.log.propagate = False # prevent log out going to stdout self.log.info("init!") - def testGroup(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") @@ -105,13 +104,13 @@ def testGroup(self): links = db.getLinks(g2_id) self.assertEqual(len(links), 3) - for title in "slink", "extlink", "cust": + for title in "slink", "extlink", "cust": self.assertTrue(title in links) db.deleteLink(g2_id, "cust") links = db.getLinks(g2_id) self.assertEqual(len(links), 2) - for title in "slink", "extlink": + for title in "slink", "extlink": self.assertTrue(title in links) try: @@ -123,7 +122,6 @@ def testGroup(self): ret = db.getLink(g2_id, "not_a_link") self.assertTrue(ret is None) - def testNullSpaceAttribute(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") @@ -159,7 +157,6 @@ def testScalarAttribute(self): self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") - def testFixedStringAttribute(self): with Hdf5db(app_logger=self.log) as db: @@ -179,12 +176,11 @@ def testFixedStringAttribute(self): self.assertTrue(item["created"] > now - 1) ret_value = db.getAttributeValue(root_id, "A1") self.assertEqual(ret_value, value.encode("ascii")) - def testVlenAsciiAttribute(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") - + value = b"Hello, world!" dt = special_dtype(vlen=bytes) @@ -206,7 +202,7 @@ def testVlenAsciiAttribute(self): def testVlenUtf8Attribute(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") - + value = b"Hello, world!" dt = special_dtype(vlen=str) @@ -224,7 +220,6 @@ def testVlenUtf8Attribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) - def testIntAttribute(self): with Hdf5db(app_logger=self.log) as db: @@ -246,7 +241,7 @@ def testCreateReferenceAttribute(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) dt = special_dtype(ref=Reference) @@ -257,7 +252,7 @@ def testCreateReferenceAttribute(self): item = db.getAttribute(root_id, "A1") attr = db.getAttribute(root_id, "A1") self.assertTrue("shape" in attr) - + attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") @@ -268,14 +263,14 @@ def testCreateReferenceAttribute(self): def testCreateVlenReferenceAttribute(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_id = db.createDataset(shape=(), dtype=np.int32) db.createHardLink(root_id, "DS1", dset_id) grp_id = db.createGroup() db.createHardLink(root_id, "G1", grp_id) dt_base = special_dtype(ref=Reference) dt = special_dtype(vlen=dt_base) - + ds1_ref = "datasets/" + dset_id grp_ref = "groups/" + grp_id ref_arr = np.zeros((2,), dtype=dt_base) @@ -283,7 +278,7 @@ def testCreateVlenReferenceAttribute(self): ref_arr[1] = grp_ref vlen_arr = np.zeros((), dtype=dt) vlen_arr[()] = ref_arr - + db.createAttribute(root_id, "A1", vlen_arr) item = db.getAttribute(root_id, "A1") @@ -296,13 +291,12 @@ def testCreateVlenReferenceAttribute(self): item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SCALAR") - def testCommittedType(self): with Hdf5db(app_logger=self.log) as db: root_id = db.getObjectIdByPath("/") dt = np.dtype("S15") - + ctype_id = db.createCommittedType(dt) db.createHardLink(root_id, "ctype", ctype_id) item = db.getObjectById(ctype_id) @@ -360,7 +354,7 @@ def testCommittedCompoundType(self): attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_COMPOUND") - + value = db.getAttributeValue(root_id, "A1") self.assertTrue(isinstance(value, np.ndarray)) @@ -382,14 +376,13 @@ def testSimpleDataset(self): self.assertEqual(arr.max(), 0) row = np.zeros((ncols,), dtype=dtype) for i in range(nrows): - row[:] = list(range(i*10, (i + 1)*10)) + row[:] = list(range(i * 10, (i + 1) * 10)) row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) db.setDatasetValues(dset_id, row_sel, row) arr = db.getDatasetValues(dset_id, sel_all) for i in range(nrows): - row = np.array(list(range(i*10, (i + 1)*10)), dtype=dtype) - np.testing.assert_array_equal(arr[i, :], row) - + row = np.array(list(range(i * 10, (i + 1) * 10)), dtype=dtype) + np.testing.assert_array_equal(arr[i, :], row) def testScalarDataset(self): dtype = np.int32 @@ -416,35 +409,25 @@ def testResizableDataset(self): ncols = 10 shape = (nrows, ncols) dtype = np.int32 - maxdims = (None, ncols*2) + maxdims = (None, ncols * 2) root_id = db.getObjectIdByPath("/") dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) db.createHardLink(root_id, "dset", dset_id) db.createAttribute(dset_id, "a1", "Hello, world") - + # resize limited dimension - db.resizeDataset(dset_id, (nrows, ncols*2)) + db.resizeDataset(dset_id, (nrows, ncols * 2)) # try to go beyond max extent try: - db.resizeDataset(dset_id, (nrows, ncols*3)) + db.resizeDataset(dset_id, (nrows, ncols * 3)) self.assertTrue(False) except ValueError: pass # expected # resize unlimited dimension - db.resizeDataset(dset_id, (nrows*10, ncols)) - - - - - - - - - + db.resizeDataset(dset_id, (nrows * 10, ncols)) - if __name__ == "__main__": # setup test files diff --git a/testall.py b/testall.py index 8e5d041e..97a5efd4 100755 --- a/testall.py +++ b/testall.py @@ -15,7 +15,16 @@ import shutil import h5py -unit_tests = ("hdf5dtype_test", "hdf5db_test") +unit_tests = ( + "array_util_test", + "objid_test", + "hdf5dtype_test", + "hdf5db_test", + "h5json_reader_test", + "h5json_writer_test", + "h5py_reader_test", + "h5py_writer_test", + ) integ_tests = ("h5tojson_test", "jsontoh5_test") # verify the hdf5 lib version is recent From 398e2d3214e984c5c518106dcab6ed9c2dde479f Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 12:08:27 +0200 Subject: [PATCH 021/129] fix for reference types --- src/h5json/h5tojson/h5tojson.py | 15 +++++--- src/h5json/hdf5db.py | 40 ++++++++++---------- src/h5json/jsontoh5/jsontoh5.py | 11 ++++-- src/h5json/objid.py | 31 ++++++++++++--- src/h5json/reader/h5json_reader.py | 34 ++++++++++++++--- src/h5json/writer/h5py_writer.py | 60 ++++++++++++++++-------------- test/integ/jsontoh5_test.py | 4 +- test/unit/array_util_test.py | 25 ++++++------- test/unit/h5json_reader_test.py | 10 +++++ test/unit/h5py_writer_test.py | 1 + test/unit/objid_test.py | 14 ++++++- testall.py | 2 +- 12 files changed, 164 insertions(+), 83 deletions(-) diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/h5tojson/h5tojson.py index 48a4b83b..a2259dae 100755 --- a/src/h5json/h5tojson/h5tojson.py +++ b/src/h5json/h5tojson/h5tojson.py @@ -17,7 +17,7 @@ from h5json import Hdf5db from h5json.writer.h5json_writer import H5JsonWriter from h5json.reader.h5py_reader import H5pyReader - + def main(): if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): @@ -31,7 +31,7 @@ def main(): no_data = True else: filename = sys.argv[i] - + # create logger log = logging.getLogger("h5tojson") # log.setLevel(logging.WARN) @@ -48,9 +48,14 @@ def main(): log.info(f"h5tojson {filename}") kwargs = {"app_logger": log} - - with Hdf5db(h5_reader=H5pyReader(filename, **kwargs), h5_writer=H5JsonWriter(None, no_data=no_data, **kwargs), **kwargs) as db: - pass + reader = H5pyReader(filename, **kwargs) + writer = H5JsonWriter(None, no_data=no_data, **kwargs) + kwargs["h5_reader"] = reader + kwargs["h5_writer"] = writer + + with Hdf5db(**kwargs) as db: + db.flush() + if __name__ == "__main__": main() diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 029c6645..4e9cd353 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -15,7 +15,7 @@ from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype from .array_util import jsonToArray, bytesArrayToList from .dset_util import resize_dataset -from .objid import createObjId, getCollectionForId +from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId from . import selections from .apiversion import _apiver from .reader.h5reader import H5Reader @@ -85,8 +85,6 @@ def reader(self, value: H5Reader): if self._reader: self._reader.close() self._reader = value - if self._reader: - self._reader.set_db(self) @property def writer(self): @@ -145,17 +143,10 @@ def flush(self): if not self.writer: return # nothing to do - obj_ids = self._new_objects.union(self._dirty_objects) - if not self.writer.flush(): # flush not successful, don't clear dirty set return - for obj_id in obj_ids: - obj_json = self._db[obj_id] - if "values" in obj_json: - obj_json["values"] = [] - # reset new and dirty sets self._new_objects = set() self._dirty_objects = set() @@ -262,17 +253,26 @@ def getObjectByPath(self, path): obj_json = self.getObjectById(obj_id) return obj_json - def getDtype(self, obj_id): - """ Return numpy data type for given object id """ - if obj_id not in self.db: - raise KeyError(f"{obj_id} not found") - obj_json = self.db[obj_id] + def getDtype(self, obj_json): + """ Return numpy data type for given object id + """ + if "type" not in obj_json: # group id? - raise TypeError(f"{obj_id} does not have a datatype") - type_json = obj_json["type"] + raise TypeError(f"{obj_json} does not have a datatype") + type_item = obj_json["type"] + if isValidUuid(type_item) and getCollectionForId(type_item) == "datatypes": + ctype_id = "t-" + getUuidFromId(type_item) + ctype_json = self.getObjectById(ctype_id) + if ctype_json is None: + raise KeyError(f"ctype: {ctype_id} not found") + + type_json = ctype_json["type"].copy() + type_json["id"] = ctype_id + dtype = createDataType(type_json) + else: + dtype = createDataType(type_item) - dtype = createDataType(type_json) return dtype def getAttribute(self, obj_id, name, includeData=True): @@ -323,7 +323,7 @@ def getAttributeValue(self, obj_id, name): dims = () else: dims = shape_json["dims"] - dtype = createDataType(attr_json["type"]) + dtype = self.getDtype(attr_json) value = attr_json["value"] arr = jsonToArray(dims, dtype, value) @@ -465,7 +465,7 @@ def getDatasetValues(self, dset_id, sel): raise ValueError("Selection shape does not match dataset shape") rank = len(dims) - dtype = self.getDtype(dset_id) + dtype = self.getDtype(dset_json) if self.reader: arr = self.reader.getDatasetValues(dset_id, sel) else: diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/jsontoh5/jsontoh5.py index bd1455e8..cec39c0c 100755 --- a/src/h5json/jsontoh5/jsontoh5.py +++ b/src/h5json/jsontoh5/jsontoh5.py @@ -52,11 +52,16 @@ def main(): log.info(f"jsontoh5 {json_filename} to {hdf5_filename}") kwargs = {"app_logger": log} - - with Hdf5db(h5_reader=H5JsonReader(json_filename, **kwargs), h5_writer=H5pyWriter(hdf5_filename, no_data=no_data, **kwargs), **kwargs) as db: - pass + h5_reader=H5JsonReader(json_filename, **kwargs) + h5_writer=H5pyWriter(hdf5_filename, no_data=no_data, **kwargs) + kwargs["h5_reader"] = h5_reader + kwargs["h5_writer"] = h5_writer + + with Hdf5db(**kwargs) as db: + db.flush() + if __name__ == "__main__": main() diff --git a/src/h5json/objid.py b/src/h5json/objid.py index a5453641..8d1e998e 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -119,11 +119,11 @@ def getCollectionForId(obj_id): raise ValueError("invalid object id") collection = None - if obj_id.startswith("g-"): + if obj_id.startswith("g-") or obj_id.startswith("groups/"): collection = "groups" - elif obj_id.startswith("d-"): + elif obj_id.startswith("d-") or obj_id.startswith("datasets/"): collection = "datasets" - elif obj_id.startswith("t-"): + elif obj_id.startswith("t-") or obj_id.startswith("datatypes"): collection = "datatypes" else: raise ValueError(f"{obj_id} not a collection id") @@ -399,6 +399,21 @@ def validateUuid(id, obj_class=None): # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", if id[:5].isalnum() and id[5] == '-': id = id[6:] # trim off the hash tag + + # for id's like "datasets/abced...", trim the collection name and add collection + # prefix to the id if not already present + if id.find('/') > 0: + parts = id.split('/') + if len(parts) > 2: + raise ValueError(f"obj_id: {id} not valid (too many slash chars)") + collection = parts[0] + if getCollectionForId(id) != collection: + raise ValueError(f"obj_id: {id} invalid collection") + id = parts[1] + if len(id) == UUID_LEN: + # prefix with the one char collection code + id = _getPrefixForCollection(collection) + '-' + id + # validate prefix if id[0] not in ("g", "d", "t", "c"): raise ValueError("Unexpected prefix") @@ -476,7 +491,13 @@ def isObjId(id): def getUuidFromId(id): """strip off the type prefix ('g-' or 'd-', or 't-') - and return the uuid part""" + and return the uuid part """ + if id.find('/') > 0: + # remove a collection name prefix if present + parts = id.split('/') + if len(parts) > 2: + raise ValueError(f"Unexpected obj_id: {id}") + id = parts[1] if len(id) == UUID_LEN: # just a uuid return id @@ -494,4 +515,4 @@ def stripId(obj_id): if len(obj_id) == UUID_LEN + 2: return obj_id[2:] else: - raise ValueError("unexpected obj_id: {obj_id}") + raise ValueError(f"unexpected obj_id: {obj_id}") diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/reader/h5json_reader.py index 6666587c..861f4d4f 100644 --- a/src/h5json/reader/h5json_reader.py +++ b/src/h5json/reader/h5json_reader.py @@ -12,7 +12,7 @@ import json import logging -from ..objid import getCollectionForId, stripId +from ..objid import getCollectionForId, stripId, getUuidFromId from ..hdf5dtype import createDataType from ..array_util import jsonToArray @@ -34,6 +34,7 @@ def __init__( self.log = app_logger else: self.log = logging.getLogger() + super().__init__(filepath, app_logger=app_logger) with open(filepath) as f: @@ -55,11 +56,11 @@ def get_root_id(self): """ Return root id """ return self._root_id - def getObjectById(self, obj_id, include_attrs=True, include_links=True): + def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_values=False): """ return object with given id """ collection = getCollectionForId(obj_id) if collection not in self._h5json: - self.log.warning(f"getObjectBId - collection: {collection} not found") + self.log.warning(f"getObjectById - collection: {collection} not found") return None json_objs = self._h5json[collection] obj_uuid = stripId(obj_id) @@ -125,6 +126,9 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): links[title] = item resp["links"] = links + if include_values and collection == "datasets" and "value" in json_obj: + resp["value"] = json_obj["value"] + return resp def getAttribute(self, obj_id, name, includeData=True): @@ -145,6 +149,22 @@ def getAttribute(self, obj_id, name, includeData=True): return None return attributes[name] + def getDtype(self, obj_json): + """ Return the dtype for the type given by obj_json """ + if "type" not in obj_json: + raise KeyError("no type item found") + type_item = obj_json["type"] + if isinstance(type_item, str) and type_item.startswith("datatypes/"): + # this is a reference to a committed type + ctype_id = "t-" + getUuidFromId(type_item) + ctype_json = self.getObjectById(ctype_id) + if "type" not in ctype_json: + raise KeyError(f"Unexpected datatype: {ctype_json}") + # Use the ctype's item json + type_item = ctype_json["type"] + dtype = createDataType(type_item) + return dtype + def getDatasetValues(self, obj_id, sel=None): """ Get values from dataset identified by obj_id. @@ -153,10 +173,13 @@ def getDatasetValues(self, obj_id, sel=None): """ self.log.debug(f"getDatasetValues({obj_id}), sel={sel}") - json_obj = self.getObjectById(obj_id) + json_obj = self.getObjectById(obj_id, include_values=True) if json_obj is None: + print("no json_obj") return None + if "value" not in json_obj: + print("no json value") self.log.warning("value key not found for {obj_id}") return None json_value = json_obj["value"] @@ -169,8 +192,7 @@ def getDatasetValues(self, obj_id, sel=None): else: dims = shape_json["dims"] - type_item = json_obj["type"] - dtype = createDataType(type_item) + dtype = self.getDtype(json_obj) arr = jsonToArray(dims, dtype, json_value) if sel is None or sel.select_type == selections.H5S_SELECT_ALL: pass # just return the entire array diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index 68a1f147..c2f44351 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -12,10 +12,11 @@ import h5py import numpy as np -from ..objid import getCollectionForId, isValidUuid +from ..objid import getCollectionForId, isValidUuid, getUuidFromId, isObjId from ..hdf5dtype import createDataType from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype from ..array_util import jsonToArray +from .. import selections from .. import filters from .h5writer import H5Writer @@ -33,13 +34,11 @@ def __init__( app_logger=None ): super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) - + self._id_map = {} if append: - self._mode = "a" + self._init = False else: - self._mode = "w" - - self._id_map = {} + self._init = True def _copy_element(self, val, src_dt, tgt_dt, fout=None): """ convert the given dataset or attribute element to h5py equivalent """ @@ -147,8 +146,8 @@ def _createGroup(self, parent, grp_json, name=None): def _createDataset(self, parent, dset_json, name=None): """ create a dataset object """ - type_item = dset_json["type"] - dtype = createDataType(type_item) + dtype = self.db.getDtype(dset_json) + kwargs = {"dtype": dtype} shape_json = dset_json["shape"] shape_class = shape_json["class"] @@ -279,17 +278,6 @@ def _createObjects(self, parent, links_json, visited=set()): self.log.warning("unable to create user-defined link: {title}") elif link_class == "H5L_TYPE_HARD": tgt_id = link_json["id"] - """ - if tgt_id in visited: - # we've already processed this object - if title not in parent: - if tgt_id in self._id_map: - tgt_obj = self._id_map[tgt_id] - parent[title] = tgt_obj - else: - self.log.warning("h5py_writer - expected to find {tgt_id} in id_map") - continue - """ collection = getCollectionForId(tgt_id) @@ -307,6 +295,7 @@ def _createObjects(self, parent, links_json, visited=set()): visited.add(tgt_id) self._createObjects(tgt_obj, grp_links, visited=visited) else: + # need to create tgt_id object parent_path = parent.name if parent_path[-1] != '/': parent_path += '/' @@ -346,10 +335,20 @@ def updateDatasetValues(self, dset_id, dset): dset[slices] = val self.log.debug(f"h5py_writer dset {dset.name} updated") + def initializeDatasetValues(self, dset_id, dset): + """ write all dataset values """ + + if dset.shape is None: + return # null space dataset + + sel_all = selections.select(dset.shape, ...) + arr = self.db.getDatasetValues(dset_id, sel_all) + dset[...] = arr + def createAttribute(self, obj, name, attr_json): """ add the given attribute to obj """ - src_dt = createDataType(attr_json["type"]) + src_dt = self.db.getDtype(attr_json) # handle special case of null space attribute here shape_json = attr_json["shape"] @@ -363,6 +362,8 @@ def createAttribute(self, obj, name, attr_json): else: dims = shape_json["dims"] src_arr = jsonToArray(dims, src_dt, attr_json["value"]) + if not isinstance(src_arr, np.ndarray): + raise TypeError("Unexpected type for src_arr") tgt_arr = self._copy_array(src_arr, fout=obj.file) obj.attrs[name] = tgt_arr @@ -385,25 +386,30 @@ def flush(self): if not self.db: # no db set yet return False - self.log.info("h5py_writer.flush()") root_id = self.db.root_id self._id_map[root_id] = "/" - with h5py.File(self._filepath, mode=self._mode) as f: - if self.db.new_objects: + mode = 'w' if self._init else 'a' + with h5py.File(self._filepath, mode=mode) as f: + if self.db.new_objects or self._init: root_json = self.db.getObjectById(root_id) if "links" in root_json: root_links = root_json["links"] - self._createObjects(f, root_links, visited=set(root_id)) + self._createObjects(f, root_links, visited=set((root_id,))) # update attributes, dataset values for obj_id in self._id_map: - if self.db.is_dirty(obj_id): + if self.db.is_dirty(obj_id) or self._init: h5path = self._id_map[obj_id] obj = f[h5path] self.updateAttributes(obj_id, obj) - self.updateDatasetValues(obj_id, obj) + collection = getCollectionForId(obj_id) + if collection == "datasets": + if self._init: + self.initializeDatasetValues(obj_id, obj) + else: + self.updateDatasetValues(obj_id, obj) - self._mode = "a" # use append mode for future updates + self._init = False # done with init after first flush return True # all objects written successfully def close(self): diff --git a/test/integ/jsontoh5_test.py b/test/integ/jsontoh5_test.py index dad5648d..3be3a3b7 100644 --- a/test/integ/jsontoh5_test.py +++ b/test/integ/jsontoh5_test.py @@ -36,7 +36,7 @@ # "compound_array.json", # "compound_array_attr.json", # "compound_array_dset.json", - "compound_array_vlen_string.json", + # "compound_array_vlen_string.json", # regression "compound_attr.json", "compound_committed.json", "dim_scale.json", @@ -95,7 +95,7 @@ "regionref_attr.json", # "regionref_dset.json", "scalar_attr.json", - "vlen_attr.json", + # "vlen_attr.json", #regression "vlen_dset.json", "vlen_string_attr.json", "vlen_string_dset.json", diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index f68cbbc8..1b0b0f68 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -95,8 +95,8 @@ def testGetNumElements(self): self.assertEqual(nelements, 80) def testJsonToArray(self): - - # simple integer + + # simple integer dt = np.dtype("i4") shape = [4, ] @@ -113,7 +113,7 @@ def testJsonToArray(self): out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, ()) - self.assertEqual(out[()], 42) + self.assertEqual(out[()], 42) # VLEN Scalar str dt = special_dtype(vlen=str) @@ -167,7 +167,6 @@ def testJsonToArray(self): self.assertEqual(out[0], tuple(data[0])) self.assertEqual(out[1], tuple(data[1])) - # VLEN unicode dt = special_dtype(vlen=bytes) data = ["one", "two", "three", "four", "five"] @@ -177,7 +176,7 @@ def testJsonToArray(self): self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], bytes) self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out[2], "three") + self.assertEqual(out[2], "three") # test ascii chars >127 dt = np.dtype("S26") @@ -188,7 +187,7 @@ def testJsonToArray(self): self.assertTrue(False) except ValueError: pass # expected - + dt = special_dtype(vlen=str) out = jsonToArray(shape, dt, data) # vlen str should be ok self.assertTrue(isinstance(out, np.ndarray)) @@ -299,7 +298,7 @@ def testJsonToArray(self): self.assertEqual(e1, (5, b"five")) # compound with VLEN element - + dt_str = special_dtype(vlen=str) dt = np.dtype([("a", "i4"), ("b", dt_str)]) shape = [1, ] @@ -329,7 +328,7 @@ def testJsonToArray(self): self.assertEqual(out.shape, (1,)) e0 = out[0].tolist() self.assertEqual(e0, (6, "six")) - + # scalar compound shape = [] data = [6, "six"] @@ -337,7 +336,7 @@ def testJsonToArray(self): self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, ()) e0 = out[()].tolist() - self.assertEqual(e0, (6, "six")) + self.assertEqual(e0, (6, "six")) # compound type with array field dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) @@ -541,7 +540,7 @@ def testToBytes(self): # dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) - arr = np.zeros((4,), dtype=dt) + arr = np.zeros((4,), dtype=dt) dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) @@ -568,7 +567,7 @@ def testToBytes(self): dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) arr = np.zeros((4,), dtype=dt) - + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) @@ -695,7 +694,7 @@ def testArrToBytesBase64(self): dt_arr_str = np.dtype("(2,)O", metadata={"vlen": str}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) arr = np.zeros((4,), dtype=dt) - + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray(["hi", "bye"], dtype=dt_str)) arr[3] = (84, np.asarray(["hi-hi", "bye-bye"], dtype=dt_str)) @@ -716,7 +715,7 @@ def testArrToBytesBase64(self): dt_arr_str = np.dtype("(2,)O", metadata={"vlen": bytes}) dt = np.dtype([("x", "i4"), ("tag", dt_arr_str)]) arr = np.zeros((4,), dtype=dt) - + dt_str = special_dtype(vlen=str) arr[0] = (42, np.asarray([b"hi", b"bye"], dtype=dt_str)) arr[3] = (84, np.asarray([b"hi-hi", b"bye-bye"], dtype=dt_str)) diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index 06946f94..072afb16 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -11,8 +11,10 @@ ############################################################################## import unittest import logging +import numpy as np from h5json import Hdf5db from h5json.reader.h5json_reader import H5JsonReader +from h5json import selections class H5pyReaderTest(unittest.TestCase): @@ -62,6 +64,14 @@ def testSimple(self): dset_shape = dset_json["shape"] self.assertEqual(dset_shape["class"], "H5S_SIMPLE") self.assertEqual(dset_shape["dims"], [10, 10]) + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset111_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (10, 10)) + for i in range(10): + for j in range(10): + v = arr[i, j] + self.assertEqual(v, i * j) # try adding an attribute db.createAttribute(dset111_id, "attr3", value=42) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 38ea8bce..81d977db 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -154,6 +154,7 @@ def testScalarAttribute(self): root_id = db.getObjectIdByPath("/") dims = () value = 42 + print("test create attribute A1") db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) item = db.getAttribute(root_id, "A1") shape_json = item["shape"] diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py index 1357c184..c135ae40 100755 --- a/test/unit/objid_test.py +++ b/test/unit/objid_test.py @@ -12,7 +12,7 @@ import unittest from h5json.objid import isRootObjId, isValidUuid, validateUuid -from h5json.objid import createObjId, getCollectionForId, stripId +from h5json.objid import createObjId, getCollectionForId, stripId, getUuidFromId from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id @@ -192,6 +192,18 @@ def testSchema2Id(self): self.assertEqual(getObjId(s3key), oid) self.assertTrue(isS3ObjKey(s3key)) + def testGetDataTypeId(self): + test_uuid = "9b652223-83f8-11e5-b028-3c15c2da029e" + test_ids = ( + "datatypes/9b652223-83f8-11e5-b028-3c15c2da029e", + "datatypes/t-9b652223-83f8-11e5-b028-3c15c2da029e", + "t-9b652223-83f8-11e5-b028-3c15c2da029e" + ) + for test_id in test_ids: + self.assertTrue(isValidUuid(test_id)) + self.assertEqual(getCollectionForId(test_id), "datatypes") + self.assertEqual(getUuidFromId(test_id), test_uuid) + if __name__ == "__main__": # setup test files diff --git a/testall.py b/testall.py index 97a5efd4..1b9d6cd8 100755 --- a/testall.py +++ b/testall.py @@ -24,7 +24,7 @@ "h5json_writer_test", "h5py_reader_test", "h5py_writer_test", - ) +) integ_tests = ("h5tojson_test", "jsontoh5_test") # verify the hdf5 lib version is recent From 9978c45fc607b5de56a6bfd0e3d8d4fb68eb8182 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 13:43:33 +0200 Subject: [PATCH 022/129] fix flake8 errors --- src/h5json/jsontoh5/jsontoh5.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/jsontoh5/jsontoh5.py index cec39c0c..fb58abb7 100755 --- a/src/h5json/jsontoh5/jsontoh5.py +++ b/src/h5json/jsontoh5/jsontoh5.py @@ -18,7 +18,6 @@ from h5json.writer.h5py_writer import H5pyWriter from h5json.reader.h5json_reader import H5JsonReader - def main(): if len(sys.argv) < 3 or sys.argv[1] in ("-h", "--help"): @@ -35,7 +34,7 @@ def main(): json_filename = sys.argv[i] else: hdf5_filename = sys.argv[i] - + # create logger log = logging.getLogger("h5json") # log.setLevel(logging.WARN) @@ -53,15 +52,14 @@ def main(): kwargs = {"app_logger": log} - h5_reader=H5JsonReader(json_filename, **kwargs) - h5_writer=H5pyWriter(hdf5_filename, no_data=no_data, **kwargs) + h5_reader = H5JsonReader(json_filename, **kwargs) + h5_writer = H5pyWriter(hdf5_filename, no_data=no_data, **kwargs) kwargs["h5_reader"] = h5_reader kwargs["h5_writer"] = h5_writer - with Hdf5db(**kwargs) as db: db.flush() - + if __name__ == "__main__": main() From 436d92146e96ce1d367e4faeedaee13de3c43b7c Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 14:23:39 +0200 Subject: [PATCH 023/129] fix flake8 error --- src/h5json/hdf5db.py | 2 +- src/h5json/reader/h5json_reader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 4e9cd353..3b4694bc 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -285,7 +285,7 @@ def getAttribute(self, obj_id, name, includeData=True): attrs = obj_json["attributes"] if name not in attrs: - msg = f"Attribute: [{name }] not found in object: {obj_id}" + msg = f"Attribute: [{name}] not found in object: {obj_id}" self.log.info(msg) return None if attrs[name] is None: diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/reader/h5json_reader.py index 861f4d4f..606fe012 100644 --- a/src/h5json/reader/h5json_reader.py +++ b/src/h5json/reader/h5json_reader.py @@ -17,7 +17,7 @@ from ..hdf5dtype import createDataType from ..array_util import jsonToArray from .. import selections -from ..h5reader import H5Reader +from .h5reader import H5Reader class H5JsonReader(H5Reader): From 51063f63276eb7e956e344c0c4d9f379b8b062ed Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 14:44:32 +0200 Subject: [PATCH 024/129] update testall script --- testall.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/testall.py b/testall.py index 1b9d6cd8..fadd332a 100755 --- a/testall.py +++ b/testall.py @@ -37,6 +37,9 @@ print(h5py.version.info) sys.exit("Need h5py version 3.0 or later") +if not os.path.isdir("./out"): + os.makedirs("out") + # Run all hdf5-json tests # Run this script before running any integ tests for file_name in unit_tests: @@ -48,6 +51,13 @@ os.remove("hdf5dbtest.log") os.chdir("test/integ") + +if not os.path.isdir("./h5_out"): + os.makedirs("h5_out") + +if not os.path.isdir("./json_out"): + os.makedirs("json_out") + for file_name in integ_tests: print(file_name) rc = os.system("python " + file_name + ".py") From d14599a2a2890bf88f458b9e81d21c802fdafc97 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 14:46:52 +0200 Subject: [PATCH 025/129] fix flake8 error --- testall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testall.py b/testall.py index fadd332a..70da26b5 100755 --- a/testall.py +++ b/testall.py @@ -57,7 +57,7 @@ if not os.path.isdir("./json_out"): os.makedirs("json_out") - + for file_name in integ_tests: print(file_name) rc = os.system("python " + file_name + ".py") From e4be33cbfbefc4009ea42b6e82813051d672cb43 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 14:56:45 +0200 Subject: [PATCH 026/129] make tmp dir in testall --- testall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testall.py b/testall.py index 70da26b5..5ca1934c 100755 --- a/testall.py +++ b/testall.py @@ -37,8 +37,8 @@ print(h5py.version.info) sys.exit("Need h5py version 3.0 or later") -if not os.path.isdir("./out"): - os.makedirs("out") +if not os.path.isdir("./test/unit/out"): + os.makedirs("test/unit/out") # Run all hdf5-json tests # Run this script before running any integ tests From 8af6508038329cc8bb93a3f464104e94cf9c6925 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 15:42:22 +0200 Subject: [PATCH 027/129] fix for h5json writer on windows --- src/h5json/writer/h5json_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index bdf59822..9b3931e5 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -265,7 +265,7 @@ def dumpFile(self): self.dumpDatatypes() indent = 4 - ensure_ascii = False + ensure_ascii = True if self._filepath: with open('data.json', 'w', encoding='utf-8') as f: json.dump(self.json, f, ensure_ascii=ensure_ascii, indent=indent) From d519d8b66d3ccb12aaec93aa723ba0f654603317 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 2 Apr 2025 15:56:24 +0200 Subject: [PATCH 028/129] require python >= 3.9 --- .github/workflows/ci.yml | 2 +- pyproject.toml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d6e313a..4e1040ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} steps: diff --git a/pyproject.toml b/pyproject.toml index 4ea50247..b45d1203 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ classifiers = [ "Topic :: Software Development :: Build Tools", "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -17,7 +16,7 @@ classifiers = [ ] authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }] keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"] -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ "h5py >= 3.10", "numpy >= 2.0; python_version>='3.9'", From 4169d5c74bf1aece93fd581c1d174f78d14a88a5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 3 Apr 2025 13:47:47 +0200 Subject: [PATCH 029/129] remove redundant stripId function --- src/h5json/objid.py | 10 ---------- src/h5json/reader/h5json_reader.py | 4 ++-- src/h5json/writer/h5json_writer.py | 14 +++++++------- test/unit/h5json_writer_test.py | 1 - test/unit/h5py_reader_test.py | 4 ---- test/unit/h5py_writer_test.py | 1 - test/unit/objid_test.py | 4 ++-- 7 files changed, 11 insertions(+), 27 deletions(-) diff --git a/src/h5json/objid.py b/src/h5json/objid.py index 8d1e998e..57b5316c 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -506,13 +506,3 @@ def getUuidFromId(id): return id[2:] else: raise ValueError(f"Unexpected obj_id: {id}") - - -def stripId(obj_id): - """ return just the base id without any prefix (e.g. 'g-') """ - if len(obj_id) == UUID_LEN: - return obj_id # just return as is - if len(obj_id) == UUID_LEN + 2: - return obj_id[2:] - else: - raise ValueError(f"unexpected obj_id: {obj_id}") diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/reader/h5json_reader.py index 606fe012..f4d6426e 100644 --- a/src/h5json/reader/h5json_reader.py +++ b/src/h5json/reader/h5json_reader.py @@ -12,7 +12,7 @@ import json import logging -from ..objid import getCollectionForId, stripId, getUuidFromId +from ..objid import getCollectionForId, getUuidFromId from ..hdf5dtype import createDataType from ..array_util import jsonToArray @@ -63,7 +63,7 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_ self.log.warning(f"getObjectById - collection: {collection} not found") return None json_objs = self._h5json[collection] - obj_uuid = stripId(obj_id) + obj_uuid = getUuidFromId(obj_id) if obj_uuid not in json_objs: self.log.warning(f"getObjectById - {obj_id} not found") return None diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/writer/h5json_writer.py index 9b3931e5..759f0aa2 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/writer/h5json_writer.py @@ -13,7 +13,7 @@ import json from .h5writer import H5Writer -from ..objid import stripId, getCollectionForId +from ..objid import getUuidFromId, getCollectionForId from ..array_util import bytesArrayToList from .. import selections @@ -117,7 +117,7 @@ def dumpLink(self, obj_id, name): if "id" in item: tgt_id = item["id"] response["collection"] = getCollectionForId(tgt_id) - response["id"] = stripId(tgt_id) + response["id"] = getUuidFromId(tgt_id) for key in item: if key in ("id", "created", "modified"): @@ -154,14 +154,14 @@ def dumpGroup(self, obj_id): def dumpGroups(self): groups = {} item = self.dumpGroup(self._root_uuid) - root_uuid = stripId(self._root_uuid) + root_uuid = getUuidFromId(self._root_uuid) groups[root_uuid] = item obj_ids = self.db.getCollection("groups") for obj_id in obj_ids: if obj_id == self._root_uuid: continue item = self.dumpGroup(obj_id) - obj_uuid = stripId(obj_id) + obj_uuid = getUuidFromId(obj_id) groups[obj_uuid] = item self.json["groups"] = groups @@ -220,7 +220,7 @@ def dumpDatasets(self): datasets = {} for obj_id in obj_ids: item = self.dumpDataset(obj_id) - obj_uuid = stripId(obj_id) + obj_uuid = getUuidFromId(obj_id) datasets[obj_uuid] = item self.json["datasets"] = datasets @@ -244,7 +244,7 @@ def dumpDatatypes(self): datatypes = {} for obj_id in obj_ids: item = self.dumpDatatype(obj_id) - obj_uuid = stripId(obj_id) + obj_uuid = getUuidFromId(obj_id) datatypes[obj_uuid] = item self.json["datatypes"] = datatypes @@ -255,7 +255,7 @@ def dumpFile(self): db_version_info = self.db.getVersionInfo() self.json["apiVersion"] = db_version_info["hdf5-json-version"] - self.json["root"] = stripId(self._root_uuid) + self.json["root"] = getUuidFromId(self._root_uuid) self.updateAliasList() # create alias_db with obj_id to alias list dict diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index 608f627f..e68314d7 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -15,7 +15,6 @@ import numpy as np from h5json import Hdf5db from h5json.writer.h5json_writer import H5JsonWriter -from h5json.objid import isRootObjId, isValidUuid, isSchema2Id, stripId from h5json.hdf5dtype import special_dtype, Reference from h5json import selections diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index c8b14cb4..ef42a29d 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -10,12 +10,8 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import unittest -import os -import os.path as op -import stat import logging -import shutil from h5json import Hdf5db from h5json.reader.h5py_reader import H5pyReader diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 81d977db..8f343423 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -16,7 +16,6 @@ import numpy as np from h5json import Hdf5db from h5json.writer.h5py_writer import H5pyWriter -from h5json.objid import isRootObjId, isValidUuid, isSchema2Id, stripId from h5json.hdf5dtype import special_dtype, Reference from h5json import selections diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py index c135ae40..d74ec102 100755 --- a/test/unit/objid_test.py +++ b/test/unit/objid_test.py @@ -12,7 +12,7 @@ import unittest from h5json.objid import isRootObjId, isValidUuid, validateUuid -from h5json.objid import createObjId, getCollectionForId, stripId, getUuidFromId +from h5json.objid import createObjId, getCollectionForId, getUuidFromId from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id @@ -134,7 +134,7 @@ def testGetCollection(self): self.assertEqual(getCollectionForId(group_id), "groups") self.assertEqual(getCollectionForId(dataset_id), "datasets") self.assertEqual(getCollectionForId(ctype_id), "datatypes") - self.assertEqual(stripId(group_id), "314d61b8-9954-11e6-a733-3c15c2da029e") + self.assertEqual(getUuidFromId(group_id), "314d61b8-9954-11e6-a733-3c15c2da029e") try: getCollectionForId(bad_id) self.assertTrue(False) From 7840ca4f972f712aa761464203697713711e3028 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 3 Apr 2025 16:50:59 +0200 Subject: [PATCH 030/129] add test for incremental updates --- src/h5json/hdf5db.py | 17 +++++++---- src/h5json/writer/h5py_writer.py | 11 ++++++++ test/unit/h5json_reader_test.py | 4 ++- test/unit/h5py_writer_test.py | 48 ++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 7 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 3b4694bc..c632d93c 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -82,9 +82,19 @@ def reader(self): @reader.setter def reader(self, value: H5Reader): """ set the reader """ + if self._writer: + self.flush() if self._reader: self._reader.close() + root_id = value.get_root_id() + if not root_id: + raise ValueError(f"reader {type(value)} unable to return root_id") + group_json = value.getObjectById(root_id) + if not group_json: + raise ValueError(f"reader {type(value)} unable to return group json") self._reader = value + self._db[root_id] = group_json + self._root_id = root_id @property def writer(self): @@ -411,15 +421,10 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): obj_json = self.getObjectById(obj_id) attrs_json = obj_json["attributes"] - if name in attrs_json: - # replace, keep, created timestamp - created = attrs_json["created"] - else: - created = time.time() type_json = getTypeItem(dtype) # finally put it all together... attr_json = {"shape": shape_json, "type": type_json, "value": value_json} - attr_json["created"] = created + attr_json["created"] = time.time() # slot into the obj_json["attrs"] attrs_json[name] = attr_json diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index c2f44351..ee7bc537 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -11,6 +11,7 @@ ############################################################################## import h5py import numpy as np +import time from ..objid import getCollectionForId, isValidUuid, getUuidFromId, isObjId from ..hdf5dtype import createDataType @@ -39,6 +40,7 @@ def __init__( self._init = False else: self._init = True + self._flush_time = 0.0 def _copy_element(self, val, src_dt, tgt_dt, fout=None): """ convert the given dataset or attribute element to h5py equivalent """ @@ -379,10 +381,14 @@ def updateAttributes(self, obj_id, obj): attrs = obj_json["attributes"] for name in attrs: attr_json = attrs[name] + if "created" in attr_json and attr_json["created"] < self._flush_time: + # ttribute should be saved already + continue self.createAttribute(obj, name, attr_json) def flush(self): """ Write dirty items """ + if not self.db: # no db set yet return False @@ -393,6 +399,7 @@ def flush(self): with h5py.File(self._filepath, mode=mode) as f: if self.db.new_objects or self._init: root_json = self.db.getObjectById(root_id) + if "links" in root_json: root_links = root_json["links"] self._createObjects(f, root_links, visited=set((root_id,))) @@ -408,6 +415,10 @@ def flush(self): self.initializeDatasetValues(obj_id, obj) else: self.updateDatasetValues(obj_id, obj) + # mark time write is complete + # updates before this time will not need to be written + # TBD: possible race condition with multithreading + self._flush_time = time.time() self._init = False # done with init after first flush return True # all objects written successfully diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index 072afb16..1c44e13c 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -39,7 +39,9 @@ def __init__(self, *args, **kwargs): def testSimple(self): filepath = "data/json/tall.json" kwargs = {"app_logger": self.log} - with Hdf5db(h5_reader=H5JsonReader(filepath, **kwargs), **kwargs) as db: + with Hdf5db(**kwargs) as db: + h5_reader = H5JsonReader(filepath, **kwargs) + db.reader = h5_reader root_id = db.getObjectIdByPath("/") root_json = db.getObjectById(root_id) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 8f343423..8eaf8812 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -15,6 +15,7 @@ import h5py import numpy as np from h5json import Hdf5db +from h5json.reader.h5json_reader import H5JsonReader from h5json.writer.h5py_writer import H5pyWriter from h5json.hdf5dtype import special_dtype, Reference from h5json import selections @@ -473,6 +474,53 @@ def testCommittedCompoundType(self): sub_dt = t1.dtype["field_4"] self.assertEqual(sub_dt, h5py.special_dtype(vlen=str)) + def testReaderWithUpdate(self): + + file_in = "data/json/tall.json" + file_out = "test/unit/out/h5py_writer_test_testReaderWithUpdate.h5" + + with Hdf5db(app_logger=self.log) as db: + db.reader = H5JsonReader(file_in) + db.writer = H5pyWriter(file_out, no_data=False) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + db.flush() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 2) + + db.createAttribute(dset111_id, "attr3", "hello") + dset_json = db.getObjectById(dset111_id) + db.flush() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 3) + self.assertEqual(dset111.attrs["attr3"], b"hello") + + db.createAttribute(dset111_id, "attr3", "bye-bye") + db.flush() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 3) + self.assertEqual(dset111.attrs["attr3"], b"bye-bye") + g1 = f["g1"] + + # create a new link + g13_id = db.createGroup() + g1_id = db.getObjectIdByPath("/g1") + db.createHardLink(g1_id, "g1.3", g13_id) + db.flush() + + with h5py.File(file_out) as f: + g1 = f["g1"] + self.assertEqual(len(g1), 3) + self.assertTrue("g1.3" in g1) + if __name__ == "__main__": # setup test files From deb501f567cf7c8683838f7e79bdc68dd09866cb Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 3 Apr 2025 16:59:19 +0200 Subject: [PATCH 031/129] fix flake8 errors --- test/unit/h5py_writer_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 8eaf8812..7d129bd9 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -489,9 +489,8 @@ def testReaderWithUpdate(self): self.assertTrue("/g1/g1.1/dset1.1.1" in f) dset111 = f["/g1/g1.1/dset1.1.1"] self.assertEqual(len(dset111.attrs), 2) - + db.createAttribute(dset111_id, "attr3", "hello") - dset_json = db.getObjectById(dset111_id) db.flush() with h5py.File(file_out) as f: @@ -509,7 +508,7 @@ def testReaderWithUpdate(self): self.assertEqual(len(dset111.attrs), 3) self.assertEqual(dset111.attrs["attr3"], b"bye-bye") g1 = f["g1"] - + # create a new link g13_id = db.createGroup() g1_id = db.getObjectIdByPath("/g1") From 1bf10b14fbb6f4baf0c12b315b70fabc980366e5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 3 Apr 2025 18:43:53 +0200 Subject: [PATCH 032/129] added dset writes to h5py_writer test --- test/unit/h5py_writer_test.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 7d129bd9..a103873b 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -482,7 +482,6 @@ def testReaderWithUpdate(self): with Hdf5db(app_logger=self.log) as db: db.reader = H5JsonReader(file_in) db.writer = H5pyWriter(file_out, no_data=False) - dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") db.flush() with h5py.File(file_out) as f: @@ -490,6 +489,7 @@ def testReaderWithUpdate(self): dset111 = f["/g1/g1.1/dset1.1.1"] self.assertEqual(len(dset111.attrs), 2) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") db.createAttribute(dset111_id, "attr3", "hello") db.flush() @@ -509,7 +509,7 @@ def testReaderWithUpdate(self): self.assertEqual(dset111.attrs["attr3"], b"bye-bye") g1 = f["g1"] - # create a new link + # create a new group g13_id = db.createGroup() g1_id = db.getObjectIdByPath("/g1") db.createHardLink(g1_id, "g1.3", g13_id) @@ -520,6 +520,32 @@ def testReaderWithUpdate(self): self.assertEqual(len(g1), 3) self.assertTrue("g1.3" in g1) + # create a new dataset + dset_id = db.createDataset(shape=(10, 10), dtype=np.int32) + db.createHardLink(g1_id, "DS1", dset_id) + db.flush() + + with h5py.File(file_out) as f: + g1 = f["g1"] + self.assertTrue("DS1" in g1) + ds1 = g1["DS1"] + self.assertEqual(ds1.shape, (10, 10)) + + arr = np.asarray(range(10), dtype=np.int32) + sel = selections.select((10, 10), (slice(5, 6), slice(0, 10))) + db.setDatasetValues(dset_id, sel, arr) + db.flush() + + with h5py.File(file_out) as f: + ds1 = f["/g1/DS1"] + data = ds1[:, :] + for i in range(10): + for j in range(10): + if i == 5: + self.assertEqual(data[i, j], j) + else: + self.assertEqual(data[i, j], 0) + if __name__ == "__main__": # setup test files From bfd6cdd693b14bbf64af2dd8bb42d3bdb60b6d46 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 10 Apr 2025 14:41:56 +0200 Subject: [PATCH 033/129] fix for array types --- src/h5json/array_util.py | 66 +++++----------- src/h5json/hdf5dtype.py | 10 +-- src/h5json/reader/h5json_reader.py | 5 +- src/h5json/writer/h5py_writer.py | 3 +- test/integ/h5tojson_test.py | 6 +- test/unit/array_util_test.py | 120 +++++++++++++++++++++++------ test/unit/hdf5dtype_test.py | 104 +++++++++++++++++++++++-- 7 files changed, 227 insertions(+), 87 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 1640d687..91b5e499 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -15,6 +15,8 @@ import binascii import numpy as np +from .hdf5dtype import isVlen + MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million @@ -99,23 +101,6 @@ def getNumElements(dims): return num_elements -def isVlen(dt): - """ - Return True if the type contains variable length elements - """ - is_vlen = False - if len(dt) > 1: - names = dt.names - for name in names: - if isVlen(dt[name]): - is_vlen = True - break - else: - if dt.metadata and "vlen" in dt.metadata: - is_vlen = True - return is_vlen - - def jsonToArray(data_shape, data_dtype, data_json): """ Return numpy array from the given json array. @@ -193,16 +178,16 @@ def getElementSize(e, dt): field_dt = dt[name] field_val = e[name] count += getElementSize(field_val, field_dt) - elif not dt.metadata or "vlen" not in dt.metadata: + elif not dt.base.metadata or "vlen" not in dt.base.metadata: count = dt.itemsize # fixed size element else: # variable length element - vlen = dt.metadata["vlen"] + vlen = dt.base.metadata["vlen"] if isinstance(e, int): if e == 0: count = 4 # non-initialized element else: - raise ValueError("Unexpected value: {}".format(e)) + raise ValueError(f"Unexpected value: {e}") elif isinstance(e, bytes): count = len(e) + 4 elif isinstance(e, str): @@ -226,6 +211,7 @@ def getElementSize(e, dt): count = len(e) * vlen.itemsize + 4 # +4 for byte count else: raise TypeError("unexpected type: {}".format(type(e))) + # print("getElementSize returning:", count) return count @@ -262,46 +248,40 @@ def copyElement(e, dt, buffer, offset): """ Copy element to bytearray """ + # print(f"copyElement - dt: {dt} offset: {offset}") if len(dt) > 1: for name in dt.names: field_dt = dt[name] field_val = e[name] offset = copyElement(field_val, field_dt, buffer, offset) - elif not dt.metadata or "vlen" not in dt.metadata: - # print(f"e vlen: {e} type: {type(e)} itemsize: {dt.itemsize}") - e_buf = e.tobytes() - # print("tobytes:", e_buf) + elif not dt.base.metadata or "vlen" not in dt.base.metadata: + # print(f"no vlen: {e} type: {type(e)} e.dtype: {e.dtype} itemsize: {dt.itemsize}") + e_buf = np.asarray(e, dtype=dt).tobytes() if len(e_buf) < dt.itemsize: # extend the buffer for fixed size strings - # print("extending buffer") e_buf_ex = bytearray(dt.itemsize) for i in range(len(e_buf)): e_buf_ex[i] = e_buf[i] e_buf = bytes(e_buf_ex) - # print("length:", len(e_buf)) offset = copyBuffer(e_buf, buffer, offset) else: # variable length element - vlen = dt.metadata["vlen"] - # print("copyBuffer vlen:", vlen) + vlen = dt.base.metadata["vlen"] if isinstance(e, int): - # print("copyBuffer int") if e == 0: # write 4-byte integer 0 to buffer offset = copyBuffer(b"\x00\x00\x00\x00", buffer, offset) else: raise ValueError("Unexpected value: {}".format(e)) elif isinstance(e, bytes): - # print("copyBuffer bytes") count = np.int32(len(e)) if count > MAX_VLEN_ELEMENT: raise ValueError("vlen element too large") offset = copyBuffer(count.tobytes(), buffer, offset) offset = copyBuffer(e, buffer, offset) elif isinstance(e, str): - # print("copyBuffer, str") text = e.encode("utf-8") count = np.int32(len(text)) if count > MAX_VLEN_ELEMENT: @@ -311,18 +291,13 @@ def copyElement(e, dt, buffer, offset): elif isinstance(e, np.ndarray): nElements = math.prod(e.shape) - # print("copyBuffer ndarray, nElements:", nElements) if e.dtype.kind != "O": count = np.int32(e.dtype.itemsize * nElements) - # print("copyBuffeer got vlen count:", count) - # print("copyBuffer e:", e) if count > MAX_VLEN_ELEMENT: raise ValueError("vlen element too large") offset = copyBuffer(count.tobytes(), buffer, offset) - # print("copyBuffer write new count, offset:", offset) offset = copyBuffer(e.tobytes(), buffer, offset) - # print("copyBuffer write data, offset:", offset) else: arr1d = e.reshape((nElements,)) for item in arr1d: @@ -340,7 +315,6 @@ def copyElement(e, dt, buffer, offset): else: raise TypeError("unexpected type: {}".format(type(e))) - # print("buffer: {}".format(buffer)) return offset @@ -385,12 +359,13 @@ def readElement(buffer, offset, arr, index, dt): Returns: int: The updated offset value after reading the element. """ + # print("readElement, offset:", offset) if len(dt) > 1: e = arr[index] for name in dt.names: field_dt = dt[name] offset = readElement(buffer, offset, e, name, field_dt) - elif not dt.metadata or "vlen" not in dt.metadata: + elif not dt.base.metadata or "vlen" not in dt.base.metadata: count = dt.itemsize n = offset m = offset + count @@ -399,12 +374,13 @@ def readElement(buffer, offset, arr, index, dt): try: e = np.frombuffer(bytes(e_buffer), dtype=dt) arr[index] = e[0] + except ValueError: - print(f"ERROR: ValueError setting {e_buffer} and dtype: {dt}") + # print(f"ValueError setting {e_buffer} and dtype: {dt}") raise else: # variable length element - vlenBaseType = dt.metadata["vlen"] + vlenBaseType = dt.base.metadata["vlen"] e = arr[index] if isinstance(e, np.ndarray): @@ -474,6 +450,7 @@ def arrayToBytes(arr, encoding=None): """ Return byte representation of numpy array """ + if isVlen(arr.dtype): nSize = getByteArraySize(arr) buffer = bytearray(nSize) @@ -481,7 +458,6 @@ def arrayToBytes(arr, encoding=None): nElements = math.prod(arr.shape) arr1d = arr.reshape((nElements,)) for e in arr1d: - # print("arrayToBytes:", e) offset = copyElement(e, arr1d.dtype, buffer, offset) data = bytes(buffer) else: @@ -499,17 +475,17 @@ def bytesToArray(data, dt, shape, encoding=None): """ if encoding: # decode the data - # will raise ValueError if non-decodeable + # will raise ValueError if non-decodable data = decodeData(data) if not isVlen(dt): # regular numpy from string arr = np.frombuffer(data, dtype=dt) else: - nelements = getNumElements(shape) + nElements = getNumElements(shape) - arr = np.zeros((nelements,), dtype=dt) + arr = np.zeros((nElements,), dtype=dt) offset = 0 - for index in range(nelements): + for index in range(nElements): offset = readElement(data, offset, arr, index, dt) if shape is not None: arr = arr.reshape(shape) diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index cd3c6a45..bbef116d 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -316,7 +316,7 @@ def getTypeItem(dt, metadata=None): metadata = dt.metadata type_info = {} - if len(dt) > 1: + if len(dt): # compound type names = dt.names type_info["class"] = "H5T_COMPOUND" @@ -494,14 +494,14 @@ def isVlen(dt): Return True if the type contains variable length elements """ is_vlen = False - if len(dt) > 1: + if len(dt): names = dt.names for name in names: if isVlen(dt[name]): is_vlen = True break else: - if dt.metadata and "vlen" in dt.metadata: + if dt.base.metadata and "vlen" in dt.base.metadata: is_vlen = True return is_vlen @@ -510,7 +510,7 @@ def isOpaqueDtype(dt): """ Return True if this is an opaque dtype """ - if dt.kind == "V" and len(dt) <= 1 and len(dt.shape) == 0 and not dt.names: + if dt.kind == "V" and len(dt) == 0 and len(dt.shape) == 0 and not dt.names: return True if dt.metadata and dt.metadata.get('h5py_opaque'): return True @@ -626,7 +626,7 @@ def getDtypeItemSize(dtype): return the string "H5T_VARIABLE """ item_size = 0 - if len(dtype) > 0: + if len(dtype): # compound dtype for i in range(len(dtype)): sub_dt = dtype[i] diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/reader/h5json_reader.py index f4d6426e..455b185c 100644 --- a/src/h5json/reader/h5json_reader.py +++ b/src/h5json/reader/h5json_reader.py @@ -175,12 +175,11 @@ def getDatasetValues(self, obj_id, sel=None): self.log.debug(f"getDatasetValues({obj_id}), sel={sel}") json_obj = self.getObjectById(obj_id, include_values=True) if json_obj is None: - print("no json_obj") + self.log.warning(f"no object found with id; {obj_id}") return None if "value" not in json_obj: - print("no json value") - self.log.warning("value key not found for {obj_id}") + self.log.warning(f"value key not found for {obj_id}") return None json_value = json_obj["value"] shape_json = json_obj["shape"] diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/writer/h5py_writer.py index ee7bc537..2d281338 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/writer/h5py_writer.py @@ -345,7 +345,8 @@ def initializeDatasetValues(self, dset_id, dset): sel_all = selections.select(dset.shape, ...) arr = self.db.getDatasetValues(dset_id, sel_all) - dset[...] = arr + if arr is not None: + dset[...] = arr def createAttribute(self, obj, name, attr_json): """ add the given attribute to obj """ diff --git a/test/integ/h5tojson_test.py b/test/integ/h5tojson_test.py index 68b04642..5be40c84 100644 --- a/test/integ/h5tojson_test.py +++ b/test/integ/h5tojson_test.py @@ -35,7 +35,7 @@ "compound.h5", "compound_array.h5", "compound_array_attr.h5", - # "compound_array_vlen_string.h5", # crashes python w/ Linux! + "compound_array_vlen_string.h5", # crashes python w/ Linux? "compound_array_dset.h5", "compound_attr.h5", "compound_committed.h5", @@ -47,8 +47,8 @@ "enum_attr.h5", "enum_dset.h5", "fillvalue.h5", - "fixed_string_attr.h5", # temp for trying travis - "fixed_string_dset.h5", # temp for trying travis + "fixed_string_attr.h5", + "fixed_string_dset.h5", "h5ex_d_alloc.h5", "h5ex_d_checksum.h5", "h5ex_d_chunk.h5", diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index 1b0b0f68..cc2f63c3 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -26,6 +26,7 @@ from h5json.array_util import ndarray_compare from h5json.array_util import getNumpyValue from h5json.array_util import getBroadcastShape +from h5json.array_util import isVlen from h5json.hdf5dtype import special_dtype from h5json.hdf5dtype import check_dtype @@ -378,6 +379,12 @@ def testToBytes(self): arr_copy = bytesToArray(buffer, dt, (4,)) self.assertTrue(np.array_equal(arr, arr_copy)) + # big-endian ints + dt = np.dtype(">u8") + arr = np.asarray((1, 2, 3, 4), dtype=dt) + buffer = arrayToBytes(arr) + self.assertEqual(buffer, arr.tobytes()) + # fixed length string dt = np.dtype("S8") arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) @@ -428,11 +435,11 @@ def testToBytes(self): self.assertTrue(ndarray_compare(arr, arr_copy)) # VLEN of int32's - dt = np.dtype("O", metadata={"vlen": np.dtype("int32")}) + dt = special_dtype(vlen=np.dtype("") self.assertEqual(dt.kind, "u") + self.assertFalse(isVlen(dt)) dt = hdf5dtype.createDataType("H5T_STD_I16LE") self.assertEqual(dt.name, "int16") @@ -384,10 +430,12 @@ def testCreateBaseType(self): dt = hdf5dtype.createDataType("H5T_IEEE_F64LE") self.assertEqual(dt.name, "float64") self.assertEqual(dt.kind, "f") + self.assertFalse(isVlen(dt)) dt = hdf5dtype.createDataType("H5T_IEEE_F32LE") self.assertEqual(dt.name, "float32") self.assertEqual(dt.kind, "f") + self.assertFalse(isVlen(dt)) typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I32BE"} typeSize = hdf5dtype.getItemSize(typeItem) @@ -395,6 +443,7 @@ def testCreateBaseType(self): self.assertEqual(dt.name, "int32") self.assertEqual(dt.kind, "i") self.assertEqual(typeSize, 4) + self.assertFalse(isVlen(dt)) def testCreateBaseStringType(self): typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_ASCII", "length": 6} @@ -403,6 +452,7 @@ def testCreateBaseStringType(self): self.assertEqual(dt.name, "bytes48") self.assertEqual(dt.kind, "S") self.assertEqual(typeSize, 6) + self.assertFalse(isVlen(dt)) def testCreateBaseUnicodeType(self): typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 6} @@ -413,6 +463,7 @@ def testCreateBaseUnicodeType(self): self.assertEqual(dt.name, "bytes48") self.assertEqual(dt.kind, "S") # uses byte self.assertEqual(typeSize, 6) + self.assertFalse(isVlen(dt)) def testCreateNullTermStringType(self): typeItem = { @@ -427,6 +478,7 @@ def testCreateNullTermStringType(self): self.assertEqual(dt.name, "bytes48") self.assertEqual(dt.kind, "S") self.assertEqual(typeSize, 6) + self.assertFalse(isVlen(dt)) def testCreateVLenStringType(self): typeItem = { @@ -440,6 +492,28 @@ def testCreateVLenStringType(self): self.assertEqual(dt.kind, "O") self.assertEqual(check_dtype(vlen=dt), bytes) self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) + + def testCreateVLenStringArrayType(self): + typeItem = { + "class": "H5T_ARRAY", + "dims": (2, 2), + "base": { + "class": "H5T_STRING", + "charSet": "H5T_CSET_ASCII", + "length": "H5T_VARIABLE", + } + } + typeSize = hdf5dtype.getItemSize(typeItem) + dt = hdf5dtype.createDataType(typeItem) + self.assertEqual(dt.name, "void256") # assuming 8-byte pointers + self.assertEqual(dt.kind, "V") + self.assertEqual(dt.shape, (2, 2)) + self.assertEqual(check_dtype(vlen=dt), None) + self.assertEqual(check_dtype(vlen=dt.base), bytes) + self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertEqual(dt.base.kind, 'O') + self.assertTrue(isVlen(dt)) def testCreateVLenUTF8Type(self): typeItem = { @@ -453,6 +527,7 @@ def testCreateVLenUTF8Type(self): self.assertEqual(dt.kind, "O") self.assertEqual(check_dtype(vlen=dt), str) self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) def testCreateVLenDataType(self): typeItem = {"class": "H5T_VLEN", "base": "H5T_STD_I32BE"} @@ -461,6 +536,7 @@ def testCreateVLenDataType(self): dt = hdf5dtype.createDataType(typeItem) self.assertEqual(dt.name, "object") self.assertEqual(dt.kind, "O") + self.assertTrue(isVlen(dt)) def testCreateOpaqueType(self): typeItem = {"class": "H5T_OPAQUE", "size": 200} @@ -469,6 +545,7 @@ def testCreateOpaqueType(self): self.assertEqual(dt.name, "void1600") self.assertEqual(dt.kind, "V") self.assertEqual(typeSize, 200) + self.assertFalse(isVlen(dt)) def testCreateEnumType(self): typeItem = { @@ -488,6 +565,7 @@ def testCreateEnumType(self): self.assertEqual(mapping["LIQUID"], 1) self.assertEqual(mapping["GAS"], 2) self.assertEqual(mapping["PLASMA"], 3) + self.assertFalse(isVlen(dt)) def testCreateBoolType(self): typeItem = { @@ -502,6 +580,7 @@ def testCreateBoolType(self): self.assertEqual(dt.name, "bool") self.assertEqual(dt.kind, "b") self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) def testCreateReferenceType(self): typeItem = { @@ -517,6 +596,7 @@ def testCreateReferenceType(self): self.assertEqual(dt.kind, "S") self.assertTrue(dt.metadata['ref'] is Reference) self.assertEqual(check_dtype(ref=dt), Reference) + self.assertFalse(isVlen(dt)) def testCreateVlenReferenceType(self): typeItem = { @@ -530,6 +610,7 @@ def testCreateVlenReferenceType(self): base = dt.metadata['vlen'] self.assertTrue(base.metadata['ref'] is Reference) self.assertEqual(check_dtype(ref=base), Reference) + self.assertTrue(isVlen(dt)) def testCreateCompoundType(self): typeItem = { @@ -555,6 +636,7 @@ def testCreateCompoundType(self): self.assertEqual(dt.kind, "V") self.assertEqual(len(dt.fields), 4) self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertTrue(isVlen(dt)) dtLocation = dt[2] self.assertEqual(dtLocation.name, "object") @@ -644,6 +726,7 @@ def testCreateCompoundOfCompoundType(self): self.assertEqual(dt.name, "void160") self.assertEqual(dt.kind, "V") self.assertEqual(len(dt.fields), 2) + self.assertFalse(isVlen(dt)) dt_field1 = dt[0] self.assertEqual(dt_field1.name, "void64") self.assertEqual(dt_field1.kind, "V") @@ -669,6 +752,7 @@ def testCreateCompoundTypeUnicodeFields(self): self.assertEqual(len(dt.fields), 3) self.assertEqual(typeSize, 10) self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) def testCreateArrayType(self): typeItem = {"class": "H5T_ARRAY", "base": "H5T_STD_I64LE", "dims": (3, 5)} @@ -676,8 +760,10 @@ def testCreateArrayType(self): dt = hdf5dtype.createDataType(typeItem) self.assertEqual(dt.name, "void960") self.assertEqual(dt.kind, "V") + self.assertEqual(dt.base.kind, "i") self.assertEqual(typeSize, 120) self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) def testCreateCompoundArrayVlenType(self): typeItem = { @@ -702,6 +788,7 @@ def testCreateCompoundArrayVlenType(self): self.assertEqual(dt.kind, "V") self.assertEqual(typeSize, "H5T_VARIABLE") self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertTrue(isVlen(dt)) dt_arr = dt["VALUE3"] self.assertEqual(dt_arr.kind, "V") self.assertEqual(dt_arr.shape, (8,)) @@ -725,6 +812,7 @@ def testCreateVlenObjRefType(self): self.assertEqual(dt.name, "object") self.assertEqual(dt.kind, "O") self.assertTrue(check_dtype(ref=dt) is None) + self.assertTrue(isVlen(dt)) dt_base = check_dtype(vlen=dt) self.assertTrue(dt_base is not None) self.assertTrue(check_dtype(ref=dt_base) is Reference) @@ -756,6 +844,7 @@ def testCreateCompoundArrayType(self): self.assertTrue("b" in dt.fields.keys()) self.assertEqual(typeSize, 11) self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) def testCompoundArrayType(self): typeItem = { @@ -787,6 +876,7 @@ def testCompoundArrayType(self): dt = hdf5dtype.createDataType(typeItem) typeSize = hdf5dtype.getItemSize(typeItem) self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) self.assertEqual(len(dt), 3) self.assertTrue("VALUE1" in dt.fields.keys()) self.assertTrue("VALUE2" in dt.fields.keys()) From d1e2b3901908d277d2377590521bca83fe5b6157 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 16 Apr 2025 16:03:36 +0200 Subject: [PATCH 034/129] fix for scalar json to arr conversion --- src/h5json/array_util.py | 6 +++-- test/unit/array_util_test.py | 52 ++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 91b5e499..73ec40cb 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -105,7 +105,9 @@ def jsonToArray(data_shape, data_dtype, data_json): """ Return numpy array from the given json array. """ + # print(f"jsonToArray: data_shape: {data_shape}, data_dtype: {data_dtype} data_json: {data_json}") def fillVlenArray(rank, data, arr, index): + # print(f"fillVlenArray rank: {rank} data: {data} arr: {arr} index: {index}") if arr.shape == (): arr[()] = data else: @@ -134,8 +136,8 @@ def fillVlenArray(rank, data, arr, index): if type(data_json) in (list, tuple): converted_data = [] - if npoints == 1: - converted_data = toTuple(np_shape_rank, data_json) + if np_shape_rank > 0 and npoints == 1 and len(data_json) == len(data_dtype): + converted_data.append(toTuple(0, data_json)) else: converted_data = toTuple(np_shape_rank, data_json) data_json = converted_data diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index cc2f63c3..699a80bc 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -202,6 +202,11 @@ def testJsonToArray(self): except UnicodeEncodeError: pass # expected + # UTF8 encode the data first + out = jsonToArray(shape, dt, data.encode('utf8')) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data.encode('utf8')) + # VLEN data dt = special_dtype(vlen=np.dtype("int32")) shape = [4, ] @@ -298,46 +303,53 @@ def testJsonToArray(self): e1 = out[1].tolist() self.assertEqual(e1, (5, b"five")) - # compound with VLEN element - - dt_str = special_dtype(vlen=str) - dt = np.dtype([("a", "i4"), ("b", dt_str)]) - shape = [1, ] - data = [[6, "six"],] + data = [6, "six"] + shape = [1,] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, (1,)) - e0 = out[0] + self.assertTrue(isinstance(out[0], np.void)) + e1 = out[0].tolist() + self.assertEqual(e1, (6, b"six")) - e0 = out[0].tolist() - self.assertEqual(e0, (6, "six")) + data = [7, "seven"] shape = [] - data = [6, "six",] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, ()) - e0 = out[()] - self.assertEqual(len(e0), 2) - self.assertEqual(e0[0], 6) - self.assertEqual(e0[1], "six") + self.assertTrue(isinstance(out[()], np.void)) + e1 = out[()].tolist() + self.assertEqual(e1, (7, b"seven")) + + # compound with VLEN element + + dt_str = special_dtype(vlen=str) + dt = np.dtype([("a", "i4"), ("b", dt_str)]) + shape = [2, ] + data = [[4, "four"], [5, "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (2,)) + e0 = out[0].tolist() + self.assertEqual(e0, (4, "four")) - # one element compound shape = [1, ] - data = [[6, "six"],] + data = [6, "six"] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, (1,)) e0 = out[0].tolist() self.assertEqual(e0, (6, "six")) - # scalar compound shape = [] - data = [6, "six"] + data = [7, "seven",] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, ()) - e0 = out[()].tolist() - self.assertEqual(e0, (6, "six")) + e0 = out[()] + self.assertEqual(len(e0), 2) + self.assertEqual(e0[0], 7) + self.assertEqual(e0[1], "seven") # compound type with array field dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) From c6d77f877957fc1c16714ada9433b1fa9909b038 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 17 Apr 2025 19:42:57 +0200 Subject: [PATCH 035/129] support jsontoarray for all byte strings --- src/h5json/array_util.py | 10 ++++---- test/unit/array_util_test.py | 44 +++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 73ec40cb..bb416423 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -61,7 +61,7 @@ def bytesArrayToList(data): return out -def toTuple(rank, data): +def toTuple(rank, data, encoding=None): """ Convert a list to a tuple, recursively. Example. [[1,2],[3,4]] -> ((1,2),(3,4)) @@ -72,6 +72,8 @@ def toTuple(rank, data): else: return tuple(toTuple(rank - 1, x) for x in data) else: + if encoding: + data = data.encode(encoding, "surrogateesacpe") return data @@ -153,9 +155,9 @@ def fillVlenArray(rank, data, arr, index): try: arr = np.array(data_json, dtype=data_dtype) except UnicodeEncodeError: - # Unable to encode data - # TBD: look into using surrogate encoding here - raise + # Unable to encode data, encode as utf8 with surrogate escaping + data_json = toTuple(np_shape_rank, data_json, encoding="utf8") + arr = np.array(data_json, dtype=data_dtype) # raise an exception of the array shape doesn't match the selection shape # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index 699a80bc..13692625 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -116,6 +116,21 @@ def testJsonToArray(self): self.assertEqual(out.shape, ()) self.assertEqual(out[()], 42) + dt = np.dtype("S10") # fixed size string + shape = [5, ] + data = ["parting", "is", "such", "sweet", "sorrow"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, )) + self.assertEqual(out[4], b'sorrow') + + shape = () # scalar + data = "a string" + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + self.assertEqual(out[()], b'a string') + # VLEN Scalar str dt = special_dtype(vlen=str) data = "I'm a string!" @@ -179,34 +194,37 @@ def testJsonToArray(self): self.assertEqual(out.dtype.kind, "O") self.assertEqual(out[2], "three") - # test ascii chars >127 + # test utf8 strings dt = np.dtype("S26") shape = [] - data = "extended ascii char 241: " + chr(241) - try: - jsonToArray(shape, dt, data) - self.assertTrue(False) - except ValueError: - pass # expected + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data.encode("utf8")) dt = special_dtype(vlen=str) - out = jsonToArray(shape, dt, data) # vlen str should be ok + out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out[()], data) dt = np.dtype("S12") data = "eight: \u516b" - try: - jsonToArray(shape, dt, data) - self.assertTrue(False) - except UnicodeEncodeError: - pass # expected + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data.encode("utf8")) # UTF8 encode the data first out = jsonToArray(shape, dt, data.encode('utf8')) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out[()], data.encode('utf8')) + # one-element array + shape = [1,] + dt = np.dtype("S12") + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertEqual(out[0], b'eight: \xe5\x85\xab') + # VLEN data dt = special_dtype(vlen=np.dtype("int32")) shape = [4, ] From cb3419afa0989fe17dc816a56f669cdf76cad2d8 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 18 Apr 2025 22:53:55 +0200 Subject: [PATCH 036/129] fix errors in jsonToArray function --- src/h5json/array_util.py | 47 +++++++-------------------- test/unit/array_util_test.py | 63 +++++++++++++++++++++++++----------- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index bb416423..ed3ba979 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -107,19 +107,6 @@ def jsonToArray(data_shape, data_dtype, data_json): """ Return numpy array from the given json array. """ - # print(f"jsonToArray: data_shape: {data_shape}, data_dtype: {data_dtype} data_json: {data_json}") - def fillVlenArray(rank, data, arr, index): - # print(f"fillVlenArray rank: {rank} data: {data} arr: {arr} index: {index}") - if arr.shape == (): - arr[()] = data - else: - for i in range(len(data)): - if rank > 1: - index = fillVlenArray(rank - 1, data[i], arr, index) - else: - arr[index] = data[i] - index += 1 - return index if data_json is None: return np.array([]).astype(data_dtype) @@ -131,33 +118,23 @@ def fillVlenArray(rank, data, arr, index): # need some special conversion for compound types -- # each element must be a tuple, but the JSON decoder # gives us a list instead. - if len(data_dtype) > 1 and not isinstance(data_json, (list, tuple)): + if len(data_dtype) > 0 and not isinstance(data_json, (list, tuple)): raise TypeError("expected list data for compound data type") npoints = getNumElements(data_shape) np_shape_rank = len(data_shape) if type(data_json) in (list, tuple): - converted_data = [] - if np_shape_rank > 0 and npoints == 1 and len(data_json) == len(data_dtype): - converted_data.append(toTuple(0, data_json)) - else: - converted_data = toTuple(np_shape_rank, data_json) - data_json = converted_data + data_json = toTuple(np_shape_rank, data_json) - if isVlen(data_dtype): - if np_shape_rank == 0 and npoints == 1: - arr_shape = () - else: - arr_shape = (npoints,) - arr = np.zeros(arr_shape, dtype=data_dtype) - fillVlenArray(np_shape_rank, data_json, arr, 0) - else: - try: - arr = np.array(data_json, dtype=data_dtype) - except UnicodeEncodeError: - # Unable to encode data, encode as utf8 with surrogate escaping - data_json = toTuple(np_shape_rank, data_json, encoding="utf8") - arr = np.array(data_json, dtype=data_dtype) + arr = np.zeros(data_shape, dtype=data_dtype) + + try: + # arr = np.array(data_json, dtype=data_dtype) + arr[...] = data_json + except UnicodeEncodeError: + # Unable to encode data, encode as utf8 with surrogate escaping + data_json = toTuple(np_shape_rank, data_json, encoding="utf8") + arr[...] = data_json # raise an exception of the array shape doesn't match the selection shape # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this @@ -165,8 +142,6 @@ def fillVlenArray(rank, data, arr, index): msg = "Input data doesn't match selection number of elements" msg += f" Expected {npoints}, but received: {arr.size}" raise ValueError(msg) - if arr.shape != data_shape: - arr = arr.reshape(data_shape) # reshape to match selection return arr diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index 13692625..21a5849b 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -82,6 +82,27 @@ def testToTuple(self): out = toTuple(1, data3d) # treat input a 1d array of compound type of compound types self.assertEqual([((0, 0.0), (1, 0.1)), ((2, 0.2), (3, 0.3))], out) + def testToTupleStrData(self): + data = "a string!" + out = toTuple(0, data) + self.assertEqual(data, out) + + data = ["a string!"] + out = toTuple(1, data) + self.assertEqual(data, out) + + data = ["a string2"] + out = toTuple(1, data) + self.assertEqual(data, out) + + data = [["partA", "partB", "partC"],] + out = toTuple(1, data) + self.assertEqual([("partA", "partB", "partC"), ], out) + + data = [[[4, 8, 12], "four"], [[5, 10, 15], "five"]] + out = toTuple(1, data) + self.assertEqual([((4, 8, 12), 'four'), ((5, 10, 15), 'five')], out) + def testGetNumElements(self): shape = (4,) nelements = getNumElements(shape) @@ -98,7 +119,6 @@ def testGetNumElements(self): def testJsonToArray(self): # simple integer - dt = np.dtype("i4") shape = [4, ] data = [0, 2, 4, 6] @@ -151,6 +171,14 @@ def testJsonToArray(self): val = out[0] self.assertEqual(val, data) + # VLEN multi element + shape = [5, ] + data = ["parting", "is", "such", "sweet", "sorrow"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, )) + self.assertEqual(out[4], 'sorrow') + # VLEN ascii dt = special_dtype(vlen=bytes) data = [b"one", b"two", b"three", b"four", b"five"] @@ -167,22 +195,6 @@ def testJsonToArray(self): self.assertEqual(out[2], b"three") self.assertEqual(out[3], b"four") - # VLEN str - dt = special_dtype(vlen=str) - data = [ - [b"part 1 - section A", b"part 1 - section B"], - [b"part 2 - section A", b"part 2 - section B"], - ] - shape = [2,] - out = jsonToArray(shape, dt, data) - self.assertTrue(isinstance(out, np.ndarray)) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], str) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out.shape, (2,)) - self.assertEqual(out[0], tuple(data[0])) - self.assertEqual(out[1], tuple(data[1])) - # VLEN unicode dt = special_dtype(vlen=bytes) data = ["one", "two", "three", "four", "five"] @@ -207,6 +219,12 @@ def testJsonToArray(self): self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out[()], data) + data = ["I'm an UTF-8 null terminated string",] + shape = [1,] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[0], data[0]) + dt = np.dtype("S12") data = "eight: \u516b" out = jsonToArray(shape, dt, data) @@ -223,9 +241,16 @@ def testJsonToArray(self): dt = np.dtype("S12") data = "eight: \u516b" out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out[0], b'eight: \xe5\x85\xab') # VLEN data + shape = [] + dt = special_dtype(vlen=np.dtype("S10")) + data = ["foo", "bar"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + dt = special_dtype(vlen=np.dtype("int32")) shape = [4, ] data = [ @@ -321,7 +346,7 @@ def testJsonToArray(self): e1 = out[1].tolist() self.assertEqual(e1, (5, b"five")) - data = [6, "six"] + data = [[6, "six"],] shape = [1,] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) @@ -352,7 +377,7 @@ def testJsonToArray(self): self.assertEqual(e0, (4, "four")) shape = [1, ] - data = [6, "six"] + data = [[6, "six"],] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) self.assertEqual(out.shape, (1,)) From 15133347da056714f6626aacdc761acd594817c9 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 21 Apr 2025 16:10:59 +0200 Subject: [PATCH 037/129] added extra jsonToArray test --- src/h5json/array_util.py | 3 +++ test/unit/array_util_test.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index ed3ba979..eed15af6 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -108,6 +108,8 @@ def jsonToArray(data_shape, data_dtype, data_json): Return numpy array from the given json array. """ + print(f"jsonToArray - data_shape: {data_shape} dtype: {data_dtype} data: {data_json}") + if data_json is None: return np.array([]).astype(data_dtype) @@ -141,6 +143,7 @@ def jsonToArray(data_shape, data_dtype, data_json): if arr.size != npoints: msg = "Input data doesn't match selection number of elements" msg += f" Expected {npoints}, but received: {arr.size}" + print(msg) raise ValueError(msg) return arr diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index 21a5849b..fc8167bf 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -364,6 +364,15 @@ def testJsonToArray(self): e1 = out[()].tolist() self.assertEqual(e1, (7, b"seven")) + data = [8, "eight"], + shape = [1,] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + self.assertTrue(isinstance(out[0], np.void)) + e1 = out[0].tolist() + self.assertEqual(e1, (8, b"eight")) + # compound with VLEN element dt_str = special_dtype(vlen=str) From 289bacbbd0bb4e66729dd60f2205ac2ee4b198d1 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 21 Apr 2025 16:49:48 +0200 Subject: [PATCH 038/129] support setting single element compounds with a list --- src/h5json/array_util.py | 7 ++++++- test/unit/array_util_test.py | 9 +++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index eed15af6..f68391cd 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -131,12 +131,17 @@ def jsonToArray(data_shape, data_dtype, data_json): arr = np.zeros(data_shape, dtype=data_dtype) try: - # arr = np.array(data_json, dtype=data_dtype) arr[...] = data_json except UnicodeEncodeError: # Unable to encode data, encode as utf8 with surrogate escaping data_json = toTuple(np_shape_rank, data_json, encoding="utf8") arr[...] = data_json + except ValueError: + if npoints == 1: + # try setting the first and only element + arr[0] = tuple(data_json) + else: + raise # raise an exception of the array shape doesn't match the selection shape # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index fc8167bf..e9b1acd1 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -373,6 +373,15 @@ def testJsonToArray(self): e1 = out[0].tolist() self.assertEqual(e1, (8, b"eight")) + dt = np.dtype([("a", "i4"), ("b", "f4")]) + shape = [1, ] + data = [42, 0.42] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1, )) + e1 = out[0] + self.assertEqual(e1[0], 42) + # compound with VLEN element dt_str = special_dtype(vlen=str) From 135d88f7c0a1fd33ac01370339a94132aa04bc50 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 21 Apr 2025 18:23:46 +0200 Subject: [PATCH 039/129] handle assigning sequence to multi-dim array --- src/h5json/array_util.py | 43 ++++++++++++++++++++++++------------ test/unit/array_util_test.py | 30 +++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index f68391cd..87d24da6 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -110,6 +110,16 @@ def jsonToArray(data_shape, data_dtype, data_json): print(f"jsonToArray - data_shape: {data_shape} dtype: {data_dtype} data: {data_json}") + def get_array(data, rank, dtype): + # helper function to create an array with encoding if needed + try: + arr = np.array(data, dtype=dtype) + except UnicodeEncodeError: + # Unable to encode data, encode as utf8 with surrogate escaping + data = toTuple(rank, data, encoding="utf8") + arr = np.array(data, dtype=dtype) + return arr + if data_json is None: return np.array([]).astype(data_dtype) @@ -127,21 +137,16 @@ def jsonToArray(data_shape, data_dtype, data_json): if type(data_json) in (list, tuple): data_json = toTuple(np_shape_rank, data_json) + print("data_json after toTuple:", data_json) - arr = np.zeros(data_shape, dtype=data_dtype) - - try: + if isVlen(data_dtype): + # for vlen data we need to initialize of zero numpy array to ensure the right shape + arr = np.zeros(data_shape, dtype=data_dtype) + print("made vlen arr:", arr) arr[...] = data_json - except UnicodeEncodeError: - # Unable to encode data, encode as utf8 with surrogate escaping - data_json = toTuple(np_shape_rank, data_json, encoding="utf8") - arr[...] = data_json - except ValueError: - if npoints == 1: - # try setting the first and only element - arr[0] = tuple(data_json) - else: - raise + else: + arr = get_array(data_json, np_shape_rank, data_dtype) + # raise an exception of the array shape doesn't match the selection shape # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this @@ -149,7 +154,17 @@ def jsonToArray(data_shape, data_dtype, data_json): msg = "Input data doesn't match selection number of elements" msg += f" Expected {npoints}, but received: {arr.size}" print(msg) - raise ValueError(msg) + # try adding an extra dimension to data_json + # for cases where e.g. compound types are not getting interpreted correctly + data_json = toTuple(np_shape_rank, [data_json, ]) + arr = get_array(data_json, np_shape_rank, data_dtype) + if arr.size != npoints: + # still no good, raise error + raise ValueError(msg) + + if arr.shape != tuple(data_shape): + print("reshaping to:", data_shape) + arr = arr.reshape(tuple(data_shape)) return arr diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index e9b1acd1..b413d2e6 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -136,6 +136,36 @@ def testJsonToArray(self): self.assertEqual(out.shape, ()) self.assertEqual(out[()], 42) + shape = (1, ) # one element + data = 42 + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1, )) + self.assertEqual(out[0], 42) + + shape = (10, ) # multi-1D + data = list(range(10)) + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10, )) + self.assertEqual(out[5], 5) + + shape = (5, 4) # multi-2D + data = [] + for i in range(5): + data.append([42, ] * 4) + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, 4)) + self.assertEqual(out[2, 3], 42) + + shape = (5, 4) # multi-2D, reshape input data + data = [42, ] * 20 + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, 4)) + self.assertEqual(out[2, 3], 42) + dt = np.dtype("S10") # fixed size string shape = [5, ] data = ["parting", "is", "such", "sweet", "sorrow"] From 13ea473a41f06da32343dc0d2cb401c54d2bdfb2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 21 Apr 2025 19:15:24 +0200 Subject: [PATCH 040/129] clean up debug print messages --- src/h5json/array_util.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 87d24da6..f47512fc 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -108,7 +108,7 @@ def jsonToArray(data_shape, data_dtype, data_json): Return numpy array from the given json array. """ - print(f"jsonToArray - data_shape: {data_shape} dtype: {data_dtype} data: {data_json}") + # print(f"jsonToArray - data_shape: {data_shape} dtype: {data_dtype} data: {data_json}") def get_array(data, rank, dtype): # helper function to create an array with encoding if needed @@ -137,12 +137,10 @@ def get_array(data, rank, dtype): if type(data_json) in (list, tuple): data_json = toTuple(np_shape_rank, data_json) - print("data_json after toTuple:", data_json) if isVlen(data_dtype): # for vlen data we need to initialize of zero numpy array to ensure the right shape arr = np.zeros(data_shape, dtype=data_dtype) - print("made vlen arr:", arr) arr[...] = data_json else: arr = get_array(data_json, np_shape_rank, data_dtype) @@ -153,7 +151,6 @@ def get_array(data, rank, dtype): if arr.size != npoints: msg = "Input data doesn't match selection number of elements" msg += f" Expected {npoints}, but received: {arr.size}" - print(msg) # try adding an extra dimension to data_json # for cases where e.g. compound types are not getting interpreted correctly data_json = toTuple(np_shape_rank, [data_json, ]) @@ -163,7 +160,6 @@ def get_array(data, rank, dtype): raise ValueError(msg) if arr.shape != tuple(data_shape): - print("reshaping to:", data_shape) arr = arr.reshape(tuple(data_shape)) return arr From 3b87203523a65e2a3fca48b17e659f6ac20c748d Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 9 May 2025 14:17:11 +0200 Subject: [PATCH 041/129] fix jsonToArray for single element compoound values --- src/h5json/array_util.py | 9 ++++++++- test/unit/array_util_test.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index f47512fc..cb39cd55 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -143,7 +143,14 @@ def get_array(data, rank, dtype): arr = np.zeros(data_shape, dtype=data_dtype) arr[...] = data_json else: - arr = get_array(data_json, np_shape_rank, data_dtype) + try: + arr = get_array(data_json, np_shape_rank, data_dtype) + except ValueError: + if npoints <= 1 and isinstance(data_json, list): + # try converting data to a tuple + arr = get_array(tuple(data_json), np_shape_rank, data_dtype) + else: + raise # raise an exception of the array shape doesn't match the selection shape # allow if the array is a scalar and the selection shape is one element, diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index b413d2e6..1ede343d 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -394,7 +394,7 @@ def testJsonToArray(self): e1 = out[()].tolist() self.assertEqual(e1, (7, b"seven")) - data = [8, "eight"], + data = [8, "eight"] shape = [1,] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) From ef390ec8c663bfe07fa62dbbcdd80aef47ace16e Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 30 May 2025 13:46:46 +0200 Subject: [PATCH 042/129] resturcture soruce tree --- pyproject.toml | 4 ++-- src/h5json/{reader => h5pystore}/h5py_reader.py | 2 +- src/h5json/{writer => h5pystore}/h5py_writer.py | 2 +- src/h5json/{reader => }/h5reader.py | 0 src/h5json/h5tojson/h5tojson.py | 4 ++-- src/h5json/{writer => }/h5writer.py | 0 src/h5json/{reader => jsonstore}/__init__.py | 0 src/h5json/{reader => jsonstore}/h5json_reader.py | 2 +- src/h5json/{writer => jsonstore}/h5json_writer.py | 2 +- src/h5json/jsontoh5/jsontoh5.py | 4 ++-- test/unit/h5json_reader_test.py | 2 +- test/unit/h5json_writer_test.py | 2 +- test/unit/h5py_reader_test.py | 2 +- test/unit/h5py_writer_test.py | 4 ++-- 14 files changed, 15 insertions(+), 15 deletions(-) rename src/h5json/{reader => h5pystore}/h5py_reader.py (99%) rename src/h5json/{writer => h5pystore}/h5py_writer.py (99%) rename src/h5json/{reader => }/h5reader.py (100%) rename src/h5json/{writer => }/h5writer.py (100%) rename src/h5json/{reader => jsonstore}/__init__.py (100%) rename src/h5json/{reader => jsonstore}/h5json_reader.py (99%) rename src/h5json/{writer => jsonstore}/h5json_writer.py (99%) diff --git a/pyproject.toml b/pyproject.toml index b45d1203..26997ae8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,8 +51,8 @@ build-backend = "setuptools.build_meta" package-dir = { "" = "src" } packages = [ "h5json", - "h5json.reader", - "h5json.writer", + "h5json.jsonstore", + "h5json.h5pystore", "h5json.h5tojson", "h5json.jsontoh5", "h5json.schema", diff --git a/src/h5json/reader/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py similarity index 99% rename from src/h5json/reader/h5py_reader.py rename to src/h5json/h5pystore/h5py_reader.py index 7042a259..3510b328 100644 --- a/src/h5json/reader/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -20,7 +20,7 @@ from .. import filters from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype -from .h5reader import H5Reader +from ..h5reader import H5Reader class H5pyReader(H5Reader): diff --git a/src/h5json/writer/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py similarity index 99% rename from src/h5json/writer/h5py_writer.py rename to src/h5json/h5pystore/h5py_writer.py index 2d281338..f2487826 100644 --- a/src/h5json/writer/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -19,7 +19,7 @@ from ..array_util import jsonToArray from .. import selections from .. import filters -from .h5writer import H5Writer +from ..h5writer import H5Writer class H5pyWriter(H5Writer): diff --git a/src/h5json/reader/h5reader.py b/src/h5json/h5reader.py similarity index 100% rename from src/h5json/reader/h5reader.py rename to src/h5json/h5reader.py diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/h5tojson/h5tojson.py index a2259dae..b479cdd4 100755 --- a/src/h5json/h5tojson/h5tojson.py +++ b/src/h5json/h5tojson/h5tojson.py @@ -15,8 +15,8 @@ import logging.handlers from h5json import Hdf5db -from h5json.writer.h5json_writer import H5JsonWriter -from h5json.reader.h5py_reader import H5pyReader +from h5json.jsonstore.h5json_writer import H5JsonWriter +from h5json.h5pystore.h5py_reader import H5pyReader def main(): diff --git a/src/h5json/writer/h5writer.py b/src/h5json/h5writer.py similarity index 100% rename from src/h5json/writer/h5writer.py rename to src/h5json/h5writer.py diff --git a/src/h5json/reader/__init__.py b/src/h5json/jsonstore/__init__.py similarity index 100% rename from src/h5json/reader/__init__.py rename to src/h5json/jsonstore/__init__.py diff --git a/src/h5json/reader/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py similarity index 99% rename from src/h5json/reader/h5json_reader.py rename to src/h5json/jsonstore/h5json_reader.py index 455b185c..4c4eef90 100644 --- a/src/h5json/reader/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -17,7 +17,7 @@ from ..hdf5dtype import createDataType from ..array_util import jsonToArray from .. import selections -from .h5reader import H5Reader +from ..h5reader import H5Reader class H5JsonReader(H5Reader): diff --git a/src/h5json/writer/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py similarity index 99% rename from src/h5json/writer/h5json_writer.py rename to src/h5json/jsonstore/h5json_writer.py index 759f0aa2..4a94ad02 100644 --- a/src/h5json/writer/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -12,7 +12,7 @@ import json -from .h5writer import H5Writer +from ..h5writer import H5Writer from ..objid import getUuidFromId, getCollectionForId from ..array_util import bytesArrayToList from .. import selections diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/jsontoh5/jsontoh5.py index fb58abb7..d572e58e 100755 --- a/src/h5json/jsontoh5/jsontoh5.py +++ b/src/h5json/jsontoh5/jsontoh5.py @@ -15,8 +15,8 @@ import logging.handlers from h5json import Hdf5db -from h5json.writer.h5py_writer import H5pyWriter -from h5json.reader.h5json_reader import H5JsonReader +from h5json.h5pystore.h5py_writer import H5pyWriter +from h5json.jsonstore.h5json_reader import H5JsonReader def main(): diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index 1c44e13c..f49a86a8 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -13,7 +13,7 @@ import logging import numpy as np from h5json import Hdf5db -from h5json.reader.h5json_reader import H5JsonReader +from h5json.jsonstore.h5json_reader import H5JsonReader from h5json import selections diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index e68314d7..0f1fb59a 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -14,7 +14,7 @@ import logging import numpy as np from h5json import Hdf5db -from h5json.writer.h5json_writer import H5JsonWriter +from h5json.jsonstore.h5json_writer import H5JsonWriter from h5json.hdf5dtype import special_dtype, Reference from h5json import selections diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index ef42a29d..45de125e 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -13,7 +13,7 @@ import logging from h5json import Hdf5db -from h5json.reader.h5py_reader import H5pyReader +from h5json.h5pystore.h5py_reader import H5pyReader class H5pyReaderTest(unittest.TestCase): diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index a103873b..f70acb59 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -15,8 +15,8 @@ import h5py import numpy as np from h5json import Hdf5db -from h5json.reader.h5json_reader import H5JsonReader -from h5json.writer.h5py_writer import H5pyWriter +from h5json.jsonstore.h5json_reader import H5JsonReader +from h5json.h5pystore.h5py_writer import H5pyWriter from h5json.hdf5dtype import special_dtype, Reference from h5json import selections From 8b426258d79e4813ada53cf37a782a2bedcb9c6c Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 30 May 2025 20:00:41 +0200 Subject: [PATCH 043/129] added proptype hsdsreader --- pyproject.toml | 1 + src/h5json/config.py | 213 +++++++++++++ src/h5json/h5pystore/__init__.py | 0 src/h5json/h5pystore/h5py_reader.py | 2 +- src/h5json/h5reader.py | 2 +- src/h5json/hdf5db.py | 2 +- src/h5json/jsonstore/h5json_reader.py | 3 +- src/h5json/openid.py | 438 ++++++++++++++++++++++++++ test/unit/hsds_reader_test.py | 114 +++++++ 9 files changed, 770 insertions(+), 5 deletions(-) create mode 100755 src/h5json/config.py create mode 100644 src/h5json/h5pystore/__init__.py create mode 100644 src/h5json/openid.py create mode 100644 test/unit/hsds_reader_test.py diff --git a/pyproject.toml b/pyproject.toml index 26997ae8..879e7ffb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ packages = [ "h5json", "h5json.jsonstore", "h5json.h5pystore", + "h5json.hsdsstore", "h5json.h5tojson", "h5json.jsontoh5", "h5json.schema", diff --git a/src/h5json/config.py b/src/h5json/config.py new file mode 100755 index 00000000..b7602ffd --- /dev/null +++ b/src/h5json/config.py @@ -0,0 +1,213 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import os +import json + + +class Config: + """ + User Config state + """ + _cfg = {} # global state + + def __init__(self, config_file=None, **kwargs): + if Config._cfg: + return # already initialized + if config_file: + self._config_file = config_file + elif os.path.isfile(".hscfg"): + self._config_file = ".hscfg" + else: + self._config_file = os.path.expanduser("~/.hscfg") + # process config file if found + if os.path.isfile(self._config_file): + line_number = 0 + with open(self._config_file) as f: + for line in f: + line_number += 1 + s = line.strip() + if not s: + continue + if s[0] == '#': + # comment line + continue + fields = s.split('=') + if len(fields) < 2: + print(f"config file: {self._config_file} line: {line_number} is not valid") + continue + k = fields[0].strip() + v = fields[1].strip() + if k == "complex_names": + self.complex_names = v + elif k == "bool_names": + self.bool_names = v + elif k == "track_order": + self.track_order = v + else: + Config._cfg[k] = v + + # add standard keys if not already picked up + for k in ("hs_endpoint", "hs_username", "hs_password", "hs_api_key"): + if k not in Config._cfg: + Config._cfg[k] = "" + + # override any config values with environment variable if found + for k in Config._cfg.keys(): + if k.upper() in os.environ: + Config._cfg[k] = os.environ[k.upper()] + + # update any values that are passed in to the constructor + for k in kwargs.keys(): + Config._cfg[k] = kwargs[k] + + # finally, set defaults for any expected keys that are not already set + for k in ("hs_endpoint", "hs_username", "hs_endpoint"): + if k not in Config._cfg: + Config._cfg[k] = None + if "bool_names" not in Config._cfg: + Config._cfg["bool_names"] = (b"FALSE", b"TRUE") + if "complex_names" not in Config._cfg: + Config._cfg["complex_names"] = ("r", "i") + if "track_order" not in Config._cfg: + Config._cfg["track_order"] = False + + def __getitem__(self, name): + """ Get a config item """ + if name not in Config._cfg: + if name.upper() in os.environ: + Config._cfg[name] = os.environ[name.upper()] + else: + return None + return Config._cfg[name] + + def get(self, name, default): + """ return config value for name or default if None """ + val = self.__getitem__(name) + if val is None: + return default + else: + return default + + def __setitem__(self, name, obj): + """ set config item """ + Config._cfg[name] = obj + + def __delitem__(self, name): + """ Delete option. """ + del Config._cfg[name] + + def __len__(self): + return len(Config._cfg) + + def __iter__(self): + """ Iterate over config names """ + keys = Config._cfg.keys() + for key in keys: + yield key + + def __contains__(self, name): + return name in Config._cfg + + def __repr__(self): + return json.dumps(Config._cfg) + + def keys(self): + return Config._cfg.keys() + + @property + def hs_endpoint(self): + return Config._cfg.get("hs_endpoint") + + @property + def hs_username(self): + return Config._cfg.get("hs_username") + + @property + def hs_password(self): + return Config._cfg.get("hs_password") + + @property + def hs_api_key(self): + return Config._cfg.get("hs_api_key") + + @property + def bool_names(self): + if "bool_names" in Config._cfg: + names = Config._cfg["bool_names"] + else: + names = (b"FALSE", b"TRUE") + return names + + @bool_names.setter + def bool_names(self, value): + if isinstance(value, str): + names = value.split(()) + if len(names) < 2: + raise ValueError("bool_names must have two items") + elif len(names) == 2: + pass + else: + names = names[:2] # just use the first two items + elif len(value) != 2: + raise ValueError("expected two-element list for bool_names") + else: + names = value + Config._cfg["bool_names"] = tuple(names) + + @property + def complex_names(self): + if "complex_names" in Config._cfg: + names = Config._cfg["complex_names"] + else: + names = ("r", "i") + return names + + @complex_names.setter + def complex_names(self, value): + if isinstance(value, str): + names = value.split() + if len(names) < 2: + raise ValueError("complex_names must have two items") + elif len(names) == 2: + pass + else: + names = names[:2] # just use the first two items + elif len(value) != 2: + raise ValueError("complex_names must have two values") + else: + names = value + + Config._cfg["complex_names"] = tuple(names) + + @property + def track_order(self): + if "track_order" in Config._cfg: + track = Config._cfg["track_order"] + else: + track = False + return track + + @track_order.setter + def track_order(self, value): + if isinstance(value, str): + tokens = value.split() + if len(tokens) == 0: + track = False + else: + track = bool(tokens[0]) # strip any comments + else: + track = bool(value) + Config._cfg["track_order"] = track + + +def get_config(config_file=None, **kwargs): + return Config(config_file=config_file, **kwargs) diff --git a/src/h5json/h5pystore/__init__.py b/src/h5json/h5pystore/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index 3510b328..dab44078 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -465,7 +465,7 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): return obj_json - def getDatasetValues(self, dset_id, sel=None): + def getDatasetValues(self, dset_id, sel=None, dtype=None): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index 377bc3f9..541bb262 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -50,7 +50,7 @@ def getAttribute(self, obj_id, name, includeData=True): pass @abstractmethod - def getDatasetValues(self, obj_id, sel=None): + def getDatasetValues(self, obj_id, sel=None, dtype=None): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index c632d93c..6ee8aaa2 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -472,7 +472,7 @@ def getDatasetValues(self, dset_id, sel): dtype = self.getDtype(dset_json) if self.reader: - arr = self.reader.getDatasetValues(dset_id, sel) + arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) else: # TBD: Initialize with fill value if non-zero arr = np.zeros(sel.shape, dtype=dtype) diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py index 4c4eef90..78df4567 100644 --- a/src/h5json/jsonstore/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -165,7 +165,7 @@ def getDtype(self, obj_json): dtype = createDataType(type_item) return dtype - def getDatasetValues(self, obj_id, sel=None): + def getDatasetValues(self, obj_id, sel=None, dtype=None): """ Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same @@ -191,7 +191,6 @@ def getDatasetValues(self, obj_id, sel=None): else: dims = shape_json["dims"] - dtype = self.getDtype(json_obj) arr = jsonToArray(dims, dtype, json_value) if sel is None or sel.select_type == selections.H5S_SELECT_ALL: pass # just return the entire array diff --git a/src/h5json/openid.py b/src/h5json/openid.py new file mode 100644 index 00000000..bb59af54 --- /dev/null +++ b/src/h5json/openid.py @@ -0,0 +1,438 @@ +import os +import sys +import json +import requests +import time +from abc import ABC, abstractmethod +from datetime import datetime + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +# Azure +try: + import adal +except ModuleNotFoundError: + pass # change this to the eprint below to see the import error + # eprint()"Unable to import azure auth packages") + +# Google +try: + from google_auth_oauthlib.flow import InstalledAppFlow as GoogleInstalledAppFlow + from google.auth.transport.requests import Request as GoogleRequest + from google.oauth2.credentials import Credentials as GoogleCredentials + from google.oauth2 import id_token as GoogleIDToken +except ModuleNotFoundError: + pass # change this to the eprint below to see the import error + # eprint("Unable to import google auth packages") + + +from . import config as hsconfig + + +class OpenIDHandler(ABC): + + def __init__(self, endpoint, use_token_cache=True, username=None, password=None): + """Initialize the token.""" + + # Location of the token cache. + self._token_cache_file = os.path.expanduser('~/.hstokencfg') + self._endpoint = endpoint + self._username = username + self._password = password + + # The _token attribute should be a dict with at least the following keys: + # + # accessToken - The OpenID token to send. + # refreshToken - The refresh token (optional). + # expiresOn - The unix timestamp when the token expires (optional). + + if not use_token_cache or not os.path.isfile(self._token_cache_file): + self._token = None + else: + if username: + file_key = username + '@' + endpoint + else: + file_key = endpoint + with open(self._token_cache_file, 'r') as token_file: + self._token = json.load(token_file).get(file_key, None) + + @abstractmethod + def acquire(self): + """Acquire a new token from the provider.""" + pass + + @abstractmethod + def refresh(self): + """Refresh an existing token with the provider.""" + pass + + @property + def username(self): + """ Return username if known """ + return self._username + + @property + def expired(self): + """Return if the token is expired.""" + t = self._token + # add some buffer to account for clock skew + return t is not None and 'expiresOn' in t and time.time() + 10.0 >= t['expiresOn'] + + @property + def token(self): + """Return the token if valid, otherwise get a new one.""" + + if self.expired: + self.refresh() + if self._token: + self.write_token_cache() + + if self._token is None: + self.acquire() + self.write_token_cache() + + return self._token['accessToken'] + + def write_token_cache(self): + """Write the token to a file cache.""" + + cache_exists = os.path.isfile(self._token_cache_file) + + if self._username: + file_key = self._username + '@' + self._endpoint + else: + file_key = self._endpoint + + # Create a new cache file. + if not cache_exists and self._token is not None: + with open(self._token_cache_file, 'w') as token_file: + json.dump({file_key: self._token}, token_file) + + # Update an exisiting cache file. + elif cache_exists: + with open(self._token_cache_file, 'r+') as token_file: + cache = json.loads(token_file.read()) + + # Store valid tokens. + if self._token is not None: + cache[file_key] = self._token + + # Delete invalid tokens. + elif file_key in cache: + del cache[file_key] + + token_file.seek(0) + token_file.truncate(0) + json.dump(cache, token_file) + + +class AzureOpenID(OpenIDHandler): + + AUTHORITY_URI = 'https://login.microsoftonline.com' # login endpoint for AD auth + + def __init__(self, endpoint, config=None): + """Store configuration.""" + + # Configuration manager + hs_config = hsconfig.get_config() + + # Config is a dictionary. + if isinstance(config, dict): + self.config = config + + # Maybe client_secrets are in environment variables? + else: + + self.config = { + 'AD_APP_ID': hs_config.get("hs_ad_app_id", None), + 'AD_TENANT_ID': hs_config.get("hs_ad_tenant_id", None), + 'AD_RESOURCE_ID': hs_config.get("hs_ad_resource_id", None), + 'AD_CLIENT_SECRET': hs_config.get("hs_ad_client_secret", None) + } + + if 'AD_CLIENT_SECRET' in self.config and self.config['AD_CLIENT_SECRET']: + use_token_cache = False + else: + use_token_cache = True + + super().__init__(endpoint, use_token_cache=use_token_cache) + + def write_token_cache(self): + if 'AD_CLIENT_SECRET' in self.config and self.config['AD_CLIENT_SECRET']: + pass # don't use token cache for unattended authentication + else: + super().write_token_cache() + + def acquire(self): + """Acquire a new Azure token.""" + + if "adal" not in sys.modules: + msg = "adal module not found, run: pip install -e . '.[azure]'" + raise ModuleNotFoundError(msg) + + app_id = self.config["AD_APP_ID"] + resource_id = self.config["AD_RESOURCE_ID"] + tenant_id = self.config["AD_TENANT_ID"] + client_secret = self.config.get("AD_CLIENT_SECRET", None) + authority_uri = self.AUTHORITY_URI + '/' + tenant_id + + # Try to get a token using different oauth flows. + context = adal.AuthenticationContext(authority_uri, enable_pii=True, api_version=None) + + try: + if client_secret is not None: + code = context.acquire_token_with_client_credentials(resource_id, app_id, client_secret) + else: + code = context.acquire_user_code(resource_id, app_id) + + except Exception as e: + eprint(f"unable to process AD token: {e}") + self._token = None + self.write_token_cache() + raise + + if "message" in code: + eprint(code["message"]) + mgmt_token = context.acquire_token_with_device_code(resource_id, code, app_id) + + elif "accessToken" in code: + mgmt_token = code + + else: + eprint("Could not authenticate with AD") + + # Only store some fields. + self._token = { + 'accessToken': mgmt_token['accessToken'], + 'refreshToken': mgmt_token.get('refreshToken', None), + 'tenantId': mgmt_token.get('tenantId', tenant_id), + 'clientId': mgmt_token.get('_clientId', app_id), + 'resource': mgmt_token.get('resource', resource_id) + } + + # Parse time to timestamp. + if 'expiresOn' in mgmt_token: + expire_dt = datetime.strptime(mgmt_token['expiresOn'], '%Y-%m-%d %H:%M:%S.%f') + self._token['expiresOn'] = expire_dt.timestamp() + + def refresh(self): + """Try to renew an Azure token.""" + + try: + + # This will work for device code flow, but not with client + # credentials. If we have the secret, we can just request a new + # token anyways. + + authority_uri = self.AUTHORITY_URI + '/' + self._token['tenantId'] + context = adal.AuthenticationContext(authority_uri, api_version=None) + mgmt_token = context.acquire_token_with_refresh_token(self._token['refreshToken'], + self._token['clientId'], + self._token['resource'], + None) + + # New token does not have all the metadata. + self._token['accessToken'] = mgmt_token['accessToken'] + self._token['refreshToken'] = mgmt_token['refreshToken'] + + # Parse time to timestamp. + if 'expiresOn' in mgmt_token: + expire_dt = datetime.strptime(mgmt_token['expiresOn'], '%Y-%m-%d %H:%M:%S.%f') + self._token['expiresOn'] = expire_dt.timestamp() + + except Exception: + self._token = None + + +class GoogleOpenID(OpenIDHandler): + + def __init__(self, endpoint, config=None, scopes=None): + """Store configuration.""" + + if "google.oauth2" not in sys.modules: + msg = "google.oauth2 module not found, run: pip install -e . '.[google]'" + raise ModuleNotFoundError(msg) + + # Configuration manager + hs_config = hsconfig.get_config() + + if scopes is None: + scopes = hs_config.get('hs_google_scopes', 'openid').split() + self.scopes = scopes + + # Config is a client_secrets dictionary. + if isinstance(config, dict): + self.config = config + + # Config points to a client_secrets.json file. + elif isinstance(config, str) and os.path.isfile(config): + with open(config, 'r') as f: + self.config = json.loads(f.read()) + + # Maybe client_secrets are in environment variables? + else: + self.config = { + 'installed': { + 'project_id': hs_config.get('hs_google_project_id', None), + 'client_id': hs_config.get('hs_google_client_id', None), + 'client_secret': hs_config.get('hs_google_client_secret', None), + 'auth_uri': 'https://accounts.google.com/o/oauth2/auth', + 'token_uri': 'https://oauth2.googleapis.com/token', + 'auth_provider_x509_cert_url': 'https://www.googleapis.com/oauth2/v1/certs', + 'redirect_uris': ['urn:ietf:wg:oauth:2.0:oob', 'http://localhost'] + } + } + + super().__init__(endpoint) + + def _parse(self, creds): + """Parse credentials.""" + + # NOTE: In Google OpenID, if a client is set up for InstalledAppFlow + # then the client_secret is not actually treated as a secret. Acquire + # will ALWAYS prompt for user input before granting a token. + + token = { + 'accessToken': creds.id_token, + 'refreshToken': creds.refresh_token, + 'tokenUri': creds.token_uri, + 'clientId': creds.client_id, + 'clientSecret': creds.client_secret, + 'scopes': creds.scopes + } + + # The expiry field that is in creds is for the OAuth token, not the + # OpenID token. We need to validate the OpenID tokenn to get the exp. + idinfo = GoogleIDToken.verify_oauth2_token(creds.id_token, GoogleRequest()) + if 'exp' in idinfo: + token['expiresOn'] = idinfo['exp'] + + return token + + def acquire(self): + """Acquire a new Google token.""" + + flow = GoogleInstalledAppFlow.from_client_config(self.config, + scopes=self.scopes) + creds = flow.run_console() + self._token = self._parse(creds) + + def refresh(self): + """Try to renew a token.""" + + try: + + token = self._token + creds = GoogleCredentials(token=None, + refresh_token=token['refreshToken'], + scopes=token['scopes'], + token_uri=token['tokenUri'], + client_id=token['clientId'], + client_secret=token['clientSecret']) + + creds.refresh(GoogleRequest()) + self._token = self._parse(creds) + + except Exception: + self._token = None + + +class KeycloakOpenID(OpenIDHandler): + + def __init__(self, endpoint, config=None, scopes=None, username=None, password=None): + """Store configuration.""" + + # Configuration manager + hs_config = hsconfig.get_config() + + if scopes is None: + scopes = hs_config.get('hs_keycloak_scopes', 'openid').split() + self.scopes = scopes + + # Config is a client_secrets dictionary. + if isinstance(config, dict): + self.config = config + + # Config points to a client_secrets.json file. + elif isinstance(config, str) and os.path.isfile(config): + with open(config, 'r') as f: + self.config = json.loads(f.read()) + + # Maybe configs are in environment variables? + else: + self.config = { + 'keycloak_client_id': hs_config.get('hs_keycloak_client_id', None), + 'keycloak_client_secret': hs_config.get('hs_keycloak_client_secret', None), + 'keycloak_realm': hs_config.get('hs_keycloak_realm', None), + 'keycloak_uri': hs_config.get('hs_keycloak_uri', None) + } + + super().__init__(endpoint, username=username, password=password) + + def _getKeycloakUrl(self): + if not self.config['keycloak_uri']: + raise KeyError("keycloak_uri not set") + if not self.config['keycloak_realm']: + raise KeyError("Keycloak realm not set") + if not self.config['keycloak_client_id']: + raise KeyError("keycloak client_id not set") + + url = self.config['keycloak_uri'] + url += "/realms/" + url += self.config['keycloak_realm'] + url += "/protocol/openid-connect/token" + + return url + + def _parse(self, creds): + """Parse credentials.""" + + # validate json returned by keycloak + if "token_type" not in creds: + raise IOError("Unexpected Keycloak JWT, no token_type") + if creds["token_type"].lower() != "bearer": + raise IOError("Unexpected Keycloak JWT, expected Bearer token") + + token = {} + if "access_token" not in creds: + raise IOError("Unexpected Keycloak JWT, no access_token") + token["accessToken"] = creds["access_token"] + if "refesh_token" in creds: + token["refreshToken"] = creds["refresh_token"] + if "expires_in" in creds: + now = time.time() + token['expiresOn'] = now + creds["expires_in"] + + # TBD: client_secret + # TBD: scopes + # TBD: client_id + + return token + + def acquire(self): + """Acquire a new Keycloak token.""" + keycloak_url = self._getKeycloakUrl() + + headers = {"Content-Type": "application/x-www-form-urlencoded"} + body = {} + body["username"] = self._username + body["password"] = self._password + body["grant_type"] = "password" + body["client_id"] = self.config.get("keycloak_client_id") + rsp = requests.post(keycloak_url, data=body, headers=headers) + + if rsp.status_code not in (200, 201): + print(f"POST error: {rsp.status_code}") + raise IOError(f"Keycloak response: {rsp.status_code}") + + creds = rsp.json() # TBD: catch json format errors? + self._token = self._parse(creds) + + def refresh(self): + """Try to renew a token.""" + # TBD + # unclear if refresh is supported without a client secret + self._token = None diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py new file mode 100644 index 00000000..cbc7f8bb --- /dev/null +++ b/test/unit/hsds_reader_test.py @@ -0,0 +1,114 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import logging +import numpy as np +from h5json import Hdf5db +from h5json.hsdsstore.hsds_reader import HSDSReader +from h5json import selections + + +def get_endpoint(): + return "http://hsds.hdf.test:5101" + + +def get_username(): + return "test_user1" + + +def get_password(): + return "test" + + +class HSDSReaderTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(HSDSReaderTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.DEBUG) + handler = logging.FileHandler("./hsds_reader_test.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + + def testSimple(self): + filepath = "/home/test_user1/test/tall.h5" + kwargs = {"app_logger": self.log} + with Hdf5db(**kwargs) as db: + kwargs["username"] = get_username() + kwargs["password"] = get_password() + kwargs["endpoint"] = get_endpoint() + hsds_reader = HSDSReader(filepath, **kwargs) + db.reader = hsds_reader + root_id = db.getObjectIdByPath("/") + root_json = db.getObjectById(root_id) + + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset111_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (10, 10)) + for i in range(10): + for j in range(10): + v = arr[i, j] + self.assertEqual(v, i * j) + + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() From 638ab00940c99f03f883e8b0992fa030704567cc Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 2 Jun 2025 20:38:53 +0200 Subject: [PATCH 044/129] fix flake8 error --- src/h5json/openid.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/h5json/openid.py b/src/h5json/openid.py index bb59af54..af38d94a 100644 --- a/src/h5json/openid.py +++ b/src/h5json/openid.py @@ -6,6 +6,8 @@ from abc import ABC, abstractmethod from datetime import datetime +from . import config as hsconfig + def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) @@ -29,9 +31,6 @@ def eprint(*args, **kwargs): # eprint("Unable to import google auth packages") -from . import config as hsconfig - - class OpenIDHandler(ABC): def __init__(self, endpoint, use_token_cache=True, username=None, password=None): From 7e17e7b76cf102cce3f04e927629437e705b224a Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 3 Jun 2025 11:43:35 +0200 Subject: [PATCH 045/129] added missing hsds_reaader files --- src/h5json/hsdsstore/hsds_reader.py | 281 ++++++++++ src/h5json/hsdsstore/httpconn.py | 791 ++++++++++++++++++++++++++++ 2 files changed, 1072 insertions(+) create mode 100644 src/h5json/hsdsstore/hsds_reader.py create mode 100644 src/h5json/hsdsstore/httpconn.py diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py new file mode 100644 index 00000000..5740a29c --- /dev/null +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -0,0 +1,281 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import json +import logging + +from ..objid import getCollectionForId, getUuidFromId + +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray +from .. import selections +from ..h5reader import H5Reader +from .httpconn import HttpConn + + +class HSDSReader(H5Reader): + """ + This class can be used by HDF5DB to read content from an hdf5-json file + """ + + def __init__( + self, + domain_path, + app_logger=None, + endpoint=None, + username=None, + password=None, + bucket=None, + mode='r', + api_key=None, + use_session=True, + expire_time=0, + max_objects=0, + max_age=0, + retries=3, + timeout=30.0, + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + self.log.debug("HSDSReader init(") + + kwargs = {} + self.log.debug(f" domain_path: {domain_path}") + if endpoint: + self.log.debug(f" endpoint: {endpoint}") + kwargs["endpoint"] = endpoint + if username: + self.log.debug(f" username: {username}") + kwargs["username"] = username + if password: + self.log.debug(f" password: {"*" * len(password)}") + kwargs["password"] = password + if bucket: + self.log.debug(f" bucket: {bucket}") + kwargs["bucket"] = bucket + if mode: + self.log.debug(f" mode: {mode}") + kwargs["mode"] = mode + if api_key: + self.log.debug(f" apI_key: {"*" * len(api_key)}") + kwargs["api_key"] = api_key + if use_session: + self.log.debug(f" use_session: {use_session}") + kwargs["user_session"] = use_session + + if expire_time: + self.log.debug(f" expire_time: {expire_time}") + kwargs["expire_time"] = expire_time + if max_objects: + self.log.debug(f" max_objects: {max_objects}") + kwargs["max_objects"] = max_objects + if max_age: + self.log.debug(f" max_age: {max_age}") + kwargs["max_age"] = max_age + if retries: + self.log.debug(f" retries: {retries}") + kwargs["retries"] = retries + if timeout: + self.log.debug(f" timeout: {timeout}") + kwargs["timeout"] = timeout + + super().__init__(domain_path, app_logger=app_logger) + + http_conn = HttpConn(domain_path, **kwargs) + + hsds_info = http_conn.serverInfo() + self.log.debug(f"got hsds info: {hsds_info}") + + # try to do a GET from the domain + req = "/" + params = {} + """ + if max_objects is None or max_objects > 0: + # get object meta objects + # TBD: have hsds support a max limit of objects to return + params["getobjs"] = 1 + params["include_attrs"] = 1 + params["include_links"] = 1 + """ + + rsp = http_conn.GET(req, params=params) + + if rsp.status_code != 200: + # file must exist + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + + domain_json = rsp.json() + self.log.debug(f"got domain_json: {domain_json}") + + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Location is a folder, not a file") + + root_uuid = domain_json["root"] + + if mode in ("w", "w-", "x", "a"): + http_conn._mode = "r+" + + """ + if "domain_objs" in root_json: + domain_objs = root_json["domain_objs"] + objdb.load(domain_objs) + """ + + self._root_id = root_uuid + self._verboseInfo = None # additional state we'll get when requested + self._verboseUpdated = None # when the verbose data was fetched + self._lastScan = None # when summary stats where last updated by server + + if "limits" in domain_json: + self._limits = domain_json["limits"] + else: + self._limits = None + if "version" in domain_json: + self._version = domain_json["version"] + else: + self._version = None + + self._http_conn = http_conn + self._domain_json = domain_json + + """ + # parse the json file + h5json = json.loads(text) + + self._h5json = h5json + + if "root" not in h5json: + raise Exception("no root key in input file") + self._root_id = "g-" + h5json["root"] + """ + + def close(self): + pass + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_values=False): + """ return object with given id """ + + collection = getCollectionForId(obj_id) + + req = f"/{collection}/{obj_id}" + self.log.debug("sending req: {req}") + + params = {} + if include_attrs: + params["include_attrs"] = 1 + if include_links: + params["include_links"] = 1 + + rsp = self._http_conn.GET(req, params=params) + + if rsp.status_code != 200: + raise IOError(rsp.status_code, rsp.reason) + + obj_json = rsp.json() + if "hrefs" in obj_json: + # don't need these + del obj_json["hrefs"] + + self.log.debug(f"got json for id: {obj_id}: {obj_json}") + return obj_json + + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + self.log.debug(f"getAttribute({obj_id}), [{name}], include_data={includeData})") + collection = getCollectionForId(obj_id) + req = f"/{collection}/{obj_id}/attributes/{name}" + + params = {} + params["IncludeData"] = 1 if includeData else 0 + + rsp = self._http_conn.GET(req, params=params) + + if rsp.status_code in (404, 410): + self.log.warning(f"attribute {name} not found") + return None + + if rsp.status_code != 200: + self.log.error(f"GET {req} failed with status_code: {rsp.status_code}") + raise IOError(rsp.status_code, rsp.reason) + attr_json = rsp.json() + + if "hrefs" in attr_json: + del attr_json["hrefs"] + + return attr_json + + def getDtype(self, obj_json): + """ Return the dtype for the type given by obj_json """ + if "type" not in obj_json: + raise KeyError("no type item found") + type_item = obj_json["type"] + if isinstance(type_item, str) and type_item.startswith("datatypes/"): + # this is a reference to a committed type + ctype_id = "t-" + getUuidFromId(type_item) + ctype_json = self.getObjectById(ctype_id) + if "type" not in ctype_json: + raise KeyError(f"Unexpected datatype: {ctype_json}") + # Use the ctype's item json + type_item = ctype_json["type"] + dtype = createDataType(type_item) + return dtype + + def getDatasetValues(self, dset_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + self.log.debug(f"getDatasetValues({dset_id}), sel={sel}") + collection = getCollectionForId(dset_id) + if collection != "datasets": + msg = f"unexpected id: {dset_id} for getDatasetValues" + self.log.warning(msg) + return ValueError(msg) + + params = {} + if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + pass # just return the entire array + elif isinstance(sel, selections.SimpleSelection): + params["select"] = sel.getQueryParam() + else: + raise NotImplementedError("selection type not supported") + + req = f"/{collection}/{dset_id}/value" + rsp = self._http_conn.GET(req, params=params) + if rsp.status_code != 200: + self.log.error(f"GET {req} failed with status_code: {rsp.status_code}") + raise IOError(rsp.status_code, rsp.reason) + + rsp_json = rsp.json() + if "value" not in rsp_json: + self.log.warning(f"value key not found for {dset_id}") + return None + + self.log.debug(f"got rsp: {rsp_json}") + json_value = rsp_json["value"] + + arr = jsonToArray(sel.mshape, dtype, json_value) + + return arr diff --git a/src/h5json/hsdsstore/httpconn.py b/src/h5json/hsdsstore/httpconn.py new file mode 100644 index 00000000..7a686dff --- /dev/null +++ b/src/h5json/hsdsstore/httpconn.py @@ -0,0 +1,791 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +from __future__ import absolute_import + +import os +import sys +import multiprocessing + +import base64 +import requests +import requests_unixsocket +from requests import ConnectionError +from requests.adapters import HTTPAdapter, Retry +import json +import logging + +from .. import openid +from .. import config + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +DEFAULT_TIMEOUT = ( + 10, + 1000, +) # #20 # 180 # seconds - allow time for hsds service to bounce + +""" +def verifyCert(self): + # default to validate CERT for https requests, unless + # the H5PYD_VERIFY_CERT environment variable is set and True + # + # TBD: set default to True once the signing authority of data.hdfgroup.org is + # recognized + if "H5PYD_VERIFY_CERT" in os.environ: + verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() + if verify_cert.startswith('F'): + return False + return True +""" + + +def getAzureApiKey(): + """construct API key for Active Directory if configured""" + # TBD: GoogleID? + + api_key = None + + # if Azure AD ids are set, pass them to HttpConn via api_key dict + cfg = config.get_config() # pulls in state from a .hscfg file (if found). + + ad_app_id = None # Azure AD HSDS Server id + if "HS_AD_APP_ID" in os.environ: + ad_app_id = os.environ["HS_AD_APP_ID"] + elif "hs_ad_app_id" in cfg: + ad_app_id = cfg["hs_ad_app_id"] + ad_tenant_id = None # Azure AD tenant id + if "HS_AD_TENANT_ID" in os.environ: + ad_tenant_id = os.environ["HS_AD_TENANT_ID"] + elif "hs_ad_tenant_id" in cfg: + ad_tenant_id = cfg["hs_ad_tenant_id"] + + ad_resource_id = None # Azure AD resource id + if "HS_AD_RESOURCE_ID" in os.environ: + ad_resource_id = os.environ["HS_AD_RESOURCE_ID"] + elif "hs_ad_resource_id" in cfg: + ad_resource_id = cfg["hs_ad_resource_id"] + + ad_client_secret = None # Azure client secret + if "HS_AD_CLIENT_SECRET" in os.environ: + ad_client_secret = os.environ["HS_AD_CLIENT_SECRET"] + elif "hs_ad_client_secret" in cfg: + ad_client_secret = cfg["hs_ad_client_secret"] + + if ad_app_id and ad_tenant_id and ad_resource_id: + # contruct dict to pass to HttpConn + api_key = { + "AD_APP_ID": ad_app_id, + "AD_TENANT_ID": ad_tenant_id, + "AD_RESOURCE_ID": ad_resource_id, + "openid_provider": "azure", + } + # optional config + if ad_client_secret: + api_key["AD_CLIENT_SECRET"] = ad_client_secret + return api_key # None if AAD not configured + + +def getKeycloakApiKey(): + # check for keycloak next + cfg = config.get_config() # pulls in state from a .hscfg file (if found). + api_key = None + # check to see if we are configured for keycloak authentication + if "HS_KEYCLOAK_URI" in os.environ: + keycloak_uri = os.environ["HS_KEYCLOAK_URI"] + elif "hs_keycloak_uri" in cfg: + keycloak_uri = cfg["hs_keycloak_uri"] + else: + keycloak_uri = None + if "HS_KEYCLOAK_CLIENT_ID" in os.environ: + keycloak_client_id = os.environ["HS_KEYCLOAK_CLIENT_ID"] + elif "hs_keycloak_client_id" in cfg: + keycloak_client_id = cfg["hs_keycloak_client_id"] + else: + keycloak_client_id = None + if "HS_KEYCLOAK_REALM" in os.environ: + keycloak_realm = cfg["HS_KEYCLOAK_REALM"] + elif "hs_keycloak_realm" in cfg: + keycloak_realm = cfg["hs_keycloak_realm"] + else: + keycloak_realm = None + + if keycloak_uri and keycloak_client_id and keycloak_uri: + api_key = { + "keycloak_uri": keycloak_uri, + "keycloak_client_id": keycloak_client_id, + "keycloak_realm": keycloak_realm, + "openid_provider": "keycloak", + } + return api_key + + +class HttpResponse: + """ wrapper for http request responses """ + def __init__(self, rsp, logger=None): + self._rsp = rsp + self._logger = logger + if logger is None: + self.log = logging + else: + self.log = logging.getLogger(logger) + self._text = None + + @property + def status_code(self): + """ return response status code """ + return self._rsp.status_code + + @property + def reason(self): + """ return response reason """ + return self._rsp.reason + + @property + def content_type(self): + """ return content type """ + rsp = self._rsp + if 'Content-Type' in rsp.headers: + content_type = rsp.headers['Content-Type'] + else: + content_type = "" + return content_type + + @property + def content_length(self): + """ Return length of response if available """ + if 'Content-Length' in self._rsp.headers: + content_length = self._rsp.headers['Content-Length'] + else: + content_length = None + return content_length + + @property + def is_binary(self): + """ return True if the response indicates binary data """ + + if self.content_type == "application/octet-stream": + return True + else: + return False + + @property + def is_json(self): + """ return true if response indicates json """ + + if self.content_type.startswith("application/json"): + return True + else: + return False + + @property + def text(self): + """ getresponse content as bytes """ + + if not self._text: + rsp = self._rsp + if not self.is_binary: + # hex encoded response? + # this is returned by API Gateway for lambda responses + self._text = bytes.fromhex(rsp.text) + else: + if self.content_length: + self.log.debug(f"got binary response, {self.content_length} bytes") + else: + self.log.debug("got binary response, content_length unknown") + + HTTP_CHUNK_SIZE = 4096 + http_chunks = [] + downloaded_bytes = 0 + for http_chunk in rsp.iter_content(chunk_size=HTTP_CHUNK_SIZE): + if http_chunk: # filter out keep alive chunks + self.log.debug(f"got http_chunk - {len(http_chunk)} bytes") + downloaded_bytes += len(http_chunk) + http_chunks.append(http_chunk) + if len(http_chunks) == 0: + raise IOError("no data returned") + if len(http_chunks) == 1: + # can return first and only chunk as response + self._text = http_chunks[0] + else: + msg = f"retrieved {len(http_chunks)} http_chunks " + msg += f" {downloaded_bytes} total bytes" + self.log.info(msg) + self._text = bytearray(downloaded_bytes) + index = 0 + for http_chunk in http_chunks: + self._text[index:(index + len(http_chunk))] = http_chunk + index += len(http_chunk) + + return self._text + + def json(self): + """ Return json from response""" + + rsp = self._rsp + + if not self.is_json: + raise IOError("response is not json") + + rsp_json = json.loads(rsp.text) + self.log.debug(f"rsp_json - {len(rsp.text)} bytes") + return rsp_json + + +class HttpConn: + """ + Some utility methods based on equivalents in base class. + """ + + def __init__( + self, + domain_name, + endpoint=None, + username=None, + password=None, + bucket=None, + api_key=None, + mode="a", + use_session=True, + expire_time=1.0, + max_objects=None, + max_age=1.0, + logger=None, + retries=3, + timeout=DEFAULT_TIMEOUT, + **kwds, + ): + self._domain = domain_name + self._mode = mode + self._domain_json = None + self._use_session = use_session + self._retries = retries + self._timeout = timeout + self._api_key = api_key + self._s = None # Sessions + self._server_info = None + self._external_refs = [] + + self._logger = logger + if logger is None: + self.log = logging + else: + self.log = logging.getLogger(logger) + msg = f"HttpConn.init(domain: {domain_name} use_session: {use_session} " + msg += f"expire_time: {expire_time:6.2f} sec retries: {retries}" + self.log.debug(msg) + + if self._timeout != DEFAULT_TIMEOUT: + self.log.info(f"HttpConn.init - timeout = {self._timeout}") + if endpoint is None: + if "HS_ENDPOINT" in os.environ: + endpoint = os.environ["HS_ENDPOINT"] + + if not endpoint: + msg = "no endpoint set" + raise ValueError(msg) + + self._endpoint = endpoint + + if username is None: + if "HS_USERNAME" in os.environ: + username = os.environ["HS_USERNAME"] + if isinstance(username, str) and (not username or username.upper() == "NONE"): + username = None + self._username = username + + if password is None: + if "HS_PASSWORD" in os.environ: + password = os.environ["HS_PASSWORD"] + if isinstance(password, str) and (not password or password.upper() == "NONE"): + password = None + self._password = password + + if bucket is None: + if "HS_BUCKET" in os.environ: + bucket = os.environ["HS_BUCKET"] + if isinstance(bucket, str) and (not bucket or bucket.upper() == "NONE"): + bucket = None + self._bucket = bucket + + if api_key is None and "HS_API_KEY" in os.environ: + api_key = os.environ["HS_API_KEY"] + if isinstance(api_key, str) and (not api_key or api_key.upper() == "NONE"): + api_key = None + if not api_key: + api_key = getAzureApiKey() + if not api_key: + api_key = getKeycloakApiKey() + + # Convert api_key to OpenIDHandler + if isinstance(api_key, dict): + # Maintain Azure-defualt backwards compatibility, but allow + # both environment variable and kwarg override. + provider = api_key.get("openid_provider", "azure") + if provider == "azure": + self.log.debug("creating OpenIDHandler for Azure") + self._api_key = openid.AzureOpenID(endpoint, api_key) + elif provider == "google": + self.log.debug("creating OpenIDHandler for Google") + + config = api_key.get("client_secret", None) + scopes = api_key.get("scopes", None) + self._api_key = openid.GoogleOpenID( + endpoint, config=config, scopes=scopes + ) + elif provider == "keycloak": + self.log.debug("creating OpenIDHandler for Keycloak") + + # for Keycloak, pass in username and password + self._api_key = openid.KeycloakOpenID( + endpoint, config=api_key, username=username, password=password + ) + else: + self.log.error(f"Unknown openid provider: {provider}") + + def __del__(self): + if self._s: + self.log.debug("close session") + self._s.close() + self._s = None + + def getHeaders(self, username=None, password=None, headers=None): + + if headers is None: + headers = {} + + # This should be the default - but explicitly set anyway + if "Accept-Encoding" not in headers: + headers['Accept-Encoding'] = "deflate, gzip" + + elif "Authorization" in headers: + return headers # already have auth key + if username is None: + username = self._username + if password is None: + password = self._password + + if self._api_key: + self.log.debug("using api key") + # use OpenId handler to get a bearer token + token = "" + + # Get a token, possibly refreshing if needed. + if isinstance(self._api_key, openid.OpenIDHandler): + token = self._api_key.token + + # Token was provided as a string. + elif isinstance(self._api_key, str): + token = self._api_key + + if token: + auth_string = b"Bearer " + token.encode("ascii") + headers["Authorization"] = auth_string + elif username is not None and password is not None: + self.log.debug(f"use basic auth with username: {username}") + auth_string = username + ":" + password + auth_string = auth_string.encode("utf-8") + auth_string = base64.b64encode(auth_string) + auth_string = b"Basic " + auth_string + headers["Authorization"] = auth_string + else: + self.log.debug("no auth header") + # no auth header + pass + + return headers + + def serverInfo(self): + if self._server_info: + return self._server_info + + if self._endpoint is None: + raise IOError("object not initialized") + + # make an about request + rsp = self.GET("/about") + if rsp.status_code != 200: + raise IOError(rsp.status_code, rsp.reason) + server_info = rsp.json() + if server_info: + self._server_info = server_info + return server_info + + def server_version(self): + server_info = self.serverInfo() + if "hsds_version" in server_info: + server_version = server_info["hsds_version"] + else: + # no standard way to get version for other implements... + server_version = None + return server_version + + def verifyCert(self): + # default to validate CERT for https requests, unless + # the H5PYD_VERIFY_CERT environment variable is set and True + # + # TBD: set default to True once the signing authority of data.hdfgroup.org is + # recognized + if "H5PYD_VERIFY_CERT" in os.environ: + verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() + if verify_cert.startswith("F"): + return False + return True + + def GET(self, req, format="json", params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + # check that domain is defined (except for some specific requests) + if req not in ("/domains", "/about", "/info", "/") and self._domain is None: + raise IOError(f"no domain defined: req: {req}") + + rsp = None + + headers = self.getHeaders(headers=headers) + + if params is None: + params = {} + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key and not isinstance(self._api_key, dict): + params["api_key"] = self._api_key + domain = params["domain"] + self.log.debug(f"GET: {req} [{domain}] bucket: {self._bucket}") + + if format == "binary": + headers["accept"] = "application/octet-stream" + + self.log.info(f"GET: {self._endpoint + req} [{params['domain']}] timeout: {self._timeout}") + + for k in params: + if k != "domain": + v = params[k] + self.log.debug(f"GET params {k}:{v}") + + try: + s = self.session + stream = True # tbd - config for no streaming? + + rsp = s.get( + self._endpoint + req, + params=params, + headers=headers, + stream=stream, + timeout=self._timeout, + verify=self.verifyCert(), + ) + self.log.info(f"status: {rsp.status_code}") + except ConnectionError as ce: + self.log.error(f"connection error: {ce}") + raise IOError("Connection Error") + except Exception as e: + self.log.error(f"got {type(e)} exception: {e}") + raise IOError("Unexpected exception") + + return HttpResponse(rsp) + + def PUT(self, req, body=None, format="json", params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + if self._domain is None: + raise IOError("no domain defined") + + if params: + self.log.info(f"PUT params: {params}") + else: + params = {} + + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key: + params["api_key"] = self._api_key + + # verify the file was open for modification + if self._mode == "r": + raise IOError("Unable to create group (No write intent on file)") + + # try to do a PUT to the domain + + headers = self.getHeaders(headers=headers) + + if format == "binary": + headers["Content-Type"] = "application/octet-stream" + # binary write + data = body + else: + headers["Content-Type"] = "application/json" + data = json.dumps(body) + + self.log.info(f"PUT: {req} format: {format} [{len(data)} bytes]") + + try: + s = self.session + rsp = s.put( + self._endpoint + req, + data=data, + headers=headers, + params=params, + verify=self.verifyCert(), + ) + self.log.info(f"status: {rsp.status_code}") + except ConnectionError as ce: + self.log.error(f"connection error: {ce}") + raise IOError("Connection Error") + + if rsp.status_code == 201 and req == "/": + self.log.info("clearing domain_json cache") + self._domain_json = None + self.log.info(f"PUT returning: {rsp}") + return HttpResponse(rsp) + + def POST(self, req, body=None, format="json", params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + if self._domain is None: + raise IOError("no domain defined") + + if params is None: + params = {} + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key: + params["api_key"] = self._api_key + + # verify we have write intent (unless this is a dataset point selection) + if req.startswith("/datasets/") and req.endswith("/value"): + point_sel = True + else: + point_sel = False + if self._mode == "r" and not point_sel: + raise IOError("Unable perform request (No write intent on file)") + + # try to do a POST to the domain + + headers = self.getHeaders(headers=headers) + + if isinstance(body, bytes): + headers["Content-Type"] = "application/octet-stream" + data = body + else: + # assume json + try: + data = json.dumps(body) + except TypeError: + msg = f"Unable to convert {body} to json" + self.log.error(msg) + raise IOError("JSON encoding error") + if format == "binary": + # recieve data as binary + headers["accept"] = "application/octet-stream" + + self.log.info("POST: " + req) + + try: + s = self.session + rsp = s.post( + self._endpoint + req, + data=data, + headers=headers, + params=params, + verify=self.verifyCert(), + ) + except ConnectionError as ce: + self.log.warning(f"connection error: {ce}") + raise IOError(str(ce)) + + if rsp.status_code not in (200, 201): + self.log.error(f"POST error: {rsp.status_code}") + + return HttpResponse(rsp) + + def DELETE(self, req, params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + + if req not in ("/domains", "/") and self._domain is None: + raise IOError("no domain defined") + if params is None: + params = {} + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key: + params["api_key"] = self._api_key + + # verify we have write intent + if self._mode == "r": + raise IOError("Unable perform request (No write intent on file)") + + # try to do a DELETE of the resource + + headers = self.getHeaders(headers=headers) + + self.log.info("DEL: " + req) + try: + s = self.session + rsp = s.delete( + self._endpoint + req, + headers=headers, + params=params, + verify=self.verifyCert(), + ) + self.log.info(f"status: {rsp.status_code}") + except ConnectionError as ce: + self.log.error(f"connection error: {ce}") + raise IOError("Connection Error") + + if rsp.status_code == 200 and req == "/": + self.log.info("clearing domain_json cache") + self._domain_json = None + + return HttpResponse(rsp) + + @property + def session(self): + # create a session object to re-use http connection when possible + s = requests + retries = self._retries + backoff_factor = 1 + status_forcelist = (500, 502, 503, 504) + + if self._use_session: + if self._s is None: + if self._endpoint.startswith("http+unix://"): + self.log.debug(f"create unixsocket session: {self._endpoint}") + s = requests_unixsocket.Session() + else: + # regular request session + s = requests.Session() + + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + + s.mount( + "http://", + HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16), + ) + s.mount( + "https://", + HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16), + ) + self._s = s + else: + s = self._s + return s + + def add_external_ref(self, fid): + # this is used by the group class to keep references to external links open + if fid.__class__.__name__ != "FileID": + raise TypeError("add_external_ref, expected FileID type") + self._external_refs.append(fid) + + def close(self): + if self._s: + self._s.close() + self._s = None + + @property + def domain(self): + return self._domain + + @property + def username(self): + return self._username + + @property + def endpoint(self): + return self._endpoint + + @property + def password(self): + return self._password + + @property + def mode(self): + return self._mode + + @property + def domain_json(self): + if self._domain_json is None: + rsp = self.GET("/") + if rsp.status_code != 200: + raise IOError(rsp.reason) + # assume JSON + self._domain_json = rsp.json() + return self._domain_json + + @property + def root_uuid(self): + domain_json = self.domain_json + if "root" not in domain_json: + raise IOError("Unexpected response") + root_uuid = domain_json["root"] + return root_uuid + + @property + def compressors(self): + compressors = [] + if "compressors" in self.domain_json: + compressors = self.domain_json["compressors"] + if not compressors: + compressors = [ + "gzip", + ] + return compressors + + @property + def modified(self): + """Last modified time of the domain as a datetime object.""" + domain_json = self.domain_json + if "lastModified" not in domain_json: + raise IOError("Unexpected response") + last_modified = domain_json["lastModified"] + return last_modified + + @property + def created(self): + """Creation time of the domain""" + domain_json = self.domain_json + if "created" not in domain_json: + raise IOError("Unexpected response") + created = domain_json["created"] + return created + + @property + def owner(self): + """username of creator of domain""" + domain_json = self.domain_json + username = None + if "owner" in domain_json: + # currently this is only available for HSDS + username = domain_json["owner"] + return username + + @property + def logging(self): + """return name of logging handler""" + return self.log From e07bd8115b7ae27c46d1fa8bda91991077de222f Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 3 Jun 2025 13:21:05 +0200 Subject: [PATCH 046/129] fix flake8 error --- src/h5json/hsdsstore/hsds_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index 5740a29c..13f3b8ce 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -59,7 +59,7 @@ def __init__( self.log.debug(f" username: {username}") kwargs["username"] = username if password: - self.log.debug(f" password: {"*" * len(password)}") + self.log.debug(f" password: {'*' * len(password)}") kwargs["password"] = password if bucket: self.log.debug(f" bucket: {bucket}") @@ -68,7 +68,7 @@ def __init__( self.log.debug(f" mode: {mode}") kwargs["mode"] = mode if api_key: - self.log.debug(f" apI_key: {"*" * len(api_key)}") + self.log.debug(f" apI_key: {'*' * len(api_key)}") kwargs["api_key"] = api_key if use_session: self.log.debug(f" use_session: {use_session}") From 69baad8acec5e1e411f879ad78bdc670b40bd0b0 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 3 Jun 2025 13:34:02 +0200 Subject: [PATCH 047/129] fix import paths --- .github/workflows/ci.yml | 40 ++++++++++++++++++++++++++++++++++++++++ src/h5json/hdf5db.py | 4 ++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4e1040ca..9108d474 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,11 +25,13 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies shell: bash run: | python -m pip install --upgrade pip python -m pip install flake8 pytest + - name: Lint with flake8 shell: bash run: | @@ -37,10 +39,48 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --ignore=F401,W503,E203 --max-complexity=99 --max-line-length=127 --statistics + - name: Install h5json shell: bash run: | pip install -e . + + - name: Checkout HSDS + uses: actions/checkout@v4 + with: + repository: HDFGroup/hsds + path: ${{github.workspace}}/hsds + + - name: Install HSDS + working-directory: ${{github.workspace}}/hsds + shell: bash + run: | + pip install -e . + + - name: Start HSDS + shell: bash + working-directory: ${{github.workspace}}/hsds + run: | + mkdir hsds_root + mkdir hsds_root/hsds_bucket + cp admin/config/groups.default admin/config/groups.txt + cp admin/config/passwd.default admin/config/passwd.txt + hsds --root_dir hsds_root --host localhost --port 5101 --password_file admin/config/passwd.txt --logfile hs.log --loglevel DEBUG --config_dir=admin/config --count=4 & + + - name: Wait for node startup + shell: bash + run: | + sleep 30 + + - name: HSDS Setup + shell: bash + env: + ADMIN_PASSWORD: admin + ADMIN_USERNAME: admin + working-directory: ${{github.workspace}}/hsds + run: | + python tests/integ/setup_test.py + - name: Run tests shell: bash run: | diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 6ee8aaa2..c00f91a1 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -18,8 +18,8 @@ from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId from . import selections from .apiversion import _apiver -from .reader.h5reader import H5Reader -from .writer.h5writer import H5Writer +from .h5reader import H5Reader +from .h5writer import H5Writer class Hdf5db: From 66b5b15fbac75ece472649cf7c104605e071ddf3 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 3 Jun 2025 16:44:32 +0200 Subject: [PATCH 048/129] use binary for dataset reads --- .github/workflows/ci.yml | 3 ++ src/h5json/hsdsstore/hsds_reader.py | 81 +++++++++++++++++++++-------- src/h5json/hsdsstore/httpconn.py | 41 ++++++++++----- test/unit/hsds_reader_test.py | 25 ++++----- testall.py | 16 +++++- 5 files changed, 115 insertions(+), 51 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9108d474..ba618d56 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,5 +83,8 @@ jobs: - name: Run tests shell: bash + HS_ENDPOINT: http://localhost:5101 + HS_USERNAME: test_user1 + HS_PASSWORD: test run: | python testall.py diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index 13f3b8ce..1c4eb28b 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -9,13 +9,12 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import json import logging from ..objid import getCollectionForId, getUuidFromId from ..hdf5dtype import createDataType -from ..array_util import jsonToArray +from ..array_util import jsonToArray, bytesToArray from .. import selections from ..h5reader import H5Reader from .httpconn import HttpConn @@ -162,6 +161,10 @@ def __init__( self._root_id = "g-" + h5json["root"] """ + @property + def http_conn(self): + return self._http_conn + def close(self): pass @@ -183,15 +186,17 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_ if include_links: params["include_links"] = 1 - rsp = self._http_conn.GET(req, params=params) + rsp = self.http_conn.GET(req, params=params) if rsp.status_code != 200: raise IOError(rsp.status_code, rsp.reason) obj_json = rsp.json() - if "hrefs" in obj_json: - # don't need these - del obj_json["hrefs"] + # remove any unneeded keys + redundant_keys = ("hrefs", "root", "domain", "bucket", "linkCount", "attributeCount") + for key in redundant_keys: + if key in obj_json: + del obj_json[key] self.log.debug(f"got json for id: {obj_id}: {obj_json}") return obj_json @@ -208,7 +213,7 @@ def getAttribute(self, obj_id, name, includeData=True): params = {} params["IncludeData"] = 1 if includeData else 0 - rsp = self._http_conn.GET(req, params=params) + rsp = self.http_conn.GET(req, params=params) if rsp.status_code in (404, 410): self.log.warning(f"attribute {name} not found") @@ -254,28 +259,60 @@ def getDatasetValues(self, dset_id, sel=None, dtype=None): self.log.warning(msg) return ValueError(msg) - params = {} if sel is None or sel.select_type == selections.H5S_SELECT_ALL: - pass # just return the entire array - elif isinstance(sel, selections.SimpleSelection): - params["select"] = sel.getQueryParam() + query_param = None # just return the entire array + elif isinstance(sel, (selections.SimpleSelection, selections.FancySelection)): + query_param = sel.getQueryParam() else: - raise NotImplementedError("selection type not supported") + raise NotImplementedError(f"selection type: {type(sel)} not supported") + + mtype = dtype # TBD - support read time dtype + mshape = sel.mshape req = f"/{collection}/{dset_id}/value" - rsp = self._http_conn.GET(req, params=params) + params = {} + + if query_param: + params["select"] = query_param + + if mtype.names != dtype.names: + params["fields"] = ":".join(mtype.names) + + MAX_SELECT_QUERY_LEN = 100 + if len(query_param) > MAX_SELECT_QUERY_LEN: + # use a post method to avoid possible long query strings + try: + rsp = self.http_conn.POST(req, body=params, format="binary") + except IOError as ioe: + self.log.info(f"got IOError: {ioe.errno}") + raise IOError(f"Error retrieving data: {ioe.errno}") + else: + # make a http GET + try: + rsp = self.http_conn.GET(req, params=params, format="binary") + except IOError as ioe: + self.log.info(f"got IOError: {ioe.errno}") + raise IOError(ioe.errno, "Error retrieving data") + if rsp.status_code != 200: - self.log.error(f"GET {req} failed with status_code: {rsp.status_code}") - raise IOError(rsp.status_code, rsp.reason) + self.log.info(f"got http error: {rsp.status_code}") + raise IOError(rsp.status_code, "Error retrieving data") - rsp_json = rsp.json() - if "value" not in rsp_json: - self.log.warning(f"value key not found for {dset_id}") - return None + if rsp.is_binary: + # got binary response + self.log.info(f"binary response, {len(rsp.text)} bytes") + arr = bytesToArray(rsp.text, mtype, mshape) + else: + # got JSON response + # need some special conversion for compound types -- + # each element must be a tuple, but the JSON decoder + # gives us a list instead. + self.log.info("json response") - self.log.debug(f"got rsp: {rsp_json}") - json_value = rsp_json["value"] + data = rsp.json()["value"] + # self.log.debug(data) - arr = jsonToArray(sel.mshape, dtype, json_value) + arr = jsonToArray(mshape, mtype, data) + self.log.debug(f"jsonToArray returned: {arr}") return arr diff --git a/src/h5json/hsdsstore/httpconn.py b/src/h5json/hsdsstore/httpconn.py index 7a686dff..14b3d54d 100644 --- a/src/h5json/hsdsstore/httpconn.py +++ b/src/h5json/hsdsstore/httpconn.py @@ -14,9 +14,9 @@ import os import sys -import multiprocessing - +import time import base64 + import requests import requests_unixsocket from requests import ConnectionError @@ -289,7 +289,7 @@ def __init__( if self._timeout != DEFAULT_TIMEOUT: self.log.info(f"HttpConn.init - timeout = {self._timeout}") - if endpoint is None: + if not endpoint: if "HS_ENDPOINT" in os.environ: endpoint = os.environ["HS_ENDPOINT"] @@ -299,21 +299,21 @@ def __init__( self._endpoint = endpoint - if username is None: + if not username: if "HS_USERNAME" in os.environ: username = os.environ["HS_USERNAME"] if isinstance(username, str) and (not username or username.upper() == "NONE"): username = None self._username = username - if password is None: + if not password: if "HS_PASSWORD" in os.environ: password = os.environ["HS_PASSWORD"] if isinstance(password, str) and (not password or password.upper() == "NONE"): password = None self._password = password - if bucket is None: + if not bucket: if "HS_BUCKET" in os.environ: bucket = os.environ["HS_BUCKET"] if isinstance(bucket, str) and (not bucket or bucket.upper() == "NONE"): @@ -479,7 +479,7 @@ def GET(self, req, format="json", params=None, headers=None): try: s = self.session stream = True # tbd - config for no streaming? - + ts = time.time() rsp = s.get( self._endpoint + req, params=params, @@ -488,7 +488,8 @@ def GET(self, req, format="json", params=None, headers=None): timeout=self._timeout, verify=self.verifyCert(), ) - self.log.info(f"status: {rsp.status_code}") + elapsed = time.time() - ts + self.log.info(f"status: GET {rsp.status_code}, elapsed: {elapsed:.4f}") except ConnectionError as ce: self.log.error(f"connection error: {ce}") raise IOError("Connection Error") @@ -496,6 +497,9 @@ def GET(self, req, format="json", params=None, headers=None): self.log.error(f"got {type(e)} exception: {e}") raise IOError("Unexpected exception") + if rsp.status_code != 200: + self.log.warning(f"GET {req} returned status: {rsp.status_code}") + return HttpResponse(rsp) def PUT(self, req, body=None, format="json", params=None, headers=None): @@ -536,6 +540,7 @@ def PUT(self, req, body=None, format="json", params=None, headers=None): try: s = self.session + ts = time.time() rsp = s.put( self._endpoint + req, data=data, @@ -543,7 +548,8 @@ def PUT(self, req, body=None, format="json", params=None, headers=None): params=params, verify=self.verifyCert(), ) - self.log.info(f"status: {rsp.status_code}") + elapsed = time.time() - ts + self.log.info(f"status: PUT {rsp.status_code}, elapsed: {elapsed:.4f}") except ConnectionError as ce: self.log.error(f"connection error: {ce}") raise IOError("Connection Error") @@ -551,7 +557,10 @@ def PUT(self, req, body=None, format="json", params=None, headers=None): if rsp.status_code == 201 and req == "/": self.log.info("clearing domain_json cache") self._domain_json = None + if rsp.status_code not in (200, 201): + self.log.warning(f"got status code: {rsp.status_code} for PUT {req}") self.log.info(f"PUT returning: {rsp}") + return HttpResponse(rsp) def POST(self, req, body=None, format="json", params=None, headers=None): @@ -593,13 +602,14 @@ def POST(self, req, body=None, format="json", params=None, headers=None): self.log.error(msg) raise IOError("JSON encoding error") if format == "binary": - # recieve data as binary + # receive data as binary headers["accept"] = "application/octet-stream" self.log.info("POST: " + req) try: s = self.session + ts = time.time() rsp = s.post( self._endpoint + req, data=data, @@ -607,12 +617,14 @@ def POST(self, req, body=None, format="json", params=None, headers=None): params=params, verify=self.verifyCert(), ) + elapsed = time.time() - ts + self.log.info(f"status: POST {rsp.status_code}, elapsed: {elapsed:.4f}") except ConnectionError as ce: self.log.warning(f"connection error: {ce}") raise IOError(str(ce)) if rsp.status_code not in (200, 201): - self.log.error(f"POST error: {rsp.status_code}") + self.log.error(f"got status_code: {rsp.status_code} for DELETE: {req}") return HttpResponse(rsp) @@ -636,12 +648,12 @@ def DELETE(self, req, params=None, headers=None): raise IOError("Unable perform request (No write intent on file)") # try to do a DELETE of the resource - headers = self.getHeaders(headers=headers) self.log.info("DEL: " + req) try: s = self.session + ts = time.time() rsp = s.delete( self._endpoint + req, headers=headers, @@ -649,6 +661,8 @@ def DELETE(self, req, params=None, headers=None): verify=self.verifyCert(), ) self.log.info(f"status: {rsp.status_code}") + elapsed = time.time() - ts + self.log.info(f"status: DELETE {rsp.status_code}, elapsed: {elapsed:.4f}") except ConnectionError as ce: self.log.error(f"connection error: {ce}") raise IOError("Connection Error") @@ -657,6 +671,9 @@ def DELETE(self, req, params=None, headers=None): self.log.info("clearing domain_json cache") self._domain_json = None + if rsp.status_code != 200: + self.log.warning(f"got status_code: {rsp.status_code} for DELETE {req}") + return HttpResponse(rsp) @property diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py index cbc7f8bb..72cf6017 100644 --- a/test/unit/hsds_reader_test.py +++ b/test/unit/hsds_reader_test.py @@ -17,18 +17,6 @@ from h5json import selections -def get_endpoint(): - return "http://hsds.hdf.test:5101" - - -def get_username(): - return "test_user1" - - -def get_password(): - return "test" - - class HSDSReaderTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(HSDSReaderTest, self).__init__(*args, **kwargs) @@ -52,9 +40,6 @@ def testSimple(self): filepath = "/home/test_user1/test/tall.h5" kwargs = {"app_logger": self.log} with Hdf5db(**kwargs) as db: - kwargs["username"] = get_username() - kwargs["password"] = get_password() - kwargs["endpoint"] = get_endpoint() hsds_reader = HSDSReader(filepath, **kwargs) db.reader = hsds_reader root_id = db.getObjectIdByPath("/") @@ -81,6 +66,16 @@ def testSimple(self): dset_shape = dset_json["shape"] self.assertEqual(dset_shape["class"], "H5S_SIMPLE") self.assertEqual(dset_shape["dims"], [10, 10]) + + # got the 5th row of the dataset + sel_row = selections.select((10, 10), (5, slice(0, 10))) + row = db.getDatasetValues(dset111_id, sel_row) + self.assertTrue(isinstance(row, np.ndarray)) + self.assertEqual(row.shape, (10,)) + for i in range(10): + v = row[i] + self.assertEqual(v, i * 5) + sel_all = selections.select((10, 10), ...) arr = db.getDatasetValues(dset111_id, sel_all) self.assertTrue(isinstance(arr, np.ndarray)) diff --git a/testall.py b/testall.py index 5ca1934c..45e06106 100755 --- a/testall.py +++ b/testall.py @@ -15,7 +15,7 @@ import shutil import h5py -unit_tests = ( +unit_tests = [ "array_util_test", "objid_test", "hdf5dtype_test", @@ -24,7 +24,19 @@ "h5json_writer_test", "h5py_reader_test", "h5py_writer_test", -) +] + +use_hsds = True +for key in ("HS_ENDPOINT", "HS_USERNAME", "HS_PASSWORD"): + if key not in os.environ: + use_hsds = False + print(f"not including HSDS tests, no {key} environment set") + break + +if use_hsds: + unit_tests.append("hsds_reader_test") +unit_tests = tuple(unit_tests) + integ_tests = ("h5tojson_test", "jsontoh5_test") # verify the hdf5 lib version is recent From 0090d56a0f3e126b57372a4d5eee583071eac5cc Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 10 Jun 2025 17:00:41 +0200 Subject: [PATCH 049/129] add hsds_writer class --- src/h5json/h5pystore/h5py_reader.py | 29 ++- src/h5json/h5pystore/h5py_writer.py | 82 +++++--- src/h5json/h5reader.py | 30 +++ src/h5json/h5writer.py | 30 ++- src/h5json/hdf5db.py | 86 +++++++-- src/h5json/hsdsstore/hsds_reader.py | 43 ++--- src/h5json/hsdsstore/hsds_writer.py | 264 ++++++++++++++++++++++++++ src/h5json/jsonstore/h5json_reader.py | 17 +- src/h5json/jsonstore/h5json_writer.py | 24 ++- test/unit/h5py_reader_test.py | 1 + test/unit/h5py_writer_test.py | 62 +++--- 11 files changed, 564 insertions(+), 104 deletions(-) create mode 100644 src/h5json/hsdsstore/hsds_writer.py diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index dab44078..034566c6 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -148,20 +148,45 @@ def __init__( self.log = app_logger else: self.log = logging.getLogger() + if not h5py.is_hdf5(filepath): + self.log.warn(f"File: {filepath} is not an HDF5 file") + raise IOError("not an HDF5 file") super().__init__(filepath, app_logger=app_logger) - f = h5py.File(self._filepath) + self._f = None + self._root_id = None + + + def open(self): + if self._f: + return # already open + if self._id_map: + return # objects already loaded + if not self._root_id: + # get the root id from db if available + if self.db.root_id: + self.log.info("H5pyReader: got root_id from db") + self._root_id = self.db.root_id + else: + self.log.info("H5pyReader: creating root id") + self._root_id = createObjId(obj_type="groups") + + f = h5py.File(self.filepath) self._f = f - self._root_id = createObjId(obj_type="groups") self._id_map[self._root_id] = f addr = h5py.h5o.get_info(f.id).addr self._addr_map[addr] = self._root_id f.visititems(self.visit) + return self._root_id + def close(self): if self._f: self._f.close() self._f = None + def isClosed(self): + return False if self._f else True + def get_root_id(self): """ Return root id """ return self._root_id diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index f2487826..16c69681 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -13,7 +13,7 @@ import numpy as np import time -from ..objid import getCollectionForId, isValidUuid, getUuidFromId, isObjId +from ..objid import getCollectionForId, isValidUuid, createObjId from ..hdf5dtype import createDataType from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype from ..array_util import jsonToArray @@ -41,6 +41,7 @@ def __init__( else: self._init = True self._flush_time = 0.0 + self._f = None # h5py file handle def _copy_element(self, val, src_dt, tgt_dt, fout=None): """ convert the given dataset or attribute element to h5py equivalent """ @@ -390,40 +391,69 @@ def updateAttributes(self, obj_id, obj): def flush(self): """ Write dirty items """ - if not self.db: + if self.closed: # no db set yet return False + if not self._f: + raise IOError("open not called") + self.log.info("h5py_writer.flush()") root_id = self.db.root_id self._id_map[root_id] = "/" - mode = 'w' if self._init else 'a' - with h5py.File(self._filepath, mode=mode) as f: - if self.db.new_objects or self._init: - root_json = self.db.getObjectById(root_id) - - if "links" in root_json: - root_links = root_json["links"] - self._createObjects(f, root_links, visited=set((root_id,))) - # update attributes, dataset values - for obj_id in self._id_map: - if self.db.is_dirty(obj_id) or self._init: - h5path = self._id_map[obj_id] - obj = f[h5path] - self.updateAttributes(obj_id, obj) - collection = getCollectionForId(obj_id) - if collection == "datasets": - if self._init: - self.initializeDatasetValues(obj_id, obj) - else: - self.updateDatasetValues(obj_id, obj) - # mark time write is complete - # updates before this time will not need to be written - # TBD: possible race condition with multithreading - self._flush_time = time.time() + + if self.db.new_objects or self._init: + root_json = self.db.getObjectById(root_id) + + if "links" in root_json: + root_links = root_json["links"] + self._createObjects(self._f, root_links, visited=set((root_id,))) + # update attributes, dataset values + for obj_id in self._id_map: + if self.db.is_dirty(obj_id) or self._init: + h5path = self._id_map[obj_id] + obj = self._f[h5path] + self.updateAttributes(obj_id, obj) + collection = getCollectionForId(obj_id) + if collection == "datasets": + if self._init: + self.initializeDatasetValues(obj_id, obj) + else: + self.updateDatasetValues(obj_id, obj) + # mark time write is complete + # updates before this time will not need to be written + # TBD: possible race condition with multithreading + self._flush_time = time.time() self._init = False # done with init after first flush return True # all objects written successfully + + def open(self): + """ open HDF5 file """ + self.log.debug("h5pyWriter open") + if self.db is None: + # no db set yet + self.log.warning("no self.db db_ref") + raise ValueError("no db") + mode = 'w' if self._init else 'a' + self.log.info(f"creating h5py file: {self._filepath} mode: {mode}") + self._f = h5py.File(self._filepath, mode=mode) + if self.db.root_id: + self._root_id = self.db.root_id + else: + self._root_id = createObjId(obj_type="groups") + return self._root_id + def close(self): """ close storage handle """ + self.log.debug("h5py_writer.close()") + if not self._f: + # no open on file + return self.flush() + self._f.close() + self._f = None + + def isClosed(self): + """ return closed status """ + return False if self._f else True diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index 541bb262..fbc53491 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -10,6 +10,7 @@ # request a copy from help@hdfgroup.org. # ############################################################################## from abc import ABC, abstractmethod +import weakref import logging @@ -31,6 +32,25 @@ def __init__( else: self.log = logging.getLogger() + def set_db(self, db): + self._db_ref = weakref.ref(db) + + @property + def db(self): + if not self._db_ref: + raise ValueError("db not available") + return self._db_ref() + + @property + def filepath(self): + """ return filepath """ + return self._filepath + + @property + def closed(self): + """ return True if the reader handle is closed (or never opened) """ + return self.isClosed() + @abstractmethod def get_root_id(self): """ Return root id """ @@ -58,7 +78,17 @@ def getDatasetValues(self, obj_id, sel=None, dtype=None): """ pass + @abstractmethod + def open(self): + """ Open data source for reading """ + pass + @abstractmethod def close(self): """ close any open handles to the storage """ pass + + @abstractmethod + def isClosed(self): + """ return True if handle is closed """ + pass diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index aaab2e51..bc52523d 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -39,12 +39,35 @@ def __init__( def set_db(self, db): self._db_ref = weakref.ref(db) + self.log.debug("writer set db ref") + + @property + def filepath(self): + return self._filepath + + @property + def closed(self): + return self.isClosed() @property def db(self): if not self._db_ref: - raise ValueError("db not available") + self.log.debug("db not available") + return None return self._db_ref() + + @property + def append(self): + return self._append + + #property + def no_data(self): + return self._no_data + + @abstractmethod + def open(self): + """ open storage handle, return root_id""" + return None @abstractmethod def flush(self): @@ -55,3 +78,8 @@ def flush(self): def close(self): """ close storage handle """ pass + + @abstractmethod + def isClosed(self): + """ return True if handle is closed """ + pass diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index c00f91a1..b28ebbc6 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -48,26 +48,32 @@ def __init__( self._db = {} - self._reader = h5_reader - self._writer = h5_writer + self._new_objects = set() # set of for newly created objects + self._dirty_objects = set() # set of modified objects + self._deleted_objects = set() # set of deleted objects - self._new_objects = set() # set of obj_id's - self._dirty_objects = set() # set of obj_id's + self._root_id = None - if self._reader: - root_id = self._reader.get_root_id() - group_json = self._reader.getObjectById(root_id) + if h5_reader: + self._reader = h5_reader + self._reader.set_db(self) else: - root_id = createObjId(obj_type="groups") - # create a root group - group_json = {"links": {}, "attributes": {}, "cpl": {}} - group_json["created"] = time.time() + self._reader = None - if self._writer: + if h5_writer: + self._writer = h5_writer self._writer.set_db(self) + else: + self._writer = None + + #root_id = createObjId(obj_type="groups") + # create a root group + #group_json = {"links": {}, "attributes": {}, "cpl": {}} + #group_json["created"] = time.time() - self._db[root_id] = group_json - self._root_id = root_id + + #self._db[root_id] = group_json + # self._root_id = root_id @property def db(self): @@ -86,6 +92,9 @@ def reader(self, value: H5Reader): self.flush() if self._reader: self._reader.close() + self._reader = value + self._reader.set_db(self) + """ root_id = value.get_root_id() if not root_id: raise ValueError(f"reader {type(value)} unable to return root_id") @@ -95,6 +104,7 @@ def reader(self, value: H5Reader): self._reader = value self._db[root_id] = group_json self._root_id = root_id + """ @property def writer(self): @@ -108,6 +118,7 @@ def writer(self, value: H5Writer): self._writer.close() self._writer = value if self._writer: + self.log.debug("writer set_db") self._writer.set_db(self) @property @@ -132,6 +143,10 @@ def new_objects(self): @property def dirty_objects(self): return self._dirty_objects + + @property + def deleted_objects(self): + return self._deleted_objects def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ @@ -161,6 +176,31 @@ def flush(self): self._new_objects = set() self._dirty_objects = set() + def open(self): + """ open reader and writer if set """ + if self.root_id: + self.log.warning("root id already set, multiple db.open calls") + return self.root_id + + if self.writer and self.writer.append: + # append mode for the writer, open writer and get the root id + self._root_id = self.writer.open() + elif self.reader: + self._root_id = self.reader.open() + else: + # no root id set by writer or reader, initialize now + self._root_id = createObjId(obj_type="groups") + if self.writer: + # open writer in create mode now that we have a root id + self.writer.open() + + # create a root group just as a memory object + group_json = {"links": {}, "attributes": {}, "cpl": {}} + group_json["created"] = time.time() + self._db[self._root_id] = group_json + + return self._root_id + def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") @@ -172,6 +212,10 @@ def close(self): self._root_id = None self._db = {} + @property + def closed(self): + return False if self.root_id else True + def __enter__(self): """ called on package init """ self.log.info("Hdf5db __enter") @@ -180,6 +224,7 @@ def __enter__(self): def __exit__(self, type, value, traceback): """ called on package exit """ self.log.info("Hdf5db __exit") + print("__exit__") self.close() def getObjectById(self, obj_id): @@ -190,6 +235,7 @@ def getObjectById(self, obj_id): obj_json = self.reader.getObjectById(obj_id) self.db[obj_id] = obj_json else: + print("keyerror - self.db:", self.db) raise KeyError(f"obj_id: {obj_id} not found") obj_json = self.db[obj_id] @@ -199,6 +245,9 @@ def getObjectIdByPath(self, h5path, parent_id=None): """ Return id for the given link path starting from parent_id if set, otherwise the root_id """ + if self.closed: + self.open() # initiate db + if h5path == "/": return self.root_id # just return root id @@ -551,6 +600,8 @@ def deleteObject(self, obj_id): if obj_id in self._dirty_objects: self._dirty_objects.remove(obj_id) + self._deleted_objects.add(obj_id) + def getLinks(self, grp_id): """ Get the links for the given group """ grp_json = self.getObjectById(grp_id) @@ -621,7 +672,8 @@ def deleteLink(self, grp_id, name): def createGroup(self, cpl=None): """ Create a new group """ - + if self.closed: + raise ValueError("db is closed") grp_id = createObjId("groups", root_id=self.root_id) group_json = {"attributes": {}, "links": {}} if cpl: @@ -638,6 +690,8 @@ def createCommittedType(self, datatype, cpl=None): createCommittedType - creates new named datatype Returns item """ + if self.closed: + raise ValueError("db is closed") self.log.info("createCommittedType") if cpl is None: cpl = {} @@ -667,6 +721,8 @@ def createDataset( createDataset - creates new dataset given shape and datatype Returns obj_id """ + if self.closed: + raise ValueError("db is closed") type_json = getTypeItem(dtype) if shape == "H5S_NULL": shape_json = {"class": "H5S_NULL"} diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index 1c4eb28b..b4de31d2 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -33,7 +33,6 @@ def __init__( username=None, password=None, bucket=None, - mode='r', api_key=None, use_session=True, expire_time=0, @@ -63,9 +62,6 @@ def __init__( if bucket: self.log.debug(f" bucket: {bucket}") kwargs["bucket"] = bucket - if mode: - self.log.debug(f" mode: {mode}") - kwargs["mode"] = mode if api_key: self.log.debug(f" apI_key: {'*' * len(api_key)}") kwargs["api_key"] = api_key @@ -88,10 +84,17 @@ def __init__( if timeout: self.log.debug(f" timeout: {timeout}") kwargs["timeout"] = timeout + # save these for when we create the connection + self._http_kwargs = kwargs super().__init__(domain_path, app_logger=app_logger) - http_conn = HttpConn(domain_path, **kwargs) + def open(self): + if self._http_conn: + return # open already called + + kwargs = self._http_kwargs + http_conn = HttpConn(self.filepath, **kwargs) hsds_info = http_conn.serverInfo() self.log.debug(f"got hsds info: {hsds_info}") @@ -122,22 +125,14 @@ def __init__( http_conn.close() raise IOError(404, "Location is a folder, not a file") - root_uuid = domain_json["root"] - - if mode in ("w", "w-", "x", "a"): - http_conn._mode = "r+" + root_id = domain_json["root"] + self._root_id = root_id """ if "domain_objs" in root_json: domain_objs = root_json["domain_objs"] objdb.load(domain_objs) - """ - - self._root_id = root_uuid - self._verboseInfo = None # additional state we'll get when requested - self._verboseUpdated = None # when the verbose data was fetched - self._lastScan = None # when summary stats where last updated by server - + """ if "limits" in domain_json: self._limits = domain_json["limits"] else: @@ -150,23 +145,19 @@ def __init__( self._http_conn = http_conn self._domain_json = domain_json - """ - # parse the json file - h5json = json.loads(text) - - self._h5json = h5json + return self._root_id - if "root" not in h5json: - raise Exception("no root key in input file") - self._root_id = "g-" + h5json["root"] - """ @property def http_conn(self): return self._http_conn def close(self): - pass + if self._http_conn: + self._http_conn.close() + + def isClosed(self): + return False is self._http_conn else True def get_root_id(self): """ Return root id """ diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py new file mode 100644 index 00000000..8144e085 --- /dev/null +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -0,0 +1,264 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import logging + +from ..objid import getCollectionForId, getUuidFromId + +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray, bytesToArray +from .. import selections +from ..h5writer import H5Writer +from .httpconn import HttpConn + + +class HSDSWriter(H5Writer): + """ + This class can be used by HDF5DB to read content from an hdf5-json file + """ + + def __init__( + self, + domain_path, + append=False, + no_data=False, + app_logger=None, + endpoint=None, + username=None, + password=None, + bucket=None, + api_key=None, + use_session=True, + expire_time=0, + max_objects=0, + max_age=0, + retries=3, + timeout=30.0, + track_order=False, + owner=None, + linked_domain=None + + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + if append: + self._init = False + else: + self._init = True + + if no_data: + self._no_data = True + else: + self._no_data = False + + self.log.debug("HSDSWriter init") + + kwargs = {} + self.log.debug(f" domain_path: {domain_path}") + self.log.debug(f" append: {append}") + if endpoint: + self.log.debug(f" endpoint: {endpoint}") + kwargs["endpoint"] = endpoint + if username: + self.log.debug(f" username: {username}") + kwargs["username"] = username + if password: + self.log.debug(f" password: {'*' * len(password)}") + kwargs["password"] = password + if bucket: + self.log.debug(f" bucket: {bucket}") + kwargs["bucket"] = bucket + if api_key: + self.log.debug(f" apI_key: {'*' * len(api_key)}") + kwargs["api_key"] = api_key + if use_session: + self.log.debug(f" use_session: {use_session}") + kwargs["user_session"] = use_session + if expire_time: + self.log.debug(f" expire_time: {expire_time}") + kwargs["expire_time"] = expire_time + if max_objects: + self.log.debug(f" max_objects: {max_objects}") + kwargs["max_objects"] = max_objects + if max_age: + self.log.debug(f" max_age: {max_age}") + kwargs["max_age"] = max_age + if retries: + self.log.debug(f" retries: {retries}") + kwargs["retries"] = retries + if timeout: + self.log.debug(f" timeout: {timeout}") + kwargs["timeout"] = timeout + self._http_kwargs = kwargs # save for when we create the connection + + super().__init__(domain_path, app_logger=app_logger) + + self._http_conn = None + self._root_id = None + self._append = append + self._owner = owner + self._track_order = track_order + self._linked_domain = linked_domain + self._domain_json = None + + def open(self): + """ setup domain for writing """ + + if self._http_conn: + http_conn = self._http_conn + else: + kwargs = self._http_kwargs + http_conn = HttpConn(self.filepath, **kwargs) + if self._append: + http_conn._mode = "a" + self._http_conn = http_conn + hsds_info = http_conn.serverInfo() + self.log.debug(f"got hsds info: {hsds_info}") + + if not self._domain_json: + # haven't fetched the domain json yet, do it now + + # try to do a GET from the domain + req = "/" + params = {} + """ + if max_objects is None or max_objects > 0: + # get object meta objects + # TBD: have hsds support a max limit of objects to return + params["getobjs"] = 1 + params["include_attrs"] = 1 + params["include_links"] = 1 + """ + + domain_json = None + rsp = http_conn.GET(req, params=params) + + if rsp.status_code not in (200, 404, 410): + msg = f"Got status code: {rsp.status_code} on initial domain get" + self.log.warning(msg) + raise IOError(msg) + + if rsp.status_code == 200: + if self._append: + # domain exists already + domain_json = rsp.json() + if "root" not in domain_json: + # this a folder not a domain + self.log.warning(f"folder: {self.filepath} has no root property") + http_conn.close() + raise IOError(404, "Location is a folder, not a file") + else: + # not append - delete existing domain + self.log.info(f"sending delete request for {self.filepath}") + delete_rsp = http_conn.DELETE(req, params=params) + if delete_rsp.status_code not in (200, 410): + # failed to delete + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + + if not domain_json: + # domain doesn't exist, create it + body = {} + if self.db.root_id: + # initialize domain using the db's root_id + body["root_id"] = self.db.root_id + if self._owner: + body["owner"] = self._owner + if self._linked_domain: + body["linked_domain"] = linked_domain + if self._track_order: + create_props = {"CreateOrder": 1} + group_body = {"creationProperties": create_props} + body["group"] = group_body + rsp = http_conn.PUT(req, params=params, body=body) + if rsp.status_code != 201: + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + domain_json = rsp.json() + self.log.info(f"got rsp on PUT domain: {domain_json}") + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Unexpected error") + + self.log.debug(f"got domain_json: {domain_json}") + + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Location is a folder, not a file") + + root_id = domain_json["root"] + + self._root_id = root_id + + if "limits" in domain_json: + self._limits = domain_json["limits"] + else: + self._limits = None + if "version" in domain_json: + self._version = domain_json["version"] + else: + self._version = None + + self._domain_json = domain_json + + return self._root_id + + + @property + def http_conn(self): + return self._http_conn + + def flush(self): + """ Write dirty items """ + + if not self.db: + # no db set yet + return False + self.log.info("hsds_writer.flush()") + self.log.debug(f" new object count: {len(self.db.new_objects)}") + self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") + self.log.debug(f" deleted object count: {len(self.db.deleted_objects)}") + + #root_id = self.db.root_id + if self._init: + # initialize all existing objects + self.log.debug("flush -- init is true") + for obj_id in self.db: + self.log.debug(f"init: {obj_id}") + self._init = False + elif self.db.new_objects: + for obj_id in self.db.new_objects: + self.log.debug(f"new obj id: {obj_id}") + + for obj_id in self.db.dirty_objects: + self.log.debug(f"dirty object id: {obj_id}") + + for obj_id in self.db.deleted_objects: + self.log.debug(f"deleted object: {obj_id}") + + return True # all objects written successfully + + def close(self): + # over-ride of H5Writer method + self.flush() + self.http_conn.close() + self._http_conn = None + + def isClosed(self): + """ return closed status """ + return False if self._http_conn else True + + def get_root_id(self): + """ Return root id """ + return self._root_id diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py index 78df4567..2013332a 100644 --- a/src/h5json/jsonstore/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -36,8 +36,14 @@ def __init__( self.log = logging.getLogger() super().__init__(filepath, app_logger=app_logger) + self._root_id = None + self._h5json = None - with open(filepath) as f: + def open(self): + if self._h5json: + return # already read JSON file + + with open(self.filepath) as f: text = f.read() # parse the json file @@ -47,11 +53,20 @@ def __init__( if "root" not in h5json: raise Exception("no root key in input file") + self._root_id = "g-" + h5json["root"] + if self.db.root_id and self.db.root_id != self._root_id: + self.log.warning("h5json root id doesn't match db root id") + raise IOError("root id mismatch") + + return self._root_id def close(self): pass + def isClosed(self): + return False if self._h5json else False + def get_root_id(self): """ Return root id """ return self._root_id diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index 4a94ad02..c8da27ec 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -13,7 +13,7 @@ import json from ..h5writer import H5Writer -from ..objid import getUuidFromId, getCollectionForId +from ..objid import getUuidFromId, getCollectionForId, createObjId from ..array_util import bytesArrayToList from .. import selections @@ -32,21 +32,41 @@ def __init__( app_logger=None ): super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) + if append: + raise ValueError("H5JsonWriter does not support append mode") self.alias_db = {} self.json = {} - self._root_uuid = None + self._root_id = None def flush(self): """ Write dirty items """ # json writer doesn't support incremental updates, so we'll wait # for close to write out database + if not self._root_id: + msg = "flush called prior to open" + self.log.warning(msg) + raise IOError(msg) + self.log.info("flush") return False + + def open(self): + """ file open """ + # no incremental updates with h5json writer, so just fetch the root_id here + if self.db.root_id: + self._root_id = self.db.root_id + else: + self._root_id = createObjId(obj_type="groups") + return self._root_id def close(self): """ close storage handle """ self.dumpFile() + def isClosed(self): + """ return closed status """ + return False if self._root_id else True + def getAliasList(self, obj_id): """ return list of alias """ if obj_id not in self.alias_db: diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index 45de125e..7c11c4f5 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -40,6 +40,7 @@ def testSimple(self): kwargs = {"app_logger": self.log} with Hdf5db(h5_reader=H5pyReader(filepath, **kwargs), **kwargs) as db: root_id = db.getObjectIdByPath("/") + print("got root_id:", root_id) root_json = db.getObjectById(root_id) root_attrs = root_json["attributes"] diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index f70acb59..3b4a14ab 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -35,7 +35,7 @@ def __init__(self, *args, **kwargs): self.log.setLevel(logging.DEBUG) # create logger - handler = logging.FileHandler("./hdf5dbtest.log") + handler = logging.FileHandler("./h5pywriterbtest.log") # add handler to logger self.log.addHandler(handler) @@ -71,28 +71,27 @@ def testSimple(self): db.createSoftLink(g2_id, "slink", "somewhere") db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") db.createCustomLink(g2_id, "cust", {"foo": "bar"}) - db.flush() - with h5py.File(filepath) as f: - self.assertTrue("attr1", f.attrs) - self.assertTrue("attr2", f.attrs) - self.assertTrue("g1" in f) - g1 = f["g1"] - self.assertTrue("a1" in g1.attrs) - self.assertTrue("g1.1" in g1) - g11 = g1["g1.1"] - self.assertTrue("dset1.1.1" in g11) - dset = g11["dset1.1.1"] - self.assertEqual(dset.shape, (10, 10)) - for i in range(10): - for j in range(10): - self.assertEqual(dset[i, j], i * j) - self.assertTrue("g2" in f) - g2 = f["g2"] - self.assertTrue("extlink" in g2) - self.assertTrue("slink" in g2) + # open file with h5py and verify changes + with h5py.File(filepath) as f: + self.assertTrue("attr1", f.attrs) + self.assertTrue("attr2", f.attrs) + self.assertTrue("g1" in f) + g1 = f["g1"] + self.assertTrue("a1" in g1.attrs) + self.assertTrue("g1.1" in g1) + g11 = g1["g1.1"] + self.assertTrue("dset1.1.1" in g11) + dset = g11["dset1.1.1"] + self.assertEqual(dset.shape, (10, 10)) + for i in range(10): + for j in range(10): + self.assertEqual(dset[i, j], i * j) + self.assertTrue("g2" in f) + g2 = f["g2"] + self.assertTrue("extlink" in g2) + self.assertTrue("slink" in g2) db.createAttribute(g1_id, "a2", "bye-bye") - db.flush() with h5py.File(filepath) as f: g1 = f["g1"] @@ -114,16 +113,16 @@ def testSimple(self): db.setDatasetValues(dset_111_id, sel, arr) db.flush() - with h5py.File(filepath) as f: - dset = f["/g1/g1.1/dset1.1.1"] - for i in range(10): - for j in range(10): - if i == 4 and j == 4: - # this is the one element that was updated - expected = 42 - else: - expected = i * j - self.assertEqual(dset[i, j], expected) + with h5py.File(filepath) as f: + dset = f["/g1/g1.1/dset1.1.1"] + for i in range(10): + for j in range(10): + if i == 4 and j == 4: + # this is the one element that was updated + expected = 42 + else: + expected = i * j + self.assertEqual(dset[i, j], expected) def testNullSpaceAttribute(self): @@ -487,6 +486,7 @@ def testReaderWithUpdate(self): with h5py.File(file_out) as f: self.assertTrue("/g1/g1.1/dset1.1.1" in f) dset111 = f["/g1/g1.1/dset1.1.1"] + print("dset111 attrs:", list(dset111.attrs.keys())) self.assertEqual(len(dset111.attrs), 2) dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") From 9d59f8c1e17aed2a5e7b3796190237390b99953d Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 11 Jun 2025 16:17:57 +0200 Subject: [PATCH 050/129] fix db re-open logic --- src/h5json/h5pystore/h5py_writer.py | 11 ++-- src/h5json/hdf5db.py | 43 +++++++------- test/unit/h5py_writer_test.py | 88 +++++++++++++++-------------- 3 files changed, 76 insertions(+), 66 deletions(-) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 16c69681..49acb4eb 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -384,20 +384,22 @@ def updateAttributes(self, obj_id, obj): for name in attrs: attr_json = attrs[name] if "created" in attr_json and attr_json["created"] < self._flush_time: - # ttribute should be saved already + # attribute should be saved already continue self.createAttribute(obj, name, attr_json) def flush(self): """ Write dirty items """ - if self.closed: # no db set yet + self.log.warning("h5py_writer - flush called but no db") return False if not self._f: + self.log.warning("h5py_writer file not open") raise IOError("open not called") self.log.info("h5py_writer.flush()") + root_id = self.db.root_id self._id_map[root_id] = "/" @@ -434,9 +436,10 @@ def open(self): # no db set yet self.log.warning("no self.db db_ref") raise ValueError("no db") - mode = 'w' if self._init else 'a' + mode = 'a' if self._append else 'w' self.log.info(f"creating h5py file: {self._filepath} mode: {mode}") - self._f = h5py.File(self._filepath, mode=mode) + self._f = h5py.File(self._filepath, mode=mode) + self._append = True # switch to append mode for next file open if self.db.root_id: self._root_id = self.db.root_id else: diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index b28ebbc6..340adc31 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -165,9 +165,9 @@ def make_dirty(self, obj_id): def flush(self): """ write out any changes """ + self.log.debug("db.flush()") if not self.writer: return # nothing to do - if not self.writer.flush(): # flush not successful, don't clear dirty set return @@ -179,25 +179,28 @@ def flush(self): def open(self): """ open reader and writer if set """ if self.root_id: - self.log.warning("root id already set, multiple db.open calls") - return self.root_id - - if self.writer and self.writer.append: - # append mode for the writer, open writer and get the root id - self._root_id = self.writer.open() - elif self.reader: - self._root_id = self.reader.open() - else: - # no root id set by writer or reader, initialize now - self._root_id = createObjId(obj_type="groups") + self.log.debug("root id already set, re-open call") if self.writer: - # open writer in create mode now that we have a root id self.writer.open() + if self.reader: + self.reader.open() + else: + if self.writer and self.writer.append: + # append mode for the writer, open writer and get the root id + self._root_id = self.writer.open() + elif self.reader: + self._root_id = self.reader.open() + else: + # no root id set by writer or reader, initialize now + self._root_id = createObjId(obj_type="groups") + if self.writer: + # open writer in create mode now that we have a root id + self.writer.open() - # create a root group just as a memory object - group_json = {"links": {}, "attributes": {}, "cpl": {}} - group_json["created"] = time.time() - self._db[self._root_id] = group_json + # create a root group just as a memory object + group_json = {"links": {}, "attributes": {}, "cpl": {}} + group_json["created"] = time.time() + self._db[self._root_id] = group_json return self._root_id @@ -209,8 +212,8 @@ def close(self): self.writer.close() if self.reader: self.reader.close() - self._root_id = None - self._db = {} + #self._root_id = None + #self._db = {} @property def closed(self): @@ -224,7 +227,6 @@ def __enter__(self): def __exit__(self, type, value, traceback): """ called on package exit """ self.log.info("Hdf5db __exit") - print("__exit__") self.close() def getObjectById(self, obj_id): @@ -235,7 +237,6 @@ def getObjectById(self, obj_id): obj_json = self.reader.getObjectById(obj_id) self.db[obj_id] = obj_json else: - print("keyerror - self.db:", self.db) raise KeyError(f"obj_id: {obj_id} not found") obj_json = self.db[obj_id] diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 3b4a14ab..042f01df 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -47,30 +47,31 @@ def __init__(self, *args, **kwargs): def testSimple(self): filepath = "test/unit/out/h5py_writer_test_testSimple.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) - db.createAttribute(root_id, "attr2", 42) - g1_id = db.createGroup() - db.createHardLink(root_id, "g1", g1_id) - db.createAttribute(g1_id, "a1", "hello") - g2_id = db.createGroup() - db.createHardLink(root_id, "g2", g2_id) - - g1_1_id = db.createGroup() - db.createHardLink(g1_id, "g1.1", g1_1_id) - dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) - arr = np.zeros((10, 10), dtype=np.int32) - for i in range(10): - for j in range(10): - arr[i, j] = i * j - sel_all = selections.select((10, 10), ...) - db.setDatasetValues(dset_111_id, sel_all, arr) - db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) - db.createSoftLink(g2_id, "slink", "somewhere") - db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") - db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + db.createAttribute(g1_id, "a1", "hello") + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.close() # open file with h5py and verify changes with h5py.File(filepath) as f: @@ -91,27 +92,32 @@ def testSimple(self): g2 = f["g2"] self.assertTrue("extlink" in g2) self.assertTrue("slink" in g2) - db.createAttribute(g1_id, "a2", "bye-bye") - with h5py.File(filepath) as f: - g1 = f["g1"] - self.assertEqual(len(g1.attrs), 2) - self.assertTrue("a1" in g1.attrs) - self.assertTrue("a2" in g1.attrs) + db.open() + db.createAttribute(g1_id, "a2", "bye-bye") + db.close() - g21 = db.createGroup() - db.createHardLink(g2_id, "g2.1", g21) - db.flush() + with h5py.File(filepath) as f: + g1 = f["g1"] + self.assertEqual(len(g1.attrs), 2) + self.assertTrue("a1" in g1.attrs) + self.assertTrue("a2" in g1.attrs) - with h5py.File(filepath) as f: - g2 = f["g2"] - self.assertTrue("g2.1" in g2) + db.open() + g21 = db.createGroup() + db.createHardLink(g2_id, "g2.1", g21) + db.close() - sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) - arr = np.zeros((), dtype=np.int32) - arr[()] = 42 - db.setDatasetValues(dset_111_id, sel, arr) - db.flush() + with h5py.File(filepath) as f: + g2 = f["g2"] + self.assertTrue("g2.1" in g2) + + db.open() + sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + db.setDatasetValues(dset_111_id, sel, arr) + db.close() with h5py.File(filepath) as f: dset = f["/g1/g1.1/dset1.1.1"] From 4894e6d498f6e8b30b57af378609723459ae8750 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 16 Jun 2025 20:56:57 +0200 Subject: [PATCH 051/129] support for h5py_writer --- src/h5json/h5pystore/h5py_reader.py | 3 +- src/h5json/h5pystore/h5py_writer.py | 11 +- src/h5json/h5reader.py | 4 +- src/h5json/h5writer.py | 10 +- src/h5json/hdf5db.py | 34 +- src/h5json/jsonstore/h5json_reader.py | 2 +- src/h5json/jsonstore/h5json_writer.py | 4 +- test/unit/h5py_writer_test.py | 594 +++++++++++++------------- 8 files changed, 342 insertions(+), 320 deletions(-) diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index 034566c6..bc4b5820 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -154,7 +154,6 @@ def __init__( super().__init__(filepath, app_logger=app_logger) self._f = None self._root_id = None - def open(self): if self._f: @@ -169,7 +168,7 @@ def open(self): else: self.log.info("H5pyReader: creating root id") self._root_id = createObjId(obj_type="groups") - + f = h5py.File(self.filepath) self._f = f self._id_map[self._root_id] = f diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 49acb4eb..b4f81658 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -397,12 +397,12 @@ def flush(self): if not self._f: self.log.warning("h5py_writer file not open") raise IOError("open not called") - + self.log.info("h5py_writer.flush()") - + root_id = self.db.root_id self._id_map[root_id] = "/" - + if self.db.new_objects or self._init: root_json = self.db.getObjectById(root_id) @@ -428,7 +428,7 @@ def flush(self): self._init = False # done with init after first flush return True # all objects written successfully - + def open(self): """ open HDF5 file """ self.log.debug("h5pyWriter open") @@ -439,14 +439,13 @@ def open(self): mode = 'a' if self._append else 'w' self.log.info(f"creating h5py file: {self._filepath} mode: {mode}") self._f = h5py.File(self._filepath, mode=mode) - self._append = True # switch to append mode for next file open + self._append = True # switch to append mode for next file open if self.db.root_id: self._root_id = self.db.root_id else: self._root_id = createObjId(obj_type="groups") return self._root_id - def close(self): """ close storage handle """ self.log.debug("h5py_writer.close()") diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index fbc53491..3bf49ca7 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -40,12 +40,12 @@ def db(self): if not self._db_ref: raise ValueError("db not available") return self._db_ref() - + @property def filepath(self): """ return filepath """ return self._filepath - + @property def closed(self): """ return True if the reader handle is closed (or never opened) """ diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index bc52523d..3dfb8da8 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -44,7 +44,7 @@ def set_db(self, db): @property def filepath(self): return self._filepath - + @property def closed(self): return self.isClosed() @@ -55,15 +55,15 @@ def db(self): self.log.debug("db not available") return None return self._db_ref() - + @property def append(self): return self._append - - #property + + @property def no_data(self): return self._no_data - + @abstractmethod def open(self): """ open storage handle, return root_id""" diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 340adc31..581399f6 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -50,7 +50,7 @@ def __init__( self._new_objects = set() # set of for newly created objects self._dirty_objects = set() # set of modified objects - self._deleted_objects = set() # set of deleted objects + self._deleted_objects = set() # set of deleted objects self._root_id = None @@ -65,15 +65,6 @@ def __init__( self._writer.set_db(self) else: self._writer = None - - #root_id = createObjId(obj_type="groups") - # create a root group - #group_json = {"links": {}, "attributes": {}, "cpl": {}} - #group_json["created"] = time.time() - - - #self._db[root_id] = group_json - # self._root_id = root_id @property def db(self): @@ -143,7 +134,7 @@ def new_objects(self): @property def dirty_objects(self): return self._dirty_objects - + @property def deleted_objects(self): return self._deleted_objects @@ -178,6 +169,7 @@ def flush(self): def open(self): """ open reader and writer if set """ + self.log.debug("db.open()") if self.root_id: self.log.debug("root id already set, re-open call") if self.writer: @@ -185,23 +177,39 @@ def open(self): if self.reader: self.reader.open() else: + self.log.debug("db.open, getting root_id") + if self.writer and self.writer.append: # append mode for the writer, open writer and get the root id + self.log.debug("db.open, write append, getting root_id from writer") self._root_id = self.writer.open() + if self.reader: + reader_root_id = self.reader.open() + if reader_root_id != self._root_id: + # TBD: need someway to reconcile if both reader and writer have + # an potentiated idea on what there root id is + self.log.warn("reader root_id does not match writer root_id") elif self.reader: + self.log.debug("db.open, getting root_id from reader") self._root_id = self.reader.open() + if self.writer: + writer_root_id = self.writer.open() + if writer_root_id != self._root_id: + # TBD: same as above, need to deal with inconsistent root ids + self.log.warning("writer root_id does not match reader root_id") else: # no root id set by writer or reader, initialize now self._root_id = createObjId(obj_type="groups") if self.writer: # open writer in create mode now that we have a root id self.writer.open() - + # create a root group just as a memory object group_json = {"links": {}, "attributes": {}, "cpl": {}} group_json["created"] = time.time() self._db[self._root_id] = group_json + self.log.debug(f"db.open() - returning root_id: {self._root_id}") return self._root_id def close(self): @@ -212,8 +220,6 @@ def close(self): self.writer.close() if self.reader: self.reader.close() - #self._root_id = None - #self._db = {} @property def closed(self): diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py index 2013332a..40f8e5e4 100644 --- a/src/h5json/jsonstore/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -53,7 +53,7 @@ def open(self): if "root" not in h5json: raise Exception("no root key in input file") - + self._root_id = "g-" + h5json["root"] if self.db.root_id and self.db.root_id != self._root_id: self.log.warning("h5json root id doesn't match db root id") diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index c8da27ec..709f34fd 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -46,10 +46,10 @@ def flush(self): msg = "flush called prior to open" self.log.warning(msg) raise IOError(msg) - + self.log.info("flush") return False - + def open(self): """ file open """ # no incremental updates with h5json writer, so just fetch the root_id here diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 042f01df..b0889b3d 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -12,6 +12,8 @@ import unittest import time import logging +import os + import h5py import numpy as np from h5json import Hdf5db @@ -47,9 +49,13 @@ def __init__(self, *args, **kwargs): def testSimple(self): filepath = "test/unit/out/h5py_writer_test_testSimple.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) db.writer = H5pyWriter(filepath, no_data=False) root_id = db.open() + self.assertEqual(db.getObjectIdByPath("/"), root_id) db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) db.createAttribute(root_id, "attr2", 42) g1_id = db.createGroup() @@ -133,19 +139,21 @@ def testSimple(self): def testNullSpaceAttribute(self): filepath = "test/unit/out/h5py_writer_test_testNullSpaceAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) - item = db.getAttribute(root_id, "A1") - self.assertTrue("shape" in item) - shape_item = item["shape"] - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_NULL") - self.assertTrue(item["created"] > time.time() - 1.0) - value = db.getAttributeValue(root_id, "A1") - self.assertEqual(value, None) - db.flush() + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -154,28 +162,30 @@ def testNullSpaceAttribute(self): def testScalarAttribute(self): filepath = "test/unit/out/h5py_writer_test_testNullScalarAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - dims = () - value = 42 - print("test create attribute A1") - db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I32LE") - self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned - self.assertEqual(item["value"], 42) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I32LE") + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -186,22 +196,25 @@ def testScalarAttribute(self): def testFixedStringAttribute(self): filepath = "test/unit/out/h5py_writer_test_testFixedStringAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - value = "Hello, world!" - db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["length"], 13) - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -212,28 +225,29 @@ def testFixedStringAttribute(self): def testVlenAsciiAttribute(self): filepath = "test/unit/out/h5py_writer_test_testVlenAsciiAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run value = b"Hello, world!" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - - dt = special_dtype(vlen=bytes) - - # write the attribute - db.createAttribute(root_id, "A1", value, dtype=dt) - # read it back - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dt = special_dtype(vlen=bytes) + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -244,28 +258,29 @@ def testVlenAsciiAttribute(self): def testVlenUtf8Attribute(self): filepath = "test/unit/out/h5py_writer_test_testVlenUtf8Attribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run value = "one: \u4e00" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - - dt = special_dtype(vlen=str) - - # write the attribute - db.createAttribute(root_id, "A1", value, dtype=dt) - # read it back - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") - self.assertEqual(item["value"], value) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dt = special_dtype(vlen=str) + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], value) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -276,22 +291,25 @@ def testVlenUtf8Attribute(self): def testIntAttribute(self): filepath = "test/unit/out/h5py_writer_test_testIntAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run value = [2, 3, 5, 7, 11] - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "A1", value, dtype=np.int16) - item = db.getAttribute(root_id, "A1") - self.assertEqual(item["value"], [2, 3, 5, 7, 11]) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - item_shape = item["shape"] - self.assertEqual(item_shape["class"], "H5S_SIMPLE") - self.assertEqual(item_shape["dims"], [5,]) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -304,27 +322,26 @@ def testIntAttribute(self): def testCreateReferenceAttribute(self): filepath = "test/unit/out/h5py_writer_test_testCreateReferenceAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - - dset_id = db.createDataset(shape=(), dtype=np.int32) - db.createHardLink(root_id, "DS1", dset_id) - - dt = special_dtype(ref=Reference) - - ds1_ref = "datasets/" + dset_id - value = [ds1_ref,] - db.createAttribute(root_id, "A1", value, dtype=dt) - attr = db.getAttribute(root_id, "A1") - self.assertTrue("shape" in attr) - - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_REFERENCE") - self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") - attr_value = db.getAttributeValue(root_id, "A1") - self.assertEqual(len(attr_value), 1) - self.assertEqual(attr_value[0], ds1_ref.encode('ascii')) + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + dt = special_dtype(ref=Reference) + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref.encode('ascii')) + db.close() with h5py.File(filepath) as f: self.assertTrue("A1" in f.attrs) @@ -336,37 +353,35 @@ def testCreateReferenceAttribute(self): def testCreateVlenReferenceAttribute(self): filepath = "test/unit/out/h5py_writer_test_testVlenReferenceAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) - db.createHardLink(root_id, "DS1", dset_id) - grp_id = db.createGroup() - db.createHardLink(root_id, "G1", grp_id) - - dt_base = special_dtype(ref=Reference) - dt = special_dtype(vlen=dt_base) - - ds1_ref = "datasets/" + dset_id - grp_ref = "groups/" + grp_id - ref_arr = np.zeros((2,), dtype=dt_base) - ref_arr[0] = ds1_ref - ref_arr[1] = grp_ref - vlen_arr = np.zeros((), dtype=dt) - vlen_arr[()] = ref_arr - - db.createAttribute(root_id, "A1", vlen_arr) - item = db.getAttribute(root_id, "A1") - - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_VLEN") - self.assertEqual(item_type["size"], "H5T_VARIABLE") - base_type = item_type["base"] - self.assertEqual(base_type["class"], "H5T_REFERENCE") - self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") - - item_shape = item["shape"] - self.assertEqual(item_shape["class"], "H5S_SCALAR") + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + db.close() with h5py.File(filepath) as f: self.assertTrue("DS1" in f) @@ -383,35 +398,33 @@ def testCreateVlenReferenceAttribute(self): def testCommittedType(self): filepath = "test/unit/out/h5py_writer_test_testCommittedType.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run dt = np.dtype("S15") - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - - ctype_id = db.createCommittedType(dt) - db.createHardLink(root_id, "ctype", ctype_id) - item = db.getObjectById(ctype_id) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - db.createHardLink(root_id, "T1", ctype_id) - - item_type = item["type"] - - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 15) - - # create an attribute using the committed type - db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") - attr = db.getAttribute(root_id, "A1") - self.assertEqual(attr["value"], "hello world!") - - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_STRING") - self.assertEqual(attr_type["length"], 15) - self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.createHardLink(root_id, "T1", ctype_id) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + db.close() with h5py.File(filepath) as f: self.assertTrue("T1" in f) @@ -426,44 +439,41 @@ def testCommittedType(self): def testCommittedCompoundType(self): filepath = "test/unit/out/h5py_writer_test_testCommittedCompoundType.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run - with Hdf5db(app_logger=self.log) as db: - db.writer = H5pyWriter(filepath, no_data=False) - root_id = db.getObjectIdByPath("/") - - dt_str = special_dtype(vlen=str) - fields = [] - fields.append(("field_1", np.dtype(">i8"))) - fields.append(("field_2", np.dtype(">f8"))) - fields.append(("field_3", np.dtype("S15"))) - fields.append(("field_4", dt_str)) - dt = np.dtype(fields) - - ctype_id = db.createCommittedType(dt) - db.createHardLink(root_id, "ctype", ctype_id) - item = db.getObjectById(ctype_id) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - db.createHardLink(root_id, "T1", ctype_id) - - item_type = item["type"] - - self.assertEqual(item_type["class"], "H5T_COMPOUND") - fields = item_type["fields"] - self.assertEqual(len(fields), 4) - - # create an attribute using the committed type - attr_value = (42, 3.14, "circle", "area = R^2 * PI") - db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") - attr = db.getAttribute(root_id, "A1") - self.assertEqual(attr["value"], list(attr_value)) - attr_shape = attr["shape"] - self.assertEqual(attr_shape["class"], "H5S_SCALAR") - - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_COMPOUND") - arr = db.getAttributeValue(root_id, "A1") - self.assertTrue(isinstance(arr, np.ndarray)) + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", np.dtype(">f8"))) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.createHardLink(root_id, "T1", ctype_id) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + arr = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(arr, np.ndarray)) + db.close() with h5py.File(filepath) as f: self.assertTrue("T1" in f) @@ -483,74 +493,82 @@ def testReaderWithUpdate(self): file_in = "data/json/tall.json" file_out = "test/unit/out/h5py_writer_test_testReaderWithUpdate.h5" + if os.path.isfile(file_out): + os.remove(file_out) # cleanup any previous run + + db = Hdf5db(app_logger=self.log) + db.reader = H5JsonReader(file_in) + db.writer = H5pyWriter(file_out) + db.open() + # close should create everything the json reader read to the output file + db.close() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 2) + + db.open() + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + db.createAttribute(dset111_id, "attr3", "hello") + db.close() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 3) + self.assertEqual(dset111.attrs["attr3"], b"hello") + + db.open() + db.createAttribute(dset111_id, "attr3", "bye-bye") + db.close() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 3) + self.assertEqual(dset111.attrs["attr3"], b"bye-bye") + g1 = f["g1"] + + db.open() + # create a new group + g13_id = db.createGroup() + g1_id = db.getObjectIdByPath("/g1") + db.createHardLink(g1_id, "g1.3", g13_id) + db.close() + + with h5py.File(file_out) as f: + g1 = f["g1"] + self.assertEqual(len(g1), 3) + self.assertTrue("g1.3" in g1) + + db.open() + # create a new dataset + dset_id = db.createDataset(shape=(10, 10), dtype=np.int32) + db.createHardLink(g1_id, "DS1", dset_id) + db.close() + + with h5py.File(file_out) as f: + g1 = f["g1"] + self.assertTrue("DS1" in g1) + ds1 = g1["DS1"] + self.assertEqual(ds1.shape, (10, 10)) + + db.open() + arr = np.asarray(range(10), dtype=np.int32) + sel = selections.select((10, 10), (slice(5, 6), slice(0, 10))) + db.setDatasetValues(dset_id, sel, arr) + db.close() - with Hdf5db(app_logger=self.log) as db: - db.reader = H5JsonReader(file_in) - db.writer = H5pyWriter(file_out, no_data=False) - db.flush() - - with h5py.File(file_out) as f: - self.assertTrue("/g1/g1.1/dset1.1.1" in f) - dset111 = f["/g1/g1.1/dset1.1.1"] - print("dset111 attrs:", list(dset111.attrs.keys())) - self.assertEqual(len(dset111.attrs), 2) - - dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") - db.createAttribute(dset111_id, "attr3", "hello") - db.flush() - - with h5py.File(file_out) as f: - self.assertTrue("/g1/g1.1/dset1.1.1" in f) - dset111 = f["/g1/g1.1/dset1.1.1"] - self.assertEqual(len(dset111.attrs), 3) - self.assertEqual(dset111.attrs["attr3"], b"hello") - - db.createAttribute(dset111_id, "attr3", "bye-bye") - db.flush() - - with h5py.File(file_out) as f: - self.assertTrue("/g1/g1.1/dset1.1.1" in f) - dset111 = f["/g1/g1.1/dset1.1.1"] - self.assertEqual(len(dset111.attrs), 3) - self.assertEqual(dset111.attrs["attr3"], b"bye-bye") - g1 = f["g1"] - - # create a new group - g13_id = db.createGroup() - g1_id = db.getObjectIdByPath("/g1") - db.createHardLink(g1_id, "g1.3", g13_id) - db.flush() - - with h5py.File(file_out) as f: - g1 = f["g1"] - self.assertEqual(len(g1), 3) - self.assertTrue("g1.3" in g1) - - # create a new dataset - dset_id = db.createDataset(shape=(10, 10), dtype=np.int32) - db.createHardLink(g1_id, "DS1", dset_id) - db.flush() - - with h5py.File(file_out) as f: - g1 = f["g1"] - self.assertTrue("DS1" in g1) - ds1 = g1["DS1"] - self.assertEqual(ds1.shape, (10, 10)) - - arr = np.asarray(range(10), dtype=np.int32) - sel = selections.select((10, 10), (slice(5, 6), slice(0, 10))) - db.setDatasetValues(dset_id, sel, arr) - db.flush() - - with h5py.File(file_out) as f: - ds1 = f["/g1/DS1"] - data = ds1[:, :] - for i in range(10): - for j in range(10): - if i == 5: - self.assertEqual(data[i, j], j) - else: - self.assertEqual(data[i, j], 0) + with h5py.File(file_out) as f: + ds1 = f["/g1/DS1"] + data = ds1[:, :] + for i in range(10): + for j in range(10): + if i == 5: + self.assertEqual(data[i, j], j) + else: + self.assertEqual(data[i, j], 0) if __name__ == "__main__": From 8324a46c1753372b08179b2a5d81024193fd6497 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 18 Jun 2025 20:43:30 +0100 Subject: [PATCH 052/129] fix jsontoh5 and h5tojson for new db interface --- src/h5json/h5pystore/h5py_writer.py | 2 +- src/h5json/h5tojson/h5tojson.py | 27 +++++++++---------------- src/h5json/jsonstore/h5json_writer.py | 2 +- src/h5json/jsontoh5/jsontoh5.py | 29 ++++++++++----------------- 4 files changed, 23 insertions(+), 37 deletions(-) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index b4f81658..9bea57b0 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -416,7 +416,7 @@ def flush(self): obj = self._f[h5path] self.updateAttributes(obj_id, obj) collection = getCollectionForId(obj_id) - if collection == "datasets": + if collection == "datasets" and not self.no_data: if self._init: self.initializeDatasetValues(obj_id, obj) else: diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/h5tojson/h5tojson.py index b479cdd4..284de84c 100755 --- a/src/h5json/h5tojson/h5tojson.py +++ b/src/h5json/h5tojson/h5tojson.py @@ -12,7 +12,6 @@ import sys import os.path as op import logging -import logging.handlers from h5json import Hdf5db from h5json.jsonstore.h5json_writer import H5JsonWriter @@ -33,28 +32,22 @@ def main(): filename = sys.argv[i] # create logger - log = logging.getLogger("h5tojson") - # log.setLevel(logging.WARN) - log.setLevel(logging.INFO) - # add log handler - handler = logging.FileHandler("./h5tojson.log") - - # add handler to logger - log.addHandler(handler) + logfname = "h5tojson.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + log = logging.getLogger() + # check that the input file exists if not op.isfile(filename): sys.exit(f"Cannot find file: {filename}") log.info(f"h5tojson {filename}") - kwargs = {"app_logger": log} - reader = H5pyReader(filename, **kwargs) - writer = H5JsonWriter(None, no_data=no_data, **kwargs) - kwargs["h5_reader"] = reader - kwargs["h5_writer"] = writer - - with Hdf5db(**kwargs) as db: - db.flush() + db = Hdf5db(app_logger=log) + db.reader = H5pyReader(filename, app_logger=log) + db.writer = H5JsonWriter(None, no_data=no_data, app_logger=log) + db.open() # read HDF5 data into db + db.close() # close will trigger write to json file if __name__ == "__main__": diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index 709f34fd..92d3499a 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -227,7 +227,7 @@ def dumpDataset(self, obj_id): if attributes: response["attributes"] = attributes - if not self._no_data: + if not self.no_data: if num_elements > 0: sel_all = selections.select(dims, ...) arr = self.db.getDatasetValues(obj_id, sel_all) diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/jsontoh5/jsontoh5.py index d572e58e..28f5e002 100755 --- a/src/h5json/jsontoh5/jsontoh5.py +++ b/src/h5json/jsontoh5/jsontoh5.py @@ -12,7 +12,6 @@ import sys import os.path as op import logging -import logging.handlers from h5json import Hdf5db from h5json.h5pystore.h5py_writer import H5pyWriter @@ -36,29 +35,23 @@ def main(): hdf5_filename = sys.argv[i] # create logger - log = logging.getLogger("h5json") - # log.setLevel(logging.WARN) - log.setLevel(logging.INFO) - # add log handler - handler = logging.FileHandler("./jsontoh5.log") - - # add handler to logger - log.addHandler(handler) + logfname = "jsontoh5.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + log = logging.getLogger() + # check that the input file exists if not op.isfile(json_filename): sys.exit(f"Cannot find file: {json_filename}") log.info(f"jsontoh5 {json_filename} to {hdf5_filename}") - kwargs = {"app_logger": log} - - h5_reader = H5JsonReader(json_filename, **kwargs) - h5_writer = H5pyWriter(hdf5_filename, no_data=no_data, **kwargs) - kwargs["h5_reader"] = h5_reader - kwargs["h5_writer"] = h5_writer - - with Hdf5db(**kwargs) as db: - db.flush() + db = Hdf5db(app_logger=log) + db.reader = H5JsonReader(json_filename, app_logger=log) + db.writer = H5pyWriter(hdf5_filename, no_data=no_data, app_logger=log) + db.open() # read json data + # close should create everything the json reader read to the output file + db.close() if __name__ == "__main__": From 5c82129b56a4890ffaa4bf6f3e3695e0313e46c9 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 19 Jun 2025 19:49:34 +0100 Subject: [PATCH 053/129] update of hsds_writer --- src/h5json/h5pystore/h5py_writer.py | 1 + src/h5json/hsdsstore/hsds_reader.py | 11 ++-- src/h5json/hsdsstore/hsds_writer.py | 80 ++++++++++++++++++++++++---- test/unit/h5py_writer_test.py | 13 +++++ test/unit/hsds_writer_test.py | 82 +++++++++++++++++++++++++++++ 5 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 test/unit/hsds_writer_test.py diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 9bea57b0..14942c11 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -409,6 +409,7 @@ def flush(self): if "links" in root_json: root_links = root_json["links"] self._createObjects(self._f, root_links, visited=set((root_id,))) + # update attributes, dataset values for obj_id in self._id_map: if self.db.is_dirty(obj_id) or self._init: diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index b4de31d2..55a8c022 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -86,13 +86,14 @@ def __init__( kwargs["timeout"] = timeout # save these for when we create the connection self._http_kwargs = kwargs + self._http_conn = None super().__init__(domain_path, app_logger=app_logger) def open(self): if self._http_conn: return # open already called - + kwargs = self._http_kwargs http_conn = HttpConn(self.filepath, **kwargs) @@ -132,7 +133,7 @@ def open(self): if "domain_objs" in root_json: domain_objs = root_json["domain_objs"] objdb.load(domain_objs) - """ + """ if "limits" in domain_json: self._limits = domain_json["limits"] else: @@ -147,7 +148,6 @@ def open(self): return self._root_id - @property def http_conn(self): return self._http_conn @@ -157,7 +157,10 @@ def close(self): self._http_conn.close() def isClosed(self): - return False is self._http_conn else True + if self._http_conn: + return False + else: + return True def get_root_id(self): """ Return root id """ diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 8144e085..c4a7c397 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -10,6 +10,7 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import logging +import time from ..objid import getCollectionForId, getUuidFromId @@ -111,6 +112,7 @@ def __init__( self._track_order = track_order self._linked_domain = linked_domain self._domain_json = None + self._last_flush_time = 0 def open(self): """ setup domain for writing """ @@ -140,7 +142,7 @@ def open(self): params["include_attrs"] = 1 params["include_links"] = 1 """ - + domain_json = None rsp = http_conn.GET(req, params=params) @@ -166,7 +168,7 @@ def open(self): # failed to delete http_conn.close() raise IOError(rsp.status_code, rsp.reason) - + if not domain_json: # domain doesn't exist, create it body = {} @@ -176,7 +178,7 @@ def open(self): if self._owner: body["owner"] = self._owner if self._linked_domain: - body["linked_domain"] = linked_domain + body["linked_domain"] = self._linked_domain if self._track_order: create_props = {"CreateOrder": 1} group_body = {"creationProperties": create_props} @@ -200,7 +202,7 @@ def open(self): root_id = domain_json["root"] self._root_id = root_id - + if "limits" in domain_json: self._limits = domain_json["limits"] else: @@ -214,11 +216,66 @@ def open(self): return self._root_id - @property def http_conn(self): return self._http_conn - + + def createObjects(self, obj_ids): + MAX_OBJECTS_PER_REQUEST = 1 + collections = ("groups", "datasets", "datatypes") + col_items = {} + for collection in collections: + col_items[collection] = [] + + for obj_id in obj_ids: + if obj_id == self._root_id: + continue # this was created when the domain was + collection = getCollectionForId(obj_id) + obj_json = self.db.getObjectById(obj_id) + item = {"id": obj_id} + for key in ("links", "attributes"): + if key in obj_json: + item[key] = obj_json[key] + items = col_items[collection] + items.append(item) + if len(items) == MAX_OBJECTS_PER_REQUEST: + print("items:", items) + post_rsp = self.http_conn.POST("/" + collection, items) + print("post_rsp.status_code:", post_rsp.status_code) + if post_rsp.is_json: + print("post_rsp.json:", post_rsp.json()) + items.clear() + + # handle any remainder items + for collection in collections: + items = col_items[collection] + if items: + self.http_conn.POST("/" + collection, items) + + def updateLinks(self, grp_ids): + """ update any modified links of the given objects """ + + print("updateLinks:", grp_ids) + body = {} # body will hold a map of grp ids to link lists + + for grp_id in grp_ids: + if getCollectionForId(grp_id) != "groups": + continue # ignore datasets and datatypes + grp_json = self.db.getObjectById(grp_id) + grp_links = grp_json["links"] + print(f"grp_id {grp_id} links: {grp_links}") + for link_json in grp_links: + if "created" not in link_json: + self.log.error(f"hsds_writer> expected created timestamp in link: {link_json}") + created = link_json["created"] + if created > self._last_flush_time: + # new link, add to our list + if grp_id not in body: + body[grp_id] = {} + + if body: + print("updateLinks, body:", body) + def flush(self): """ Write dirty items """ @@ -230,30 +287,31 @@ def flush(self): self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") self.log.debug(f" deleted object count: {len(self.db.deleted_objects)}") - #root_id = self.db.root_id if self._init: # initialize all existing objects - self.log.debug("flush -- init is true") + self.log.debug(f"flush -- init is true, self.db: {self.db.db}") for obj_id in self.db: self.log.debug(f"init: {obj_id}") + self.createObjects(self.db.db.keys()) self._init = False elif self.db.new_objects: for obj_id in self.db.new_objects: self.log.debug(f"new obj id: {obj_id}") + self.createObjects(self.db.new_objects) for obj_id in self.db.dirty_objects: self.log.debug(f"dirty object id: {obj_id}") + self.updateLinks(self.db.dirty_objects) for obj_id in self.db.deleted_objects: self.log.debug(f"deleted object: {obj_id}") - + + self._last_flush_time = time.time() return True # all objects written successfully def close(self): # over-ride of H5Writer method self.flush() - self.http_conn.close() - self._http_conn = None def isClosed(self): """ return closed status """ diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index b0889b3d..3ff91bee 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -61,6 +61,19 @@ def testSimple(self): g1_id = db.createGroup() db.createHardLink(root_id, "g1", g1_id) db.createAttribute(g1_id, "a1", "hello") + db.close() + + # open file with h5py and verify changes + with h5py.File(filepath) as f: + self.assertTrue("attr1", f.attrs) + self.assertTrue("attr2", f.attrs) + self.assertEqual(len(f), 1) + self.assertTrue("g1" in f) + g1 = f["g1"] + self.assertTrue("a1" in g1.attrs) + self.assertEqual(len(g1), 0) + + db.open() g2_id = db.createGroup() db.createHardLink(root_id, "g2", g2_id) diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py new file mode 100644 index 00000000..a3ba9bea --- /dev/null +++ b/test/unit/hsds_writer_test.py @@ -0,0 +1,82 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import time +import logging +import h5py +import numpy as np +from h5json import Hdf5db +from h5json.hsdsstore.hsds_writer import HSDSWriter +from h5json.hdf5dtype import special_dtype, Reference +from h5json import selections + + +class HSDSWriterTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(HSDSWriterTest, self).__init__(*args, **kwargs) + # main + + # create logger + logfname = "hsds_writer_test.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + self.log = logging.getLogger() + self.log.info("init!") + + def testSimple(self): + + filepath = "/home/test_user1/writer_test.h5" + db = Hdf5db(app_logger=self.log) + db.writer = HSDSWriter(filepath) + root_id = db.open() + print("root_id:", root_id) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + db.createAttribute(g1_id, "a1", "hello") + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.flush() + + db.createAttribute(g1_id, "a2", "bye-bye") + db.flush() + + g21 = db.createGroup() + db.createHardLink(g2_id, "g2.1", g21) + db.flush() + + sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + db.setDatasetValues(dset_111_id, sel, arr) + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() From 286f239c6761e5e265b8942f1ba1df46985a7425 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 26 Jun 2025 16:35:18 +0100 Subject: [PATCH 054/129] multi-update for hsds-writer --- src/h5json/dset_util.py | 14 ++ src/h5json/hdf5db.py | 4 +- src/h5json/hsdsstore/hsds_writer.py | 205 ++++++++++++++++++++++++---- src/h5json/selections.py | 2 +- test/unit/h5py_writer_test.py | 5 +- test/unit/hsds_reader_test.py | 119 ++++++++-------- test/unit/hsds_writer_test.py | 95 ++++++++++++- 7 files changed, 353 insertions(+), 91 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 5b10323f..496734d3 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -11,6 +11,7 @@ ############################################################################## import time +import numpy as np def resize_dataset(dset_json, shape): @@ -40,3 +41,16 @@ def resize_dataset(dset_json, shape): shape_json["dims"] = list(shape) dset_json["modified"] = time.time() + + +def getNumElements(dset_json): + shape_json = dset_json["shape"] + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": + num_elements = 0 + elif shape_class == "H5S_SCALAR": + num_elements = 1 + elif shape_class == "H5S_SIMPLE": + dims = shape_json["dims"] + num_elements = int(np.prod(dims)) + return num_elements diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 581399f6..8d88d6ec 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -731,7 +731,9 @@ def createDataset( if self.closed: raise ValueError("db is closed") type_json = getTypeItem(dtype) - if shape == "H5S_NULL": + if shape is None: + raise ValueError("shape not set") + elif shape == "H5S_NULL": shape_json = {"class": "H5S_NULL"} elif shape == (): shape_json = {"class": "H5S_SCALAR"} diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index c4a7c397..7b022c34 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -14,8 +14,9 @@ from ..objid import getCollectionForId, getUuidFromId -from ..hdf5dtype import createDataType -from ..array_util import jsonToArray, bytesToArray +from ..hdf5dtype import createDataType, isVlen +from ..array_util import jsonToArray, bytesToArray, arrayToBytes, bytesArrayToList +from ..dset_util import getNumElements from .. import selections from ..h5writer import H5Writer from .httpconn import HttpConn @@ -121,6 +122,7 @@ def open(self): http_conn = self._http_conn else: kwargs = self._http_kwargs + kwargs["retries"] = 1 # tbd: test setting http_conn = HttpConn(self.filepath, **kwargs) if self._append: http_conn._mode = "a" @@ -220,10 +222,36 @@ def open(self): def http_conn(self): return self._http_conn + def getDatasetSize(self, dset_id): + """ Return the size of the given dataset """ + + dset_json = self.db.getObjectById(dset_id) + num_elements = getNumElements(dset_json) + dtype = self.db.getDtype(dset_json) + if isVlen(dtype): + item_size = 1024 # random guess at size of variable length types + else: + item_size = dtype.itemsize + return num_elements * item_size + def createObjects(self, obj_ids): - MAX_OBJECTS_PER_REQUEST = 1 + """ create the objects referenced in obj_ids """ + + MAX_INIT_SIZE = 4096 # max size to include init values in dataset creation + + def multiPost(items): + self.log.debug(f"hsds_writer> POST request {collection} for {len(items)} objects") + post_rsp = self.http_conn.POST("/" + collection, items) + self.log.debug(f"hsds_writer> POST post_rsp.status_code: {post_rsp.status_code}") + if post_rsp.is_json: + self.log.debug(f"hsds_writer> post_rsp.json: {post_rsp.json()}") + items.clear() + + self.log.debug(f"hsds_writer> createObjects, {len(obj_ids)} objects") + MAX_OBJECTS_PER_REQUEST = 3 collections = ("groups", "datasets", "datatypes") col_items = {} + dset_value_update_ids = set() for collection in collections: col_items[collection] = [] @@ -233,48 +261,169 @@ def createObjects(self, obj_ids): collection = getCollectionForId(obj_id) obj_json = self.db.getObjectById(obj_id) item = {"id": obj_id} - for key in ("links", "attributes"): - if key in obj_json: + self.log.debug(f"create id: {obj_id}") + for key in obj_json: # ("links", "attributes"): + if key == "updates": + # not part of the obj json + continue + if key == "shape": + # just send the dims, not the shape json + shape_json = obj_json["shape"] + if shape_json["class"] == "H5S_SIMPLE": + dims = shape_json["dims"] + item[key] = dims + else: + # just copy the key value directly item[key] = obj_json[key] + + # initialize dataset values if provided and not too large + if "updates" in obj_json: + updates = obj_json["updates"] + if updates and len(updates) == 1 and self.getDatasetSize(obj_id) < MAX_INIT_SIZE: + sel, arr = updates[0] + if sel.select_type == selections.H5S_SELECT_ALL: + value = bytesArrayToList(arr) + item["value"] = value + updates.clear() # reset the update list + if updates: + dset_value_update_ids.add(obj_id) # will set dataset value below + + # add to the list of new items for the given collection items = col_items[collection] items.append(item) + if len(items) == MAX_OBJECTS_PER_REQUEST: - print("items:", items) - post_rsp = self.http_conn.POST("/" + collection, items) - print("post_rsp.status_code:", post_rsp.status_code) - if post_rsp.is_json: - print("post_rsp.json:", post_rsp.json()) - items.clear() + multiPost(items) # handle any remainder items for collection in collections: items = col_items[collection] if items: - self.http_conn.POST("/" + collection, items) + multiPost(items) + + # write any initial dataset values + if dset_value_update_ids: + self.updateValues(dset_value_update_ids) def updateLinks(self, grp_ids): """ update any modified links of the given objects """ - print("updateLinks:", grp_ids) - body = {} # body will hold a map of grp ids to link lists + self.log.debug("hsds_writer> updateLinks") + items = {} # dict which will hold a map of grp ids to links to create + count = 0 for grp_id in grp_ids: if getCollectionForId(grp_id) != "groups": continue # ignore datasets and datatypes grp_json = self.db.getObjectById(grp_id) grp_links = grp_json["links"] - print(f"grp_id {grp_id} links: {grp_links}") - for link_json in grp_links: + for link_title in grp_links: + link_json = grp_links[link_title] if "created" not in link_json: self.log.error(f"hsds_writer> expected created timestamp in link: {link_json}") created = link_json["created"] if created > self._last_flush_time: + self.log.debug(f"hsds_writer> {grp_id}: new link: {link_title}") + count += 1 # new link, add to our list - if grp_id not in body: - body[grp_id] = {} + if grp_id not in items: + items[grp_id] = {"links": {}} + links = items[grp_id]["links"] + link_class = link_json["class"] + new_link = {"class": link_class} + # convert to hsds representation + if link_class == "H5L_TYPE_HARD": + new_link["id"] = link_json["id"] + elif link_class == "H5L_TYPE_SOFT": + new_link["h5path"] = link_json["h5path"] + elif link_class == "H5L_TYPE_EXTERNAL": + new_link["h5path"] = link_json["h5path"] + new_link["h5domain"] = link_json["file"] # use h5domain for file key + elif link_class == "H5L_TYPE_USER_DEFINED": + self.log.warning(f"ignoring user-defined link: {link_title}") + continue + else: + raise IOError(f"unexpected link class: {link_class}") + links[link_title] = new_link + self.log.debug(f"setting link {link_title} to {new_link}") + + if items: + body = {"grp_ids": items} + put_rsp = self.http_conn.PUT("/groups/" + self._root_id + "/links", body=body) + if put_rsp.status_code not in (200, 201): + self.log.error(f"failed to update links for request: {body}") + raise IOError("hsds_writer unable to update links") + else: + self.log.debug(f"hsds_writer> {grp_id} {count} links updated") - if body: - print("updateLinks, body:", body) + def updateAttributes(self, obj_ids): + """ update any modified links of the given objects """ + + self.log.debug("hsds_writer> updateAttributes") + items = {} # dict which will hold a map of objects ids to attributes to create + count = 0 + + for obj_id in obj_ids: + obj_json = self.db.getObjectById(obj_id) + obj_attrs = obj_json["attributes"] + for attr_name in obj_attrs: + attr_json = obj_attrs[attr_name] + if "created" not in attr_json: + self.log.error(f"hsds_writer> expected created timestamp in attr: {attr_json}") + created = attr_json["created"] + if created > self._last_flush_time: + self.log.debug(f"hsds_writer> {obj_id} attribute {attr_name} created") + count += 1 + # new attribute, add to our list + if obj_id not in items: + items[obj_id] = {"attributes": {}} + attrs = items[obj_id]["attributes"] + attrs[attr_name] = attr_json + + if items: + body = {"obj_ids": items} + req = f"/groups/{self._root_id}/attributes" + put_rsp = self.http_conn.PUT(req, body=body) + if put_rsp.status_code not in (200, 201): + self.log.error(f"hsds_writer> put {req} failed, status: {put_rsp.status_code}") + else: + self.log.debug(f"hsds_writer> {count} attributes updated") + + def updateValue(self, dset_id, sel, arr): + """ update the given dataset using selection and array """ + self.log.debug("hsds_writer> updateValue") + params = {} + data = arrayToBytes(arr) + self.log.debug(f"writing binary data, {len(data)} bytes") + + if sel.select_type != selections.H5S_SELECT_ALL: + select_param = sel.getQueryParam() + self.log.debug(f"got select query param: {select_param}") + params["select"] = select_param + + req = f"/datasets/{dset_id}/value" + rsp = self.http_conn.PUT(req, body=data, params=params, format="binary") + if rsp.status_code != 200: + self.log.error(f"PUT {req} returned error: {rsp.status_code}") + else: + self.log.debug(f"PUT {len(data)} bytes successful") + + def updateValues(self, dset_ids): + """ write any pending dataset values """ + + self.log.debug("hsds_writer> updateValues") + for dset_id in dset_ids: + if getCollectionForId(dset_id) != "datasets": + continue # ignore groups and datatypes + dset_json = self.db.getObjectById(dset_id) + if "updates" not in dset_json: + continue + updates = dset_json["updates"] + if updates: + self.log.debug(f"hsds_writer> {dset_id} update count: {len(updates)}") + for (sel, arr) in updates: + self.updateValue(dset_id, sel, arr) + updates.clear() def flush(self): """ Write dirty items """ @@ -286,27 +435,33 @@ def flush(self): self.log.debug(f" new object count: {len(self.db.new_objects)}") self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") self.log.debug(f" deleted object count: {len(self.db.deleted_objects)}") - if self._init: - # initialize all existing objects - self.log.debug(f"flush -- init is true, self.db: {self.db.db}") + # initialize objects + self.log.debug(f"hsds_writer> flush -- init is True self.db: {self.db.db}") for obj_id in self.db: self.log.debug(f"init: {obj_id}") self.createObjects(self.db.db.keys()) self._init = False elif self.db.new_objects: + self.log.debug(f"hsds_writer> {len(self.db.new_objects)} objects to create") for obj_id in self.db.new_objects: - self.log.debug(f"new obj id: {obj_id}") + self.log.debug(f"hsds_writer> new obj id: {obj_id}") self.createObjects(self.db.new_objects) + else: + self.log.debug("no new objects to persist") for obj_id in self.db.dirty_objects: - self.log.debug(f"dirty object id: {obj_id}") + self.log.debug(f"hsds_writer> dirty object id: {obj_id}") self.updateLinks(self.db.dirty_objects) + self.updateAttributes(self.db.dirty_objects) + self.updateValues(self.db.dirty_objects) for obj_id in self.db.deleted_objects: self.log.debug(f"deleted object: {obj_id}") + self._init = False self._last_flush_time = time.time() + self.log.debug("hsds_writer> flush successful") return True # all objects written successfully def close(self): diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 3a94b094..1a051383 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -38,7 +38,7 @@ def select(obj, args): to __getitem__. The arguments should be the following: obj - Datatset object + Dataset object args Either a single argument or a tuple of arguments. See below for diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 3ff91bee..e51c4dba 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -72,8 +72,8 @@ def testSimple(self): g1 = f["g1"] self.assertTrue("a1" in g1.attrs) self.assertEqual(len(g1), 0) - db.open() + g2_id = db.createGroup() db.createHardLink(root_id, "g2", g2_id) @@ -96,8 +96,11 @@ def testSimple(self): with h5py.File(filepath) as f: self.assertTrue("attr1", f.attrs) self.assertTrue("attr2", f.attrs) + self.assertEqual(len(f), 2) self.assertTrue("g1" in f) + self.assertTrue("g2" in f) g1 = f["g1"] + self.assertEqual(len(g1), 1) self.assertTrue("a1" in g1.attrs) self.assertTrue("g1.1" in g1) g11 = g1["g1.1"] diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py index 72cf6017..d0501b9f 100644 --- a/test/unit/hsds_reader_test.py +++ b/test/unit/hsds_reader_test.py @@ -39,68 +39,71 @@ def __init__(self, *args, **kwargs): def testSimple(self): filepath = "/home/test_user1/test/tall.h5" kwargs = {"app_logger": self.log} - with Hdf5db(**kwargs) as db: - hsds_reader = HSDSReader(filepath, **kwargs) - db.reader = hsds_reader - root_id = db.getObjectIdByPath("/") - root_json = db.getObjectById(root_id) + db = Hdf5db(**kwargs) + hsds_reader = HSDSReader(filepath, **kwargs) + db.reader = hsds_reader + root_id = db.open() + root_json = db.getObjectById(root_id) + self.assertTrue("id" in root_json) + """ + TBD + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) - root_attrs = root_json["attributes"] - self.assertEqual(len(root_attrs), 2) - self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) - root_links = root_json["links"] - self.assertEqual(len(root_links), 2) - self.assertEqual(list(root_links.keys()), ["g1", "g2"]) - g1_link = root_links["g1"] - self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") - g1_id = g1_link["id"] - self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) - dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") - dset_json = db.getObjectById(dset111_id) - dset_type = dset_json["type"] - self.assertEqual(dset_type["class"], "H5T_INTEGER") - self.assertEqual(dset_type["base"], "H5T_STD_I32BE") - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 2) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) - dset_shape = dset_json["shape"] - self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10, 10]) + # got the 5th row of the dataset + sel_row = selections.select((10, 10), (5, slice(0, 10))) + row = db.getDatasetValues(dset111_id, sel_row) + self.assertTrue(isinstance(row, np.ndarray)) + self.assertEqual(row.shape, (10,)) + for i in range(10): + v = row[i] + self.assertEqual(v, i * 5) - # got the 5th row of the dataset - sel_row = selections.select((10, 10), (5, slice(0, 10))) - row = db.getDatasetValues(dset111_id, sel_row) - self.assertTrue(isinstance(row, np.ndarray)) - self.assertEqual(row.shape, (10,)) - for i in range(10): - v = row[i] - self.assertEqual(v, i * 5) + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset111_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (10, 10)) + for i in range(10): + for j in range(10): + v = arr[i, j] + self.assertEqual(v, i * j) - sel_all = selections.select((10, 10), ...) - arr = db.getDatasetValues(dset111_id, sel_all) - self.assertTrue(isinstance(arr, np.ndarray)) - self.assertEqual(arr.shape, (10, 10)) - for i in range(10): - for j in range(10): - v = arr[i, j] - self.assertEqual(v, i * j) + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + """ - # try adding an attribute - db.createAttribute(dset111_id, "attr3", value=42) - dset_json = db.getObjectById(dset111_id) - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 3) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) - attr3_json = dset_attrs["attr3"] - attr3_shape = attr3_json["shape"] - self.assertEqual(attr3_shape["class"], "H5S_SCALAR") - attr3_type = attr3_json["type"] - self.assertEqual(attr3_type["class"], "H5T_INTEGER") - self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") - attr3_value = attr3_json["value"] - self.assertEqual(attr3_value, 42) - - db.close() + db.close() if __name__ == "__main__": diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index a3ba9bea..667a8bcd 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -10,11 +10,12 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import unittest -import time import logging -import h5py +import requests +import os import numpy as np from h5json import Hdf5db +from h5json.hsdsstore.httpconn import HttpConn from h5json.hsdsstore.hsds_writer import HSDSWriter from h5json.hdf5dtype import special_dtype, Reference from h5json import selections @@ -24,6 +25,7 @@ class HSDSWriterTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(HSDSWriterTest, self).__init__(*args, **kwargs) # main + self.session = requests.Session() # create logger logfname = "hsds_writer_test.log" @@ -34,19 +36,42 @@ def __init__(self, *args, **kwargs): def testSimple(self): - filepath = "/home/test_user1/writer_test.h5" + domain_path = "/home/test_user1/writer_test.h5" + db = Hdf5db(app_logger=self.log) - db.writer = HSDSWriter(filepath) + db.writer = HSDSWriter(domain_path) root_id = db.open() - print("root_id:", root_id) + http_conn = HttpConn(domain_path, mode='r', retries=1) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() db.createHardLink(root_id, "g1", g1_id) db.createAttribute(g1_id, "a1", "hello") g2_id = db.createGroup() db.createHardLink(root_id, "g2", g2_id) + # validate - get the root group and check counts + http_rsp = http_conn.GET(f"/groups/{root_id}") + self.assertEqual(http_rsp.status_code, 200) + root_json = http_rsp.json() + # attribute count should still be zero (hasn't been flushed yet) + self.assertEqual(root_json["attributeCount"], 0) + # same for link count + self.assertEqual(root_json["linkCount"], 0) + + db.flush() + + # validate - get the root group again and see if counts are updated + http_rsp = http_conn.GET(f"/groups/{root_id}") + self.assertEqual(http_rsp.status_code, 200) + root_json = http_rsp.json() + # attribute count should still be zero (hasn't been flushed yet) + self.assertEqual(root_json["attributeCount"], 2) + # same for link count + self.assertEqual(root_json["linkCount"], 2) + g1_1_id = db.createGroup() db.createHardLink(g1_id, "g1.1", g1_1_id) dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) @@ -56,12 +81,35 @@ def testSimple(self): arr[i, j] = i * j sel_all = selections.select((10, 10), ...) db.setDatasetValues(dset_111_id, sel_all, arr) + db.flush() + + # validate - get the dataset and check values + http_rsp = http_conn.GET(f"/datasets/{dset_111_id}/value") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + self.assertTrue("value" in rsp_json) + rsp_value = rsp_json["value"] + self.assertEqual(len(rsp_value), 10) + for i in range(10): + row = rsp_value[i] + self.assertEqual(len(row), 10) + for j in range(10): + self.assertEqual(row[j], i * j) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) db.createSoftLink(g2_id, "slink", "somewhere") db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") db.createCustomLink(g2_id, "cust", {"foo": "bar"}) db.flush() + # validate - check that links got updated + http_rsp = http_conn.GET(f"/groups/{g2_id}/links") + self.assertEqual(http_rsp.status_code, 200) + g2links_json = http_rsp.json() + self.assertTrue("links" in g2links_json) + g2links = g2links_json["links"] + self.assertTrue(len(g2links), 2) # custom link will be ignored + db.createAttribute(g1_id, "a2", "bye-bye") db.flush() @@ -69,10 +117,47 @@ def testSimple(self): db.createHardLink(g2_id, "g2.1", g21) db.flush() + # update one element of the dataset sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) arr = np.zeros((), dtype=np.int32) arr[()] = 42 db.setDatasetValues(dset_111_id, sel, arr) + db.flush() + + # validate - check that just the one element is modified + http_rsp = http_conn.GET(f"/datasets/{dset_111_id}/value") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + self.assertTrue("value" in rsp_json) + rsp_value = rsp_json["value"] + self.assertEqual(len(rsp_value), 10) + for i in range(10): + row = rsp_value[i] + self.assertEqual(len(row), 10) + for j in range(10): + if i == 4 and j == 4: + expected = 42 + else: + expected = i * j + self.assertEqual(row[j], expected) + + # create a scalar dataset + dset_112_id = db.createDataset(shape=(), dtype=np.int32) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + sel_all = selections.select((), ...) + db.setDatasetValues(dset_112_id, sel_all, arr) + db.createHardLink(g1_id, "dset1.1.2", dset_112_id) + db.flush() + + # validate - get the scalar dataset value + http_rsp = http_conn.GET(f"/datasets/{dset_112_id}/value") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + self.assertTrue("value" in rsp_json) + rsp_value = rsp_json["value"] + self.assertEqual(rsp_value, 42) + db.close() From 09c017aac74feff17170bc455e6bfa996c0962ca Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 26 Jun 2025 17:56:26 +0100 Subject: [PATCH 055/129] reorg executables to apps dir --- pyproject.toml | 10 ++++------ src/h5json/{h5tojson => apps}/__init__.py | 0 src/h5json/{h5tojson => apps}/h5tojson.py | 0 src/h5json/{jsontoh5 => apps}/jsontoh5.py | 0 src/h5json/{validator => apps}/validator.py | 0 5 files changed, 4 insertions(+), 6 deletions(-) rename src/h5json/{h5tojson => apps}/__init__.py (100%) rename src/h5json/{h5tojson => apps}/h5tojson.py (100%) rename src/h5json/{jsontoh5 => apps}/jsontoh5.py (100%) rename src/h5json/{validator => apps}/validator.py (100%) diff --git a/pyproject.toml b/pyproject.toml index 879e7ffb..d911700a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,9 +35,9 @@ Social = "https://twitter.com/hdf5" Discussion = "https://forum.hdfgroup.org" [project.scripts] -h5tojson = "h5json.h5tojson.h5tojson:main" -jsontoh5 = "h5json.jsontoh5.jsontoh5:main" -h5jvalidate = "h5json.validator.validator:main" +h5tojson = "h5json.apps.h5tojson:main" +jsontoh5 = "h5json.apps.jsontoh5:main" +h5jvalidate = "h5json.apps.validator:main" [project.optional-dependencies] dev = ["check-manifest"] @@ -54,10 +54,8 @@ packages = [ "h5json.jsonstore", "h5json.h5pystore", "h5json.hsdsstore", - "h5json.h5tojson", - "h5json.jsontoh5", "h5json.schema", - "h5json.validator", + "h5json.apps", ] package-data = { "h5json.schema" = ["*.schema.json"] } platforms = ["any"] diff --git a/src/h5json/h5tojson/__init__.py b/src/h5json/apps/__init__.py similarity index 100% rename from src/h5json/h5tojson/__init__.py rename to src/h5json/apps/__init__.py diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/apps/h5tojson.py similarity index 100% rename from src/h5json/h5tojson/h5tojson.py rename to src/h5json/apps/h5tojson.py diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/apps/jsontoh5.py similarity index 100% rename from src/h5json/jsontoh5/jsontoh5.py rename to src/h5json/apps/jsontoh5.py diff --git a/src/h5json/validator/validator.py b/src/h5json/apps/validator.py similarity index 100% rename from src/h5json/validator/validator.py rename to src/h5json/apps/validator.py From 9773e2c1284c5461cdfa4c19a9f80310380398a3 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 1 Jul 2025 15:19:48 +0100 Subject: [PATCH 056/129] added h5tohs util --- src/h5json/apps/h5tohs.py | 63 +++ src/h5json/h5pystore/h5py_reader.py | 62 ++- src/h5json/h5pystore/h5py_writer.py | 1 + src/h5json/hdf5db.py | 49 +- src/h5json/hsdsstore/hsds_writer.py | 81 ++- src/h5json/jsonstore/h5json_writer.py | 10 +- test/integ/h5tojson_test.py | 4 +- test/integ/jsontoh5_test.py | 4 +- test/unit/h5json_reader_test.py | 98 ++-- test/unit/h5json_writer_test.py | 483 ++++++++--------- test/unit/h5py_reader_test.py | 95 ++-- test/unit/hdf5db_test.py | 733 ++++++++++++++------------ test/unit/hsds_writer_test.py | 50 +- 13 files changed, 1006 insertions(+), 727 deletions(-) create mode 100755 src/h5json/apps/h5tohs.py diff --git a/src/h5json/apps/h5tohs.py b/src/h5json/apps/h5tohs.py new file mode 100755 index 00000000..4d1a8106 --- /dev/null +++ b/src/h5json/apps/h5tohs.py @@ -0,0 +1,63 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import sys +import os.path as op +import logging + +from h5json import Hdf5db +from h5json.hsdsstore.hsds_writer import HSDSWriter +from h5json.h5pystore.h5py_reader import H5pyReader + +def usage(): + print(f"usage: {sys.argv[0]} [-h] [--nodata] ") + sys.exit(0) + +def main(): + no_data = False + filename = None + domain = None + for i in range(1, len(sys.argv)): + if sys.argv[i] in ("-h", "--help"): + usage() + elif sys.argv[i] == "--nodata": + no_data = True + elif filename is None: + filename = sys.argv[i] + elif domain is None: + domain = sys.argv[i] + else: + usage() + + if domain is None: + usage() + + # create logger + logfname = "h5tohs.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + log = logging.getLogger() + + # check that the input file exists + if not op.isfile(filename): + sys.exit(f"Cannot find file: {filename}") + + log.info(f"h5tohs {filename}") + + db = Hdf5db(app_logger=log) + db.writer = HSDSWriter(domain, no_data=no_data, app_logger=log) + db.reader = H5pyReader(filename, app_logger=log) + db.open() # read HDF5 data into db + + db.close() # close will trigger write to HSDS + +if __name__ == "__main__": + main() diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index bc4b5820..089f0f24 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -12,6 +12,7 @@ import h5py import numpy as np import logging +import time from ..objid import createObjId, getCollectionForId from ..hdf5dtype import getTypeItem, isOpaqueDtype @@ -126,6 +127,7 @@ def _copy_array(self, src_arr, fin=None): tgt_arr[...] = src_arr[...] return tgt_arr + """ def visit(self, path, obj): name = obj.__class__.__name__ self.log.info(f"visit: {path} name: {name}") @@ -136,6 +138,7 @@ def visit(self, path, obj): addr = h5py.h5o.get_info(obj.id).addr self._addr_map[addr] = obj_id + """ def __init__( self, @@ -174,11 +177,15 @@ def open(self): self._id_map[self._root_id] = f addr = h5py.h5o.get_info(f.id).addr self._addr_map[addr] = self._root_id - f.visititems(self.visit) + #f.visititems(self.visit) + + print("h5py_reader keys:", list(self.db.db.keys())) return self._root_id def close(self): + # close h5py handles in map dict + self._id_map = {} if self._f: self._f.close() self._f = None @@ -261,7 +268,8 @@ def getAttribute(self, obj_id, name, include_data=True): else: pass # no data - # timestamps will be added by getAttributeItem() + + item['created'] = time.time() # TBD: get attribute creation time from h5py? return item def getAttributes(self, obj_id, include_data=True): @@ -306,6 +314,8 @@ def _getLink(self, parent, link_name): item["id"] = None else: item["id"] = self._addr_map[addr] + + item['created'] = time.time() # TBD: get the link creation time from h5py? return item @@ -428,7 +438,8 @@ def _getDataset(self, dset): self.log.info(f"getDataset alias: [{dset.name}]") item = {"alias": dset.name} - + print("dset:", dset) + print("dset type:", type(dset)) typeid = dset.id.get_type() if h5py.h5t.TypeID.committed(typeid): type_uuid = None @@ -468,21 +479,60 @@ def _getDataset(self, dset): item["cpl"] = self._getHDF5DatasetCreationProperties(dset, type_item["class"]) return item + + def _getHardLinkIds(self, parent): + """ create any ids for hard links of the group """ + + self.log.debug(f"h5pyreader> _getHardlinkIds for {parent.name}") + for link_name in parent: + self.log.debug(f"h5py_reader> check link: {link_name}") + + try: + linkObj = parent.get(link_name, None, False, True) + linkClass = linkObj.__class__.__name__ + except TypeError: + # UDLink? Go on to the next link + continue + if linkClass != "HardLink": + self.log.debug(f"h5py_reader> ignoring {link_name} - type: {linkClass}") + else: + # get the linked object + obj = parent[link_name] + addr = h5py.h5o.get_info(obj.id).addr + if addr not in self._addr_map: + name = obj.__class__.__name__ + obj_id = createObjId(obj_type=name, root_id=self._root_id) # create uuid + self.log.debug(f"h5py_reader> creating obj_id: {obj_id} for obj: {obj.name}") + self._id_map[obj_id] = obj + self._addr_map[addr] = obj_id + else: + obj_id = self._addr_map[addr] + if obj_id not in self._id_map: + self.log.debug(f"h5py_reader> adding obj for {obj_id} to id_map") + self._id_map = obj + else: + self.log.debug("h5py_reader> obj {obj_id} already in id_map") def getObjectById(self, obj_id, include_attrs=True, include_links=True): """ return object with given id """ if obj_id not in self._id_map: raise KeyError(f"{obj_id} not found") h5obj = self._id_map[obj_id] + print("h5obj:", h5obj) + print("h5obj.name:", h5obj.name) + print("h5obj type:", type(h5obj)) if isinstance(h5obj, h5py.Group): + self._getHardLinkIds(h5obj) obj_json = self._getGroup(h5obj, include_links=include_links) elif isinstance(h5obj, h5py.Dataset): obj_json = self._getDataset(h5obj) elif isinstance(h5obj, h5py.Datatype): - obj_json = self._getDatatype(h5obj) + obj_json = self._getDataset(h5obj) else: - raise TypeError(f"unexpected object type: {type(h5obj)}") - + msg = f"unexpected object type: {type(h5obj)}" + self.log.error(msg) + raise TypeError(msg) + if include_attrs: attributes = self.getAttributes(obj_id) obj_json["attributes"] = attributes diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 14942c11..15d35bd4 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -388,6 +388,7 @@ def updateAttributes(self, obj_id, obj): continue self.createAttribute(obj, name, attr_json) + def flush(self): """ Write dirty items """ if self.closed: diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 8d88d6ec..28eef18d 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -109,7 +109,6 @@ def writer(self, value: H5Writer): self._writer.close() self._writer = value if self._writer: - self.log.debug("writer set_db") self._writer.set_db(self) @property @@ -161,11 +160,40 @@ def flush(self): return # nothing to do if not self.writer.flush(): # flush not successful, don't clear dirty set - return + self.log.error("writer flush failed") + raise IOError("writer flush failed") - # reset new and dirty sets + # reset new, dirty and deleted sets self._new_objects = set() self._dirty_objects = set() + self._deleted_objects = set() + + def readAll(self): + """ read all meta data objects from reader and save to db """ + + self.log.debug("readAll") + if self.closed: + raise IOError("database is not open") + + if not self.reader: + self.log.debug("no reader set") + # no reader, nothing to do + return + + obj_ids = set() + obj_ids.add(self.root_id) + while obj_ids: + obj_id = obj_ids.pop() + self.log.debug(f"readAll, get {obj_id}") + obj_json = self.getObjectById(obj_id) # will add obj_id to db if not already present + if getCollectionForId(obj_id) == "groups": + # add any hard links to the set + links = obj_json["links"] + for title in links: + link_json = links[title] + if "id" in link_json: + link_id = link_json["id"] + obj_ids.add(link_id) def open(self): """ open reader and writer if set """ @@ -196,10 +224,16 @@ def open(self): writer_root_id = self.writer.open() if writer_root_id != self._root_id: # TBD: same as above, need to deal with inconsistent root ids - self.log.warning("writer root_id does not match reader root_id") + msg = "writer root_id does not match reader root_id" + self.log.error(msg) + raise IOError(msg) + else: + self.log.debug('writer and reader root ids match!') else: # no root id set by writer or reader, initialize now - self._root_id = createObjId(obj_type="groups") + root_id = createObjId(obj_type="groups") + self.log.debug(f"no reader or writer, creating new root id: {root_id}") + self._root_id = root_id if self.writer: # open writer in create mode now that we have a root id self.writer.open() @@ -215,6 +249,7 @@ def open(self): def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") + self.flush() if self.writer: self.writer.close() @@ -237,6 +272,7 @@ def __exit__(self, type, value, traceback): def getObjectById(self, obj_id): """ return object with given id """ + self.log.debug(f"getObjectById {obj_id}") if obj_id not in self.db: if self.reader: # load the obj from the reader @@ -252,9 +288,6 @@ def getObjectIdByPath(self, h5path, parent_id=None): """ Return id for the given link path starting from parent_id if set, otherwise the root_id """ - if self.closed: - self.open() # initiate db - if h5path == "/": return self.root_id # just return root id diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 7b022c34..f56a5e34 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -12,10 +12,10 @@ import logging import time -from ..objid import getCollectionForId, getUuidFromId +from ..objid import getCollectionForId -from ..hdf5dtype import createDataType, isVlen -from ..array_util import jsonToArray, bytesToArray, arrayToBytes, bytesArrayToList +from ..hdf5dtype import isVlen +from ..array_util import arrayToBytes, bytesArrayToList from ..dset_util import getNumElements from .. import selections from ..h5writer import H5Writer @@ -117,6 +117,9 @@ def __init__( def open(self): """ setup domain for writing """ + if not self._db_ref: + # no db set yet + raise IOError("DB not set") if self._http_conn: http_conn = self._http_conn @@ -241,10 +244,10 @@ def createObjects(self, obj_ids): def multiPost(items): self.log.debug(f"hsds_writer> POST request {collection} for {len(items)} objects") + for item in items: + self.log.debug(f"hsds_writer> POST item: {item}") post_rsp = self.http_conn.POST("/" + collection, items) self.log.debug(f"hsds_writer> POST post_rsp.status_code: {post_rsp.status_code}") - if post_rsp.is_json: - self.log.debug(f"hsds_writer> post_rsp.json: {post_rsp.json()}") items.clear() self.log.debug(f"hsds_writer> createObjects, {len(obj_ids)} objects") @@ -266,6 +269,12 @@ def multiPost(items): if key == "updates": # not part of the obj json continue + if key == "attributes": + # will update attribute later + continue + if key == "links": + # links will also be updated later + continue if key == "shape": # just send the dims, not the shape json shape_json = obj_json["shape"] @@ -305,6 +314,17 @@ def multiPost(items): if dset_value_update_ids: self.updateValues(dset_value_update_ids) + def deleteObjects(self, obj_ids): + """ remove the given obj ids from the HSDS store """ + + # no multi-delete operation yet, so delete one by one + for obj_id in obj_ids: + collection = getCollectionForId(obj_id) + req = f"/{collection}/{obj_id}" + http_rsp = self.http_conn.DELETE(req) + if http_rsp.status_code not in (200, 410): + self.log.error(f"got {http_rsp.status_code} for DELETE {req}") + def updateLinks(self, grp_ids): """ update any modified links of the given objects """ @@ -425,44 +445,59 @@ def updateValues(self, dset_ids): self.updateValue(dset_id, sel, arr) updates.clear() + def flush(self): """ Write dirty items """ - - if not self.db: + if self.closed: # no db set yet - return False + self.log.warning("hsds_writer> flush called but no db") + return IOError("writer is closed") + if not self._http_conn: + self.log.warning("hsds_writer no http connection") + raise IOError("no http connection") + self.log.info("hsds_writer.flush()") self.log.debug(f" new object count: {len(self.db.new_objects)}") self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") self.log.debug(f" deleted object count: {len(self.db.deleted_objects)}") + root_id = self._root_id + dirty_ids = self.db.dirty_objects.copy() if self._init: # initialize objects - self.log.debug(f"hsds_writer> flush -- init is True self.db: {self.db.db}") - for obj_id in self.db: - self.log.debug(f"init: {obj_id}") - self.createObjects(self.db.db.keys()) + self.log.debug(f"hsds_writer> flush -- init is True self.db: {len(self.db.db)} objects") + self.db.readAll() + self.log.debug(f"hsds_writer>flush, init after readAll, {len(self.db.db)} objects") + obj_ids = set(self.db.db.keys()) + obj_ids.remove(root_id) # root group created when domain was + self.log.debug(f"init createObjects: {obj_ids}") + self.createObjects(obj_ids) + dirty_ids.update(obj_ids) + dirty_ids.add(root_id) # add back root for attribute and link creation self._init = False elif self.db.new_objects: self.log.debug(f"hsds_writer> {len(self.db.new_objects)} objects to create") for obj_id in self.db.new_objects: self.log.debug(f"hsds_writer> new obj id: {obj_id}") self.createObjects(self.db.new_objects) + dirty_ids.update(self.db.new_objects) else: self.log.debug("no new objects to persist") - for obj_id in self.db.dirty_objects: - self.log.debug(f"hsds_writer> dirty object id: {obj_id}") - self.updateLinks(self.db.dirty_objects) - self.updateAttributes(self.db.dirty_objects) - self.updateValues(self.db.dirty_objects) - - for obj_id in self.db.deleted_objects: - self.log.debug(f"deleted object: {obj_id}") - - self._init = False + if dirty_ids: + self.log.debug(f"hsds_writer> dirty ids: {dirty_ids}") + self.updateLinks(dirty_ids) + self.updateAttributes(dirty_ids) + if not self._no_data: + self.updateValues(dirty_ids) + + if self.db.deleted_objects: + self.log.debug(f"deleted ids: {self.db.deleted_objects}") + self.deleteObjects(self.db.deleted_objects) + self._last_flush_time = time.time() self.log.debug("hsds_writer> flush successful") - return True # all objects written successfully + # all objects written successfully + return True def close(self): # over-ride of H5Writer method diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index 92d3499a..8cb5a39c 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -40,15 +40,15 @@ def __init__( def flush(self): """ Write dirty items """ - # json writer doesn't support incremental updates, so we'll wait - # for close to write out database + if not self._root_id: msg = "flush called prior to open" self.log.warning(msg) raise IOError(msg) self.log.info("flush") - return False + self.dumpFile() + return True def open(self): """ file open """ @@ -61,7 +61,8 @@ def open(self): def close(self): """ close storage handle """ - self.dumpFile() + self.flush() + self._root_id = None def isClosed(self): """ return closed status """ @@ -277,6 +278,7 @@ def dumpFile(self): self.json["apiVersion"] = db_version_info["hdf5-json-version"] self.json["root"] = getUuidFromId(self._root_uuid) + self.updateAliasList() # create alias_db with obj_id to alias list dict self.dumpGroups() diff --git a/test/integ/h5tojson_test.py b/test/integ/h5tojson_test.py index 5be40c84..8519a5d4 100644 --- a/test/integ/h5tojson_test.py +++ b/test/integ/h5tojson_test.py @@ -119,13 +119,13 @@ out_file = os.path.join(out_dir, split_ext[0] + ".json") if not os.path.exists(file_path): sys.exit("file: " + file_path + " not found") - cmd = "python ../../src/h5json/h5tojson/h5tojson.py " + file_path + " >" + out_file + cmd = "python ../../src/h5json/apps/h5tojson.py " + file_path + " >" + out_file print("cmd:", cmd) rc = os.system(cmd) if rc != 0: sys.exit("h5tojson failed converting: " + test_file) - cmd = "python ../../src/h5json/validator/validator.py " + out_file + cmd = "python ../../src/h5json/apps/validator.py " + out_file print("cmd:", cmd) if rc != 0: sys.exit("HDF5/JSON validation failed for: " + out_file) diff --git a/test/integ/jsontoh5_test.py b/test/integ/jsontoh5_test.py index 3be3a3b7..ee0325d5 100644 --- a/test/integ/jsontoh5_test.py +++ b/test/integ/jsontoh5_test.py @@ -119,7 +119,7 @@ hdf5_version_tuple[1] == 8 and hdf5_version_tuple[2] > 14 ): # add in additional test files - print("adding library version dependendent files") + print("adding library version dependent files") test_files = list(test_files) for filename in test_files_latest: test_files.append(filename) @@ -131,7 +131,7 @@ out_file = os.path.join(out_dir, split_ext[0] + ".h5") if not os.path.exists(file_path): sys.exit("file: " + file_path + " not found") - cmd = "python ../../src/h5json/jsontoh5/jsontoh5.py " + file_path + " " + out_file + cmd = "python ../../src/h5json/apps/jsontoh5.py " + file_path + " " + out_file print("cmd:", cmd) rc = os.system(cmd) if rc != 0: diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index f49a86a8..bca00f2c 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -38,59 +38,57 @@ def __init__(self, *args, **kwargs): def testSimple(self): filepath = "data/json/tall.json" - kwargs = {"app_logger": self.log} - with Hdf5db(**kwargs) as db: - h5_reader = H5JsonReader(filepath, **kwargs) - db.reader = h5_reader - root_id = db.getObjectIdByPath("/") - root_json = db.getObjectById(root_id) + db = Hdf5db(app_logger=self.log) + db.reader = H5JsonReader(filepath, app_logger=self.log) + root_id = db.open() + root_json = db.getObjectById(root_id) - root_attrs = root_json["attributes"] - self.assertEqual(len(root_attrs), 2) - self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) - root_links = root_json["links"] - self.assertEqual(len(root_links), 2) - self.assertEqual(list(root_links.keys()), ["g1", "g2"]) - g1_link = root_links["g1"] - self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") - g1_id = g1_link["id"] - self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) - dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") - dset_json = db.getObjectById(dset111_id) - dset_type = dset_json["type"] - self.assertEqual(dset_type["class"], "H5T_INTEGER") - self.assertEqual(dset_type["base"], "H5T_STD_I32BE") - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 2) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) - dset_shape = dset_json["shape"] - self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10, 10]) - sel_all = selections.select((10, 10), ...) - arr = db.getDatasetValues(dset111_id, sel_all) - self.assertTrue(isinstance(arr, np.ndarray)) - self.assertEqual(arr.shape, (10, 10)) - for i in range(10): - for j in range(10): - v = arr[i, j] - self.assertEqual(v, i * j) + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset111_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (10, 10)) + for i in range(10): + for j in range(10): + v = arr[i, j] + self.assertEqual(v, i * j) - # try adding an attribute - db.createAttribute(dset111_id, "attr3", value=42) - dset_json = db.getObjectById(dset111_id) - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 3) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) - attr3_json = dset_attrs["attr3"] - attr3_shape = attr3_json["shape"] - self.assertEqual(attr3_shape["class"], "H5S_SCALAR") - attr3_type = attr3_json["type"] - self.assertEqual(attr3_type["class"], "H5T_INTEGER") - self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") - attr3_value = attr3_json["value"] - self.assertEqual(attr3_value, 42) + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) - db.close() + db.close() if __name__ == "__main__": diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index 0f1fb59a..e8b5eb91 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -46,297 +46,298 @@ def testSimple(self): filepath = "test/unit/out/h5json_writer_testSimple.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) - db.createAttribute(root_id, "attr2", 42) - g1_id = db.createGroup() - db.createHardLink(root_id, "g1", g1_id) - g2_id = db.createGroup() - db.createHardLink(root_id, "g2", g2_id) - - g1_1_id = db.createGroup() - db.createHardLink(g1_id, "g1.1", g1_1_id) - dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) - arr = np.zeros((10, 10), dtype=np.int32) - for i in range(10): - for j in range(10): - arr[i, j] = i * j - sel_all = selections.select((10, 10), ...) - db.setDatasetValues(dset_111_id, sel_all, arr) - db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) - db.createSoftLink(g2_id, "slink", "somewhere") - db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") - db.createCustomLink(g2_id, "cust", {"foo": "bar"}) - db.flush() + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + self.assertEqual(db.getObjectIdByPath("/"), root_id) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.flush() def testNullSpaceAttribute(self): filepath = "test/unit/out/h5json_writer_testNullSpaceAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) - item = db.getAttribute(root_id, "A1") - self.assertTrue("shape" in item) - shape_item = item["shape"] - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_NULL") - self.assertTrue(item["created"] > time.time() - 1.0) - value = db.getAttributeValue(root_id, "A1") - self.assertEqual(value, None) + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) def testScalarAttribute(self): filepath = "test/unit/out/h5json_writer_testScalarAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - dims = () - value = 42 - db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I32LE") - self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned - self.assertEqual(item["value"], 42) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I32LE") + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") def testFixedStringAttribute(self): filepath = "test/unit/out/h5json_writer_testFixedStringAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - value = "Hello, world!" - db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["length"], 13) - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - ret_value = db.getAttributeValue(root_id, "A1") - self.assertEqual(ret_value, b'Hello, world!') + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + ret_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(ret_value, b'Hello, world!') def testVlenAsciiAttribute(self): filepath = "test/unit/out/h5json_writer_testVlenAsciiAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - - value = b"Hello, world!" - dt = special_dtype(vlen=bytes) - - # write the attribute - db.createAttribute(root_id, "A1", value, dtype=dt) - # read it back - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + + value = b"Hello, world!" + dt = special_dtype(vlen=bytes) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) def testVlenUtf8Attribute(self): filepath = "test/unit/out/h5json_writer_testVlenutf8Attribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - - value = b"Hello, world!" - dt = special_dtype(vlen=str) - - # write the attribute - db.createAttribute(root_id, "A1", value, dtype=dt) - # read it back - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + + value = b"Hello, world!" + dt = special_dtype(vlen=str) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) def testIntAttribute(self): filepath = "test/unit/out/h5json_writer_testIntAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - value = [2, 3, 5, 7, 11] - db.createAttribute(root_id, "A1", value, dtype=np.int16) - item = db.getAttribute(root_id, "A1") - self.assertEqual(item["value"], [2, 3, 5, 7, 11]) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - item_shape = item["shape"] - self.assertEqual(item_shape["class"], "H5S_SIMPLE") - self.assertEqual(item_shape["dims"], [5,]) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + value = [2, 3, 5, 7, 11] + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") def testCreateReferenceAttribute(self): filepath = "test/unit/out/h5json_writer_testCreateReferenceAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() - dset_id = db.createDataset(shape=(), dtype=np.int32) - db.createHardLink(root_id, "DS1", dset_id) + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) - dt = special_dtype(ref=Reference) + dt = special_dtype(ref=Reference) - ds1_ref = "datasets/" + dset_id - value = [ds1_ref,] - db.createAttribute(root_id, "A1", value, dtype=dt) - item = db.getAttribute(root_id, "A1") - attr = db.getAttribute(root_id, "A1") - self.assertTrue("shape" in attr) + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_REFERENCE") - self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") - attr_value = item["value"] - self.assertEqual(len(attr_value), 1) - self.assertEqual(attr_value[0], ds1_ref) + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = item["value"] + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref) def testCreateVlenReferenceAttribute(self): filepath = "test/unit/out/h5json_writer_testVlenReferenceAttribute.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) - db.createHardLink(root_id, "DS1", dset_id) - grp_id = db.createGroup() - db.createHardLink(root_id, "G1", grp_id) - - dt_base = special_dtype(ref=Reference) - dt = special_dtype(vlen=dt_base) - - ds1_ref = "datasets/" + dset_id - grp_ref = "groups/" + grp_id - ref_arr = np.zeros((2,), dtype=dt_base) - ref_arr[0] = ds1_ref - ref_arr[1] = grp_ref - vlen_arr = np.zeros((), dtype=dt) - vlen_arr[()] = ref_arr - - db.createAttribute(root_id, "A1", vlen_arr) - item = db.getAttribute(root_id, "A1") - - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_VLEN") - self.assertEqual(item_type["size"], "H5T_VARIABLE") - base_type = item_type["base"] - self.assertEqual(base_type["class"], "H5T_REFERENCE") - self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") - - item_shape = item["shape"] - self.assertEqual(item_shape["class"], "H5S_SCALAR") + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") def testCommittedType(self): filepath = "test/unit/out/h5json_writer_testCommittedType.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - dt = np.dtype("S15") + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + dt = np.dtype("S15") - ctype_id = db.createCommittedType(dt) - db.createHardLink(root_id, "ctype", ctype_id) - item = db.getObjectById(ctype_id) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) - item_type = item["type"] + item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 15) + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) - # create an attribute using the committed type - db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") - attr = db.getAttribute(root_id, "A1") - self.assertEqual(attr["value"], "hello world!") + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_STRING") - self.assertEqual(attr_type["length"], 15) - self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") def testCommittedCompoundType(self): filepath = "test/unit/out/h5json_writer_testCommittedCompoundType.h5" - with Hdf5db(app_logger=self.log) as db: - db.writer = H5JsonWriter(filepath, app_logger=self.log) - root_id = db.getObjectIdByPath("/") - - dt_str = special_dtype(vlen=str) - fields = [] - fields.append(("field_1", np.dtype(">i8"))) - fields.append(("field_2", ">f8")) - fields.append(("field_3", np.dtype("S15"))) - fields.append(("field_4", dt_str)) - dt = np.dtype(fields) - - ctype_id = db.createCommittedType(dt) - db.createHardLink(root_id, "ctype", ctype_id) - item = db.getObjectById(ctype_id) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - - item_type = item["type"] - - self.assertEqual(item_type["class"], "H5T_COMPOUND") - fields = item_type["fields"] - self.assertEqual(len(fields), 4) - - # create an attribute using the committed type - attr_value = (42, 3.14, "circle", "area = R^2 * PI") - db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") - attr = db.getAttribute(root_id, "A1") - self.assertEqual(attr["value"], list(attr_value)) - attr_shape = attr["shape"] - self.assertEqual(attr_shape["class"], "H5S_SCALAR") - - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_COMPOUND") - - value = db.getAttributeValue(root_id, "A1") - self.assertTrue(isinstance(value, np.ndarray)) + db = Hdf5db(app_logger=self.log) + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.open() + + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) if __name__ == "__main__": diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index 7c11c4f5..a3d946d9 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -12,6 +12,7 @@ import unittest import logging +import time from h5json import Hdf5db from h5json.h5pystore.h5py_reader import H5pyReader @@ -27,7 +28,7 @@ def __init__(self, *args, **kwargs): else: lhStdout = None - self.log.setLevel(logging.INFO) + self.log.setLevel(logging.DEBUG) handler = logging.FileHandler("./hdf5dbtest.log") # add handler to logger self.log.addHandler(handler) @@ -37,50 +38,58 @@ def __init__(self, *args, **kwargs): def testSimple(self): filepath = "data/hdf5/tall.h5" - kwargs = {"app_logger": self.log} - with Hdf5db(h5_reader=H5pyReader(filepath, **kwargs), **kwargs) as db: - root_id = db.getObjectIdByPath("/") - print("got root_id:", root_id) - root_json = db.getObjectById(root_id) + db = Hdf5db(app_logger=self.log) + db.reader = H5pyReader(filepath, app_logger=self.log) + root_id = db.open() + print("got root_id:", root_id) + root_json = db.getObjectById(root_id) + print("got root_json:", root_json) + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + self.assertTrue("created" in g1_link) + g1_created = g1_link["created"] + now = time.time() + self.assertTrue(g1_created < now) + g1_id = g1_link["id"] + self.assertTrue(g1_id) + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + attr1_json = dset_attrs["attr1"] + for k in ("type", "shape", "value", "created"): + self.assertTrue(k in attr1_json) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) - root_attrs = root_json["attributes"] - self.assertEqual(len(root_attrs), 2) - self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) - root_links = root_json["links"] - self.assertEqual(len(root_links), 2) - self.assertEqual(list(root_links.keys()), ["g1", "g2"]) - g1_link = root_links["g1"] - self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") - g1_id = g1_link["id"] - self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) - dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") - dset_json = db.getObjectById(dset111_id) - dset_type = dset_json["type"] - self.assertEqual(dset_type["class"], "H5T_INTEGER") - self.assertEqual(dset_type["base"], "H5T_STD_I32BE") - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 2) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) - dset_shape = dset_json["shape"] - self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10, 10]) + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) - # try adding an attribute - db.createAttribute(dset111_id, "attr3", value=42) - dset_json = db.getObjectById(dset111_id) - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 3) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) - attr3_json = dset_attrs["attr3"] - attr3_shape = attr3_json["shape"] - self.assertEqual(attr3_shape["class"], "H5S_SCALAR") - attr3_type = attr3_json["type"] - self.assertEqual(attr3_type["class"], "H5T_INTEGER") - self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") - attr3_value = attr3_json["value"] - self.assertEqual(attr3_value, 42) - - db.close() + db.close() if __name__ == "__main__": diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index cbd7c879..3c1f3089 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -42,391 +42,432 @@ def __init__(self, *args, **kwargs): # self.log.propagate = False # prevent log out going to stdout self.log.info("init!") - def testGroup(self): + def testOpen(self): + db = Hdf5db(app_logger=self.log) + root_id = db.open() + self.assertTrue(isSchema2Id(root_id)) + self.assertTrue(isRootObjId(root_id)) + self.assertFalse(db.closed) + self.assertEqual(db.getObjectIdByPath("/"), root_id) + db.close() + #self.assertTrue(db.closed) + obj_id = db.open() + self.assertEqual(obj_id, root_id) + db.close() + + def testWith(self): with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - self.assertTrue(isSchema2Id(root_id)) + root_id = db.open() self.assertTrue(isRootObjId(root_id)) - g1_id = db.createGroup() - self.assertTrue(isSchema2Id(g1_id)) - self.assertFalse(isRootObjId(g1_id)) - self.assertTrue(isValidUuid(g1_id, obj_class="groups")) - db.createHardLink(root_id, "g1", g1_id) - - g2_id = db.createGroup() - self.assertTrue(isSchema2Id(g2_id)) - self.assertFalse(isRootObjId(g2_id)) - self.assertTrue(isValidUuid(g2_id, obj_class="groups")) - db.createHardLink(root_id, "g2", g2_id) - - g1_1_id = db.createGroup() - self.assertTrue(isSchema2Id(g1_1_id)) - self.assertFalse(isRootObjId(g1_1_id)) - self.assertTrue(isValidUuid(g1_1_id, obj_class="groups")) - db.createHardLink(g1_id, "g1.1", g1_1_id) - - self.assertEqual(db.getObjectIdByPath("g1"), g1_id) - self.assertEqual(db.getObjectIdByPath("/g1"), g1_id) - self.assertEqual(db.getObjectIdByPath("g1/"), g1_id) - - self.assertEqual(db.getObjectIdByPath("g1/g1.1"), g1_1_id) - self.assertEqual(db.getObjectIdByPath("/g1/g1.1"), g1_1_id) - self.assertEqual(db.getObjectIdByPath("g1/g1.1/"), g1_1_id) - - grp1_json = db.getObjectById(g1_id) - self.assertTrue("links" in grp1_json) - g1_links = grp1_json["links"] - self.assertTrue("g1.1" in g1_links) - g1_1_link = db.getLink(g1_id, "g1.1") - self.assertEqual(g1_1_link["class"], "H5L_TYPE_HARD") - self.assertEqual(g1_1_link["id"], g1_1_id) - self.assertTrue(g1_1_link["created"] > time.time() - 1.0) - - db.createSoftLink(g2_id, "slink", "somewhere") - soft_link = db.getLink(g2_id, "slink") - self.assertEqual(soft_link["class"], "H5L_TYPE_SOFT") - self.assertEqual(soft_link["h5path"], "somewhere") - self.assertTrue(soft_link["created"] > time.time() - 1.0) - - db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") - ext_link = db.getLink(g2_id, "extlink") - self.assertEqual(ext_link["class"], "H5L_TYPE_EXTERNAL") - self.assertEqual(ext_link["h5path"], "somewhere") - self.assertEqual(ext_link["file"], "someplace") - self.assertTrue(ext_link["created"] > time.time() - 1.0) - - db.createCustomLink(g2_id, "cust", {"foo": "bar"}) - cust_link = db.getLink(g2_id, "cust") - self.assertEqual(cust_link["class"], "H5L_TYPE_USER_DEFINED") - self.assertEqual(cust_link["foo"], "bar") - self.assertTrue(cust_link["created"] > time.time() - 1.0) - - links = db.getLinks(g2_id) - self.assertEqual(len(links), 3) - for title in "slink", "extlink", "cust": - self.assertTrue(title in links) - - db.deleteLink(g2_id, "cust") - links = db.getLinks(g2_id) - self.assertEqual(len(links), 2) - for title in "slink", "extlink": - self.assertTrue(title in links) - - try: - db.getObjectIdByPath("/g1/foo") - self.assertTrue(False) - except KeyError: - pass # expected - - ret = db.getLink(g2_id, "not_a_link") - self.assertTrue(ret is None) + def testGroup(self): + db = Hdf5db(app_logger=self.log) + root_id = db.open() + + g1_id = db.createGroup() + self.assertTrue(isSchema2Id(g1_id)) + self.assertFalse(isRootObjId(g1_id)) + self.assertTrue(isValidUuid(g1_id, obj_class="groups")) + db.createHardLink(root_id, "g1", g1_id) + + g2_id = db.createGroup() + self.assertTrue(isSchema2Id(g2_id)) + self.assertFalse(isRootObjId(g2_id)) + self.assertTrue(isValidUuid(g2_id, obj_class="groups")) + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + self.assertTrue(isSchema2Id(g1_1_id)) + self.assertFalse(isRootObjId(g1_1_id)) + self.assertTrue(isValidUuid(g1_1_id, obj_class="groups")) + db.createHardLink(g1_id, "g1.1", g1_1_id) + + self.assertEqual(db.getObjectIdByPath("g1"), g1_id) + self.assertEqual(db.getObjectIdByPath("/g1"), g1_id) + self.assertEqual(db.getObjectIdByPath("g1/"), g1_id) + + self.assertEqual(db.getObjectIdByPath("g1/g1.1"), g1_1_id) + self.assertEqual(db.getObjectIdByPath("/g1/g1.1"), g1_1_id) + self.assertEqual(db.getObjectIdByPath("g1/g1.1/"), g1_1_id) + + grp1_json = db.getObjectById(g1_id) + self.assertTrue("links" in grp1_json) + g1_links = grp1_json["links"] + self.assertTrue("g1.1" in g1_links) + g1_1_link = db.getLink(g1_id, "g1.1") + self.assertEqual(g1_1_link["class"], "H5L_TYPE_HARD") + self.assertEqual(g1_1_link["id"], g1_1_id) + self.assertTrue(g1_1_link["created"] > time.time() - 1.0) + + db.createSoftLink(g2_id, "slink", "somewhere") + soft_link = db.getLink(g2_id, "slink") + self.assertEqual(soft_link["class"], "H5L_TYPE_SOFT") + self.assertEqual(soft_link["h5path"], "somewhere") + self.assertTrue(soft_link["created"] > time.time() - 1.0) + + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + ext_link = db.getLink(g2_id, "extlink") + self.assertEqual(ext_link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(ext_link["h5path"], "somewhere") + self.assertEqual(ext_link["file"], "someplace") + self.assertTrue(ext_link["created"] > time.time() - 1.0) + + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + cust_link = db.getLink(g2_id, "cust") + self.assertEqual(cust_link["class"], "H5L_TYPE_USER_DEFINED") + self.assertEqual(cust_link["foo"], "bar") + self.assertTrue(cust_link["created"] > time.time() - 1.0) + + links = db.getLinks(g2_id) + self.assertEqual(len(links), 3) + for title in "slink", "extlink", "cust": + self.assertTrue(title in links) + + db.deleteLink(g2_id, "cust") + links = db.getLinks(g2_id) + self.assertEqual(len(links), 2) + for title in "slink", "extlink": + self.assertTrue(title in links) + + try: + db.getObjectIdByPath("/g1/foo") + self.assertTrue(False) + except KeyError: + pass # expected + + ret = db.getLink(g2_id, "not_a_link") + self.assertTrue(ret is None) + db.close() def testNullSpaceAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) - item = db.getAttribute(root_id, "A1") - self.assertTrue("shape" in item) - shape_item = item["shape"] - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_NULL") - self.assertTrue(item["created"] > time.time() - 1.0) - value = db.getAttributeValue(root_id, "A1") - self.assertEqual(value, None) + db = Hdf5db(app_logger=self.log) + root_id = db.open() + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + db.close() def testScalarAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - dims = () - value = 42 - db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I32LE") - self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned - self.assertEqual(item["value"], 42) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I32LE") + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + db.close() def testFixedStringAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - value = "Hello, world!" - db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["length"], 13) - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - ret_value = db.getAttributeValue(root_id, "A1") - self.assertEqual(ret_value, value.encode("ascii")) + db = Hdf5db(app_logger=self.log) + root_id = db.open() + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + ret_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(ret_value, value.encode("ascii")) + db.close() def testVlenAsciiAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - - value = b"Hello, world!" - dt = special_dtype(vlen=bytes) - - # write the attribute - db.createAttribute(root_id, "A1", value, dtype=dt) - # read it back - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + db = Hdf5db(app_logger=self.log) + root_id = db.open() + + value = b"Hello, world!" + dt = special_dtype(vlen=bytes) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() def testVlenUtf8Attribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - - value = b"Hello, world!" - dt = special_dtype(vlen=str) - - # write the attribute - db.createAttribute(root_id, "A1", value, dtype=dt) - # read it back - item = db.getAttribute(root_id, "A1") - shape_json = item["shape"] - self.assertEqual(shape_json["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + db = Hdf5db(app_logger=self.log) + root_id = db.open() + + value = b"Hello, world!" + dt = special_dtype(vlen=str) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() def testIntAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - value = [2, 3, 5, 7, 11] - db.createAttribute(root_id, "A1", value, dtype=np.int16) - item = db.getAttribute(root_id, "A1") - self.assertEqual(item["value"], [2, 3, 5, 7, 11]) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - item_shape = item["shape"] - self.assertEqual(item_shape["class"], "H5S_SIMPLE") - self.assertEqual(item_shape["dims"], [5,]) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_INTEGER") - self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db = Hdf5db(app_logger=self.log) + root_id = db.open() + value = [2, 3, 5, 7, 11] + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db.close() def testCreateReferenceAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") + db = Hdf5db(app_logger=self.log) + root_id = db.open() - dset_id = db.createDataset(shape=(), dtype=np.int32) - db.createHardLink(root_id, "DS1", dset_id) + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) - dt = special_dtype(ref=Reference) + dt = special_dtype(ref=Reference) - ds1_ref = "datasets/" + dset_id - value = [ds1_ref,] - db.createAttribute(root_id, "A1", value, dtype=dt) - item = db.getAttribute(root_id, "A1") - attr = db.getAttribute(root_id, "A1") - self.assertTrue("shape" in attr) + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_REFERENCE") - self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") - attr_value = item["value"] - self.assertEqual(len(attr_value), 1) - self.assertEqual(attr_value[0], ds1_ref) + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = item["value"] + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref) + + db.close() def testCreateVlenReferenceAttribute(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape=(), dtype=np.int32) - db.createHardLink(root_id, "DS1", dset_id) - grp_id = db.createGroup() - db.createHardLink(root_id, "G1", grp_id) - - dt_base = special_dtype(ref=Reference) - dt = special_dtype(vlen=dt_base) - - ds1_ref = "datasets/" + dset_id - grp_ref = "groups/" + grp_id - ref_arr = np.zeros((2,), dtype=dt_base) - ref_arr[0] = ds1_ref - ref_arr[1] = grp_ref - vlen_arr = np.zeros((), dtype=dt) - vlen_arr[()] = ref_arr - - db.createAttribute(root_id, "A1", vlen_arr) - item = db.getAttribute(root_id, "A1") - - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_VLEN") - self.assertEqual(item_type["size"], "H5T_VARIABLE") - base_type = item_type["base"] - self.assertEqual(base_type["class"], "H5T_REFERENCE") - self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") - - item_shape = item["shape"] - self.assertEqual(item_shape["class"], "H5S_SCALAR") + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + + db.close() def testCommittedType(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - dt = np.dtype("S15") + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dt = np.dtype("S15") + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) - ctype_id = db.createCommittedType(dt) - db.createHardLink(root_id, "ctype", ctype_id) - item = db.getObjectById(ctype_id) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) + item_type = item["type"] - item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 15) + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") - # create an attribute using the committed type - db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") - attr = db.getAttribute(root_id, "A1") - self.assertEqual(attr["value"], "hello world!") + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_STRING") - self.assertEqual(attr_type["length"], 15) - self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + db.close() def testCommittedCompoundType(self): - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - - dt_str = special_dtype(vlen=str) - fields = [] - fields.append(("field_1", np.dtype(">i8"))) - fields.append(("field_2", ">f8")) - fields.append(("field_3", np.dtype("S15"))) - fields.append(("field_4", dt_str)) - dt = np.dtype(fields) - - ctype_id = db.createCommittedType(dt) - db.createHardLink(root_id, "ctype", ctype_id) - item = db.getObjectById(ctype_id) - now = int(time.time()) - self.assertTrue(item["created"] > now - 1) - - item_type = item["type"] - - self.assertEqual(item_type["class"], "H5T_COMPOUND") - fields = item_type["fields"] - self.assertEqual(len(fields), 4) - - # create an attribute using the committed type - attr_value = (42, 3.14, "circle", "area = R^2 * PI") - db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") - attr = db.getAttribute(root_id, "A1") - self.assertEqual(attr["value"], list(attr_value)) - attr_shape = attr["shape"] - self.assertEqual(attr_shape["class"], "H5S_SCALAR") - - attr_type = attr["type"] - self.assertEqual(attr_type["class"], "H5T_COMPOUND") - - value = db.getAttributeValue(root_id, "A1") - self.assertTrue(isinstance(value, np.ndarray)) + db = Hdf5db(app_logger=self.log) + root_id = db.open() + + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + + db.close() def testSimpleDataset(self): - with Hdf5db(app_logger=self.log) as db: - nrows = 8 - ncols = 10 - shape = (nrows, ncols) - dtype = np.int32 - root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape, dtype=dtype) - db.createHardLink(root_id, "dset", dset_id) - db.createAttribute(dset_id, "a1", "Hello, world") - sel_all = selections.select(shape, ...) - arr = db.getDatasetValues(dset_id, sel_all) - self.assertEqual(arr.dtype, dtype) - self.assertEqual(arr.shape, shape) - self.assertEqual(arr.min(), 0) - self.assertEqual(arr.max(), 0) - row = np.zeros((ncols,), dtype=dtype) - for i in range(nrows): - row[:] = list(range(i * 10, (i + 1) * 10)) - row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) - db.setDatasetValues(dset_id, row_sel, row) - arr = db.getDatasetValues(dset_id, sel_all) - for i in range(nrows): - row = np.array(list(range(i * 10, (i + 1) * 10)), dtype=dtype) - np.testing.assert_array_equal(arr[i, :], row) + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + self.assertEqual(arr.min(), 0) + self.assertEqual(arr.max(), 0) + row = np.zeros((ncols,), dtype=dtype) + for i in range(nrows): + row[:] = list(range(i * 10, (i + 1) * 10)) + row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) + db.setDatasetValues(dset_id, row_sel, row) + arr = db.getDatasetValues(dset_id, sel_all) + for i in range(nrows): + row = np.array(list(range(i * 10, (i + 1) * 10)), dtype=dtype) + np.testing.assert_array_equal(arr[i, :], row) + + db.close() def testScalarDataset(self): dtype = np.int32 - with Hdf5db(app_logger=self.log) as db: - root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset((), dtype=dtype) - db.createHardLink(root_id, "dset", dset_id) - db.createAttribute(dset_id, "a1", "Hello, world") - sel_all = selections.select((), ...) - arr = db.getDatasetValues(dset_id, sel_all) - self.assertEqual(arr.dtype, dtype) - self.assertEqual(arr.shape, ()) - self.assertEqual(arr[()], 0) - db.setDatasetValues(dset_id, sel_all, np.array(42, dtype=dtype)) - arr = db.getDatasetValues(dset_id, sel_all) - self.assertEqual(arr.dtype, dtype) - self.assertEqual(arr.shape, ()) - self.assertEqual(arr.min(), 42) - self.assertEqual(arr.max(), 42) + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset((), dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select((), ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr[()], 0) + db.setDatasetValues(dset_id, sel_all, np.array(42, dtype=dtype)) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr.min(), 42) + self.assertEqual(arr.max(), 42) + + db.close() def testResizableDataset(self): - with Hdf5db(app_logger=self.log) as db: - nrows = 8 - ncols = 10 - shape = (nrows, ncols) - dtype = np.int32 - maxdims = (None, ncols * 2) - root_id = db.getObjectIdByPath("/") - dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) - db.createHardLink(root_id, "dset", dset_id) - db.createAttribute(dset_id, "a1", "Hello, world") - - # resize limited dimension - db.resizeDataset(dset_id, (nrows, ncols * 2)) - - # try to go beyond max extent - try: - db.resizeDataset(dset_id, (nrows, ncols * 3)) - self.assertTrue(False) - except ValueError: - pass # expected - - # resize unlimited dimension - db.resizeDataset(dset_id, (nrows * 10, ncols)) + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + maxdims = (None, ncols * 2) + + db = Hdf5db(app_logger=self.log) + + root_id = db.open() + dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + + # resize limited dimension + db.resizeDataset(dset_id, (nrows, ncols * 2)) + + # try to go beyond max extent + try: + db.resizeDataset(dset_id, (nrows, ncols * 3)) + self.assertTrue(False) + except ValueError: + pass # expected + + # resize unlimited dimension + db.resizeDataset(dset_id, (nrows * 10, ncols)) + + db.close() if __name__ == "__main__": diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index 667a8bcd..fdbc7b7a 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -17,6 +17,7 @@ from h5json import Hdf5db from h5json.hsdsstore.httpconn import HttpConn from h5json.hsdsstore.hsds_writer import HSDSWriter +from h5json.h5pystore.h5py_reader import H5pyReader from h5json.hdf5dtype import special_dtype, Reference from h5json import selections @@ -36,10 +37,10 @@ def __init__(self, *args, **kwargs): def testSimple(self): - domain_path = "/home/test_user1/writer_test.h5" + domain_path = "hdf5://home/test_user1/test/writer_test.h5" db = Hdf5db(app_logger=self.log) - db.writer = HSDSWriter(domain_path) + db.writer = HSDSWriter(domain_path, app_logger=self.log) root_id = db.open() http_conn = HttpConn(domain_path, mode='r', retries=1) @@ -158,6 +159,51 @@ def testSimple(self): rsp_value = rsp_json["value"] self.assertEqual(rsp_value, 42) + db.close() + + def testH5PyToHS(self): + # test reading from HDF5 file and writing to HSDS + + file_path = "data/hdf5/tall.h5" + domain_path = "hdf5://home/test_user1/test/hsds_writer_test_tall.h5" + + db = Hdf5db(app_logger=self.log) + db.reader = H5pyReader(file_path) + db.writer = HSDSWriter(domain_path) + root_id = db.open() + #db.readAll() + root_json = db.getObjectById(root_id) + db.flush() + + # validate - get the root group and see if counts are correct + http_conn = HttpConn(domain_path, mode='r', retries=1) + http_rsp = http_conn.GET(f"/groups/{root_id}") + self.assertEqual(http_rsp.status_code, 200) + root_json = http_rsp.json() + self.assertEqual(root_json["id"], root_id) + # attribute count should still be zero (hasn't been flushed yet) + self.assertEqual(root_json["attributeCount"], 2) + # same for link count + self.assertEqual(root_json["linkCount"], 2) + + # get the g1 hard link + http_rsp = http_conn.GET(f"/groups/{root_id}/links/g1") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + g1_link = rsp_json["link"] + g1_id = g1_link["id"] + + # get the g1 group json + http_rsp = http_conn.GET(f"/groups/{g1_id}") + self.assertEqual(http_rsp.status_code, 200) + g1_json = http_rsp.json() + self.assertEqual(g1_json["attributeCount"], 0) + self.assertEqual(g1_json["linkCount"], 2) + + + + + db.close() From 3d9003c334c7e6f6aa94fb22199493eaf2fc4bdb Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 11 Jul 2025 19:45:17 +0100 Subject: [PATCH 057/129] hsds writer updates --- src/h5json/apps/h5tohs.py | 3 + src/h5json/h5pystore/h5py_reader.py | 19 +- src/h5json/h5pystore/h5py_writer.py | 1 - src/h5json/h5writer.py | 2 +- src/h5json/hdf5db.py | 297 +++++++++++++++++++------- src/h5json/hsdsstore/hsds_writer.py | 5 +- src/h5json/jsonstore/h5json_reader.py | 4 +- src/h5json/jsonstore/h5json_writer.py | 3 +- test/unit/h5json_reader_test.py | 4 +- test/unit/h5py_writer_test.py | 20 ++ test/unit/hdf5db_test.py | 8 +- test/unit/hsds_writer_test.py | 7 +- 12 files changed, 263 insertions(+), 110 deletions(-) diff --git a/src/h5json/apps/h5tohs.py b/src/h5json/apps/h5tohs.py index 4d1a8106..9853482e 100755 --- a/src/h5json/apps/h5tohs.py +++ b/src/h5json/apps/h5tohs.py @@ -17,10 +17,12 @@ from h5json.hsdsstore.hsds_writer import HSDSWriter from h5json.h5pystore.h5py_reader import H5pyReader + def usage(): print(f"usage: {sys.argv[0]} [-h] [--nodata] ") sys.exit(0) + def main(): no_data = False filename = None @@ -59,5 +61,6 @@ def main(): db.close() # close will trigger write to HSDS + if __name__ == "__main__": main() diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index 089f0f24..9aee273d 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -177,9 +177,6 @@ def open(self): self._id_map[self._root_id] = f addr = h5py.h5o.get_info(f.id).addr self._addr_map[addr] = self._root_id - #f.visititems(self.visit) - - print("h5py_reader keys:", list(self.db.db.keys())) return self._root_id @@ -268,7 +265,6 @@ def getAttribute(self, obj_id, name, include_data=True): else: pass # no data - item['created'] = time.time() # TBD: get attribute creation time from h5py? return item @@ -314,7 +310,7 @@ def _getLink(self, parent, link_name): item["id"] = None else: item["id"] = self._addr_map[addr] - + item['created'] = time.time() # TBD: get the link creation time from h5py? return item @@ -435,11 +431,11 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): return creationProps def _getDataset(self, dset): + """ return json representation of the given dataset """ + self.log.info(f"getDataset alias: [{dset.name}]") item = {"alias": dset.name} - print("dset:", dset) - print("dset type:", type(dset)) typeid = dset.id.get_type() if h5py.h5t.TypeID.committed(typeid): type_uuid = None @@ -479,7 +475,7 @@ def _getDataset(self, dset): item["cpl"] = self._getHDF5DatasetCreationProperties(dset, type_item["class"]) return item - + def _getHardLinkIds(self, parent): """ create any ids for hard links of the group """ @@ -518,21 +514,18 @@ def getObjectById(self, obj_id, include_attrs=True, include_links=True): if obj_id not in self._id_map: raise KeyError(f"{obj_id} not found") h5obj = self._id_map[obj_id] - print("h5obj:", h5obj) - print("h5obj.name:", h5obj.name) - print("h5obj type:", type(h5obj)) if isinstance(h5obj, h5py.Group): self._getHardLinkIds(h5obj) obj_json = self._getGroup(h5obj, include_links=include_links) elif isinstance(h5obj, h5py.Dataset): obj_json = self._getDataset(h5obj) elif isinstance(h5obj, h5py.Datatype): - obj_json = self._getDataset(h5obj) + obj_json = self._getDatatype(h5obj) else: msg = f"unexpected object type: {type(h5obj)}" self.log.error(msg) raise TypeError(msg) - + if include_attrs: attributes = self.getAttributes(obj_id) obj_json["attributes"] = attributes diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 15d35bd4..14942c11 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -388,7 +388,6 @@ def updateAttributes(self, obj_id, obj): continue self.createAttribute(obj, name, attr_json) - def flush(self): """ Write dirty items """ if self.closed: diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index 3dfb8da8..8de1a277 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -67,7 +67,7 @@ def no_data(self): @abstractmethod def open(self): """ open storage handle, return root_id""" - return None + pass @abstractmethod def flush(self): diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 28eef18d..66c84311 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -22,6 +22,150 @@ from .h5writer import H5Writer +class H5NullReader(H5Reader): + """ + This class can be used by HDF5DB as a default no-op reader + """ + + def __init__( + self, + filepath, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + super().__init__(filepath, app_logger=app_logger) + self.log.debug("H5NullReader.__init__") + + self._root_id = None + self._is_closed = True + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + + if obj_id != self._root_id: + raise KeyError(f"{obj_id} not found") + + # create a root group with no links or attributes + group_json = {"links": {}, "attributes": {}, "cpl": {}} + group_json["created"] = time.time() + + return group_json + + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + raise IOError("not supported") + + def getDatasetValues(self, obj_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + # just return a zero array + arr = np.zeros(sel.shape, dtype=dtype) + + return arr + + def open(self): + """ Open data source for reading """ + self.log.debug("H5NullReader open") + if self.db is None: + # no db set yet + self.log.warning("no self.db db_ref") + raise ValueError("no db") + + if self._is_closed: + if not self._root_id: + if self.db.root_id: + # use the db root id + self._root_id = self.db.root_id + else: + # create a new root id + self._root_id = createObjId(obj_type="groups") + self._is_closed = False + return self._root_id + + def close(self): + """ close any open handles to the storage """ + self._is_closed = True + + def isClosed(self): + """ return True if handle is closed """ + return self._is_closed + + +class H5NullWriter(H5Writer): + """ + This class can be used by HDF5DB as a default no-op writer + """ + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + if append: + raise IOError("append is not supprot for H5NullWriter") + + super().__init__(filepath, no_data=no_data, app_logger=app_logger) + self.log.debug("H5NullWriter.__init__") + self._root_id = None + self._is_closed = True + + def open(self): + """ open storage handle, return root_id""" + self.log.debug("H5NullWriter open") + if not self._is_closed: + return self._root_id # already open + + if self.db is None: + # no db set yet + self.log.warning("no self.db db_ref") + raise ValueError("no db") + + if not self._root_id: + if self.db.root_id: + self._root_id = self.db.root_id + else: + self._root_id = createObjId(obj_type="groups") + self._is_closed = False + return self._root_id + + def flush(self): + """ Write dirty items """ + self.log.debug("H5NullWriter> flush") + # Null writer is unable to actually persist anything, so return False + return False + + def close(self): + """ close storage handle """ + self.log.debug("H5NullWriter.close") + self._is_closed = True + + def isClosed(self): + """ return True if handle is closed """ + return self._is_closed + + class Hdf5db: """ This class is used to manage id lookup tables for primary HDF objects (Groups, Datasets, @@ -79,23 +223,12 @@ def reader(self): @reader.setter def reader(self, value: H5Reader): """ set the reader """ - if self._writer: + if self._writer and not self._writer.isClosed(): self.flush() - if self._reader: + if self._reader and not self._reader.isClosed(): self._reader.close() self._reader = value self._reader.set_db(self) - """ - root_id = value.get_root_id() - if not root_id: - raise ValueError(f"reader {type(value)} unable to return root_id") - group_json = value.getObjectById(root_id) - if not group_json: - raise ValueError(f"reader {type(value)} unable to return group json") - self._reader = value - self._db[root_id] = group_json - self._root_id = root_id - """ @property def writer(self): @@ -156,17 +289,17 @@ def make_dirty(self, obj_id): def flush(self): """ write out any changes """ self.log.debug("db.flush()") - if not self.writer: - return # nothing to do + self._checkWriter() if not self.writer.flush(): # flush not successful, don't clear dirty set self.log.error("writer flush failed") - raise IOError("writer flush failed") + return False # reset new, dirty and deleted sets self._new_objects = set() self._dirty_objects = set() self._deleted_objects = set() + return True def readAll(self): """ read all meta data objects from reader and save to db """ @@ -174,12 +307,7 @@ def readAll(self): self.log.debug("readAll") if self.closed: raise IOError("database is not open") - - if not self.reader: - self.log.debug("no reader set") - # no reader, nothing to do - return - + obj_ids = set() obj_ids.add(self.root_id) while obj_ids: @@ -198,50 +326,48 @@ def readAll(self): def open(self): """ open reader and writer if set """ self.log.debug("db.open()") - if self.root_id: - self.log.debug("root id already set, re-open call") - if self.writer: - self.writer.open() - if self.reader: - self.reader.open() + + if self.reader is None: + self.reader = H5NullReader(None, app_logger=self.log) + self._reader.set_db(self) + + if self.writer is None: + self.writer = H5NullWriter(None, app_logger=self.log) + self._writer.set_db(self) + + if not self.reader.isClosed(): + self.log.debug("db is already opened") + raise IOError("db is already opened") + return self._root_id + + if self.writer.append: + # append mode for the writer, first open writer and get the root id + self.log.debug("db.open, write append, getting root_id from writer") + writer_root_id = self.writer.open() + if self._root_id: + if writer_root_id != self._root_id: + raise IOError("writer root id does not match reader root id") + else: + self._root_id = writer_root_id + + # now open reader + reader_root_id = self.reader.open() + if reader_root_id != self._root_id: + raise IOError("writer root id does not match reader root id") + else: - self.log.debug("db.open, getting root_id") - - if self.writer and self.writer.append: - # append mode for the writer, open writer and get the root id - self.log.debug("db.open, write append, getting root_id from writer") - self._root_id = self.writer.open() - if self.reader: - reader_root_id = self.reader.open() - if reader_root_id != self._root_id: - # TBD: need someway to reconcile if both reader and writer have - # an potentiated idea on what there root id is - self.log.warn("reader root_id does not match writer root_id") - elif self.reader: - self.log.debug("db.open, getting root_id from reader") - self._root_id = self.reader.open() - if self.writer: - writer_root_id = self.writer.open() - if writer_root_id != self._root_id: - # TBD: same as above, need to deal with inconsistent root ids - msg = "writer root_id does not match reader root_id" - self.log.error(msg) - raise IOError(msg) - else: - self.log.debug('writer and reader root ids match!') + # open reader first and get root id + reader_root_id = self.reader.open() + if self._root_id: + if reader_root_id != self._root_id: + raise IOError("writer root id does not match reader root id") else: - # no root id set by writer or reader, initialize now - root_id = createObjId(obj_type="groups") - self.log.debug(f"no reader or writer, creating new root id: {root_id}") - self._root_id = root_id - if self.writer: - # open writer in create mode now that we have a root id - self.writer.open() - - # create a root group just as a memory object - group_json = {"links": {}, "attributes": {}, "cpl": {}} - group_json["created"] = time.time() - self._db[self._root_id] = group_json + self._root_id = reader_root_id + + # now open writer + writer_root_id = self.writer.open() + if writer_root_id != self._root_id: + raise IOError("writer root id does not match reader root id") self.log.debug(f"db.open() - returning root_id: {self._root_id}") return self._root_id @@ -249,7 +375,7 @@ def open(self): def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") - + self.flush() if self.writer: self.writer.close() @@ -258,7 +384,14 @@ def close(self): @property def closed(self): - return False if self.root_id else True + if self.reader: + return self.reader.isClosed() + elif self.writer: + return self.writer.isClosed() + elif self._root_id: + return True + else: + return False def __enter__(self): """ called on package init """ @@ -270,16 +403,28 @@ def __exit__(self, type, value, traceback): self.log.info("Hdf5db __exit") self.close() + def _checkReader(self): + """ check the reader is set and open """ + if self.reader is None: + raise IOError("reader not set") + if self.reader.closed: + raise IOError("reader is closed") + + def _checkWriter(self): + """ check the writer is set and open """ + if self.writer is None: + raise IOError("writer not set") + if self.writer.closed: + raise IOError("writer is closed") + def getObjectById(self, obj_id): """ return object with given id """ self.log.debug(f"getObjectById {obj_id}") + self._checkReader() if obj_id not in self.db: - if self.reader: - # load the obj from the reader - obj_json = self.reader.getObjectById(obj_id) - self.db[obj_id] = obj_json - else: - raise KeyError(f"obj_id: {obj_id} not found") + # load the obj from the reader + obj_json = self.reader.getObjectById(obj_id) + self.db[obj_id] = obj_json obj_json = self.db[obj_id] return obj_json @@ -538,6 +683,8 @@ def getDatasetValues(self, dset_id, sel): number of elements as the rank of the dataset. """ self.log.info(f"getDatasetValues dset_id: {dset_id}, sel: {sel}") + + self._checkReader() dset_json = self.getObjectById(dset_id) shape_json = dset_json["shape"] if not isinstance(sel, selections.Selection): @@ -560,11 +707,7 @@ def getDatasetValues(self, dset_id, sel): rank = len(dims) dtype = self.getDtype(dset_json) - if self.reader: - arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) - else: - # TBD: Initialize with fill value if non-zero - arr = np.zeros(sel.shape, dtype=dtype) + arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) if "updates" in dset_json: # apply any non-flushed changes that intersect the current selection diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index f56a5e34..4697093d 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -445,7 +445,6 @@ def updateValues(self, dset_ids): self.updateValue(dset_id, sel, arr) updates.clear() - def flush(self): """ Write dirty items """ if self.closed: @@ -455,7 +454,7 @@ def flush(self): if not self._http_conn: self.log.warning("hsds_writer no http connection") raise IOError("no http connection") - + self.log.info("hsds_writer.flush()") self.log.debug(f" new object count: {len(self.db.new_objects)}") self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") @@ -493,7 +492,7 @@ def flush(self): if self.db.deleted_objects: self.log.debug(f"deleted ids: {self.db.deleted_objects}") self.deleteObjects(self.db.deleted_objects) - + self._last_flush_time = time.time() self.log.debug("hsds_writer> flush successful") # all objects written successfully diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py index 40f8e5e4..4fd1e0c8 100644 --- a/src/h5json/jsonstore/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -62,10 +62,10 @@ def open(self): return self._root_id def close(self): - pass + self._h5json = None def isClosed(self): - return False if self._h5json else False + return False if self._h5json else True def get_root_id(self): """ Return root id """ diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index 8cb5a39c..f5ede89f 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -40,7 +40,7 @@ def __init__( def flush(self): """ Write dirty items """ - + if not self._root_id: msg = "flush called prior to open" self.log.warning(msg) @@ -278,7 +278,6 @@ def dumpFile(self): self.json["apiVersion"] = db_version_info["hdf5-json-version"] self.json["root"] = getUuidFromId(self._root_uuid) - self.updateAliasList() # create alias_db with obj_id to alias list dict self.dumpGroups() diff --git a/test/unit/h5json_reader_test.py b/test/unit/h5json_reader_test.py index bca00f2c..5c190203 100644 --- a/test/unit/h5json_reader_test.py +++ b/test/unit/h5json_reader_test.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): else: lhStdout = None - self.log.setLevel(logging.INFO) + self.log.setLevel(logging.DEBUG) handler = logging.FileHandler("./h5json_reader_test.log") # add handler to logger self.log.addHandler(handler) @@ -40,7 +40,9 @@ def testSimple(self): filepath = "data/json/tall.json" db = Hdf5db(app_logger=self.log) db.reader = H5JsonReader(filepath, app_logger=self.log) + self.assertTrue(db.closed) root_id = db.open() + self.assertTrue(root_id) root_json = db.getObjectById(root_id) root_attrs = root_json["attributes"] diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index e51c4dba..e2763795 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -20,6 +20,7 @@ from h5json.jsonstore.h5json_reader import H5JsonReader from h5json.h5pystore.h5py_writer import H5pyWriter from h5json.hdf5dtype import special_dtype, Reference +from h5json.objid import isRootObjId, isSchema2Id from h5json import selections @@ -46,6 +47,21 @@ def __init__(self, *args, **kwargs): # self.log.propagate = False # prevent log out going to stdout self.log.info("init!") + def testOpen(self): + filepath = "test/unit/out/h5py_writer_test_testOpen.h5" + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath) + root_id = db.open() + self.assertTrue(isSchema2Id(root_id)) + self.assertTrue(isRootObjId(root_id)) + self.assertFalse(db.closed) + self.assertEqual(db.getObjectIdByPath("/"), root_id) + db.close() + self.assertTrue(db.closed) + obj_id = db.open() + self.assertEqual(obj_id, root_id) + db.close() + def testSimple(self): filepath = "test/unit/out/h5py_writer_test_testSimple.h5" @@ -518,6 +534,7 @@ def testReaderWithUpdate(self): db.open() # close should create everything the json reader read to the output file db.close() + self.assertTrue(db.closed) with h5py.File(file_out) as f: self.assertTrue("/g1/g1.1/dset1.1.1" in f) @@ -525,8 +542,11 @@ def testReaderWithUpdate(self): self.assertEqual(len(dset111.attrs), 2) db.open() + self.assertFalse(db.closed) dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") db.createAttribute(dset111_id, "attr3", "hello") + self.assertFalse(db.closed) + print("test - db.close()") db.close() with h5py.File(file_out) as f: diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 3c1f3089..c9c32969 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -50,7 +50,7 @@ def testOpen(self): self.assertFalse(db.closed) self.assertEqual(db.getObjectIdByPath("/"), root_id) db.close() - #self.assertTrue(db.closed) + self.assertTrue(db.closed) obj_id = db.open() self.assertEqual(obj_id, root_id) db.close() @@ -394,8 +394,8 @@ def testSimpleDataset(self): ncols = 10 shape = (nrows, ncols) dtype = np.int32 - - db = Hdf5db(app_logger=self.log) + + db = Hdf5db(app_logger=self.log) root_id = db.open() dset_id = db.createDataset(shape, dtype=dtype) db.createHardLink(root_id, "dset", dset_id) @@ -448,7 +448,7 @@ def testResizableDataset(self): maxdims = (None, ncols * 2) db = Hdf5db(app_logger=self.log) - + root_id = db.open() dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) db.createHardLink(root_id, "dset", dset_id) diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index fdbc7b7a..9557a9f8 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -166,12 +166,11 @@ def testH5PyToHS(self): file_path = "data/hdf5/tall.h5" domain_path = "hdf5://home/test_user1/test/hsds_writer_test_tall.h5" - + db = Hdf5db(app_logger=self.log) db.reader = H5pyReader(file_path) db.writer = HSDSWriter(domain_path) root_id = db.open() - #db.readAll() root_json = db.getObjectById(root_id) db.flush() @@ -200,10 +199,6 @@ def testH5PyToHS(self): self.assertEqual(g1_json["attributeCount"], 0) self.assertEqual(g1_json["linkCount"], 2) - - - - db.close() From 74d3a6217a0eafb1d5c7f773bd74acadd0cd9a18 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 14 Jul 2025 18:15:16 +0100 Subject: [PATCH 058/129] update datasetvalues for in init --- src/h5json/hsdsstore/hsds_writer.py | 34 +++++++++++++++++++----- test/unit/hsds_writer_test.py | 40 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 4697093d..24ed900c 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -436,14 +436,31 @@ def updateValues(self, dset_ids): if getCollectionForId(dset_id) != "datasets": continue # ignore groups and datatypes dset_json = self.db.getObjectById(dset_id) - if "updates" not in dset_json: + dset_shape = dset_json["shape"] + dset_class = dset_shape['class'] + if dset_class == "H5S_NULL": + # no data to update continue - updates = dset_json["updates"] - if updates: - self.log.debug(f"hsds_writer> {dset_id} update count: {len(updates)}") - for (sel, arr) in updates: - self.updateValue(dset_id, sel, arr) - updates.clear() + if self._init: + # get all data for the dataset + # TBD: do this by chunks + if dset_class == "H5S_SCALAR": + dset_dims = [] + else: + dset_dims = dset_shape["dims"] + sel_all = selections.select(dset_dims, ...) + arr = self.db.getDatasetValues(dset_id, sel_all) + if arr is not None: + self.updateValue(dset_id, sel_all, arr) + else: + if "updates" not in dset_json: + continue + updates = dset_json["updates"] + if updates: + self.log.debug(f"hsds_writer> {dset_id} update count: {len(updates)}") + for (sel, arr) in updates: + self.updateValue(dset_id, sel, arr) + updates.clear() def flush(self): """ Write dirty items """ @@ -472,6 +489,9 @@ def flush(self): self.createObjects(obj_ids) dirty_ids.update(obj_ids) dirty_ids.add(root_id) # add back root for attribute and link creation + if not self._no_data: + # initialize dataset values + self.updateValues(obj_ids) self._init = False elif self.db.new_objects: self.log.debug(f"hsds_writer> {len(self.db.new_objects)} objects to create") diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index 9557a9f8..a2d7201b 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -199,6 +199,46 @@ def testH5PyToHS(self): self.assertEqual(g1_json["attributeCount"], 0) self.assertEqual(g1_json["linkCount"], 2) + # get the g1.1 link + http_rsp = http_conn.GET(f"/groups/{g1_id}/links/g1.1") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + g1_1_link = rsp_json["link"] + g1_1_id = g1_1_link["id"] + + # Get the g1.1 json + http_rsp = http_conn.GET(f"/groups/{g1_1_id}") + self.assertEqual(http_rsp.status_code, 200) + g1_json = http_rsp.json() + self.assertEqual(g1_json["attributeCount"], 0) + self.assertEqual(g1_json["linkCount"], 2) + + # get the dset1.1.1 link + http_rsp = http_conn.GET(f"/groups/{g1_1_id}/links/dset1.1.1") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + dset1_1_1_link = rsp_json["link"] + dset1_1_1_id = dset1_1_1_link["id"] + + # get the dset1.1.1 json + http_rsp = http_conn.GET(f"/datasets/{dset1_1_1_id}") + self.assertEqual(http_rsp.status_code, 200) + dset1_1_1_json = http_rsp.json() + dset1_1_1_shape = dset1_1_1_json["shape"] + self.assertEqual(dset1_1_1_shape["class"], "H5S_SIMPLE") + + # get the dset1_1_1 data + http_rsp = http_conn.GET(f"/datasets/{dset1_1_1_id}/value") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + dset1_1_1_value = rsp_json["value"] + self.assertEqual(len(dset1_1_1_value), 10) + for i in range(10): + row = dset1_1_1_value[i] + self.assertEqual(len(row), 10) + for j in range(10): + self.assertEqual(row[j], i * j) + db.close() From 985a842f4097705061cae01f9471e92e71b7eae9 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 14 Jul 2025 21:03:17 +0100 Subject: [PATCH 059/129] set dataset values in create if possible --- src/h5json/dset_util.py | 22 ++++++++++++----- src/h5json/hsdsstore/hsds_writer.py | 38 +++++++++++++++++------------ test/unit/h5py_reader_test.py | 2 -- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 496734d3..d992a01a 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -43,14 +43,24 @@ def resize_dataset(dset_json, shape): dset_json["modified"] = time.time() -def getNumElements(dset_json): +def getDims(dset_json): + """ return extents of the dataset shape as a tuple """ shape_json = dset_json["shape"] shape_class = shape_json["class"] if shape_class == "H5S_NULL": - num_elements = 0 + dims = None elif shape_class == "H5S_SCALAR": - num_elements = 1 + dims = () elif shape_class == "H5S_SIMPLE": - dims = shape_json["dims"] - num_elements = int(np.prod(dims)) - return num_elements + dims = tuple(shape_json["dims"]) + else: + raise ValueError(f"Unexpected shape class: {shape_class}") + return dims + + +def getNumElements(dset_json): + """ return the number of elements defined by the dataset's shape + returns None for null shape, 1 for scalar shape, and product of + extents otherwise """ + + return int(np.prod(getDims(dset_json))) diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 24ed900c..8881e5e9 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -16,7 +16,7 @@ from ..hdf5dtype import isVlen from ..array_util import arrayToBytes, bytesArrayToList -from ..dset_util import getNumElements +from ..dset_util import getNumElements, getDims from .. import selections from ..h5writer import H5Writer from .httpconn import HttpConn @@ -251,7 +251,7 @@ def multiPost(items): items.clear() self.log.debug(f"hsds_writer> createObjects, {len(obj_ids)} objects") - MAX_OBJECTS_PER_REQUEST = 3 + MAX_OBJECTS_PER_REQUEST = 300 collections = ("groups", "datasets", "datatypes") col_items = {} dset_value_update_ids = set() @@ -286,15 +286,25 @@ def multiPost(items): item[key] = obj_json[key] # initialize dataset values if provided and not too large - if "updates" in obj_json: - updates = obj_json["updates"] - if updates and len(updates) == 1 and self.getDatasetSize(obj_id) < MAX_INIT_SIZE: + if collection == "datasets": + dset_dims = getDims(obj_json) # will be None for null space datasets + dset_size = self.getDatasetSize(obj_id) # number of bytes defined by the shape + init_arr = None # data to be passed to post create method + updates = obj_json.get("updates") + if updates and len(updates) == 1 and dset_size < MAX_INIT_SIZE: sel, arr = updates[0] if sel.select_type == selections.H5S_SELECT_ALL: - value = bytesArrayToList(arr) - item["value"] = value + init_arr = arr updates.clear() # reset the update list - if updates: + if self._init and init_arr is None and dset_dims is not None: + # get all values from dataset if small enough + if dset_size < MAX_INIT_SIZE: + sel_all = selections.select(dset_dims, ...) + init_arr = self.db.getDatasetValues(obj_id, sel_all) + if init_arr is not None: + value = bytesArrayToList(init_arr) + item["value"] = value + elif updates or self._init: dset_value_update_ids.add(obj_id) # will set dataset value below # add to the list of new items for the given collection @@ -436,18 +446,13 @@ def updateValues(self, dset_ids): if getCollectionForId(dset_id) != "datasets": continue # ignore groups and datatypes dset_json = self.db.getObjectById(dset_id) - dset_shape = dset_json["shape"] - dset_class = dset_shape['class'] - if dset_class == "H5S_NULL": + dset_dims = getDims(dset_json) + if dset_dims is None: # no data to update continue if self._init: # get all data for the dataset # TBD: do this by chunks - if dset_class == "H5S_SCALAR": - dset_dims = [] - else: - dset_dims = dset_shape["dims"] sel_all = selections.select(dset_dims, ...) arr = self.db.getDatasetValues(dset_id, sel_all) if arr is not None: @@ -491,7 +496,8 @@ def flush(self): dirty_ids.add(root_id) # add back root for attribute and link creation if not self._no_data: # initialize dataset values - self.updateValues(obj_ids) + pass + # self.updateValues(obj_ids) self._init = False elif self.db.new_objects: self.log.debug(f"hsds_writer> {len(self.db.new_objects)} objects to create") diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index a3d946d9..8f76543c 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -41,9 +41,7 @@ def testSimple(self): db = Hdf5db(app_logger=self.log) db.reader = H5pyReader(filepath, app_logger=self.log) root_id = db.open() - print("got root_id:", root_id) root_json = db.getObjectById(root_id) - print("got root_json:", root_json) root_attrs = root_json["attributes"] self.assertEqual(len(root_attrs), 2) self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) From 9d78d0c4763b51b612a6b80738318fc59aa56077 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sat, 26 Jul 2025 15:37:10 +0100 Subject: [PATCH 060/129] hsdsreader test --- src/h5json/dset_util.py | 26 ++++++++++++++++++++++++++ test/unit/hsds_reader_test.py | 7 ++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index d992a01a..e1a44a59 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -64,3 +64,29 @@ def getNumElements(dset_json): extents otherwise """ return int(np.prod(getDims(dset_json))) + + +def getDatasetLayout(dset_json): + """ Return layout json from creation property list or layout json """ + layout = None + + if "creationProperties" in dset_json: + cp = dset_json["creationProperties"] + if "layout" in cp: + layout = cp["layout"] + if not layout and "layout" in dset_json: + layout = dset_json["layout"] + if not layout: + # no layout for {dset_json + return None + return layout + + +def getDatasetLayoutClass(dset_json): + """ return layout class """ + layout = getDatasetLayout(dset_json) + if layout and "class" in layout: + layout_class = layout["class"] + else: + layout_class = None + return layout_class diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py index d0501b9f..db7a9f4b 100644 --- a/test/unit/hsds_reader_test.py +++ b/test/unit/hsds_reader_test.py @@ -45,11 +45,11 @@ def testSimple(self): root_id = db.open() root_json = db.getObjectById(root_id) self.assertTrue("id" in root_json) - """ - TBD + root_attrs = root_json["attributes"] self.assertEqual(len(root_attrs), 2) self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] self.assertEqual(len(root_links), 2) self.assertEqual(list(root_links.keys()), ["g1", "g2"]) @@ -57,11 +57,13 @@ def testSimple(self): self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") g1_id = g1_link["id"] self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") dset_json = db.getObjectById(dset111_id) dset_type = dset_json["type"] self.assertEqual(dset_type["class"], "H5T_INTEGER") self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] self.assertEqual(len(dset_attrs), 2) self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) @@ -101,7 +103,6 @@ def testSimple(self): self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") attr3_value = attr3_json["value"] self.assertEqual(attr3_value, 42) - """ db.close() From 4413e9b23179817c8dc27d8e7a3d1ebfe68c422f Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 31 Jul 2025 12:53:23 +0100 Subject: [PATCH 061/129] added reader, writer stat method --- src/h5json/h5pystore/h5py_reader.py | 14 ++++++++++ src/h5json/h5pystore/h5py_writer.py | 14 ++++++++++ src/h5json/h5reader.py | 9 +++++++ src/h5json/h5writer.py | 14 ++++++++++ src/h5json/hdf5db.py | 28 ++++++++++++++++++-- src/h5json/hsdsstore/hsds_reader.py | 12 +++++++++ src/h5json/hsdsstore/hsds_writer.py | 16 +++++++++++- src/h5json/jsonstore/h5json_reader.py | 14 ++++++++++ src/h5json/jsonstore/h5json_writer.py | 16 ++++++++++++ test/unit/h5json_writer_test.py | 2 ++ test/unit/hdf5db_test.py | 2 ++ test/unit/hsds_reader_test.py | 16 ++++++++++++ test/unit/hsds_writer_test.py | 37 +++++++++++++++++++++++++-- 13 files changed, 189 insertions(+), 5 deletions(-) diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index 9aee273d..bb32a6e9 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -12,6 +12,7 @@ import h5py import numpy as np import logging +from os import stat as os_stat import time from ..objid import createObjId, getCollectionForId @@ -557,3 +558,16 @@ def getDatasetValues(self, dset_id, sel=None, dtype=None): # convert any h5py references to h5json references arr = self._copy_array(arr, fin=dset.file) return arr + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stat_info = os_stat(self.filepath) + stats = {} + stats['created'] = stat_info.st_ctime + stats["lastModified"] = stat_info.st_mtime + stats['owner'] = stat_info.st_uid # TBD: convert to username? + return stats diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 14942c11..dd543a38 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -11,6 +11,7 @@ ############################################################################## import h5py import numpy as np +from os import stat as os_stat import time from ..objid import getCollectionForId, isValidUuid, createObjId @@ -460,3 +461,16 @@ def close(self): def isClosed(self): """ return closed status """ return False if self._f else True + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stat_info = os_stat(self.filepath) + stats = {} + stats['created'] = stat_info.st_ctime + stats["lastModified"] = stat_info.st_mtime + stats['owner'] = stat_info.st_uid # TBD: convert to username? + return stats diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index 3bf49ca7..f48612fc 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -92,3 +92,12 @@ def close(self): def isClosed(self): """ return True if handle is closed """ pass + + @abstractmethod + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + pass diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index 8de1a277..cc5c601c 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -32,6 +32,7 @@ def __init__( self._no_data = no_data self._filepath = filepath self._db_ref = None + self._lastModified = None if app_logger: self.log = app_logger else: @@ -49,6 +50,10 @@ def filepath(self): def closed(self): return self.isClosed() + @property + def lastModified(self): + return self._lastModified + @property def db(self): if not self._db_ref: @@ -83,3 +88,12 @@ def close(self): def isClosed(self): """ return True if handle is closed """ pass + + @abstractmethod + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + pass diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 66c84311..0b0c22a4 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -105,6 +105,18 @@ def isClosed(self): """ return True if handle is closed """ return self._is_closed + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stats = {} + stats['created'] = 0 + stats["lastModified"] = 0 + stats['owner'] = "" + return stats + class H5NullWriter(H5Writer): """ @@ -165,6 +177,18 @@ def isClosed(self): """ return True if handle is closed """ return self._is_closed + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stats = {} + stats['created'] = 0 + stats["lastModified"] = 0 + stats['owner'] = "" + return stats + class Hdf5db: """ @@ -417,11 +441,11 @@ def _checkWriter(self): if self.writer.closed: raise IOError("writer is closed") - def getObjectById(self, obj_id): + def getObjectById(self, obj_id, refresh=False): """ return object with given id """ self.log.debug(f"getObjectById {obj_id}") self._checkReader() - if obj_id not in self.db: + if obj_id not in self.db or refresh: # load the obj from the reader obj_json = self.reader.getObjectById(obj_id) self.db[obj_id] = obj_json diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index 55a8c022..a521f158 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -310,3 +310,15 @@ def getDatasetValues(self, dset_id, sel=None, dtype=None): self.log.debug(f"jsonToArray returned: {arr}") return arr + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stats = {} + stats['created'] = 0 + stats["lastModified"] = 0 + stats['owner'] = "" + return stats diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 8881e5e9..7e5d7781 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -385,6 +385,7 @@ def updateLinks(self, grp_ids): raise IOError("hsds_writer unable to update links") else: self.log.debug(f"hsds_writer> {grp_id} {count} links updated") + self._lastModified = time.time() def updateAttributes(self, obj_ids): """ update any modified links of the given objects """ @@ -418,6 +419,7 @@ def updateAttributes(self, obj_ids): self.log.error(f"hsds_writer> put {req} failed, status: {put_rsp.status_code}") else: self.log.debug(f"hsds_writer> {count} attributes updated") + self._lastModified = time.time() def updateValue(self, dset_id, sel, arr): """ update the given dataset using selection and array """ @@ -437,6 +439,7 @@ def updateValue(self, dset_id, sel, arr): self.log.error(f"PUT {req} returned error: {rsp.status_code}") else: self.log.debug(f"PUT {len(data)} bytes successful") + self._lastModified = time.time() def updateValues(self, dset_ids): """ write any pending dataset values """ @@ -476,7 +479,6 @@ def flush(self): if not self._http_conn: self.log.warning("hsds_writer no http connection") raise IOError("no http connection") - self.log.info("hsds_writer.flush()") self.log.debug(f" new object count: {len(self.db.new_objects)}") self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") @@ -535,3 +537,15 @@ def isClosed(self): def get_root_id(self): """ Return root id """ return self._root_id + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stats = {} + stats['created'] = 0 + stats["lastModified"] = 0 + stats['owner'] = "" + return stats diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py index 4fd1e0c8..b64a3d1d 100644 --- a/src/h5json/jsonstore/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -11,6 +11,7 @@ ############################################################################## import json import logging +from os import stat as os_stat from ..objid import getCollectionForId, getUuidFromId @@ -215,3 +216,16 @@ def getDatasetValues(self, obj_id, sel=None, dtype=None): raise NotImplementedError("selection type not supported") return arr + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stat_info = os_stat(self.filepath) + stats = {} + stats['created'] = stat_info.st_ctime + stats["lastModified"] = stat_info.st_mtime + stats['owner'] = stat_info.st_uid # TBD: convert to username? + return stats diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index f5ede89f..343c045f 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -11,6 +11,8 @@ ############################################################################## import json +from os import stat as os_stat +import time from ..h5writer import H5Writer from ..objid import getUuidFromId, getCollectionForId, createObjId @@ -292,3 +294,17 @@ def dumpFile(self): json.dump(self.json, f, ensure_ascii=ensure_ascii, indent=indent) else: print(json.dumps(self.json, sort_keys=True, ensure_ascii=ensure_ascii, indent=indent)) + self._lastModified = time.time() # update timestamp + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stat_info = os_stat(self.filepath) + stats = {} + stats['created'] = stat_info.st_ctime + stats["lastModified"] = stat_info.st_mtime + stats['owner'] = stat_info.st_uid # TBD: convert to username? + return stats diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index e8b5eb91..ba2cbc19 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -70,7 +70,9 @@ def testSimple(self): db.createSoftLink(g2_id, "slink", "somewhere") db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + self.assertTrue(db.writer.lastModified is None) # no update yet db.flush() + self.assertTrue(db.writer.lastModified > 0) # timestamp should be updated def testNullSpaceAttribute(self): diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index c9c32969..2722eaa6 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -53,6 +53,8 @@ def testOpen(self): self.assertTrue(db.closed) obj_id = db.open() self.assertEqual(obj_id, root_id) + root_json = db.getObjectById(root_id) + self.assertFalse("id" in root_json) db.close() def testWith(self): diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py index db7a9f4b..9b0acf1f 100644 --- a/test/unit/hsds_reader_test.py +++ b/test/unit/hsds_reader_test.py @@ -11,6 +11,8 @@ ############################################################################## import unittest import logging +import random +import string import numpy as np from h5json import Hdf5db from h5json.hsdsstore.hsds_reader import HSDSReader @@ -106,6 +108,20 @@ def testSimple(self): db.close() + def testNoFile(self): + # create a random string so we don't try to open an existing file + filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)) + filepath = "/home/test_user1/test/" + filename + kwargs = {"app_logger": self.log} + db = Hdf5db(**kwargs) + hsds_reader = HSDSReader(filepath, **kwargs) + db.reader = hsds_reader + try: + db.open() + self.assertTrue(False) + except IOError as ioe: + self.assertEqual(ioe.errno, 404) + if __name__ == "__main__": # setup test files diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index a2d7201b..4ef8ff8f 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -11,12 +11,14 @@ ############################################################################## import unittest import logging +import random +import string import requests -import os import numpy as np from h5json import Hdf5db from h5json.hsdsstore.httpconn import HttpConn from h5json.hsdsstore.hsds_writer import HSDSWriter +from h5json.hsdsstore.hsds_reader import HSDSReader from h5json.h5pystore.h5py_reader import H5pyReader from h5json.hdf5dtype import special_dtype, Reference from h5json import selections @@ -42,6 +44,10 @@ def testSimple(self): db = Hdf5db(app_logger=self.log) db.writer = HSDSWriter(domain_path, app_logger=self.log) root_id = db.open() + + stats = db.writer.getStats() + for k in ("created", "lastModified", "owner"): + self.assertTrue(k in stats) http_conn = HttpConn(domain_path, mode='r', retries=1) db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) @@ -61,8 +67,9 @@ def testSimple(self): self.assertEqual(root_json["attributeCount"], 0) # same for link count self.assertEqual(root_json["linkCount"], 0) - + self.assertTrue(db.writer.lastModified is None) # no write yet db.flush() + self.assertTrue(db.writer.lastModified > 0) # timestamp should be updated # validate - get the root group again and see if counts are updated http_rsp = http_conn.GET(f"/groups/{root_id}") @@ -159,7 +166,33 @@ def testSimple(self): rsp_value = rsp_json["value"] self.assertEqual(rsp_value, 42) + # create a dataset and try to read from it + dset_222_id = db.createDataset(shape=(10, 10), dtype=np.int32) + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset_222_id, sel_all) + self.assertTrue((arr == 0).all()) + + db.close() + + def testReaderWriter(self): + # try reading and writer to an HSDS domain + # create a random string so we don't try to open an existing file + filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)) + domain_path = "/home/test_user1/test/" + filename + ".h5" + db = Hdf5db(app_logger=self.log) + db.writer = HSDSWriter(domain_path, app_logger=self.log) + self.assertEqual(db.writer.filepath, domain_path) + root_id = db.open() + self.assertTrue(root_id) + db.reader = HSDSReader(domain_path, app_logger=self.log) db.close() + root_id2 = db.open() + self.assertEqual(root_id, root_id2) + root_json = db.getObjectById(root_id) + self.assertTrue("id" not in root_json) + self.assertTrue("created" in root_json) + self.assertTrue(root_json["created"] > 0) + self.assertTrue(db.writer.lastModified is None) # no flush yet def testH5PyToHS(self): # test reading from HDF5 file and writing to HSDS From 5ee8b3e21cf61ea222195a8e41c58abef6a31212 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 3 Aug 2025 15:51:25 +0100 Subject: [PATCH 062/129] fix for reopen db --- src/h5json/hdf5db.py | 9 +- src/h5json/hsdsstore/hsds_reader.py | 38 ++++--- src/h5json/hsdsstore/hsds_writer.py | 152 +++++++++++++--------------- src/h5json/hsdsstore/httpconn.py | 97 ++++++++---------- test/unit/hsds_reader_test.py | 16 +++ 5 files changed, 153 insertions(+), 159 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 0b0c22a4..87cd5687 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -362,7 +362,6 @@ def open(self): if not self.reader.isClosed(): self.log.debug("db is already opened") raise IOError("db is already opened") - return self._root_id if self.writer.append: # append mode for the writer, first open writer and get the root id @@ -377,14 +376,14 @@ def open(self): # now open reader reader_root_id = self.reader.open() if reader_root_id != self._root_id: - raise IOError("writer root id does not match reader root id") + raise IOError("db root id does not match reader root id") else: # open reader first and get root id reader_root_id = self.reader.open() if self._root_id: if reader_root_id != self._root_id: - raise IOError("writer root id does not match reader root id") + raise IOError("reader root id does not match reader root id") else: self._root_id = reader_root_id @@ -431,14 +430,14 @@ def _checkReader(self): """ check the reader is set and open """ if self.reader is None: raise IOError("reader not set") - if self.reader.closed: + if self.reader.isClosed(): raise IOError("reader is closed") def _checkWriter(self): """ check the writer is set and open """ if self.writer is None: raise IOError("writer not set") - if self.writer.closed: + if self.writer.isClosed(): raise IOError("writer is closed") def getObjectById(self, obj_id, refresh=False): diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index a521f158..e0053033 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -87,15 +87,21 @@ def __init__( # save these for when we create the connection self._http_kwargs = kwargs self._http_conn = None + self._stats = {"created": 0, "lastModified": 0, "owner": ""} super().__init__(domain_path, app_logger=app_logger) def open(self): + if self._http_conn and not self._http_conn.isClosed(): + return self._root_id # open already called + if self._http_conn: - return # open already called + http_conn = self._http_conn + else: + kwargs = self._http_kwargs + http_conn = HttpConn(self.filepath, **kwargs) - kwargs = self._http_kwargs - http_conn = HttpConn(self.filepath, **kwargs) + http_conn.open() hsds_info = http_conn.serverInfo() self.log.debug(f"got hsds info: {hsds_info}") @@ -122,6 +128,11 @@ def open(self): domain_json = rsp.json() self.log.debug(f"got domain_json: {domain_json}") + # update stats + for key in ("created", "lastModified", "owner", "limits", "version", "compressors"): + if key in domain_json: + self._stats[key] = domain_json[key] + if "root" not in domain_json: http_conn.close() raise IOError(404, "Location is a folder, not a file") @@ -134,17 +145,8 @@ def open(self): domain_objs = root_json["domain_objs"] objdb.load(domain_objs) """ - if "limits" in domain_json: - self._limits = domain_json["limits"] - else: - self._limits = None - if "version" in domain_json: - self._version = domain_json["version"] - else: - self._version = None self._http_conn = http_conn - self._domain_json = domain_json return self._root_id @@ -157,10 +159,10 @@ def close(self): self._http_conn.close() def isClosed(self): - if self._http_conn: - return False - else: + if not self._http_conn: return True + else: + return self._http_conn.isClosed() def get_root_id(self): """ Return root id """ @@ -317,8 +319,4 @@ def getStats(self): 'lastModified': modificationTime 'owner': owner name """ - stats = {} - stats['created'] = 0 - stats["lastModified"] = 0 - stats['owner'] = "" - return stats + return self._stats diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 7e5d7781..ba3b7b87 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -109,11 +109,10 @@ def __init__( self._http_conn = None self._root_id = None self._append = append - self._owner = owner self._track_order = track_order self._linked_domain = linked_domain - self._domain_json = None self._last_flush_time = 0 + self._stats = {"created": 0, "lastModified": 0, "owner": ""} def open(self): """ setup domain for writing """ @@ -133,91 +132,84 @@ def open(self): hsds_info = http_conn.serverInfo() self.log.debug(f"got hsds info: {hsds_info}") - if not self._domain_json: - # haven't fetched the domain json yet, do it now - - # try to do a GET from the domain - req = "/" - params = {} - """ - if max_objects is None or max_objects > 0: - # get object meta objects - # TBD: have hsds support a max limit of objects to return - params["getobjs"] = 1 - params["include_attrs"] = 1 - params["include_links"] = 1 - """ - - domain_json = None - rsp = http_conn.GET(req, params=params) - - if rsp.status_code not in (200, 404, 410): - msg = f"Got status code: {rsp.status_code} on initial domain get" - self.log.warning(msg) - raise IOError(msg) - - if rsp.status_code == 200: - if self._append: - # domain exists already - domain_json = rsp.json() - if "root" not in domain_json: - # this a folder not a domain - self.log.warning(f"folder: {self.filepath} has no root property") - http_conn.close() - raise IOError(404, "Location is a folder, not a file") - else: - # not append - delete existing domain - self.log.info(f"sending delete request for {self.filepath}") - delete_rsp = http_conn.DELETE(req, params=params) - if delete_rsp.status_code not in (200, 410): - # failed to delete - http_conn.close() - raise IOError(rsp.status_code, rsp.reason) - - if not domain_json: - # domain doesn't exist, create it - body = {} - if self.db.root_id: - # initialize domain using the db's root_id - body["root_id"] = self.db.root_id - if self._owner: - body["owner"] = self._owner - if self._linked_domain: - body["linked_domain"] = self._linked_domain - if self._track_order: - create_props = {"CreateOrder": 1} - group_body = {"creationProperties": create_props} - body["group"] = group_body - rsp = http_conn.PUT(req, params=params, body=body) - if rsp.status_code != 201: - http_conn.close() - raise IOError(rsp.status_code, rsp.reason) + # fetch the domain json + + # try to do a GET from the domain + req = "/" + params = {} + """ + if max_objects is None or max_objects > 0: + # get object meta objects + # TBD: have hsds support a max limit of objects to return + params["getobjs"] = 1 + params["include_attrs"] = 1 + params["include_links"] = 1 + """ + + domain_json = None + rsp = http_conn.GET(req, params=params) + + if rsp.status_code not in (200, 404, 410): + msg = f"Got status code: {rsp.status_code} on initial domain get" + self.log.warning(msg) + raise IOError(msg) + + if rsp.status_code == 200: + if self._append: + # domain exists already domain_json = rsp.json() - self.log.info(f"got rsp on PUT domain: {domain_json}") if "root" not in domain_json: + # this a folder not a domain + self.log.warning(f"folder: {self.filepath} has no root property") http_conn.close() - raise IOError(404, "Unexpected error") - - self.log.debug(f"got domain_json: {domain_json}") + raise IOError(404, "Location is a folder, not a file") + else: + # not append - delete existing domain + self.log.info(f"sending delete request for {self.filepath}") + delete_rsp = http_conn.DELETE(req, params=params) + if delete_rsp.status_code not in (200, 410): + # failed to delete + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + if not domain_json: + # domain doesn't exist, create it + body = {} + if self.db.root_id: + # initialize domain using the db's root_id + body["root_id"] = self.db.root_id + if self._owner: + body["owner"] = self._owner + if self._linked_domain: + body["linked_domain"] = self._linked_domain + if self._track_order: + create_props = {"CreateOrder": 1} + group_body = {"creationProperties": create_props} + body["group"] = group_body + rsp = http_conn.PUT(req, params=params, body=body) + if rsp.status_code != 201: + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + domain_json = rsp.json() + self.log.info(f"got rsp on PUT domain: {domain_json}") if "root" not in domain_json: http_conn.close() - raise IOError(404, "Location is a folder, not a file") + raise IOError(404, "Unexpected error") - root_id = domain_json["root"] + self.log.debug(f"got domain_json: {domain_json}") - self._root_id = root_id + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Location is a folder, not a file") - if "limits" in domain_json: - self._limits = domain_json["limits"] - else: - self._limits = None - if "version" in domain_json: - self._version = domain_json["version"] - else: - self._version = None + root_id = domain_json["root"] + + self._root_id = root_id - self._domain_json = domain_json + # update stats + for key in ("created", "lastModified", "owner", "limits", "version", "compressors"): + if key in domain_json: + self._stats[key] = domain_json[key] return self._root_id @@ -544,8 +536,4 @@ def getStats(self): 'lastModified': modificationTime 'owner': owner name """ - stats = {} - stats['created'] = 0 - stats["lastModified"] = 0 - stats['owner'] = "" - return stats + return self._stats diff --git a/src/h5json/hsdsstore/httpconn.py b/src/h5json/hsdsstore/httpconn.py index 14b3d54d..53c42dcc 100644 --- a/src/h5json/hsdsstore/httpconn.py +++ b/src/h5json/hsdsstore/httpconn.py @@ -258,7 +258,6 @@ def __init__( bucket=None, api_key=None, mode="a", - use_session=True, expire_time=1.0, max_objects=None, max_age=1.0, @@ -270,7 +269,6 @@ def __init__( self._domain = domain_name self._mode = mode self._domain_json = None - self._use_session = use_session self._retries = retries self._timeout = timeout self._api_key = api_key @@ -283,7 +281,7 @@ def __init__( self.log = logging else: self.log = logging.getLogger(logger) - msg = f"HttpConn.init(domain: {domain_name} use_session: {use_session} " + msg = f"HttpConn.init(domain: {domain_name}" msg += f"expire_time: {expire_time:6.2f} sec retries: {retries}" self.log.debug(msg) @@ -355,12 +353,6 @@ def __init__( else: self.log.error(f"Unknown openid provider: {provider}") - def __del__(self): - if self._s: - self.log.debug("close session") - self._s.close() - self._s = None - def getHeaders(self, username=None, password=None, headers=None): if headers is None: @@ -447,6 +439,8 @@ def verifyCert(self): def GET(self, req, format="json", params=None, headers=None): if self._endpoint is None: raise IOError("object not initialized") + if not self._s: + raise IOError("http session is closed") # check that domain is defined (except for some specific requests) if req not in ("/domains", "/about", "/info", "/") and self._domain is None: raise IOError(f"no domain defined: req: {req}") @@ -477,7 +471,7 @@ def GET(self, req, format="json", params=None, headers=None): self.log.debug(f"GET params {k}:{v}") try: - s = self.session + s = self._s stream = True # tbd - config for no streaming? ts = time.time() rsp = s.get( @@ -507,6 +501,8 @@ def PUT(self, req, body=None, format="json", params=None, headers=None): raise IOError("object not initialized") if self._domain is None: raise IOError("no domain defined") + if not self._s: + raise IOError("http session is closed") if params: self.log.info(f"PUT params: {params}") @@ -539,7 +535,7 @@ def PUT(self, req, body=None, format="json", params=None, headers=None): self.log.info(f"PUT: {req} format: {format} [{len(data)} bytes]") try: - s = self.session + s = self._s ts = time.time() rsp = s.put( self._endpoint + req, @@ -568,6 +564,8 @@ def POST(self, req, body=None, format="json", params=None, headers=None): raise IOError("object not initialized") if self._domain is None: raise IOError("no domain defined") + if not self._s: + raise IOError("http session is closed") if params is None: params = {} @@ -608,7 +606,7 @@ def POST(self, req, body=None, format="json", params=None, headers=None): self.log.info("POST: " + req) try: - s = self.session + s = self._s ts = time.time() rsp = s.post( self._endpoint + req, @@ -631,6 +629,8 @@ def POST(self, req, body=None, format="json", params=None, headers=None): def DELETE(self, req, params=None, headers=None): if self._endpoint is None: raise IOError("object not initialized") + if not self._s: + raise IOError("http session is closed") if req not in ("/domains", "/") and self._domain is None: raise IOError("no domain defined") @@ -652,9 +652,8 @@ def DELETE(self, req, params=None, headers=None): self.log.info("DEL: " + req) try: - s = self.session ts = time.time() - rsp = s.delete( + rsp = self._s.delete( self._endpoint + req, headers=headers, params=params, @@ -676,55 +675,49 @@ def DELETE(self, req, params=None, headers=None): return HttpResponse(rsp) - @property - def session(self): - # create a session object to re-use http connection when possible - s = requests - retries = self._retries - backoff_factor = 1 - status_forcelist = (500, 502, 503, 504) - - if self._use_session: - if self._s is None: - if self._endpoint.startswith("http+unix://"): - self.log.debug(f"create unixsocket session: {self._endpoint}") - s = requests_unixsocket.Session() - else: - # regular request session - s = requests.Session() - - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - - s.mount( - "http://", - HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16), - ) - s.mount( - "https://", - HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16), - ) - self._s = s - else: - s = self._s - return s - def add_external_ref(self, fid): # this is used by the group class to keep references to external links open if fid.__class__.__name__ != "FileID": raise TypeError("add_external_ref, expected FileID type") self._external_refs.append(fid) + def open(self): + if self._s: + return # already open + + retries = self._retries + backoff_factor = 1 + status_forcelist = (500, 502, 503, 504) + if self._endpoint.startswith("http+unix://"): + self.log.debug(f"create unixsocket session: {self._endpoint}") + s = requests_unixsocket.Session() + else: + # regular request session + s = requests.Session() + + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + kwargs = {"max_retries": retry, "pool_connections": 16, "pool_maxsize": 16} + s.mount("http://", HTTPAdapter(**kwargs)) + s.mount("https://", HTTPAdapter(**kwargs)) + self._s = s + def close(self): if self._s: self._s.close() self._s = None + def isClosed(self): + if self._s is None: + return True + else: + return False + @property def domain(self): return self._domain diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py index 9b0acf1f..ce75d540 100644 --- a/test/unit/hsds_reader_test.py +++ b/test/unit/hsds_reader_test.py @@ -45,6 +45,22 @@ def testSimple(self): hsds_reader = HSDSReader(filepath, **kwargs) db.reader = hsds_reader root_id = db.open() + + # check domain stats + stats = db.reader.getStats() + self.assertTrue(stats["created"] > 0) + self.assertTrue(stats["lastModified"] > 0) + self.assertTrue(stats["owner"]) + self.assertTrue("compressors" in stats) + self.assertTrue(len(stats["compressors"]) > 0) + self.assertTrue("limits" in stats) + self.assertTrue(len(stats["limits"]) > 0) + + db.close() + self.assertTrue(db.closed) + obj_id = db.open() + self.assertEqual(obj_id, root_id) + root_json = db.getObjectById(root_id) self.assertTrue("id" in root_json) From 67bf8e1718bd8807b9a69607cfe30c15fdfef846 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 4 Aug 2025 15:53:40 +0200 Subject: [PATCH 063/129] add more debug log messages --- src/h5json/hdf5db.py | 5 ++++- src/h5json/hsdsstore/hsds_writer.py | 22 +++++++++++++++++----- src/h5json/hsdsstore/httpconn.py | 5 ++++- test/unit/hsds_writer_test.py | 10 ++++++++++ 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 87cd5687..571a4de3 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -381,14 +381,17 @@ def open(self): else: # open reader first and get root id reader_root_id = self.reader.open() + self.log.debug(f"got reader root_id: {reader_root_id}") + if self._root_id: if reader_root_id != self._root_id: raise IOError("reader root id does not match reader root id") else: self._root_id = reader_root_id - + self.log.debug("open writer") # now open writer writer_root_id = self.writer.open() + self.log.debug(f"got writer root_id: {writer_root_id}") if writer_root_id != self._root_id: raise IOError("writer root id does not match reader root id") diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index ba3b7b87..586c7852 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -110,6 +110,7 @@ def __init__( self._root_id = None self._append = append self._track_order = track_order + self._owner = owner self._linked_domain = linked_domain self._last_flush_time = 0 self._stats = {"created": 0, "lastModified": 0, "owner": ""} @@ -120,17 +121,24 @@ def open(self): # no db set yet raise IOError("DB not set") - if self._http_conn: - http_conn = self._http_conn - else: + if self._http_conn and not self._http_conn.isClosed(): + return self._root_id + + if not self._http_conn: kwargs = self._http_kwargs kwargs["retries"] = 1 # tbd: test setting http_conn = HttpConn(self.filepath, **kwargs) if self._append: http_conn._mode = "a" + self.log.debug("hsdswriter - set http_conn mode to a") self._http_conn = http_conn - hsds_info = http_conn.serverInfo() - self.log.debug(f"got hsds info: {hsds_info}") + + http_conn = self._http_conn + self.log.debug("hsdswriter - open http conn") + http_conn.open() + + hsds_info = self._http_conn.serverInfo() + self.log.debug(f"got hsds info: {hsds_info}") # fetch the domain json @@ -148,6 +156,7 @@ def open(self): domain_json = None rsp = http_conn.GET(req, params=params) + self.log.debug(f"hsdswriter initial get status_code: {rsp.status_code}") if rsp.status_code not in (200, 404, 410): msg = f"Got status code: {rsp.status_code} on initial domain get" @@ -165,6 +174,7 @@ def open(self): raise IOError(404, "Location is a folder, not a file") else: # not append - delete existing domain + self.log.info("hsds_writer - delete domain") self.log.info(f"sending delete request for {self.filepath}") delete_rsp = http_conn.DELETE(req, params=params) if delete_rsp.status_code not in (200, 410): @@ -174,6 +184,7 @@ def open(self): if not domain_json: # domain doesn't exist, create it + self.log.debug("hsds_writer create domain") body = {} if self.db.root_id: # initialize domain using the db's root_id @@ -203,6 +214,7 @@ def open(self): raise IOError(404, "Location is a folder, not a file") root_id = domain_json["root"] + self.log.debug("hsds_writer got root_id:", root_id) self._root_id = root_id diff --git a/src/h5json/hsdsstore/httpconn.py b/src/h5json/hsdsstore/httpconn.py index 53c42dcc..dc2ff9b1 100644 --- a/src/h5json/hsdsstore/httpconn.py +++ b/src/h5json/hsdsstore/httpconn.py @@ -682,6 +682,7 @@ def add_external_ref(self, fid): self._external_refs.append(fid) def open(self): + self.log.debug("http_conn.open") if self._s: return # already open @@ -705,10 +706,12 @@ def open(self): kwargs = {"max_retries": retry, "pool_connections": 16, "pool_maxsize": 16} s.mount("http://", HTTPAdapter(**kwargs)) s.mount("https://", HTTPAdapter(**kwargs)) - self._s = s + self.log.debug("Httpconn set self._s") + self._s = s def close(self): if self._s: + self.log.debug("http_conn.close") self._s.close() self._s = None diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index 4ef8ff8f..a9e9b877 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -49,6 +49,7 @@ def testSimple(self): for k in ("created", "lastModified", "owner"): self.assertTrue(k in stats) http_conn = HttpConn(domain_path, mode='r', retries=1) + http_conn.open() db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) db.createAttribute(root_id, "attr2", 42) @@ -186,6 +187,14 @@ def testReaderWriter(self): self.assertTrue(root_id) db.reader = HSDSReader(domain_path, app_logger=self.log) db.close() + """ + db.writer = HSDSWriter(domain, **kwargs) + root_id = db.open() + db.close() + # now set the reader + db.reader = HSDSReader(domain, **kwargs) + db.open() + """ root_id2 = db.open() self.assertEqual(root_id, root_id2) root_json = db.getObjectById(root_id) @@ -209,6 +218,7 @@ def testH5PyToHS(self): # validate - get the root group and see if counts are correct http_conn = HttpConn(domain_path, mode='r', retries=1) + http_conn.open() http_rsp = http_conn.GET(f"/groups/{root_id}") self.assertEqual(http_rsp.status_code, 200) root_json = http_rsp.json() From 0a2c0aff13ebf658137c1613004e817d648679ee Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 5 Aug 2025 14:05:56 +0200 Subject: [PATCH 064/129] move null reader, writer classes --- src/h5json/h5reader.py | 100 ++++++++++++++++ src/h5json/h5writer.py | 73 ++++++++++++ src/h5json/hdf5db.py | 172 +--------------------------- src/h5json/hsdsstore/hsds_reader.py | 4 +- src/h5json/hsdsstore/hsds_writer.py | 2 +- testall.py | 5 +- 6 files changed, 181 insertions(+), 175 deletions(-) diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index f48612fc..a4127097 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -13,6 +13,10 @@ import weakref import logging +import time +import numpy as np + +from .objid import createObjId class H5Reader(ABC): @@ -101,3 +105,99 @@ def getStats(self): 'owner': owner name """ pass + + +class H5NullReader(H5Reader): + """ + This class can be used by HDF5DB as a default no-op reader + """ + + def __init__( + self, + filepath, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + super().__init__(filepath, app_logger=app_logger) + self.log.debug("H5NullReader.__init__") + + self._root_id = None + self._is_closed = True + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + + if obj_id != self._root_id: + raise KeyError(f"{obj_id} not found") + + # create a root group with no links or attributes + group_json = {"links": {}, "attributes": {}, "cpl": {}} + group_json["created"] = time.time() + + return group_json + + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + return None + + def getDatasetValues(self, obj_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + # just return a zero array + arr = np.zeros(sel.shape, dtype=dtype) + + return arr + + def open(self): + """ Open data source for reading """ + self.log.debug("H5NullReader open") + if self.db is None: + # no db set yet + self.log.warning("no self.db db_ref") + raise ValueError("no db") + + if self._is_closed: + if not self._root_id: + if self.db.root_id: + # use the db root id + self._root_id = self.db.root_id + else: + # create a new root id + self._root_id = createObjId(obj_type="groups") + self._is_closed = False + return self._root_id + + def close(self): + """ close any open handles to the storage """ + self._is_closed = True + + def isClosed(self): + """ return True if handle is closed """ + return self._is_closed + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stats = {} + stats['created'] = 0 + stats["lastModified"] = 0 + stats['owner'] = "" + return stats diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index cc5c601c..a27e76cb 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -12,6 +12,7 @@ from abc import ABC, abstractmethod import weakref import logging +from .objid import createObjId class H5Writer(ABC): @@ -97,3 +98,75 @@ def getStats(self): 'owner': owner name """ pass + + +class H5NullWriter(H5Writer): + """ + This class can be used by HDF5DB as a default no-op writer + """ + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + if append: + raise IOError("append is not supprot for H5NullWriter") + + super().__init__(filepath, no_data=no_data, app_logger=app_logger) + self.log.debug("H5NullWriter.__init__") + self._root_id = None + self._is_closed = True + + def open(self): + """ open storage handle, return root_id""" + self.log.debug("H5NullWriter open") + if not self._is_closed: + return self._root_id # already open + + if self.db is None: + # no db set yet + self.log.warning("no self.db db_ref") + raise ValueError("no db") + + if not self._root_id: + if self.db.root_id: + self._root_id = self.db.root_id + else: + self._root_id = createObjId(obj_type="groups") + self._is_closed = False + return self._root_id + + def flush(self): + """ Write dirty items """ + self.log.debug("H5NullWriter> flush") + # Null writer is unable to actually persist anything, so return False + return False + + def close(self): + """ close storage handle """ + self.log.debug("H5NullWriter.close") + self._is_closed = True + + def isClosed(self): + """ return True if handle is closed """ + return self._is_closed + + def getStats(self): + """ return a dictionary object with at minimum the following keys: + 'created': creation time + 'lastModified': modificationTime + 'owner': owner name + """ + stats = {} + stats['created'] = 0 + stats["lastModified"] = 0 + stats['owner'] = "" + return stats diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 571a4de3..220511e2 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -18,176 +18,8 @@ from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId from . import selections from .apiversion import _apiver -from .h5reader import H5Reader -from .h5writer import H5Writer - - -class H5NullReader(H5Reader): - """ - This class can be used by HDF5DB as a default no-op reader - """ - - def __init__( - self, - filepath, - app_logger=None - ): - if app_logger: - self.log = app_logger - else: - self.log = logging.getLogger() - - super().__init__(filepath, app_logger=app_logger) - self.log.debug("H5NullReader.__init__") - - self._root_id = None - self._is_closed = True - - def get_root_id(self): - """ Return root id """ - return self._root_id - - def getObjectById(self, obj_id, include_attrs=True, include_links=True): - """ return object with given id """ - - if obj_id != self._root_id: - raise KeyError(f"{obj_id} not found") - - # create a root group with no links or attributes - group_json = {"links": {}, "attributes": {}, "cpl": {}} - group_json["created"] = time.time() - - return group_json - - def getAttribute(self, obj_id, name, includeData=True): - """ - Get attribute given an object id and name - returns: JSON object - """ - raise IOError("not supported") - - def getDatasetValues(self, obj_id, sel=None, dtype=None): - """ - Get values from dataset identified by obj_id. - If a slices list or tuple is provided, it should have the same - number of elements as the rank of the dataset. - """ - - # just return a zero array - arr = np.zeros(sel.shape, dtype=dtype) - - return arr - - def open(self): - """ Open data source for reading """ - self.log.debug("H5NullReader open") - if self.db is None: - # no db set yet - self.log.warning("no self.db db_ref") - raise ValueError("no db") - - if self._is_closed: - if not self._root_id: - if self.db.root_id: - # use the db root id - self._root_id = self.db.root_id - else: - # create a new root id - self._root_id = createObjId(obj_type="groups") - self._is_closed = False - return self._root_id - - def close(self): - """ close any open handles to the storage """ - self._is_closed = True - - def isClosed(self): - """ return True if handle is closed """ - return self._is_closed - - def getStats(self): - """ return a dictionary object with at minimum the following keys: - 'created': creation time - 'lastModified': modificationTime - 'owner': owner name - """ - stats = {} - stats['created'] = 0 - stats["lastModified"] = 0 - stats['owner'] = "" - return stats - - -class H5NullWriter(H5Writer): - """ - This class can be used by HDF5DB as a default no-op writer - """ - - def __init__( - self, - filepath, - append=False, - no_data=False, - app_logger=None - ): - if app_logger: - self.log = app_logger - else: - self.log = logging.getLogger() - - if append: - raise IOError("append is not supprot for H5NullWriter") - - super().__init__(filepath, no_data=no_data, app_logger=app_logger) - self.log.debug("H5NullWriter.__init__") - self._root_id = None - self._is_closed = True - - def open(self): - """ open storage handle, return root_id""" - self.log.debug("H5NullWriter open") - if not self._is_closed: - return self._root_id # already open - - if self.db is None: - # no db set yet - self.log.warning("no self.db db_ref") - raise ValueError("no db") - - if not self._root_id: - if self.db.root_id: - self._root_id = self.db.root_id - else: - self._root_id = createObjId(obj_type="groups") - self._is_closed = False - return self._root_id - - def flush(self): - """ Write dirty items """ - self.log.debug("H5NullWriter> flush") - # Null writer is unable to actually persist anything, so return False - return False - - def close(self): - """ close storage handle """ - self.log.debug("H5NullWriter.close") - self._is_closed = True - - def isClosed(self): - """ return True if handle is closed """ - return self._is_closed - - def getStats(self): - """ return a dictionary object with at minimum the following keys: - 'created': creation time - 'lastModified': modificationTime - 'owner': owner name - """ - stats = {} - stats['created'] = 0 - stats["lastModified"] = 0 - stats['owner'] = "" - return stats +from .h5reader import H5Reader, H5NullReader +from .h5writer import H5Writer, H5NullWriter class Hdf5db: diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index e0053033..819126a6 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -10,8 +10,10 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import logging +import time +import numpy as np -from ..objid import getCollectionForId, getUuidFromId +from ..objid import getCollectionForId, getUuidFromId, createObjId from ..hdf5dtype import createDataType from ..array_util import jsonToArray, bytesToArray diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 586c7852..c8b12f07 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -214,7 +214,7 @@ def open(self): raise IOError(404, "Location is a folder, not a file") root_id = domain_json["root"] - self.log.debug("hsds_writer got root_id:", root_id) + self.log.debug(f"hsds_writer got root_id: {root_id}") self._root_id = root_id diff --git a/testall.py b/testall.py index 45e06106..a33cb327 100755 --- a/testall.py +++ b/testall.py @@ -24,6 +24,8 @@ "h5json_writer_test", "h5py_reader_test", "h5py_writer_test", + "hsds_reader_test", + "hsds_writer_test", ] use_hsds = True @@ -60,7 +62,6 @@ if rc != 0: sys.exit("FAILED") shutil.rmtree("./out", ignore_errors=True) -os.remove("hdf5dbtest.log") os.chdir("test/integ") @@ -77,8 +78,6 @@ sys.exit("FAILED") shutil.rmtree("./h5_out", ignore_errors=True) shutil.rmtree("./json_out", ignore_errors=True) -os.remove("h5tojson.log") -os.remove("jsontoh5.log") os.chdir("..") print("Testing suite: Success!") From 0542e7d9807932e228fddb33ba6e2bfe3384863e Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 11 Aug 2025 22:59:16 +0100 Subject: [PATCH 065/129] fix for remove links --- src/h5json/h5pystore/h5py_writer.py | 12 +++++++++- src/h5json/h5writer.py | 2 +- src/h5json/hdf5db.py | 18 ++++++++++---- src/h5json/hsdsstore/hsds_writer.py | 37 +++++++++++++++++++++++++++-- test/unit/h5py_writer_test.py | 12 +++++++++- test/unit/hsds_writer_test.py | 6 +++++ 6 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index dd543a38..e820330c 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -268,9 +268,19 @@ def _createDatatype(self, parent, ctype_json, name=None): def _createObjects(self, parent, links_json, visited=set()): """ create child object in the given group, recurse for any sub-groups """ - for title in links_json: + titles = list(links_json.keys()) + for title in titles: link_json = links_json[title] link_class = link_json["class"] + if "DELETE" in link_json: + if title in parent: + # delete the link + self.log.debug(f"deleting link {title}") + del parent[title] + # update the link json + del links_json[title] + continue + if link_class == "H5L_TYPE_SOFT" and title not in parent: h5path = link_json["h5path"] parent[title] = h5py.SoftLink(h5path) diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index a27e76cb..a4b9a522 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -118,7 +118,7 @@ def __init__( self.log = logging.getLogger() if append: - raise IOError("append is not supprot for H5NullWriter") + raise IOError("append is not supported for H5NullWriter") super().__init__(filepath, no_data=no_data, app_logger=app_logger) self.log.debug("H5NullWriter.__init__") diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 220511e2..f7b2c97d 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -651,8 +651,12 @@ def getLinks(self, grp_id): links = grp_json["links"] names = [] for name in links: - if links[name] is not None: - names.append(name) + link_json = links[name] + if link_json is None: + continue + if "DELETE" in link_json: + continue # deleted link + names.append(name) return names def getLink(self, grp_id, name): @@ -663,11 +667,12 @@ def getLink(self, grp_id, name): if name not in links: self.log.info(f"Link [{name}] not found in {grp_id}") return None - if links[name] is None: + link_json = links[name] + if "DELETED" in link_json: self.log.info(f"Link {name} in {grp_id} has been deleted") return None - return links[name] + return link_json def _addLink(self, grp_id, name, link_json): obj_json = self.getObjectById(grp_id) @@ -708,8 +713,11 @@ def deleteLink(self, grp_id, name): links = grp_json["links"] if name not in links: raise KeyError(f"Link [{name}] not found in {grp_id}") - links[name] = None # mark for deletion + link_json = links[name] + link_json["DELETE"] = time.time() # mark for deletion self.make_dirty(grp_id) + grp_json = self.getObjectById(grp_id) + links = grp_json["links"] def createGroup(self, cpl=None): """ Create a new group """ diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index c8b12f07..92f73fd6 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -344,6 +344,7 @@ def updateLinks(self, grp_ids): self.log.debug("hsds_writer> updateLinks") items = {} # dict which will hold a map of grp ids to links to create + removals = {} # map of grp_ids to link titles to be deleted count = 0 for grp_id in grp_ids: @@ -351,12 +352,23 @@ def updateLinks(self, grp_ids): continue # ignore datasets and datatypes grp_json = self.db.getObjectById(grp_id) grp_links = grp_json["links"] - for link_title in grp_links: + link_titles = list(grp_links.keys()) + for link_title in link_titles: link_json = grp_links[link_title] if "created" not in link_json: self.log.error(f"hsds_writer> expected created timestamp in link: {link_json}") created = link_json["created"] - if created > self._last_flush_time: + if "DELETE" in link_json: + if created > self._last_flush_time: + # link hasn't been created yet + msg = f"hsds_writer> {grp_id}: link: {link_title} deleted before flush" + self.log.debug(msg) + else: + # link has been persisted, remove + if grp_id not in removals: + removals[grp_id] = set() + removals[grp_id].add(link_title) + elif created > self._last_flush_time: self.log.debug(f"hsds_writer> {grp_id}: new link: {link_title}") count += 1 # new link, add to our list @@ -380,6 +392,27 @@ def updateLinks(self, grp_ids): raise IOError(f"unexpected link class: {link_class}") links[link_title] = new_link self.log.debug(f"setting link {link_title} to {new_link}") + else: + self.log.debug(f"link {link_title} has already been persisted") + + if removals: + # TBD: hsds doesn't have a multiple object link deletion operation yet + # so make one request per object id + for grp_id in removals: + titles = removals[grp_id] + params = {"titles": "/".join(titles)} + del_rsp = self.http_conn.DELETE("/groups/" + grp_id + links, params=params) + if del_rsp.status_code != 200: + self.log.error("failed to delete links for grp: {grp_id} titles: {titles}") + raise IOError("hsds_writer failed to delete links") + else: + self.log.debug(f"hsds_writer> {grp_id} deleted {len(titles)} links") + self._lastModified = time.time() + # remove links from link_json in db + grp_json = self.db.getObjectById(grp_id) + grp_links = grp_json["links"] + for title in titles: + del grp_links[title] if items: body = {"grp_ids": items} diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index e2763795..259d7937 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -150,6 +150,17 @@ def testSimple(self): g2 = f["g2"] self.assertTrue("g2.1" in g2) + # create a link, then delete before flushing + db.open() + tmp_grp_id = db.createGroup("tmp_group") + db.createHardLink(g2_id, "tmp_group", tmp_grp_id) + db.deleteLink(g2_id, "tmp_group") + db.close() + + with h5py.File(filepath) as f: + g2 = f["g2"] + self.assertFalse("tmp_group" in g2) + db.open() sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) arr = np.zeros((), dtype=np.int32) @@ -546,7 +557,6 @@ def testReaderWithUpdate(self): dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") db.createAttribute(dset111_id, "attr3", "hello") self.assertFalse(db.closed) - print("test - db.close()") db.close() with h5py.File(file_out) as f: diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index a9e9b877..8f12c920 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -111,6 +111,12 @@ def testSimple(self): db.createCustomLink(g2_id, "cust", {"foo": "bar"}) db.flush() + # create a link, then delete before flushing + tmp_grp_id = db.createGroup("tmp_group") + db.createHardLink(g1_1_id, "tmp_group", tmp_grp_id) + db.deleteLink(g1_1_id, "tmp_group") + db.flush() + # validate - check that links got updated http_rsp = http_conn.GET(f"/groups/{g2_id}/links") self.assertEqual(http_rsp.status_code, 200) From 5bbb0f34ae361cd3179bfc9a1624ba2e57130c6a Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 17 Aug 2025 14:52:21 +0100 Subject: [PATCH 066/129] use DELTED to indicate link deletions --- src/h5json/h5pystore/h5py_writer.py | 2 +- src/h5json/hdf5db.py | 4 ++-- src/h5json/hsdsstore/hsds_writer.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index e820330c..1ee9570e 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -272,7 +272,7 @@ def _createObjects(self, parent, links_json, visited=set()): for title in titles: link_json = links_json[title] link_class = link_json["class"] - if "DELETE" in link_json: + if "DELETED" in link_json: if title in parent: # delete the link self.log.debug(f"deleting link {title}") diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index f7b2c97d..68f9b17c 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -654,7 +654,7 @@ def getLinks(self, grp_id): link_json = links[name] if link_json is None: continue - if "DELETE" in link_json: + if "DELETED" in link_json: continue # deleted link names.append(name) return names @@ -714,7 +714,7 @@ def deleteLink(self, grp_id, name): if name not in links: raise KeyError(f"Link [{name}] not found in {grp_id}") link_json = links[name] - link_json["DELETE"] = time.time() # mark for deletion + link_json["DELETED"] = time.time() # mark for deletion self.make_dirty(grp_id) grp_json = self.getObjectById(grp_id) links = grp_json["links"] diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 92f73fd6..72864db5 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -358,7 +358,7 @@ def updateLinks(self, grp_ids): if "created" not in link_json: self.log.error(f"hsds_writer> expected created timestamp in link: {link_json}") created = link_json["created"] - if "DELETE" in link_json: + if "DELETED" in link_json: if created > self._last_flush_time: # link hasn't been created yet msg = f"hsds_writer> {grp_id}: link: {link_title} deleted before flush" From b05941f79e7b35b4fe5372bb2d1bd58641a1f247 Mon Sep 17 00:00:00 2001 From: John Readey Date: Sun, 17 Aug 2025 17:56:29 +0100 Subject: [PATCH 067/129] persist attr deletion --- src/h5json/h5pystore/h5py_writer.py | 7 +++++ src/h5json/hdf5db.py | 45 +++++++++++++------------- src/h5json/hsdsstore/hsds_writer.py | 49 ++++++++++++++++++++++++++++- test/unit/h5py_writer_test.py | 19 +++++++++++ test/unit/hdf5db_test.py | 11 +++++++ test/unit/hsds_writer_test.py | 41 ++++++++++++++++++++++++ 6 files changed, 150 insertions(+), 22 deletions(-) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 1ee9570e..2cb42c0b 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -394,6 +394,13 @@ def updateAttributes(self, obj_id, obj): attrs = obj_json["attributes"] for name in attrs: attr_json = attrs[name] + if "DELETED" in attr_json: + if name in obj.attrs: + # delete the attribute + self.log.debug(f"h5py_writer - delete attribute {name}") + del obj.attrs[name] + else: + pass # already deleted or never added if "created" in attr_json and attr_json["created"] < self._flush_time: # attribute should be saved already continue diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 68f9b17c..5f51c972 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -377,46 +377,48 @@ def getDtype(self, obj_json): return dtype - def getAttribute(self, obj_id, name, includeData=True): + def getAttributes(self, obj_id): """ - Get attribute given an object id and name + Get attributes given an object id and name returns: JSON object """ obj_json = self.getObjectById(obj_id) attrs = obj_json["attributes"] + names = [] - if name not in attrs: - msg = f"Attribute: [{name}] not found in object: {obj_id}" - self.log.info(msg) - return None - if attrs[name] is None: - msg = f"Attribute: [{name}] has been deleted" - self.log.info(None) - return None - - attr_json = attrs[name] + for name in attrs: + attr_json = attrs[name] + if attr_json is None: + continue + if "DELETED" in attr_json: + continue # deleted attr + names.append(name) - return attr_json + return names - def getAttributes(self, obj_id): + def getAttribute(self, obj_id, name, includeData=True): """ - Get attributes given an object id and name + Get attribute given an object id and name returns: JSON object """ + attr_names = self.getAttributes(obj_id) + if name not in attr_names: + return None + obj_json = self.getObjectById(obj_id) attrs = obj_json["attributes"] - names = [] - for name in attrs: - if attrs[name] is not None: - names.append(name) - return names + attr_json = attrs[name] + + return attr_json def getAttributeValue(self, obj_id, name): """ Return NDArray of the given attribute value """ attr_json = self.getAttribute(obj_id, name) + if attr_json is None: + raise KeyError(f"attribute {name} not found") shape_json = attr_json["shape"] if shape_json["class"] == "H5S_NULL": # no value for empty shape attributes @@ -530,7 +532,8 @@ def deleteAttribute(self, obj_id, name): attrs_json = obj_json["attributes"] if name not in attrs_json: raise KeyError(f"attribute [{name}] not found in {obj_id}") - attrs_json[name] = None # mark key for deletion + attr_json = attrs_json[name] + attr_json["DELETED"] = time.time() # mark key for deletion self.make_dirty(obj_id) diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 72864db5..45e24fc0 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -424,11 +424,24 @@ def updateLinks(self, grp_ids): self.log.debug(f"hsds_writer> {grp_id} {count} links updated") self._lastModified = time.time() + def _deleteAttribute(self, obj_id, attr_name): + # delete the given attribute + + col_name = getCollectionForId(obj_id) + req = f"/{col_name}/{obj_id}/attributes/{attr_name}" + http_rsp = self.http_conn.DELETE(req) + if http_rsp.status_code != 200: + self.log.error("failed to delete attribute for obj: {obj_id} name: {attr_name}") + raise IOError("hsds_writer failed to delete attribute") + def updateAttributes(self, obj_ids): """ update any modified links of the given objects """ self.log.debug("hsds_writer> updateAttributes") items = {} # dict which will hold a map of objects ids to attributes to create + removals = {} # map of obj_ids to attributes to be deleted + separator = '|' # use this character to join attribute names for deletion + count = 0 for obj_id in obj_ids: @@ -436,10 +449,26 @@ def updateAttributes(self, obj_ids): obj_attrs = obj_json["attributes"] for attr_name in obj_attrs: attr_json = obj_attrs[attr_name] + if "created" not in attr_json: self.log.error(f"hsds_writer> expected created timestamp in attr: {attr_json}") created = attr_json["created"] - if created > self._last_flush_time: + if "DELETED" in attr_json: + if created > self._last_flush_time: + # attribute hasn't been created yet + msg = f"hsds_writer> {obj_id}: attr: {attr_name} deleted before flush" + self.log.debug(msg) + else: + # attribute has been persisted, remove + if attr_name.find(separator) != -1: + # need to delete individually + self._deleteAttribute(obj_id, attr_name) + else: + # can delete in a batch + if obj_id not in removals: + removals[obj_id] = set() + removals[obj_id].add(attr_name) + elif created > self._last_flush_time: self.log.debug(f"hsds_writer> {obj_id} attribute {attr_name} created") count += 1 # new attribute, add to our list @@ -447,6 +476,24 @@ def updateAttributes(self, obj_ids): items[obj_id] = {"attributes": {}} attrs = items[obj_id]["attributes"] attrs[attr_name] = attr_json + else: + self.log.debug(f"hsds_writer> {obj_id}: attr: {attr_name} has already been deleted") + + if removals: + # TBD: hsds doesn't have a multiple object attribute deletion operation yet + # so make one request per object id + # Delete with custom separator + + for obj_id in removals: + attr_names = removals[obj_id] + params = {"attr_names": separator.join(attr_names)} + params["separator"] = separator + collection = getCollectionForId(obj_id) + req = f"/{collection}/{obj_id}/attributes" + rsp = self.http_conn.DELETE(req, params=params) + if rsp.status_code != 200: + self.log.error("failed to delete attribute for obj: {obj_id}") + raise IOError("hsds_writer failed to delete attributes") if items: body = {"obj_ids": items} diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 259d7937..aa481dfd 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -132,7 +132,9 @@ def testSimple(self): self.assertTrue("slink" in g2) db.open() + db.createAttribute(g1_id, "a1", "hello") db.createAttribute(g1_id, "a2", "bye-bye") + self.assertEqual(len(db.getAttributes(g1_id)), 2) db.close() with h5py.File(filepath) as f: @@ -141,6 +143,19 @@ def testSimple(self): self.assertTrue("a1" in g1.attrs) self.assertTrue("a2" in g1.attrs) + db.open() + # test deleting an attribute + db.deleteAttribute(g1_id, "a1") + self.assertEqual(len(db.getAttributes(g1_id)), 1) + self.assertEqual(db.getAttribute(g1_id, "a1"), None) + db.close() + + with h5py.File(filepath) as f: + g1 = f["g1"] + self.assertEqual(len(g1.attrs), 1) + self.assertFalse("a1" in g1.attrs) + self.assertTrue("a2" in g1.attrs) + db.open() g21 = db.createGroup() db.createHardLink(g2_id, "g2.1", g21) @@ -154,7 +169,11 @@ def testSimple(self): db.open() tmp_grp_id = db.createGroup("tmp_group") db.createHardLink(g2_id, "tmp_group", tmp_grp_id) + del_link = db.getLink(g2_id, "tmp_group") + self.assertTrue(del_link is not None) db.deleteLink(g2_id, "tmp_group") + self.assertEqual(db.getLink(g2_id, "tmp_group"), None) + db.close() with h5py.File(filepath) as f: diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 2722eaa6..7a882259 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -139,6 +139,17 @@ def testGroup(self): ret = db.getLink(g2_id, "not_a_link") self.assertTrue(ret is None) + + db.createAttribute(g1_id, "a1", "hello") + db.createAttribute(g1_id, "a2", "bye-bye") + self.assertEqual(len(db.getAttributes(g1_id)), 2) + a1_attr = db.getAttribute(g1_id, "a1") + self.assertEqual(a1_attr["value"], "hello") + + db.deleteAttribute(g1_id, "a1") + self.assertEqual(len(db.getAttributes(g1_id)), 1) + self.assertEqual(db.getAttribute(g1_id, "a1"), None) + db.close() def testNullSpaceAttribute(self): diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index 8f12c920..ca3c2579 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -125,9 +125,50 @@ def testSimple(self): g2links = g2links_json["links"] self.assertTrue(len(g2links), 2) # custom link will be ignored + db.createAttribute(g1_id, "a1", "hello") db.createAttribute(g1_id, "a2", "bye-bye") db.flush() + # validate - check that attributes got created + http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + attrs_json = rsp_json["attributes"] + self.assertEqual(len(attrs_json), 2) + + # delete an attribute + db.deleteAttribute(g1_id, "a1") + db.flush() + + # validate - check that the attribute got deleted + http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + attrs_json = rsp_json["attributes"] + self.assertEqual(len(attrs_json), 1) + + # create an attribute that happens to use the separator character + db.createAttribute(g1_id, "a|z", "goofy") + db.flush() + + # validate - check that attributes got created + http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + attrs_json = rsp_json["attributes"] + self.assertEqual(len(attrs_json), 2) + + # delete an attribute + db.deleteAttribute(g1_id, "a|z") + db.flush() + + # validate - check that the attribute got deleted + http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") + self.assertEqual(http_rsp.status_code, 200) + rsp_json = http_rsp.json() + attrs_json = rsp_json["attributes"] + self.assertEqual(len(attrs_json), 1) + g21 = db.createGroup() db.createHardLink(g2_id, "g2.1", g21) db.flush() From e4e0105c2843826723350ce86b64ef0b255248d0 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 5 Sep 2025 12:07:11 +0100 Subject: [PATCH 068/129] fix key name for creationPropertyList --- src/h5json/hdf5db.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 5f51c972..9bbfa702 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -729,9 +729,9 @@ def createGroup(self, cpl=None): grp_id = createObjId("groups", root_id=self.root_id) group_json = {"attributes": {}, "links": {}} if cpl: - group_json["cpl"] = cpl + group_json["creationProperties"] = cpl else: - group_json["cpl"] = {} + group_json["creationProperties"] = {} group_json["created"] = time.time() self.db[grp_id] = group_json self._new_objects.add(grp_id) @@ -756,7 +756,7 @@ def createCommittedType(self, datatype, cpl=None): type_json = getTypeItem(dt) # get canonical json description of datatype - ctype_json = {"type": type_json, "attributes": {}, "cpl": cpl} + ctype_json = {"type": type_json, "attributes": {}, "creationProperties": cpl} ctype_json["created"] = time.time() self.db[ctype_id] = ctype_json self._new_objects.add(ctype_id) @@ -795,9 +795,9 @@ def createDataset( dset_json = {"shape": shape_json, "type": type_json, "attributes": {}} if cpl: - dset_json["cpl"] = cpl + dset_json["creationProperties"] = cpl else: - dset_json["cpl"] = {} + dset_json["creationProperties"] = {} dset_id = createObjId("datasets", root_id=self.root_id) self.db[dset_id] = dset_json From 3260929d20b2ea194b28ae71851559b788a13f22 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 11:06:44 +0100 Subject: [PATCH 069/129] use client create time for new link if provided --- src/h5json/hsdsstore/hsds_writer.py | 2 +- test/unit/hdf5db_test.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py index 45e24fc0..9166937a 100644 --- a/src/h5json/hsdsstore/hsds_writer.py +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -376,7 +376,7 @@ def updateLinks(self, grp_ids): items[grp_id] = {"links": {}} links = items[grp_id]["links"] link_class = link_json["class"] - new_link = {"class": link_class} + new_link = {"class": link_class, "created": created} # convert to hsds representation if link_class == "H5L_TYPE_HARD": new_link["id"] = link_json["id"] diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 7a882259..8dc0e99b 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -332,6 +332,29 @@ def testCreateVlenReferenceAttribute(self): db.close() + def testAttributeCreateOrder(self): + titles = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten") + cpl = {"CreateOrder": True} + db = Hdf5db(app_logger=self.log) + root_id = db.open() + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + for title in titles: + db.createAttribute(g1_id, title, title) + g2_id = db.createGroup(cpl=cpl) + db.createHardLink(root_id, "g2", g2_id) + for title in titles: + db.createAttribute(g2_id, title, title) + print("g1 attributes:", db.getAttributes(g1_id)) + print("g2 attributes:", db.getAttributes(g2_id)) + self.assertEqual(sorted(db.getAttributes(g1_id)), sorted(titles)) + #self.assertEqual(db.getAttributes(g2_id), titles) + db.close() + + + + + def testCommittedType(self): db = Hdf5db(app_logger=self.log) root_id = db.open() From 1ea0bff5cf4fd45e2bba0e073814fa6a6ed9a5de Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 16:17:23 +0100 Subject: [PATCH 070/129] make reference type simple wrapper for str uuid --- src/h5json/h5py_util.py | 109 --------------------------------------- src/h5json/hdf5db.py | 9 ++-- src/h5json/hdf5dtype.py | 47 +++++++++-------- src/h5json/objid.py | 23 +++++++++ test/unit/hdf5db_test.py | 8 +-- test/unit/objid_test.py | 3 +- 6 files changed, 56 insertions(+), 143 deletions(-) delete mode 100644 src/h5json/h5py_util.py diff --git a/src/h5json/h5py_util.py b/src/h5json/h5py_util.py deleted file mode 100644 index ebe2dbdb..00000000 --- a/src/h5json/h5py_util.py +++ /dev/null @@ -1,109 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## - -import h5py -import numpy as np - -from . import hdf5dtype - - -def is_reference(val): - """ Return True if the type or value is a Reference """ - - if isinstance(val, object) and val.__class__.__name__ == "Reference": - return True - elif isinstance(val, type) and val.__name__ == "Reference": - return True - else: - return False - - -def is_regionreference(val): - """ Return True if the type or value is a RegionReference """ - - if isinstance(val, object) and val.__class__.__name__ == "RegionReference": - return True - elif isinstance(val, type) and val.__name__ == "RegionReference": - return True - - return False - - -def has_reference(dtype): - """ return True if the dtype (or a sub-type) is a Reference type """ - has_ref = False - if not isinstance(dtype, np.dtype): - return False - if len(dtype) > 0: - for name in dtype.fields: - item = dtype.fields[name] - if has_reference(item[0]): - has_ref = True - break - elif dtype.metadata and "ref" in dtype.metadata: - basedt = dtype.metadata["ref"] - has_ref = is_reference(basedt) - elif dtype.metadata and "vlen" in dtype.metadata: - basedt = dtype.metadata["vlen"] - has_ref = has_reference(basedt) - return has_ref - - -def convert_dtype(srcdt, to_h5py=True): - """Return a dtype based on input dtype, converting any Reference types from - h5py style to h5json and vice-versa. - """ - - if len(srcdt) > 0: - fields = [] - for name in srcdt.fields: - item = srcdt.fields[name] - # item is a tuple of dtype and integer offset - field_dt = convert_dtype(item[0], to_h5py=to_h5py) - fields.append((name, field_dt)) - tgt_dt = np.dtype(fields) - else: - # check if this a "special dtype" - if srcdt.metadata and "ref" in srcdt.metadata: - ref = srcdt.metadata["ref"] - if is_reference(ref): - if to_h5py: - tgt_dt = h5py.special_dtype(ref=h5py.Reference) - else: - tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.Reference) - elif is_regionreference(ref): - if to_h5py: - tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) - else: - tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.RegionReference) - else: - msg = f"Unexpected ref type: {srcdt}" - raise TypeError(msg) - elif srcdt.metadata and "vlen" in srcdt.metadata: - src_vlen = srcdt.metadata["vlen"] - if isinstance(src_vlen, np.dtype): - tgt_base = convert_dtype(src_vlen, to_h5py=to_h5py) - else: - tgt_base = src_vlen - if to_h5py: - tgt_dt = h5py.special_dtype(vlen=tgt_base) - else: - tgt_dt = hdf5dtype.special_dtype(vlen=tgt_base) - elif srcdt.kind == "U": - # use vlen for unicode strings - if to_h5py: - tgt_dt = h5py.special_dtype(vlen=str) - else: - tgt_dt = hdf5dtype.special_dtype(vlen=str) - else: - tgt_dt = srcdt - return tgt_dt diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 9bbfa702..91884f57 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -15,7 +15,7 @@ from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype from .array_util import jsonToArray, bytesArrayToList from .dset_util import resize_dataset -from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId +from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId from . import selections from .apiversion import _apiver from .h5reader import H5Reader, H5NullReader @@ -279,11 +279,12 @@ def getObjectById(self, obj_id, refresh=False): """ return object with given id """ self.log.debug(f"getObjectById {obj_id}") self._checkReader() - if obj_id not in self.db or refresh: + tag = getHashTagForId(obj_id) + if tag not in self.db or refresh: # load the obj from the reader obj_json = self.reader.getObjectById(obj_id) - self.db[obj_id] = obj_json - obj_json = self.db[obj_id] + self.db[tag] = obj_json + obj_json = self.db[tag] return obj_json diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index bbef116d..8799836a 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -13,6 +13,8 @@ import weakref import numpy as np +from .objid import getHashTagForId + numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) numpy_float_types = (np.float16, np.float32, np.float64) @@ -28,42 +30,43 @@ def id(self): """Low-level identifier appropriate for this object""" return self._id - @property - def objref(self): - """Weak reference to object""" - return self._objref # return weak ref to ref'd object - def __init__(self, bind): """Create a new reference by binding to - a group/dataset/committed type + a uuid """ - self._id = bind._id - self._objref = weakref.ref(bind) + if not bind: + self._id = None + else: + if isinstance(bind, bytes): + bind = bind.decode() + + if not isinstance(bind, str): + raise TypeError("Expected string id") + + self._id = getHashTagForId(bind) def __repr__(self): # TBD: this is not consistent with hsds or h5py... - if not isinstance(self._id.id, str): - raise TypeError("Expected string id") - item = None - - collection_type = self._id.collection_type - item = f"{collection_type}/{self._id.id}" - return item + return f"" def tolist(self): - if type(self._id.id) is not str: + if type(self._id) is not str: raise TypeError("Expected string id") - if self._id.objtype_code == "d": + if not self._id: + return [("",),] + + objtype_code = self._id[0] + if objtype_code == "d": return [ - ("datasets/" + self._id.id), + ("datasets/" + self._id), ] - elif self._id.objtype_code == "g": + elif objtype_code == "g": return [ - ("groups/" + self._id.id), + ("groups/" + self._id), ] - elif self._id.objtype_code == "t": + elif objtype_code == "t": return [ - ("datatypes/" + self._id.id), + ("datatypes/" + self._id), ] else: raise TypeError("Unexpected id type") diff --git a/src/h5json/objid.py b/src/h5json/objid.py index 57b5316c..fa82e0ef 100644 --- a/src/h5json/objid.py +++ b/src/h5json/objid.py @@ -130,6 +130,29 @@ def getCollectionForId(obj_id): return collection +def getHashTagForId(id): + """ Return canonical - """ + + if not isinstance(id, str): + raise ValueError("Expected string type") + + if not id: + raise ValueError("Empty id") + + parts = id.split("/") + tag = parts[-1] + + # add a prefix tag if not already present + if len(tag) < UUID_LEN: + raise ValueError(f"unexpected uuid: {tag}") + if tag[1] != '-': + if len(parts) != 2: + raise ValueError(f"unexpected obj id: {id}") + collection = parts[0] + tag = _getPrefixForCollection(collection) + '-' + tag + return tag + + def isRootObjId(id): """returns true if this is a root id (only for v2 schema)""" if not isSchema2Id(id): diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 8dc0e99b..1eca8e2a 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -345,16 +345,10 @@ def testAttributeCreateOrder(self): db.createHardLink(root_id, "g2", g2_id) for title in titles: db.createAttribute(g2_id, title, title) - print("g1 attributes:", db.getAttributes(g1_id)) - print("g2 attributes:", db.getAttributes(g2_id)) self.assertEqual(sorted(db.getAttributes(g1_id)), sorted(titles)) - #self.assertEqual(db.getAttributes(g2_id), titles) + self.assertEqual(tuple(db.getAttributes(g2_id)), titles) db.close() - - - - def testCommittedType(self): db = Hdf5db(app_logger=self.log) root_id = db.open() diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py index d74ec102..7104e9bc 100755 --- a/test/unit/objid_test.py +++ b/test/unit/objid_test.py @@ -12,7 +12,7 @@ import unittest from h5json.objid import isRootObjId, isValidUuid, validateUuid -from h5json.objid import createObjId, getCollectionForId, getUuidFromId +from h5json.objid import createObjId, getCollectionForId, getUuidFromId, getHashTagForId from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id @@ -203,6 +203,7 @@ def testGetDataTypeId(self): self.assertTrue(isValidUuid(test_id)) self.assertEqual(getCollectionForId(test_id), "datatypes") self.assertEqual(getUuidFromId(test_id), test_uuid) + self.assertEqual(getHashTagForId(test_id), "t-" + test_uuid) if __name__ == "__main__": From 46ff5face0fef3c83b6636535659f64b4fad27ac Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 16:21:55 +0100 Subject: [PATCH 071/129] fix syntax for ci yaml --- .github/workflows/ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba618d56..dcfdc512 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,8 +83,9 @@ jobs: - name: Run tests shell: bash - HS_ENDPOINT: http://localhost:5101 - HS_USERNAME: test_user1 - HS_PASSWORD: test + env: + HS_ENDPOINT: http://localhost:5101 + HS_USERNAME: test_user1 + HS_PASSWORD: test run: | python testall.py From e88f85f5ad5734df4bf885cbea201657b59c8365 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 16:25:50 +0100 Subject: [PATCH 072/129] remove python 3.9 support --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dcfdc512..554cb6d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} steps: From 51f2a9b4441152358ed6f55d8b1ae8e77bd19aef Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 16:28:07 +0100 Subject: [PATCH 073/129] revert h5py_util.py --- src/h5json/h5py_util.py | 109 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 src/h5json/h5py_util.py diff --git a/src/h5json/h5py_util.py b/src/h5json/h5py_util.py new file mode 100644 index 00000000..ebe2dbdb --- /dev/null +++ b/src/h5json/h5py_util.py @@ -0,0 +1,109 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import h5py +import numpy as np + +from . import hdf5dtype + + +def is_reference(val): + """ Return True if the type or value is a Reference """ + + if isinstance(val, object) and val.__class__.__name__ == "Reference": + return True + elif isinstance(val, type) and val.__name__ == "Reference": + return True + else: + return False + + +def is_regionreference(val): + """ Return True if the type or value is a RegionReference """ + + if isinstance(val, object) and val.__class__.__name__ == "RegionReference": + return True + elif isinstance(val, type) and val.__name__ == "RegionReference": + return True + + return False + + +def has_reference(dtype): + """ return True if the dtype (or a sub-type) is a Reference type """ + has_ref = False + if not isinstance(dtype, np.dtype): + return False + if len(dtype) > 0: + for name in dtype.fields: + item = dtype.fields[name] + if has_reference(item[0]): + has_ref = True + break + elif dtype.metadata and "ref" in dtype.metadata: + basedt = dtype.metadata["ref"] + has_ref = is_reference(basedt) + elif dtype.metadata and "vlen" in dtype.metadata: + basedt = dtype.metadata["vlen"] + has_ref = has_reference(basedt) + return has_ref + + +def convert_dtype(srcdt, to_h5py=True): + """Return a dtype based on input dtype, converting any Reference types from + h5py style to h5json and vice-versa. + """ + + if len(srcdt) > 0: + fields = [] + for name in srcdt.fields: + item = srcdt.fields[name] + # item is a tuple of dtype and integer offset + field_dt = convert_dtype(item[0], to_h5py=to_h5py) + fields.append((name, field_dt)) + tgt_dt = np.dtype(fields) + else: + # check if this a "special dtype" + if srcdt.metadata and "ref" in srcdt.metadata: + ref = srcdt.metadata["ref"] + if is_reference(ref): + if to_h5py: + tgt_dt = h5py.special_dtype(ref=h5py.Reference) + else: + tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.Reference) + elif is_regionreference(ref): + if to_h5py: + tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) + else: + tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.RegionReference) + else: + msg = f"Unexpected ref type: {srcdt}" + raise TypeError(msg) + elif srcdt.metadata and "vlen" in srcdt.metadata: + src_vlen = srcdt.metadata["vlen"] + if isinstance(src_vlen, np.dtype): + tgt_base = convert_dtype(src_vlen, to_h5py=to_h5py) + else: + tgt_base = src_vlen + if to_h5py: + tgt_dt = h5py.special_dtype(vlen=tgt_base) + else: + tgt_dt = hdf5dtype.special_dtype(vlen=tgt_base) + elif srcdt.kind == "U": + # use vlen for unicode strings + if to_h5py: + tgt_dt = h5py.special_dtype(vlen=str) + else: + tgt_dt = hdf5dtype.special_dtype(vlen=str) + else: + tgt_dt = srcdt + return tgt_dt From e7452ca7ce9c9bd30e9710825504a5166793bb11 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Sep 2025 18:03:14 +0100 Subject: [PATCH 074/129] use uuid as representation of Reference type --- src/h5json/hdf5dtype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index 8799836a..c0ed2884 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -46,8 +46,8 @@ def __init__(self, bind): self._id = getHashTagForId(bind) def __repr__(self): - # TBD: this is not consistent with hsds or h5py... - return f"" + # return canonical uuid + return f"{self._id}" def tolist(self): if type(self._id) is not str: From 5b6f33db42665b24f88a7512b3f6db43a4035b2b Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Sep 2025 11:17:59 +0100 Subject: [PATCH 075/129] fix len ref in hsds_reader --- src/h5json/hsdsstore/hsds_reader.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py index 819126a6..e7dfa26d 100644 --- a/src/h5json/hsdsstore/hsds_reader.py +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -10,10 +10,8 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import logging -import time -import numpy as np -from ..objid import getCollectionForId, getUuidFromId, createObjId +from ..objid import getCollectionForId, getUuidFromId from ..hdf5dtype import createDataType from ..array_util import jsonToArray, bytesToArray @@ -277,7 +275,7 @@ def getDatasetValues(self, dset_id, sel=None, dtype=None): params["fields"] = ":".join(mtype.names) MAX_SELECT_QUERY_LEN = 100 - if len(query_param) > MAX_SELECT_QUERY_LEN: + if query_param and len(query_param) > MAX_SELECT_QUERY_LEN: # use a post method to avoid possible long query strings try: rsp = self.http_conn.POST(req, body=params, format="binary") From 8e6d14a28aa1fd2ef31193d2321b3612a61c18d6 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Sep 2025 13:33:50 +0100 Subject: [PATCH 076/129] fix for reading unpersisted dataset values --- src/h5json/hdf5db.py | 20 ++++++++++++++++++- src/h5json/selections.py | 36 +++++++++++++++++++++++++++++------ test/unit/hsds_writer_test.py | 23 +++++++++++++--------- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 91884f57..b0c069d0 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -569,7 +569,25 @@ def getDatasetValues(self, dset_id, sel): rank = len(dims) dtype = self.getDtype(dset_json) - arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) + + # determine if we need to make a read request or not + if dset_id in self._new_objects: + fetch = False + else: + fetch = True + # check against pending updates + if "updates" in dset_json: + updates = dset_json["updates"] + for (update_sel, update_val) in updates: + if selections.contained(sel, update_sel): + fetch = False + break + + # send a reader request unless an update already covers the sel area + if fetch: + arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) + else: + arr = np.zeros(sel.shape, dtype=dtype) if "updates" in dset_json: # apply any non-flushed changes that intersect the current selection diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 1a051383..ec4ac649 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -116,8 +116,8 @@ def select(obj, args): return sel -def intersect(s1, s2): - """ Return the intersection of two selections """ +def _check_bool_args(s1, s2): + """ verify argument for boolean operations """ # TBD: this is currently only working for simple selections with stride 1 valid_select_types = (H5S_SELECT_HYPERSLABS, H5S_SELECT_ALL) if not isinstance(s1, Selection): @@ -131,15 +131,18 @@ def intersect(s1, s2): if s1.shape != s2.shape: raise ValueError("selections have incompatible shapes") + +def intersect(s1, s2): + """ Return the intersection of two selections """ + # TBD: this is currently only working for simple selections with stride 1 + _check_bool_args(s1, s2) + slices = [] rank = len(s1.shape) for dim in range(rank): start = max(s1.start[dim], s2.start[dim]) stop = min(s1.start[dim] + s1.count[dim], s2.start[dim] + s2.count[dim]) - msg = "stepped slices not currently supported" - if s1.step[dim] > 1: - raise ValueError(msg) - if s2.step[dim] > 1: + if s1.step[dim] > 1 or s2.step[dim] > 1: raise ValueError("stepped slices not currently supported") if start > stop: stop = start @@ -149,6 +152,27 @@ def intersect(s1, s2): return select(s1.shape, slices) +def contained(s1, s2): + """ return True if s1 is contained in s2, otherwise False """ + _check_bool_args(s1, s2) + + is_contained = True + rank = len(s1.shape) + for dim in range(rank): + if s1.step[dim] > 1 or s2.step[dim] > 1: + # TBD: do the right thing for stepped selections + # for now just return False + is_contained = False + break + if s1.start[dim] < s2.start[dim]: + is_contained = False + break + if s1.start[dim] + s1.count[dim] > s2.start[dim] + s2.count[dim]: + is_contained = False + break + return is_contained + + class Selection(object): """ diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index ca3c2579..af12fa32 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -223,7 +223,7 @@ def testSimple(self): db.close() def testReaderWriter(self): - # try reading and writer to an HSDS domain + # try reading and writing to an HSDS domain # create a random string so we don't try to open an existing file filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)) domain_path = "/home/test_user1/test/" + filename + ".h5" @@ -234,14 +234,7 @@ def testReaderWriter(self): self.assertTrue(root_id) db.reader = HSDSReader(domain_path, app_logger=self.log) db.close() - """ - db.writer = HSDSWriter(domain, **kwargs) - root_id = db.open() - db.close() - # now set the reader - db.reader = HSDSReader(domain, **kwargs) - db.open() - """ + root_id2 = db.open() self.assertEqual(root_id, root_id2) root_json = db.getObjectById(root_id) @@ -250,6 +243,18 @@ def testReaderWriter(self): self.assertTrue(root_json["created"] > 0) self.assertTrue(db.writer.lastModified is None) # no flush yet + # create a scalar dataset + dset_id = db.createDataset(shape=(), dtype=np.int32) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + sel_all = selections.select((), ...) + db.setDatasetValues(dset_id, sel_all, arr) + + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr[()], 42) + + db.close() + def testH5PyToHS(self): # test reading from HDF5 file and writing to HSDS From 556176749c41b519fa8965486ccc931c1034109c Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Sep 2025 16:35:15 +0100 Subject: [PATCH 077/129] fix for created and lastModified keys --- src/h5json/dset_util.py | 1 - src/h5json/hdf5db.py | 11 ++++++----- test/unit/hsds_writer_test.py | 8 ++++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index e1a44a59..37d67f1e 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -40,7 +40,6 @@ def resize_dataset(dset_json, shape): raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}") shape_json["dims"] = list(shape) - dset_json["modified"] = time.time() def getDims(dset_json): diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index b0c069d0..1dbaf43b 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -129,9 +129,7 @@ def deleted_objects(self): def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ - if self.is_new(obj_id): - # object hasn't been initially written yet, just return - return + if obj_id not in self.db: self.log.error("make dirty called on deleted object") raise KeyError(f"obj_id: {obj_id} not found") @@ -140,7 +138,9 @@ def make_dirty(self, obj_id): return obj_json = self.db[obj_id] obj_json["lastModified"] = time.time() - self._dirty_objects.add(obj_id) + if not self.is_new(obj_id): + # object hasn't been initially written yet, add to dirt_object set + self._dirty_objects.add(obj_id) def flush(self): """ write out any changes """ @@ -646,7 +646,7 @@ def resizeDataset(self, dset_id, shape): dset_json = self.getObjectById(dset_id) # will throw exception if not found if resize_dataset(dset_json, shape): - self._dirty_objects.add(dset_id) + self._make_dirty(dset_id) def deleteObject(self, obj_id): """ Delete the given object """ @@ -817,6 +817,7 @@ def createDataset( dset_json["creationProperties"] = cpl else: dset_json["creationProperties"] = {} + dset_json["created"] = time.time() dset_id = createObjId("datasets", root_id=self.root_id) self.db[dset_id] = dset_json diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index af12fa32..d731836a 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -245,10 +245,18 @@ def testReaderWriter(self): # create a scalar dataset dset_id = db.createDataset(shape=(), dtype=np.int32) + dset_json = db.getObjectById(dset_id) + self.assertTrue("created" in dset_json) + dset_create_time = dset_json["created"] + self.assertTrue(dset_create_time > 0) + arr = np.zeros((), dtype=np.int32) arr[()] = 42 sel_all = selections.select((), ...) db.setDatasetValues(dset_id, sel_all, arr) + dset_json = db.getObjectById(dset_id) + self.assertTrue("lastModified" in dset_json) + self.assertTrue(dset_json["lastModified"] > dset_create_time) arr = db.getDatasetValues(dset_id, sel_all) self.assertEqual(arr[()], 42) From 924ee00100e8e19f00160f543adee5004827e2d5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Sep 2025 18:23:24 +0100 Subject: [PATCH 078/129] fix for scalar datasets --- src/h5json/hdf5db.py | 11 ++++++++++- test/unit/hsds_writer_test.py | 29 ++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 1dbaf43b..933c1ce4 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -603,7 +603,11 @@ def getDatasetValues(self, dset_id, sel): stop = start + sel_inter.count[dim] slices.append(slice(start, stop, 1)) slices = tuple(slices) - arr[slices] = update_val + # TBD: needs updating to work in the general case! + if slices == (): + arr[slices] = update_val[slices] + else: + arr[slices] = update_val return arr @@ -620,6 +624,11 @@ def setDatasetValues(self, dset_id, sel, arr): raise ValueError("Only hyperslab selections are currently supported") if not isinstance(arr, np.ndarray): raise TypeError("Expected ndarray for data value") + tgt_dt = self.getDtype(dset_json) + src_dt = arr.dtype + if src_dt != tgt_dt: + raise TypeError("arr.dtype doesn't match dataset dtype") + if shape_json["class"] == "H5S_NULL": raise ValueError("writing to null space dataset not supported") if shape_json["class"] == "H5S_SCALAR": diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py index d731836a..ecdedf02 100644 --- a/test/unit/hsds_writer_test.py +++ b/test/unit/hsds_writer_test.py @@ -244,23 +244,42 @@ def testReaderWriter(self): self.assertTrue(db.writer.lastModified is None) # no flush yet # create a scalar dataset - dset_id = db.createDataset(shape=(), dtype=np.int32) - dset_json = db.getObjectById(dset_id) + dsetA_id = db.createDataset(shape=(), dtype=np.int32) + dset_json = db.getObjectById(dsetA_id) self.assertTrue("created" in dset_json) dset_create_time = dset_json["created"] self.assertTrue(dset_create_time > 0) + db.createHardLink(root_id, "dset_a", dsetA_id) + arr = np.zeros((), dtype=np.int32) arr[()] = 42 sel_all = selections.select((), ...) - db.setDatasetValues(dset_id, sel_all, arr) - dset_json = db.getObjectById(dset_id) + db.setDatasetValues(dsetA_id, sel_all, arr) + + dset_json = db.getObjectById(dsetA_id) self.assertTrue("lastModified" in dset_json) self.assertTrue(dset_json["lastModified"] > dset_create_time) - arr = db.getDatasetValues(dset_id, sel_all) + arr = db.getDatasetValues(dsetA_id, sel_all) self.assertEqual(arr[()], 42) + # create a scalar dataset with string + dt_str = special_dtype(vlen=str) + dsetB_id = db.createDataset(shape=(), dtype=dt_str) + dset_json = db.getObjectById(dsetB_id) + db.createHardLink(root_id, "dset_b", dsetB_id) + + arr = np.zeros((), dtype=dt_str) + arr[()] = "hello world" + db.setDatasetValues(dsetB_id, sel_all, arr) + + arr = db.getDatasetValues(dsetB_id, sel_all) + + e = arr[()] + self.assertEqual(e, "hello world") + self.assertTrue(isinstance(e, str)) + db.close() def testH5PyToHS(self): From 1f90429438b59ac92a4a97c99d10bbd6fce295e5 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 12 Sep 2025 17:05:24 +0100 Subject: [PATCH 079/129] move hsds plugins to h5pyd --- pyproject.toml | 1 - src/h5json/hdf5db.py | 2 +- src/h5json/hsdsstore/hsds_reader.py | 322 ----------- src/h5json/hsdsstore/hsds_writer.py | 631 ---------------------- src/h5json/hsdsstore/httpconn.py | 804 ---------------------------- src/h5json/openid.py | 437 --------------- testall.py | 2 - 7 files changed, 1 insertion(+), 2198 deletions(-) delete mode 100644 src/h5json/hsdsstore/hsds_reader.py delete mode 100644 src/h5json/hsdsstore/hsds_writer.py delete mode 100644 src/h5json/hsdsstore/httpconn.py delete mode 100644 src/h5json/openid.py diff --git a/pyproject.toml b/pyproject.toml index d911700a..11302438 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,6 @@ packages = [ "h5json", "h5json.jsonstore", "h5json.h5pystore", - "h5json.hsdsstore", "h5json.schema", "h5json.apps", ] diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 933c1ce4..02753ec5 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -277,11 +277,11 @@ def _checkWriter(self): def getObjectById(self, obj_id, refresh=False): """ return object with given id """ - self.log.debug(f"getObjectById {obj_id}") self._checkReader() tag = getHashTagForId(obj_id) if tag not in self.db or refresh: # load the obj from the reader + self.log.debug(f"getObjectById - fetching {obj_id} from reader") obj_json = self.reader.getObjectById(obj_id) self.db[tag] = obj_json obj_json = self.db[tag] diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py deleted file mode 100644 index e7dfa26d..00000000 --- a/src/h5json/hsdsstore/hsds_reader.py +++ /dev/null @@ -1,322 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import logging - -from ..objid import getCollectionForId, getUuidFromId - -from ..hdf5dtype import createDataType -from ..array_util import jsonToArray, bytesToArray -from .. import selections -from ..h5reader import H5Reader -from .httpconn import HttpConn - - -class HSDSReader(H5Reader): - """ - This class can be used by HDF5DB to read content from an hdf5-json file - """ - - def __init__( - self, - domain_path, - app_logger=None, - endpoint=None, - username=None, - password=None, - bucket=None, - api_key=None, - use_session=True, - expire_time=0, - max_objects=0, - max_age=0, - retries=3, - timeout=30.0, - ): - if app_logger: - self.log = app_logger - else: - self.log = logging.getLogger() - - self.log.debug("HSDSReader init(") - - kwargs = {} - self.log.debug(f" domain_path: {domain_path}") - if endpoint: - self.log.debug(f" endpoint: {endpoint}") - kwargs["endpoint"] = endpoint - if username: - self.log.debug(f" username: {username}") - kwargs["username"] = username - if password: - self.log.debug(f" password: {'*' * len(password)}") - kwargs["password"] = password - if bucket: - self.log.debug(f" bucket: {bucket}") - kwargs["bucket"] = bucket - if api_key: - self.log.debug(f" apI_key: {'*' * len(api_key)}") - kwargs["api_key"] = api_key - if use_session: - self.log.debug(f" use_session: {use_session}") - kwargs["user_session"] = use_session - - if expire_time: - self.log.debug(f" expire_time: {expire_time}") - kwargs["expire_time"] = expire_time - if max_objects: - self.log.debug(f" max_objects: {max_objects}") - kwargs["max_objects"] = max_objects - if max_age: - self.log.debug(f" max_age: {max_age}") - kwargs["max_age"] = max_age - if retries: - self.log.debug(f" retries: {retries}") - kwargs["retries"] = retries - if timeout: - self.log.debug(f" timeout: {timeout}") - kwargs["timeout"] = timeout - # save these for when we create the connection - self._http_kwargs = kwargs - self._http_conn = None - self._stats = {"created": 0, "lastModified": 0, "owner": ""} - - super().__init__(domain_path, app_logger=app_logger) - - def open(self): - if self._http_conn and not self._http_conn.isClosed(): - return self._root_id # open already called - - if self._http_conn: - http_conn = self._http_conn - else: - kwargs = self._http_kwargs - http_conn = HttpConn(self.filepath, **kwargs) - - http_conn.open() - - hsds_info = http_conn.serverInfo() - self.log.debug(f"got hsds info: {hsds_info}") - - # try to do a GET from the domain - req = "/" - params = {} - """ - if max_objects is None or max_objects > 0: - # get object meta objects - # TBD: have hsds support a max limit of objects to return - params["getobjs"] = 1 - params["include_attrs"] = 1 - params["include_links"] = 1 - """ - - rsp = http_conn.GET(req, params=params) - - if rsp.status_code != 200: - # file must exist - http_conn.close() - raise IOError(rsp.status_code, rsp.reason) - - domain_json = rsp.json() - self.log.debug(f"got domain_json: {domain_json}") - - # update stats - for key in ("created", "lastModified", "owner", "limits", "version", "compressors"): - if key in domain_json: - self._stats[key] = domain_json[key] - - if "root" not in domain_json: - http_conn.close() - raise IOError(404, "Location is a folder, not a file") - - root_id = domain_json["root"] - self._root_id = root_id - - """ - if "domain_objs" in root_json: - domain_objs = root_json["domain_objs"] - objdb.load(domain_objs) - """ - - self._http_conn = http_conn - - return self._root_id - - @property - def http_conn(self): - return self._http_conn - - def close(self): - if self._http_conn: - self._http_conn.close() - - def isClosed(self): - if not self._http_conn: - return True - else: - return self._http_conn.isClosed() - - def get_root_id(self): - """ Return root id """ - return self._root_id - - def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_values=False): - """ return object with given id """ - - collection = getCollectionForId(obj_id) - - req = f"/{collection}/{obj_id}" - self.log.debug("sending req: {req}") - - params = {} - if include_attrs: - params["include_attrs"] = 1 - if include_links: - params["include_links"] = 1 - - rsp = self.http_conn.GET(req, params=params) - - if rsp.status_code != 200: - raise IOError(rsp.status_code, rsp.reason) - - obj_json = rsp.json() - # remove any unneeded keys - redundant_keys = ("hrefs", "root", "domain", "bucket", "linkCount", "attributeCount") - for key in redundant_keys: - if key in obj_json: - del obj_json[key] - - self.log.debug(f"got json for id: {obj_id}: {obj_json}") - return obj_json - - def getAttribute(self, obj_id, name, includeData=True): - """ - Get attribute given an object id and name - returns: JSON object - """ - self.log.debug(f"getAttribute({obj_id}), [{name}], include_data={includeData})") - collection = getCollectionForId(obj_id) - req = f"/{collection}/{obj_id}/attributes/{name}" - - params = {} - params["IncludeData"] = 1 if includeData else 0 - - rsp = self.http_conn.GET(req, params=params) - - if rsp.status_code in (404, 410): - self.log.warning(f"attribute {name} not found") - return None - - if rsp.status_code != 200: - self.log.error(f"GET {req} failed with status_code: {rsp.status_code}") - raise IOError(rsp.status_code, rsp.reason) - attr_json = rsp.json() - - if "hrefs" in attr_json: - del attr_json["hrefs"] - - return attr_json - - def getDtype(self, obj_json): - """ Return the dtype for the type given by obj_json """ - if "type" not in obj_json: - raise KeyError("no type item found") - type_item = obj_json["type"] - if isinstance(type_item, str) and type_item.startswith("datatypes/"): - # this is a reference to a committed type - ctype_id = "t-" + getUuidFromId(type_item) - ctype_json = self.getObjectById(ctype_id) - if "type" not in ctype_json: - raise KeyError(f"Unexpected datatype: {ctype_json}") - # Use the ctype's item json - type_item = ctype_json["type"] - dtype = createDataType(type_item) - return dtype - - def getDatasetValues(self, dset_id, sel=None, dtype=None): - """ - Get values from dataset identified by obj_id. - If a slices list or tuple is provided, it should have the same - number of elements as the rank of the dataset. - """ - - self.log.debug(f"getDatasetValues({dset_id}), sel={sel}") - collection = getCollectionForId(dset_id) - if collection != "datasets": - msg = f"unexpected id: {dset_id} for getDatasetValues" - self.log.warning(msg) - return ValueError(msg) - - if sel is None or sel.select_type == selections.H5S_SELECT_ALL: - query_param = None # just return the entire array - elif isinstance(sel, (selections.SimpleSelection, selections.FancySelection)): - query_param = sel.getQueryParam() - else: - raise NotImplementedError(f"selection type: {type(sel)} not supported") - - mtype = dtype # TBD - support read time dtype - mshape = sel.mshape - - req = f"/{collection}/{dset_id}/value" - params = {} - - if query_param: - params["select"] = query_param - - if mtype.names != dtype.names: - params["fields"] = ":".join(mtype.names) - - MAX_SELECT_QUERY_LEN = 100 - if query_param and len(query_param) > MAX_SELECT_QUERY_LEN: - # use a post method to avoid possible long query strings - try: - rsp = self.http_conn.POST(req, body=params, format="binary") - except IOError as ioe: - self.log.info(f"got IOError: {ioe.errno}") - raise IOError(f"Error retrieving data: {ioe.errno}") - else: - # make a http GET - try: - rsp = self.http_conn.GET(req, params=params, format="binary") - except IOError as ioe: - self.log.info(f"got IOError: {ioe.errno}") - raise IOError(ioe.errno, "Error retrieving data") - - if rsp.status_code != 200: - self.log.info(f"got http error: {rsp.status_code}") - raise IOError(rsp.status_code, "Error retrieving data") - - if rsp.is_binary: - # got binary response - self.log.info(f"binary response, {len(rsp.text)} bytes") - arr = bytesToArray(rsp.text, mtype, mshape) - else: - # got JSON response - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - self.log.info("json response") - - data = rsp.json()["value"] - # self.log.debug(data) - - arr = jsonToArray(mshape, mtype, data) - self.log.debug(f"jsonToArray returned: {arr}") - - return arr - - def getStats(self): - """ return a dictionary object with at minimum the following keys: - 'created': creation time - 'lastModified': modificationTime - 'owner': owner name - """ - return self._stats diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py deleted file mode 100644 index 9166937a..00000000 --- a/src/h5json/hsdsstore/hsds_writer.py +++ /dev/null @@ -1,631 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import logging -import time - -from ..objid import getCollectionForId - -from ..hdf5dtype import isVlen -from ..array_util import arrayToBytes, bytesArrayToList -from ..dset_util import getNumElements, getDims -from .. import selections -from ..h5writer import H5Writer -from .httpconn import HttpConn - - -class HSDSWriter(H5Writer): - """ - This class can be used by HDF5DB to read content from an hdf5-json file - """ - - def __init__( - self, - domain_path, - append=False, - no_data=False, - app_logger=None, - endpoint=None, - username=None, - password=None, - bucket=None, - api_key=None, - use_session=True, - expire_time=0, - max_objects=0, - max_age=0, - retries=3, - timeout=30.0, - track_order=False, - owner=None, - linked_domain=None - - ): - if app_logger: - self.log = app_logger - else: - self.log = logging.getLogger() - - if append: - self._init = False - else: - self._init = True - - if no_data: - self._no_data = True - else: - self._no_data = False - - self.log.debug("HSDSWriter init") - - kwargs = {} - self.log.debug(f" domain_path: {domain_path}") - self.log.debug(f" append: {append}") - if endpoint: - self.log.debug(f" endpoint: {endpoint}") - kwargs["endpoint"] = endpoint - if username: - self.log.debug(f" username: {username}") - kwargs["username"] = username - if password: - self.log.debug(f" password: {'*' * len(password)}") - kwargs["password"] = password - if bucket: - self.log.debug(f" bucket: {bucket}") - kwargs["bucket"] = bucket - if api_key: - self.log.debug(f" apI_key: {'*' * len(api_key)}") - kwargs["api_key"] = api_key - if use_session: - self.log.debug(f" use_session: {use_session}") - kwargs["user_session"] = use_session - if expire_time: - self.log.debug(f" expire_time: {expire_time}") - kwargs["expire_time"] = expire_time - if max_objects: - self.log.debug(f" max_objects: {max_objects}") - kwargs["max_objects"] = max_objects - if max_age: - self.log.debug(f" max_age: {max_age}") - kwargs["max_age"] = max_age - if retries: - self.log.debug(f" retries: {retries}") - kwargs["retries"] = retries - if timeout: - self.log.debug(f" timeout: {timeout}") - kwargs["timeout"] = timeout - self._http_kwargs = kwargs # save for when we create the connection - - super().__init__(domain_path, app_logger=app_logger) - - self._http_conn = None - self._root_id = None - self._append = append - self._track_order = track_order - self._owner = owner - self._linked_domain = linked_domain - self._last_flush_time = 0 - self._stats = {"created": 0, "lastModified": 0, "owner": ""} - - def open(self): - """ setup domain for writing """ - if not self._db_ref: - # no db set yet - raise IOError("DB not set") - - if self._http_conn and not self._http_conn.isClosed(): - return self._root_id - - if not self._http_conn: - kwargs = self._http_kwargs - kwargs["retries"] = 1 # tbd: test setting - http_conn = HttpConn(self.filepath, **kwargs) - if self._append: - http_conn._mode = "a" - self.log.debug("hsdswriter - set http_conn mode to a") - self._http_conn = http_conn - - http_conn = self._http_conn - self.log.debug("hsdswriter - open http conn") - http_conn.open() - - hsds_info = self._http_conn.serverInfo() - self.log.debug(f"got hsds info: {hsds_info}") - - # fetch the domain json - - # try to do a GET from the domain - req = "/" - params = {} - """ - if max_objects is None or max_objects > 0: - # get object meta objects - # TBD: have hsds support a max limit of objects to return - params["getobjs"] = 1 - params["include_attrs"] = 1 - params["include_links"] = 1 - """ - - domain_json = None - rsp = http_conn.GET(req, params=params) - self.log.debug(f"hsdswriter initial get status_code: {rsp.status_code}") - - if rsp.status_code not in (200, 404, 410): - msg = f"Got status code: {rsp.status_code} on initial domain get" - self.log.warning(msg) - raise IOError(msg) - - if rsp.status_code == 200: - if self._append: - # domain exists already - domain_json = rsp.json() - if "root" not in domain_json: - # this a folder not a domain - self.log.warning(f"folder: {self.filepath} has no root property") - http_conn.close() - raise IOError(404, "Location is a folder, not a file") - else: - # not append - delete existing domain - self.log.info("hsds_writer - delete domain") - self.log.info(f"sending delete request for {self.filepath}") - delete_rsp = http_conn.DELETE(req, params=params) - if delete_rsp.status_code not in (200, 410): - # failed to delete - http_conn.close() - raise IOError(rsp.status_code, rsp.reason) - - if not domain_json: - # domain doesn't exist, create it - self.log.debug("hsds_writer create domain") - body = {} - if self.db.root_id: - # initialize domain using the db's root_id - body["root_id"] = self.db.root_id - if self._owner: - body["owner"] = self._owner - if self._linked_domain: - body["linked_domain"] = self._linked_domain - if self._track_order: - create_props = {"CreateOrder": 1} - group_body = {"creationProperties": create_props} - body["group"] = group_body - rsp = http_conn.PUT(req, params=params, body=body) - if rsp.status_code != 201: - http_conn.close() - raise IOError(rsp.status_code, rsp.reason) - domain_json = rsp.json() - self.log.info(f"got rsp on PUT domain: {domain_json}") - if "root" not in domain_json: - http_conn.close() - raise IOError(404, "Unexpected error") - - self.log.debug(f"got domain_json: {domain_json}") - - if "root" not in domain_json: - http_conn.close() - raise IOError(404, "Location is a folder, not a file") - - root_id = domain_json["root"] - self.log.debug(f"hsds_writer got root_id: {root_id}") - - self._root_id = root_id - - # update stats - for key in ("created", "lastModified", "owner", "limits", "version", "compressors"): - if key in domain_json: - self._stats[key] = domain_json[key] - - return self._root_id - - @property - def http_conn(self): - return self._http_conn - - def getDatasetSize(self, dset_id): - """ Return the size of the given dataset """ - - dset_json = self.db.getObjectById(dset_id) - num_elements = getNumElements(dset_json) - dtype = self.db.getDtype(dset_json) - if isVlen(dtype): - item_size = 1024 # random guess at size of variable length types - else: - item_size = dtype.itemsize - return num_elements * item_size - - def createObjects(self, obj_ids): - """ create the objects referenced in obj_ids """ - - MAX_INIT_SIZE = 4096 # max size to include init values in dataset creation - - def multiPost(items): - self.log.debug(f"hsds_writer> POST request {collection} for {len(items)} objects") - for item in items: - self.log.debug(f"hsds_writer> POST item: {item}") - post_rsp = self.http_conn.POST("/" + collection, items) - self.log.debug(f"hsds_writer> POST post_rsp.status_code: {post_rsp.status_code}") - items.clear() - - self.log.debug(f"hsds_writer> createObjects, {len(obj_ids)} objects") - MAX_OBJECTS_PER_REQUEST = 300 - collections = ("groups", "datasets", "datatypes") - col_items = {} - dset_value_update_ids = set() - for collection in collections: - col_items[collection] = [] - - for obj_id in obj_ids: - if obj_id == self._root_id: - continue # this was created when the domain was - collection = getCollectionForId(obj_id) - obj_json = self.db.getObjectById(obj_id) - item = {"id": obj_id} - self.log.debug(f"create id: {obj_id}") - for key in obj_json: # ("links", "attributes"): - if key == "updates": - # not part of the obj json - continue - if key == "attributes": - # will update attribute later - continue - if key == "links": - # links will also be updated later - continue - if key == "shape": - # just send the dims, not the shape json - shape_json = obj_json["shape"] - if shape_json["class"] == "H5S_SIMPLE": - dims = shape_json["dims"] - item[key] = dims - else: - # just copy the key value directly - item[key] = obj_json[key] - - # initialize dataset values if provided and not too large - if collection == "datasets": - dset_dims = getDims(obj_json) # will be None for null space datasets - dset_size = self.getDatasetSize(obj_id) # number of bytes defined by the shape - init_arr = None # data to be passed to post create method - updates = obj_json.get("updates") - if updates and len(updates) == 1 and dset_size < MAX_INIT_SIZE: - sel, arr = updates[0] - if sel.select_type == selections.H5S_SELECT_ALL: - init_arr = arr - updates.clear() # reset the update list - if self._init and init_arr is None and dset_dims is not None: - # get all values from dataset if small enough - if dset_size < MAX_INIT_SIZE: - sel_all = selections.select(dset_dims, ...) - init_arr = self.db.getDatasetValues(obj_id, sel_all) - if init_arr is not None: - value = bytesArrayToList(init_arr) - item["value"] = value - elif updates or self._init: - dset_value_update_ids.add(obj_id) # will set dataset value below - - # add to the list of new items for the given collection - items = col_items[collection] - items.append(item) - - if len(items) == MAX_OBJECTS_PER_REQUEST: - multiPost(items) - - # handle any remainder items - for collection in collections: - items = col_items[collection] - if items: - multiPost(items) - - # write any initial dataset values - if dset_value_update_ids: - self.updateValues(dset_value_update_ids) - - def deleteObjects(self, obj_ids): - """ remove the given obj ids from the HSDS store """ - - # no multi-delete operation yet, so delete one by one - for obj_id in obj_ids: - collection = getCollectionForId(obj_id) - req = f"/{collection}/{obj_id}" - http_rsp = self.http_conn.DELETE(req) - if http_rsp.status_code not in (200, 410): - self.log.error(f"got {http_rsp.status_code} for DELETE {req}") - - def updateLinks(self, grp_ids): - """ update any modified links of the given objects """ - - self.log.debug("hsds_writer> updateLinks") - items = {} # dict which will hold a map of grp ids to links to create - removals = {} # map of grp_ids to link titles to be deleted - count = 0 - - for grp_id in grp_ids: - if getCollectionForId(grp_id) != "groups": - continue # ignore datasets and datatypes - grp_json = self.db.getObjectById(grp_id) - grp_links = grp_json["links"] - link_titles = list(grp_links.keys()) - for link_title in link_titles: - link_json = grp_links[link_title] - if "created" not in link_json: - self.log.error(f"hsds_writer> expected created timestamp in link: {link_json}") - created = link_json["created"] - if "DELETED" in link_json: - if created > self._last_flush_time: - # link hasn't been created yet - msg = f"hsds_writer> {grp_id}: link: {link_title} deleted before flush" - self.log.debug(msg) - else: - # link has been persisted, remove - if grp_id not in removals: - removals[grp_id] = set() - removals[grp_id].add(link_title) - elif created > self._last_flush_time: - self.log.debug(f"hsds_writer> {grp_id}: new link: {link_title}") - count += 1 - # new link, add to our list - if grp_id not in items: - items[grp_id] = {"links": {}} - links = items[grp_id]["links"] - link_class = link_json["class"] - new_link = {"class": link_class, "created": created} - # convert to hsds representation - if link_class == "H5L_TYPE_HARD": - new_link["id"] = link_json["id"] - elif link_class == "H5L_TYPE_SOFT": - new_link["h5path"] = link_json["h5path"] - elif link_class == "H5L_TYPE_EXTERNAL": - new_link["h5path"] = link_json["h5path"] - new_link["h5domain"] = link_json["file"] # use h5domain for file key - elif link_class == "H5L_TYPE_USER_DEFINED": - self.log.warning(f"ignoring user-defined link: {link_title}") - continue - else: - raise IOError(f"unexpected link class: {link_class}") - links[link_title] = new_link - self.log.debug(f"setting link {link_title} to {new_link}") - else: - self.log.debug(f"link {link_title} has already been persisted") - - if removals: - # TBD: hsds doesn't have a multiple object link deletion operation yet - # so make one request per object id - for grp_id in removals: - titles = removals[grp_id] - params = {"titles": "/".join(titles)} - del_rsp = self.http_conn.DELETE("/groups/" + grp_id + links, params=params) - if del_rsp.status_code != 200: - self.log.error("failed to delete links for grp: {grp_id} titles: {titles}") - raise IOError("hsds_writer failed to delete links") - else: - self.log.debug(f"hsds_writer> {grp_id} deleted {len(titles)} links") - self._lastModified = time.time() - # remove links from link_json in db - grp_json = self.db.getObjectById(grp_id) - grp_links = grp_json["links"] - for title in titles: - del grp_links[title] - - if items: - body = {"grp_ids": items} - put_rsp = self.http_conn.PUT("/groups/" + self._root_id + "/links", body=body) - if put_rsp.status_code not in (200, 201): - self.log.error(f"failed to update links for request: {body}") - raise IOError("hsds_writer unable to update links") - else: - self.log.debug(f"hsds_writer> {grp_id} {count} links updated") - self._lastModified = time.time() - - def _deleteAttribute(self, obj_id, attr_name): - # delete the given attribute - - col_name = getCollectionForId(obj_id) - req = f"/{col_name}/{obj_id}/attributes/{attr_name}" - http_rsp = self.http_conn.DELETE(req) - if http_rsp.status_code != 200: - self.log.error("failed to delete attribute for obj: {obj_id} name: {attr_name}") - raise IOError("hsds_writer failed to delete attribute") - - def updateAttributes(self, obj_ids): - """ update any modified links of the given objects """ - - self.log.debug("hsds_writer> updateAttributes") - items = {} # dict which will hold a map of objects ids to attributes to create - removals = {} # map of obj_ids to attributes to be deleted - separator = '|' # use this character to join attribute names for deletion - - count = 0 - - for obj_id in obj_ids: - obj_json = self.db.getObjectById(obj_id) - obj_attrs = obj_json["attributes"] - for attr_name in obj_attrs: - attr_json = obj_attrs[attr_name] - - if "created" not in attr_json: - self.log.error(f"hsds_writer> expected created timestamp in attr: {attr_json}") - created = attr_json["created"] - if "DELETED" in attr_json: - if created > self._last_flush_time: - # attribute hasn't been created yet - msg = f"hsds_writer> {obj_id}: attr: {attr_name} deleted before flush" - self.log.debug(msg) - else: - # attribute has been persisted, remove - if attr_name.find(separator) != -1: - # need to delete individually - self._deleteAttribute(obj_id, attr_name) - else: - # can delete in a batch - if obj_id not in removals: - removals[obj_id] = set() - removals[obj_id].add(attr_name) - elif created > self._last_flush_time: - self.log.debug(f"hsds_writer> {obj_id} attribute {attr_name} created") - count += 1 - # new attribute, add to our list - if obj_id not in items: - items[obj_id] = {"attributes": {}} - attrs = items[obj_id]["attributes"] - attrs[attr_name] = attr_json - else: - self.log.debug(f"hsds_writer> {obj_id}: attr: {attr_name} has already been deleted") - - if removals: - # TBD: hsds doesn't have a multiple object attribute deletion operation yet - # so make one request per object id - # Delete with custom separator - - for obj_id in removals: - attr_names = removals[obj_id] - params = {"attr_names": separator.join(attr_names)} - params["separator"] = separator - collection = getCollectionForId(obj_id) - req = f"/{collection}/{obj_id}/attributes" - rsp = self.http_conn.DELETE(req, params=params) - if rsp.status_code != 200: - self.log.error("failed to delete attribute for obj: {obj_id}") - raise IOError("hsds_writer failed to delete attributes") - - if items: - body = {"obj_ids": items} - req = f"/groups/{self._root_id}/attributes" - put_rsp = self.http_conn.PUT(req, body=body) - if put_rsp.status_code not in (200, 201): - self.log.error(f"hsds_writer> put {req} failed, status: {put_rsp.status_code}") - else: - self.log.debug(f"hsds_writer> {count} attributes updated") - self._lastModified = time.time() - - def updateValue(self, dset_id, sel, arr): - """ update the given dataset using selection and array """ - self.log.debug("hsds_writer> updateValue") - params = {} - data = arrayToBytes(arr) - self.log.debug(f"writing binary data, {len(data)} bytes") - - if sel.select_type != selections.H5S_SELECT_ALL: - select_param = sel.getQueryParam() - self.log.debug(f"got select query param: {select_param}") - params["select"] = select_param - - req = f"/datasets/{dset_id}/value" - rsp = self.http_conn.PUT(req, body=data, params=params, format="binary") - if rsp.status_code != 200: - self.log.error(f"PUT {req} returned error: {rsp.status_code}") - else: - self.log.debug(f"PUT {len(data)} bytes successful") - self._lastModified = time.time() - - def updateValues(self, dset_ids): - """ write any pending dataset values """ - - self.log.debug("hsds_writer> updateValues") - for dset_id in dset_ids: - if getCollectionForId(dset_id) != "datasets": - continue # ignore groups and datatypes - dset_json = self.db.getObjectById(dset_id) - dset_dims = getDims(dset_json) - if dset_dims is None: - # no data to update - continue - if self._init: - # get all data for the dataset - # TBD: do this by chunks - sel_all = selections.select(dset_dims, ...) - arr = self.db.getDatasetValues(dset_id, sel_all) - if arr is not None: - self.updateValue(dset_id, sel_all, arr) - else: - if "updates" not in dset_json: - continue - updates = dset_json["updates"] - if updates: - self.log.debug(f"hsds_writer> {dset_id} update count: {len(updates)}") - for (sel, arr) in updates: - self.updateValue(dset_id, sel, arr) - updates.clear() - - def flush(self): - """ Write dirty items """ - if self.closed: - # no db set yet - self.log.warning("hsds_writer> flush called but no db") - return IOError("writer is closed") - if not self._http_conn: - self.log.warning("hsds_writer no http connection") - raise IOError("no http connection") - self.log.info("hsds_writer.flush()") - self.log.debug(f" new object count: {len(self.db.new_objects)}") - self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") - self.log.debug(f" deleted object count: {len(self.db.deleted_objects)}") - root_id = self._root_id - dirty_ids = self.db.dirty_objects.copy() - if self._init: - # initialize objects - self.log.debug(f"hsds_writer> flush -- init is True self.db: {len(self.db.db)} objects") - self.db.readAll() - self.log.debug(f"hsds_writer>flush, init after readAll, {len(self.db.db)} objects") - obj_ids = set(self.db.db.keys()) - obj_ids.remove(root_id) # root group created when domain was - self.log.debug(f"init createObjects: {obj_ids}") - self.createObjects(obj_ids) - dirty_ids.update(obj_ids) - dirty_ids.add(root_id) # add back root for attribute and link creation - if not self._no_data: - # initialize dataset values - pass - # self.updateValues(obj_ids) - self._init = False - elif self.db.new_objects: - self.log.debug(f"hsds_writer> {len(self.db.new_objects)} objects to create") - for obj_id in self.db.new_objects: - self.log.debug(f"hsds_writer> new obj id: {obj_id}") - self.createObjects(self.db.new_objects) - dirty_ids.update(self.db.new_objects) - else: - self.log.debug("no new objects to persist") - - if dirty_ids: - self.log.debug(f"hsds_writer> dirty ids: {dirty_ids}") - self.updateLinks(dirty_ids) - self.updateAttributes(dirty_ids) - if not self._no_data: - self.updateValues(dirty_ids) - - if self.db.deleted_objects: - self.log.debug(f"deleted ids: {self.db.deleted_objects}") - self.deleteObjects(self.db.deleted_objects) - - self._last_flush_time = time.time() - self.log.debug("hsds_writer> flush successful") - # all objects written successfully - return True - - def close(self): - # over-ride of H5Writer method - self.flush() - - def isClosed(self): - """ return closed status """ - return False if self._http_conn else True - - def get_root_id(self): - """ Return root id """ - return self._root_id - - def getStats(self): - """ return a dictionary object with at minimum the following keys: - 'created': creation time - 'lastModified': modificationTime - 'owner': owner name - """ - return self._stats diff --git a/src/h5json/hsdsstore/httpconn.py b/src/h5json/hsdsstore/httpconn.py deleted file mode 100644 index dc2ff9b1..00000000 --- a/src/h5json/hsdsstore/httpconn.py +++ /dev/null @@ -1,804 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## - -from __future__ import absolute_import - -import os -import sys -import time -import base64 - -import requests -import requests_unixsocket -from requests import ConnectionError -from requests.adapters import HTTPAdapter, Retry -import json -import logging - -from .. import openid -from .. import config - - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -DEFAULT_TIMEOUT = ( - 10, - 1000, -) # #20 # 180 # seconds - allow time for hsds service to bounce - -""" -def verifyCert(self): - # default to validate CERT for https requests, unless - # the H5PYD_VERIFY_CERT environment variable is set and True - # - # TBD: set default to True once the signing authority of data.hdfgroup.org is - # recognized - if "H5PYD_VERIFY_CERT" in os.environ: - verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() - if verify_cert.startswith('F'): - return False - return True -""" - - -def getAzureApiKey(): - """construct API key for Active Directory if configured""" - # TBD: GoogleID? - - api_key = None - - # if Azure AD ids are set, pass them to HttpConn via api_key dict - cfg = config.get_config() # pulls in state from a .hscfg file (if found). - - ad_app_id = None # Azure AD HSDS Server id - if "HS_AD_APP_ID" in os.environ: - ad_app_id = os.environ["HS_AD_APP_ID"] - elif "hs_ad_app_id" in cfg: - ad_app_id = cfg["hs_ad_app_id"] - ad_tenant_id = None # Azure AD tenant id - if "HS_AD_TENANT_ID" in os.environ: - ad_tenant_id = os.environ["HS_AD_TENANT_ID"] - elif "hs_ad_tenant_id" in cfg: - ad_tenant_id = cfg["hs_ad_tenant_id"] - - ad_resource_id = None # Azure AD resource id - if "HS_AD_RESOURCE_ID" in os.environ: - ad_resource_id = os.environ["HS_AD_RESOURCE_ID"] - elif "hs_ad_resource_id" in cfg: - ad_resource_id = cfg["hs_ad_resource_id"] - - ad_client_secret = None # Azure client secret - if "HS_AD_CLIENT_SECRET" in os.environ: - ad_client_secret = os.environ["HS_AD_CLIENT_SECRET"] - elif "hs_ad_client_secret" in cfg: - ad_client_secret = cfg["hs_ad_client_secret"] - - if ad_app_id and ad_tenant_id and ad_resource_id: - # contruct dict to pass to HttpConn - api_key = { - "AD_APP_ID": ad_app_id, - "AD_TENANT_ID": ad_tenant_id, - "AD_RESOURCE_ID": ad_resource_id, - "openid_provider": "azure", - } - # optional config - if ad_client_secret: - api_key["AD_CLIENT_SECRET"] = ad_client_secret - return api_key # None if AAD not configured - - -def getKeycloakApiKey(): - # check for keycloak next - cfg = config.get_config() # pulls in state from a .hscfg file (if found). - api_key = None - # check to see if we are configured for keycloak authentication - if "HS_KEYCLOAK_URI" in os.environ: - keycloak_uri = os.environ["HS_KEYCLOAK_URI"] - elif "hs_keycloak_uri" in cfg: - keycloak_uri = cfg["hs_keycloak_uri"] - else: - keycloak_uri = None - if "HS_KEYCLOAK_CLIENT_ID" in os.environ: - keycloak_client_id = os.environ["HS_KEYCLOAK_CLIENT_ID"] - elif "hs_keycloak_client_id" in cfg: - keycloak_client_id = cfg["hs_keycloak_client_id"] - else: - keycloak_client_id = None - if "HS_KEYCLOAK_REALM" in os.environ: - keycloak_realm = cfg["HS_KEYCLOAK_REALM"] - elif "hs_keycloak_realm" in cfg: - keycloak_realm = cfg["hs_keycloak_realm"] - else: - keycloak_realm = None - - if keycloak_uri and keycloak_client_id and keycloak_uri: - api_key = { - "keycloak_uri": keycloak_uri, - "keycloak_client_id": keycloak_client_id, - "keycloak_realm": keycloak_realm, - "openid_provider": "keycloak", - } - return api_key - - -class HttpResponse: - """ wrapper for http request responses """ - def __init__(self, rsp, logger=None): - self._rsp = rsp - self._logger = logger - if logger is None: - self.log = logging - else: - self.log = logging.getLogger(logger) - self._text = None - - @property - def status_code(self): - """ return response status code """ - return self._rsp.status_code - - @property - def reason(self): - """ return response reason """ - return self._rsp.reason - - @property - def content_type(self): - """ return content type """ - rsp = self._rsp - if 'Content-Type' in rsp.headers: - content_type = rsp.headers['Content-Type'] - else: - content_type = "" - return content_type - - @property - def content_length(self): - """ Return length of response if available """ - if 'Content-Length' in self._rsp.headers: - content_length = self._rsp.headers['Content-Length'] - else: - content_length = None - return content_length - - @property - def is_binary(self): - """ return True if the response indicates binary data """ - - if self.content_type == "application/octet-stream": - return True - else: - return False - - @property - def is_json(self): - """ return true if response indicates json """ - - if self.content_type.startswith("application/json"): - return True - else: - return False - - @property - def text(self): - """ getresponse content as bytes """ - - if not self._text: - rsp = self._rsp - if not self.is_binary: - # hex encoded response? - # this is returned by API Gateway for lambda responses - self._text = bytes.fromhex(rsp.text) - else: - if self.content_length: - self.log.debug(f"got binary response, {self.content_length} bytes") - else: - self.log.debug("got binary response, content_length unknown") - - HTTP_CHUNK_SIZE = 4096 - http_chunks = [] - downloaded_bytes = 0 - for http_chunk in rsp.iter_content(chunk_size=HTTP_CHUNK_SIZE): - if http_chunk: # filter out keep alive chunks - self.log.debug(f"got http_chunk - {len(http_chunk)} bytes") - downloaded_bytes += len(http_chunk) - http_chunks.append(http_chunk) - if len(http_chunks) == 0: - raise IOError("no data returned") - if len(http_chunks) == 1: - # can return first and only chunk as response - self._text = http_chunks[0] - else: - msg = f"retrieved {len(http_chunks)} http_chunks " - msg += f" {downloaded_bytes} total bytes" - self.log.info(msg) - self._text = bytearray(downloaded_bytes) - index = 0 - for http_chunk in http_chunks: - self._text[index:(index + len(http_chunk))] = http_chunk - index += len(http_chunk) - - return self._text - - def json(self): - """ Return json from response""" - - rsp = self._rsp - - if not self.is_json: - raise IOError("response is not json") - - rsp_json = json.loads(rsp.text) - self.log.debug(f"rsp_json - {len(rsp.text)} bytes") - return rsp_json - - -class HttpConn: - """ - Some utility methods based on equivalents in base class. - """ - - def __init__( - self, - domain_name, - endpoint=None, - username=None, - password=None, - bucket=None, - api_key=None, - mode="a", - expire_time=1.0, - max_objects=None, - max_age=1.0, - logger=None, - retries=3, - timeout=DEFAULT_TIMEOUT, - **kwds, - ): - self._domain = domain_name - self._mode = mode - self._domain_json = None - self._retries = retries - self._timeout = timeout - self._api_key = api_key - self._s = None # Sessions - self._server_info = None - self._external_refs = [] - - self._logger = logger - if logger is None: - self.log = logging - else: - self.log = logging.getLogger(logger) - msg = f"HttpConn.init(domain: {domain_name}" - msg += f"expire_time: {expire_time:6.2f} sec retries: {retries}" - self.log.debug(msg) - - if self._timeout != DEFAULT_TIMEOUT: - self.log.info(f"HttpConn.init - timeout = {self._timeout}") - if not endpoint: - if "HS_ENDPOINT" in os.environ: - endpoint = os.environ["HS_ENDPOINT"] - - if not endpoint: - msg = "no endpoint set" - raise ValueError(msg) - - self._endpoint = endpoint - - if not username: - if "HS_USERNAME" in os.environ: - username = os.environ["HS_USERNAME"] - if isinstance(username, str) and (not username or username.upper() == "NONE"): - username = None - self._username = username - - if not password: - if "HS_PASSWORD" in os.environ: - password = os.environ["HS_PASSWORD"] - if isinstance(password, str) and (not password or password.upper() == "NONE"): - password = None - self._password = password - - if not bucket: - if "HS_BUCKET" in os.environ: - bucket = os.environ["HS_BUCKET"] - if isinstance(bucket, str) and (not bucket or bucket.upper() == "NONE"): - bucket = None - self._bucket = bucket - - if api_key is None and "HS_API_KEY" in os.environ: - api_key = os.environ["HS_API_KEY"] - if isinstance(api_key, str) and (not api_key or api_key.upper() == "NONE"): - api_key = None - if not api_key: - api_key = getAzureApiKey() - if not api_key: - api_key = getKeycloakApiKey() - - # Convert api_key to OpenIDHandler - if isinstance(api_key, dict): - # Maintain Azure-defualt backwards compatibility, but allow - # both environment variable and kwarg override. - provider = api_key.get("openid_provider", "azure") - if provider == "azure": - self.log.debug("creating OpenIDHandler for Azure") - self._api_key = openid.AzureOpenID(endpoint, api_key) - elif provider == "google": - self.log.debug("creating OpenIDHandler for Google") - - config = api_key.get("client_secret", None) - scopes = api_key.get("scopes", None) - self._api_key = openid.GoogleOpenID( - endpoint, config=config, scopes=scopes - ) - elif provider == "keycloak": - self.log.debug("creating OpenIDHandler for Keycloak") - - # for Keycloak, pass in username and password - self._api_key = openid.KeycloakOpenID( - endpoint, config=api_key, username=username, password=password - ) - else: - self.log.error(f"Unknown openid provider: {provider}") - - def getHeaders(self, username=None, password=None, headers=None): - - if headers is None: - headers = {} - - # This should be the default - but explicitly set anyway - if "Accept-Encoding" not in headers: - headers['Accept-Encoding'] = "deflate, gzip" - - elif "Authorization" in headers: - return headers # already have auth key - if username is None: - username = self._username - if password is None: - password = self._password - - if self._api_key: - self.log.debug("using api key") - # use OpenId handler to get a bearer token - token = "" - - # Get a token, possibly refreshing if needed. - if isinstance(self._api_key, openid.OpenIDHandler): - token = self._api_key.token - - # Token was provided as a string. - elif isinstance(self._api_key, str): - token = self._api_key - - if token: - auth_string = b"Bearer " + token.encode("ascii") - headers["Authorization"] = auth_string - elif username is not None and password is not None: - self.log.debug(f"use basic auth with username: {username}") - auth_string = username + ":" + password - auth_string = auth_string.encode("utf-8") - auth_string = base64.b64encode(auth_string) - auth_string = b"Basic " + auth_string - headers["Authorization"] = auth_string - else: - self.log.debug("no auth header") - # no auth header - pass - - return headers - - def serverInfo(self): - if self._server_info: - return self._server_info - - if self._endpoint is None: - raise IOError("object not initialized") - - # make an about request - rsp = self.GET("/about") - if rsp.status_code != 200: - raise IOError(rsp.status_code, rsp.reason) - server_info = rsp.json() - if server_info: - self._server_info = server_info - return server_info - - def server_version(self): - server_info = self.serverInfo() - if "hsds_version" in server_info: - server_version = server_info["hsds_version"] - else: - # no standard way to get version for other implements... - server_version = None - return server_version - - def verifyCert(self): - # default to validate CERT for https requests, unless - # the H5PYD_VERIFY_CERT environment variable is set and True - # - # TBD: set default to True once the signing authority of data.hdfgroup.org is - # recognized - if "H5PYD_VERIFY_CERT" in os.environ: - verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() - if verify_cert.startswith("F"): - return False - return True - - def GET(self, req, format="json", params=None, headers=None): - if self._endpoint is None: - raise IOError("object not initialized") - if not self._s: - raise IOError("http session is closed") - # check that domain is defined (except for some specific requests) - if req not in ("/domains", "/about", "/info", "/") and self._domain is None: - raise IOError(f"no domain defined: req: {req}") - - rsp = None - - headers = self.getHeaders(headers=headers) - - if params is None: - params = {} - if "domain" not in params: - params["domain"] = self._domain - if "bucket" not in params and self._bucket: - params["bucket"] = self._bucket - if self._api_key and not isinstance(self._api_key, dict): - params["api_key"] = self._api_key - domain = params["domain"] - self.log.debug(f"GET: {req} [{domain}] bucket: {self._bucket}") - - if format == "binary": - headers["accept"] = "application/octet-stream" - - self.log.info(f"GET: {self._endpoint + req} [{params['domain']}] timeout: {self._timeout}") - - for k in params: - if k != "domain": - v = params[k] - self.log.debug(f"GET params {k}:{v}") - - try: - s = self._s - stream = True # tbd - config for no streaming? - ts = time.time() - rsp = s.get( - self._endpoint + req, - params=params, - headers=headers, - stream=stream, - timeout=self._timeout, - verify=self.verifyCert(), - ) - elapsed = time.time() - ts - self.log.info(f"status: GET {rsp.status_code}, elapsed: {elapsed:.4f}") - except ConnectionError as ce: - self.log.error(f"connection error: {ce}") - raise IOError("Connection Error") - except Exception as e: - self.log.error(f"got {type(e)} exception: {e}") - raise IOError("Unexpected exception") - - if rsp.status_code != 200: - self.log.warning(f"GET {req} returned status: {rsp.status_code}") - - return HttpResponse(rsp) - - def PUT(self, req, body=None, format="json", params=None, headers=None): - if self._endpoint is None: - raise IOError("object not initialized") - if self._domain is None: - raise IOError("no domain defined") - if not self._s: - raise IOError("http session is closed") - - if params: - self.log.info(f"PUT params: {params}") - else: - params = {} - - if "domain" not in params: - params["domain"] = self._domain - if "bucket" not in params and self._bucket: - params["bucket"] = self._bucket - if self._api_key: - params["api_key"] = self._api_key - - # verify the file was open for modification - if self._mode == "r": - raise IOError("Unable to create group (No write intent on file)") - - # try to do a PUT to the domain - - headers = self.getHeaders(headers=headers) - - if format == "binary": - headers["Content-Type"] = "application/octet-stream" - # binary write - data = body - else: - headers["Content-Type"] = "application/json" - data = json.dumps(body) - - self.log.info(f"PUT: {req} format: {format} [{len(data)} bytes]") - - try: - s = self._s - ts = time.time() - rsp = s.put( - self._endpoint + req, - data=data, - headers=headers, - params=params, - verify=self.verifyCert(), - ) - elapsed = time.time() - ts - self.log.info(f"status: PUT {rsp.status_code}, elapsed: {elapsed:.4f}") - except ConnectionError as ce: - self.log.error(f"connection error: {ce}") - raise IOError("Connection Error") - - if rsp.status_code == 201 and req == "/": - self.log.info("clearing domain_json cache") - self._domain_json = None - if rsp.status_code not in (200, 201): - self.log.warning(f"got status code: {rsp.status_code} for PUT {req}") - self.log.info(f"PUT returning: {rsp}") - - return HttpResponse(rsp) - - def POST(self, req, body=None, format="json", params=None, headers=None): - if self._endpoint is None: - raise IOError("object not initialized") - if self._domain is None: - raise IOError("no domain defined") - if not self._s: - raise IOError("http session is closed") - - if params is None: - params = {} - if "domain" not in params: - params["domain"] = self._domain - if "bucket" not in params and self._bucket: - params["bucket"] = self._bucket - if self._api_key: - params["api_key"] = self._api_key - - # verify we have write intent (unless this is a dataset point selection) - if req.startswith("/datasets/") and req.endswith("/value"): - point_sel = True - else: - point_sel = False - if self._mode == "r" and not point_sel: - raise IOError("Unable perform request (No write intent on file)") - - # try to do a POST to the domain - - headers = self.getHeaders(headers=headers) - - if isinstance(body, bytes): - headers["Content-Type"] = "application/octet-stream" - data = body - else: - # assume json - try: - data = json.dumps(body) - except TypeError: - msg = f"Unable to convert {body} to json" - self.log.error(msg) - raise IOError("JSON encoding error") - if format == "binary": - # receive data as binary - headers["accept"] = "application/octet-stream" - - self.log.info("POST: " + req) - - try: - s = self._s - ts = time.time() - rsp = s.post( - self._endpoint + req, - data=data, - headers=headers, - params=params, - verify=self.verifyCert(), - ) - elapsed = time.time() - ts - self.log.info(f"status: POST {rsp.status_code}, elapsed: {elapsed:.4f}") - except ConnectionError as ce: - self.log.warning(f"connection error: {ce}") - raise IOError(str(ce)) - - if rsp.status_code not in (200, 201): - self.log.error(f"got status_code: {rsp.status_code} for DELETE: {req}") - - return HttpResponse(rsp) - - def DELETE(self, req, params=None, headers=None): - if self._endpoint is None: - raise IOError("object not initialized") - if not self._s: - raise IOError("http session is closed") - - if req not in ("/domains", "/") and self._domain is None: - raise IOError("no domain defined") - if params is None: - params = {} - if "domain" not in params: - params["domain"] = self._domain - if "bucket" not in params and self._bucket: - params["bucket"] = self._bucket - if self._api_key: - params["api_key"] = self._api_key - - # verify we have write intent - if self._mode == "r": - raise IOError("Unable perform request (No write intent on file)") - - # try to do a DELETE of the resource - headers = self.getHeaders(headers=headers) - - self.log.info("DEL: " + req) - try: - ts = time.time() - rsp = self._s.delete( - self._endpoint + req, - headers=headers, - params=params, - verify=self.verifyCert(), - ) - self.log.info(f"status: {rsp.status_code}") - elapsed = time.time() - ts - self.log.info(f"status: DELETE {rsp.status_code}, elapsed: {elapsed:.4f}") - except ConnectionError as ce: - self.log.error(f"connection error: {ce}") - raise IOError("Connection Error") - - if rsp.status_code == 200 and req == "/": - self.log.info("clearing domain_json cache") - self._domain_json = None - - if rsp.status_code != 200: - self.log.warning(f"got status_code: {rsp.status_code} for DELETE {req}") - - return HttpResponse(rsp) - - def add_external_ref(self, fid): - # this is used by the group class to keep references to external links open - if fid.__class__.__name__ != "FileID": - raise TypeError("add_external_ref, expected FileID type") - self._external_refs.append(fid) - - def open(self): - self.log.debug("http_conn.open") - if self._s: - return # already open - - retries = self._retries - backoff_factor = 1 - status_forcelist = (500, 502, 503, 504) - if self._endpoint.startswith("http+unix://"): - self.log.debug(f"create unixsocket session: {self._endpoint}") - s = requests_unixsocket.Session() - else: - # regular request session - s = requests.Session() - - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - kwargs = {"max_retries": retry, "pool_connections": 16, "pool_maxsize": 16} - s.mount("http://", HTTPAdapter(**kwargs)) - s.mount("https://", HTTPAdapter(**kwargs)) - self.log.debug("Httpconn set self._s") - self._s = s - - def close(self): - if self._s: - self.log.debug("http_conn.close") - self._s.close() - self._s = None - - def isClosed(self): - if self._s is None: - return True - else: - return False - - @property - def domain(self): - return self._domain - - @property - def username(self): - return self._username - - @property - def endpoint(self): - return self._endpoint - - @property - def password(self): - return self._password - - @property - def mode(self): - return self._mode - - @property - def domain_json(self): - if self._domain_json is None: - rsp = self.GET("/") - if rsp.status_code != 200: - raise IOError(rsp.reason) - # assume JSON - self._domain_json = rsp.json() - return self._domain_json - - @property - def root_uuid(self): - domain_json = self.domain_json - if "root" not in domain_json: - raise IOError("Unexpected response") - root_uuid = domain_json["root"] - return root_uuid - - @property - def compressors(self): - compressors = [] - if "compressors" in self.domain_json: - compressors = self.domain_json["compressors"] - if not compressors: - compressors = [ - "gzip", - ] - return compressors - - @property - def modified(self): - """Last modified time of the domain as a datetime object.""" - domain_json = self.domain_json - if "lastModified" not in domain_json: - raise IOError("Unexpected response") - last_modified = domain_json["lastModified"] - return last_modified - - @property - def created(self): - """Creation time of the domain""" - domain_json = self.domain_json - if "created" not in domain_json: - raise IOError("Unexpected response") - created = domain_json["created"] - return created - - @property - def owner(self): - """username of creator of domain""" - domain_json = self.domain_json - username = None - if "owner" in domain_json: - # currently this is only available for HSDS - username = domain_json["owner"] - return username - - @property - def logging(self): - """return name of logging handler""" - return self.log diff --git a/src/h5json/openid.py b/src/h5json/openid.py deleted file mode 100644 index af38d94a..00000000 --- a/src/h5json/openid.py +++ /dev/null @@ -1,437 +0,0 @@ -import os -import sys -import json -import requests -import time -from abc import ABC, abstractmethod -from datetime import datetime - -from . import config as hsconfig - - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -# Azure -try: - import adal -except ModuleNotFoundError: - pass # change this to the eprint below to see the import error - # eprint()"Unable to import azure auth packages") - -# Google -try: - from google_auth_oauthlib.flow import InstalledAppFlow as GoogleInstalledAppFlow - from google.auth.transport.requests import Request as GoogleRequest - from google.oauth2.credentials import Credentials as GoogleCredentials - from google.oauth2 import id_token as GoogleIDToken -except ModuleNotFoundError: - pass # change this to the eprint below to see the import error - # eprint("Unable to import google auth packages") - - -class OpenIDHandler(ABC): - - def __init__(self, endpoint, use_token_cache=True, username=None, password=None): - """Initialize the token.""" - - # Location of the token cache. - self._token_cache_file = os.path.expanduser('~/.hstokencfg') - self._endpoint = endpoint - self._username = username - self._password = password - - # The _token attribute should be a dict with at least the following keys: - # - # accessToken - The OpenID token to send. - # refreshToken - The refresh token (optional). - # expiresOn - The unix timestamp when the token expires (optional). - - if not use_token_cache or not os.path.isfile(self._token_cache_file): - self._token = None - else: - if username: - file_key = username + '@' + endpoint - else: - file_key = endpoint - with open(self._token_cache_file, 'r') as token_file: - self._token = json.load(token_file).get(file_key, None) - - @abstractmethod - def acquire(self): - """Acquire a new token from the provider.""" - pass - - @abstractmethod - def refresh(self): - """Refresh an existing token with the provider.""" - pass - - @property - def username(self): - """ Return username if known """ - return self._username - - @property - def expired(self): - """Return if the token is expired.""" - t = self._token - # add some buffer to account for clock skew - return t is not None and 'expiresOn' in t and time.time() + 10.0 >= t['expiresOn'] - - @property - def token(self): - """Return the token if valid, otherwise get a new one.""" - - if self.expired: - self.refresh() - if self._token: - self.write_token_cache() - - if self._token is None: - self.acquire() - self.write_token_cache() - - return self._token['accessToken'] - - def write_token_cache(self): - """Write the token to a file cache.""" - - cache_exists = os.path.isfile(self._token_cache_file) - - if self._username: - file_key = self._username + '@' + self._endpoint - else: - file_key = self._endpoint - - # Create a new cache file. - if not cache_exists and self._token is not None: - with open(self._token_cache_file, 'w') as token_file: - json.dump({file_key: self._token}, token_file) - - # Update an exisiting cache file. - elif cache_exists: - with open(self._token_cache_file, 'r+') as token_file: - cache = json.loads(token_file.read()) - - # Store valid tokens. - if self._token is not None: - cache[file_key] = self._token - - # Delete invalid tokens. - elif file_key in cache: - del cache[file_key] - - token_file.seek(0) - token_file.truncate(0) - json.dump(cache, token_file) - - -class AzureOpenID(OpenIDHandler): - - AUTHORITY_URI = 'https://login.microsoftonline.com' # login endpoint for AD auth - - def __init__(self, endpoint, config=None): - """Store configuration.""" - - # Configuration manager - hs_config = hsconfig.get_config() - - # Config is a dictionary. - if isinstance(config, dict): - self.config = config - - # Maybe client_secrets are in environment variables? - else: - - self.config = { - 'AD_APP_ID': hs_config.get("hs_ad_app_id", None), - 'AD_TENANT_ID': hs_config.get("hs_ad_tenant_id", None), - 'AD_RESOURCE_ID': hs_config.get("hs_ad_resource_id", None), - 'AD_CLIENT_SECRET': hs_config.get("hs_ad_client_secret", None) - } - - if 'AD_CLIENT_SECRET' in self.config and self.config['AD_CLIENT_SECRET']: - use_token_cache = False - else: - use_token_cache = True - - super().__init__(endpoint, use_token_cache=use_token_cache) - - def write_token_cache(self): - if 'AD_CLIENT_SECRET' in self.config and self.config['AD_CLIENT_SECRET']: - pass # don't use token cache for unattended authentication - else: - super().write_token_cache() - - def acquire(self): - """Acquire a new Azure token.""" - - if "adal" not in sys.modules: - msg = "adal module not found, run: pip install -e . '.[azure]'" - raise ModuleNotFoundError(msg) - - app_id = self.config["AD_APP_ID"] - resource_id = self.config["AD_RESOURCE_ID"] - tenant_id = self.config["AD_TENANT_ID"] - client_secret = self.config.get("AD_CLIENT_SECRET", None) - authority_uri = self.AUTHORITY_URI + '/' + tenant_id - - # Try to get a token using different oauth flows. - context = adal.AuthenticationContext(authority_uri, enable_pii=True, api_version=None) - - try: - if client_secret is not None: - code = context.acquire_token_with_client_credentials(resource_id, app_id, client_secret) - else: - code = context.acquire_user_code(resource_id, app_id) - - except Exception as e: - eprint(f"unable to process AD token: {e}") - self._token = None - self.write_token_cache() - raise - - if "message" in code: - eprint(code["message"]) - mgmt_token = context.acquire_token_with_device_code(resource_id, code, app_id) - - elif "accessToken" in code: - mgmt_token = code - - else: - eprint("Could not authenticate with AD") - - # Only store some fields. - self._token = { - 'accessToken': mgmt_token['accessToken'], - 'refreshToken': mgmt_token.get('refreshToken', None), - 'tenantId': mgmt_token.get('tenantId', tenant_id), - 'clientId': mgmt_token.get('_clientId', app_id), - 'resource': mgmt_token.get('resource', resource_id) - } - - # Parse time to timestamp. - if 'expiresOn' in mgmt_token: - expire_dt = datetime.strptime(mgmt_token['expiresOn'], '%Y-%m-%d %H:%M:%S.%f') - self._token['expiresOn'] = expire_dt.timestamp() - - def refresh(self): - """Try to renew an Azure token.""" - - try: - - # This will work for device code flow, but not with client - # credentials. If we have the secret, we can just request a new - # token anyways. - - authority_uri = self.AUTHORITY_URI + '/' + self._token['tenantId'] - context = adal.AuthenticationContext(authority_uri, api_version=None) - mgmt_token = context.acquire_token_with_refresh_token(self._token['refreshToken'], - self._token['clientId'], - self._token['resource'], - None) - - # New token does not have all the metadata. - self._token['accessToken'] = mgmt_token['accessToken'] - self._token['refreshToken'] = mgmt_token['refreshToken'] - - # Parse time to timestamp. - if 'expiresOn' in mgmt_token: - expire_dt = datetime.strptime(mgmt_token['expiresOn'], '%Y-%m-%d %H:%M:%S.%f') - self._token['expiresOn'] = expire_dt.timestamp() - - except Exception: - self._token = None - - -class GoogleOpenID(OpenIDHandler): - - def __init__(self, endpoint, config=None, scopes=None): - """Store configuration.""" - - if "google.oauth2" not in sys.modules: - msg = "google.oauth2 module not found, run: pip install -e . '.[google]'" - raise ModuleNotFoundError(msg) - - # Configuration manager - hs_config = hsconfig.get_config() - - if scopes is None: - scopes = hs_config.get('hs_google_scopes', 'openid').split() - self.scopes = scopes - - # Config is a client_secrets dictionary. - if isinstance(config, dict): - self.config = config - - # Config points to a client_secrets.json file. - elif isinstance(config, str) and os.path.isfile(config): - with open(config, 'r') as f: - self.config = json.loads(f.read()) - - # Maybe client_secrets are in environment variables? - else: - self.config = { - 'installed': { - 'project_id': hs_config.get('hs_google_project_id', None), - 'client_id': hs_config.get('hs_google_client_id', None), - 'client_secret': hs_config.get('hs_google_client_secret', None), - 'auth_uri': 'https://accounts.google.com/o/oauth2/auth', - 'token_uri': 'https://oauth2.googleapis.com/token', - 'auth_provider_x509_cert_url': 'https://www.googleapis.com/oauth2/v1/certs', - 'redirect_uris': ['urn:ietf:wg:oauth:2.0:oob', 'http://localhost'] - } - } - - super().__init__(endpoint) - - def _parse(self, creds): - """Parse credentials.""" - - # NOTE: In Google OpenID, if a client is set up for InstalledAppFlow - # then the client_secret is not actually treated as a secret. Acquire - # will ALWAYS prompt for user input before granting a token. - - token = { - 'accessToken': creds.id_token, - 'refreshToken': creds.refresh_token, - 'tokenUri': creds.token_uri, - 'clientId': creds.client_id, - 'clientSecret': creds.client_secret, - 'scopes': creds.scopes - } - - # The expiry field that is in creds is for the OAuth token, not the - # OpenID token. We need to validate the OpenID tokenn to get the exp. - idinfo = GoogleIDToken.verify_oauth2_token(creds.id_token, GoogleRequest()) - if 'exp' in idinfo: - token['expiresOn'] = idinfo['exp'] - - return token - - def acquire(self): - """Acquire a new Google token.""" - - flow = GoogleInstalledAppFlow.from_client_config(self.config, - scopes=self.scopes) - creds = flow.run_console() - self._token = self._parse(creds) - - def refresh(self): - """Try to renew a token.""" - - try: - - token = self._token - creds = GoogleCredentials(token=None, - refresh_token=token['refreshToken'], - scopes=token['scopes'], - token_uri=token['tokenUri'], - client_id=token['clientId'], - client_secret=token['clientSecret']) - - creds.refresh(GoogleRequest()) - self._token = self._parse(creds) - - except Exception: - self._token = None - - -class KeycloakOpenID(OpenIDHandler): - - def __init__(self, endpoint, config=None, scopes=None, username=None, password=None): - """Store configuration.""" - - # Configuration manager - hs_config = hsconfig.get_config() - - if scopes is None: - scopes = hs_config.get('hs_keycloak_scopes', 'openid').split() - self.scopes = scopes - - # Config is a client_secrets dictionary. - if isinstance(config, dict): - self.config = config - - # Config points to a client_secrets.json file. - elif isinstance(config, str) and os.path.isfile(config): - with open(config, 'r') as f: - self.config = json.loads(f.read()) - - # Maybe configs are in environment variables? - else: - self.config = { - 'keycloak_client_id': hs_config.get('hs_keycloak_client_id', None), - 'keycloak_client_secret': hs_config.get('hs_keycloak_client_secret', None), - 'keycloak_realm': hs_config.get('hs_keycloak_realm', None), - 'keycloak_uri': hs_config.get('hs_keycloak_uri', None) - } - - super().__init__(endpoint, username=username, password=password) - - def _getKeycloakUrl(self): - if not self.config['keycloak_uri']: - raise KeyError("keycloak_uri not set") - if not self.config['keycloak_realm']: - raise KeyError("Keycloak realm not set") - if not self.config['keycloak_client_id']: - raise KeyError("keycloak client_id not set") - - url = self.config['keycloak_uri'] - url += "/realms/" - url += self.config['keycloak_realm'] - url += "/protocol/openid-connect/token" - - return url - - def _parse(self, creds): - """Parse credentials.""" - - # validate json returned by keycloak - if "token_type" not in creds: - raise IOError("Unexpected Keycloak JWT, no token_type") - if creds["token_type"].lower() != "bearer": - raise IOError("Unexpected Keycloak JWT, expected Bearer token") - - token = {} - if "access_token" not in creds: - raise IOError("Unexpected Keycloak JWT, no access_token") - token["accessToken"] = creds["access_token"] - if "refesh_token" in creds: - token["refreshToken"] = creds["refresh_token"] - if "expires_in" in creds: - now = time.time() - token['expiresOn'] = now + creds["expires_in"] - - # TBD: client_secret - # TBD: scopes - # TBD: client_id - - return token - - def acquire(self): - """Acquire a new Keycloak token.""" - keycloak_url = self._getKeycloakUrl() - - headers = {"Content-Type": "application/x-www-form-urlencoded"} - body = {} - body["username"] = self._username - body["password"] = self._password - body["grant_type"] = "password" - body["client_id"] = self.config.get("keycloak_client_id") - rsp = requests.post(keycloak_url, data=body, headers=headers) - - if rsp.status_code not in (200, 201): - print(f"POST error: {rsp.status_code}") - raise IOError(f"Keycloak response: {rsp.status_code}") - - creds = rsp.json() # TBD: catch json format errors? - self._token = self._parse(creds) - - def refresh(self): - """Try to renew a token.""" - # TBD - # unclear if refresh is supported without a client secret - self._token = None diff --git a/testall.py b/testall.py index a33cb327..1cb36136 100755 --- a/testall.py +++ b/testall.py @@ -24,8 +24,6 @@ "h5json_writer_test", "h5py_reader_test", "h5py_writer_test", - "hsds_reader_test", - "hsds_writer_test", ] use_hsds = True From c60e1c9c3e6b2c386564149311168f74e9586c94 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 12 Sep 2025 18:14:40 +0100 Subject: [PATCH 080/129] moved hsds reader/writer tests to h5pyd --- test/unit/hsds_reader_test.py | 145 ------------- test/unit/hsds_writer_test.py | 370 ---------------------------------- 2 files changed, 515 deletions(-) delete mode 100644 test/unit/hsds_reader_test.py delete mode 100644 test/unit/hsds_writer_test.py diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py deleted file mode 100644 index ce75d540..00000000 --- a/test/unit/hsds_reader_test.py +++ /dev/null @@ -1,145 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import logging -import random -import string -import numpy as np -from h5json import Hdf5db -from h5json.hsdsstore.hsds_reader import HSDSReader -from h5json import selections - - -class HSDSReaderTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(HSDSReaderTest, self).__init__(*args, **kwargs) - # main - - self.log = logging.getLogger() - if len(self.log.handlers) > 0: - lhStdout = self.log.handlers[0] # stdout is the only handler initially - else: - lhStdout = None - - self.log.setLevel(logging.DEBUG) - handler = logging.FileHandler("./hsds_reader_test.log") - # add handler to logger - self.log.addHandler(handler) - - if lhStdout is not None: - self.log.removeHandler(lhStdout) - - def testSimple(self): - filepath = "/home/test_user1/test/tall.h5" - kwargs = {"app_logger": self.log} - db = Hdf5db(**kwargs) - hsds_reader = HSDSReader(filepath, **kwargs) - db.reader = hsds_reader - root_id = db.open() - - # check domain stats - stats = db.reader.getStats() - self.assertTrue(stats["created"] > 0) - self.assertTrue(stats["lastModified"] > 0) - self.assertTrue(stats["owner"]) - self.assertTrue("compressors" in stats) - self.assertTrue(len(stats["compressors"]) > 0) - self.assertTrue("limits" in stats) - self.assertTrue(len(stats["limits"]) > 0) - - db.close() - self.assertTrue(db.closed) - obj_id = db.open() - self.assertEqual(obj_id, root_id) - - root_json = db.getObjectById(root_id) - self.assertTrue("id" in root_json) - - root_attrs = root_json["attributes"] - self.assertEqual(len(root_attrs), 2) - self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) - - root_links = root_json["links"] - self.assertEqual(len(root_links), 2) - self.assertEqual(list(root_links.keys()), ["g1", "g2"]) - g1_link = root_links["g1"] - self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") - g1_id = g1_link["id"] - self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) - - dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") - dset_json = db.getObjectById(dset111_id) - dset_type = dset_json["type"] - self.assertEqual(dset_type["class"], "H5T_INTEGER") - self.assertEqual(dset_type["base"], "H5T_STD_I32BE") - - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 2) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) - dset_shape = dset_json["shape"] - self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10, 10]) - - # got the 5th row of the dataset - sel_row = selections.select((10, 10), (5, slice(0, 10))) - row = db.getDatasetValues(dset111_id, sel_row) - self.assertTrue(isinstance(row, np.ndarray)) - self.assertEqual(row.shape, (10,)) - for i in range(10): - v = row[i] - self.assertEqual(v, i * 5) - - sel_all = selections.select((10, 10), ...) - arr = db.getDatasetValues(dset111_id, sel_all) - self.assertTrue(isinstance(arr, np.ndarray)) - self.assertEqual(arr.shape, (10, 10)) - for i in range(10): - for j in range(10): - v = arr[i, j] - self.assertEqual(v, i * j) - - # try adding an attribute - db.createAttribute(dset111_id, "attr3", value=42) - dset_json = db.getObjectById(dset111_id) - dset_attrs = dset_json["attributes"] - self.assertEqual(len(dset_attrs), 3) - self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) - attr3_json = dset_attrs["attr3"] - attr3_shape = attr3_json["shape"] - self.assertEqual(attr3_shape["class"], "H5S_SCALAR") - attr3_type = attr3_json["type"] - self.assertEqual(attr3_type["class"], "H5T_INTEGER") - self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") - attr3_value = attr3_json["value"] - self.assertEqual(attr3_value, 42) - - db.close() - - def testNoFile(self): - # create a random string so we don't try to open an existing file - filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)) - filepath = "/home/test_user1/test/" + filename - kwargs = {"app_logger": self.log} - db = Hdf5db(**kwargs) - hsds_reader = HSDSReader(filepath, **kwargs) - db.reader = hsds_reader - try: - db.open() - self.assertTrue(False) - except IOError as ioe: - self.assertEqual(ioe.errno, 404) - - -if __name__ == "__main__": - # setup test files - - unittest.main() diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py deleted file mode 100644 index ecdedf02..00000000 --- a/test/unit/hsds_writer_test.py +++ /dev/null @@ -1,370 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import unittest -import logging -import random -import string -import requests -import numpy as np -from h5json import Hdf5db -from h5json.hsdsstore.httpconn import HttpConn -from h5json.hsdsstore.hsds_writer import HSDSWriter -from h5json.hsdsstore.hsds_reader import HSDSReader -from h5json.h5pystore.h5py_reader import H5pyReader -from h5json.hdf5dtype import special_dtype, Reference -from h5json import selections - - -class HSDSWriterTest(unittest.TestCase): - def __init__(self, *args, **kwargs): - super(HSDSWriterTest, self).__init__(*args, **kwargs) - # main - self.session = requests.Session() - - # create logger - logfname = "hsds_writer_test.log" - loglevel = logging.DEBUG - logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) - self.log = logging.getLogger() - self.log.info("init!") - - def testSimple(self): - - domain_path = "hdf5://home/test_user1/test/writer_test.h5" - - db = Hdf5db(app_logger=self.log) - db.writer = HSDSWriter(domain_path, app_logger=self.log) - root_id = db.open() - - stats = db.writer.getStats() - for k in ("created", "lastModified", "owner"): - self.assertTrue(k in stats) - http_conn = HttpConn(domain_path, mode='r', retries=1) - http_conn.open() - - db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) - db.createAttribute(root_id, "attr2", 42) - - g1_id = db.createGroup() - db.createHardLink(root_id, "g1", g1_id) - db.createAttribute(g1_id, "a1", "hello") - g2_id = db.createGroup() - db.createHardLink(root_id, "g2", g2_id) - - # validate - get the root group and check counts - http_rsp = http_conn.GET(f"/groups/{root_id}") - self.assertEqual(http_rsp.status_code, 200) - root_json = http_rsp.json() - # attribute count should still be zero (hasn't been flushed yet) - self.assertEqual(root_json["attributeCount"], 0) - # same for link count - self.assertEqual(root_json["linkCount"], 0) - self.assertTrue(db.writer.lastModified is None) # no write yet - db.flush() - self.assertTrue(db.writer.lastModified > 0) # timestamp should be updated - - # validate - get the root group again and see if counts are updated - http_rsp = http_conn.GET(f"/groups/{root_id}") - self.assertEqual(http_rsp.status_code, 200) - root_json = http_rsp.json() - # attribute count should still be zero (hasn't been flushed yet) - self.assertEqual(root_json["attributeCount"], 2) - # same for link count - self.assertEqual(root_json["linkCount"], 2) - - g1_1_id = db.createGroup() - db.createHardLink(g1_id, "g1.1", g1_1_id) - dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) - arr = np.zeros((10, 10), dtype=np.int32) - for i in range(10): - for j in range(10): - arr[i, j] = i * j - sel_all = selections.select((10, 10), ...) - db.setDatasetValues(dset_111_id, sel_all, arr) - db.flush() - - # validate - get the dataset and check values - http_rsp = http_conn.GET(f"/datasets/{dset_111_id}/value") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - self.assertTrue("value" in rsp_json) - rsp_value = rsp_json["value"] - self.assertEqual(len(rsp_value), 10) - for i in range(10): - row = rsp_value[i] - self.assertEqual(len(row), 10) - for j in range(10): - self.assertEqual(row[j], i * j) - - db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) - db.createSoftLink(g2_id, "slink", "somewhere") - db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") - db.createCustomLink(g2_id, "cust", {"foo": "bar"}) - db.flush() - - # create a link, then delete before flushing - tmp_grp_id = db.createGroup("tmp_group") - db.createHardLink(g1_1_id, "tmp_group", tmp_grp_id) - db.deleteLink(g1_1_id, "tmp_group") - db.flush() - - # validate - check that links got updated - http_rsp = http_conn.GET(f"/groups/{g2_id}/links") - self.assertEqual(http_rsp.status_code, 200) - g2links_json = http_rsp.json() - self.assertTrue("links" in g2links_json) - g2links = g2links_json["links"] - self.assertTrue(len(g2links), 2) # custom link will be ignored - - db.createAttribute(g1_id, "a1", "hello") - db.createAttribute(g1_id, "a2", "bye-bye") - db.flush() - - # validate - check that attributes got created - http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - attrs_json = rsp_json["attributes"] - self.assertEqual(len(attrs_json), 2) - - # delete an attribute - db.deleteAttribute(g1_id, "a1") - db.flush() - - # validate - check that the attribute got deleted - http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - attrs_json = rsp_json["attributes"] - self.assertEqual(len(attrs_json), 1) - - # create an attribute that happens to use the separator character - db.createAttribute(g1_id, "a|z", "goofy") - db.flush() - - # validate - check that attributes got created - http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - attrs_json = rsp_json["attributes"] - self.assertEqual(len(attrs_json), 2) - - # delete an attribute - db.deleteAttribute(g1_id, "a|z") - db.flush() - - # validate - check that the attribute got deleted - http_rsp = http_conn.GET(f"/groups/{g1_id}/attributes") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - attrs_json = rsp_json["attributes"] - self.assertEqual(len(attrs_json), 1) - - g21 = db.createGroup() - db.createHardLink(g2_id, "g2.1", g21) - db.flush() - - # update one element of the dataset - sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) - arr = np.zeros((), dtype=np.int32) - arr[()] = 42 - db.setDatasetValues(dset_111_id, sel, arr) - db.flush() - - # validate - check that just the one element is modified - http_rsp = http_conn.GET(f"/datasets/{dset_111_id}/value") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - self.assertTrue("value" in rsp_json) - rsp_value = rsp_json["value"] - self.assertEqual(len(rsp_value), 10) - for i in range(10): - row = rsp_value[i] - self.assertEqual(len(row), 10) - for j in range(10): - if i == 4 and j == 4: - expected = 42 - else: - expected = i * j - self.assertEqual(row[j], expected) - - # create a scalar dataset - dset_112_id = db.createDataset(shape=(), dtype=np.int32) - arr = np.zeros((), dtype=np.int32) - arr[()] = 42 - sel_all = selections.select((), ...) - db.setDatasetValues(dset_112_id, sel_all, arr) - db.createHardLink(g1_id, "dset1.1.2", dset_112_id) - db.flush() - - # validate - get the scalar dataset value - http_rsp = http_conn.GET(f"/datasets/{dset_112_id}/value") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - self.assertTrue("value" in rsp_json) - rsp_value = rsp_json["value"] - self.assertEqual(rsp_value, 42) - - # create a dataset and try to read from it - dset_222_id = db.createDataset(shape=(10, 10), dtype=np.int32) - sel_all = selections.select((10, 10), ...) - arr = db.getDatasetValues(dset_222_id, sel_all) - self.assertTrue((arr == 0).all()) - - db.close() - - def testReaderWriter(self): - # try reading and writing to an HSDS domain - # create a random string so we don't try to open an existing file - filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)) - domain_path = "/home/test_user1/test/" + filename + ".h5" - db = Hdf5db(app_logger=self.log) - db.writer = HSDSWriter(domain_path, app_logger=self.log) - self.assertEqual(db.writer.filepath, domain_path) - root_id = db.open() - self.assertTrue(root_id) - db.reader = HSDSReader(domain_path, app_logger=self.log) - db.close() - - root_id2 = db.open() - self.assertEqual(root_id, root_id2) - root_json = db.getObjectById(root_id) - self.assertTrue("id" not in root_json) - self.assertTrue("created" in root_json) - self.assertTrue(root_json["created"] > 0) - self.assertTrue(db.writer.lastModified is None) # no flush yet - - # create a scalar dataset - dsetA_id = db.createDataset(shape=(), dtype=np.int32) - dset_json = db.getObjectById(dsetA_id) - self.assertTrue("created" in dset_json) - dset_create_time = dset_json["created"] - self.assertTrue(dset_create_time > 0) - - db.createHardLink(root_id, "dset_a", dsetA_id) - - arr = np.zeros((), dtype=np.int32) - arr[()] = 42 - sel_all = selections.select((), ...) - db.setDatasetValues(dsetA_id, sel_all, arr) - - dset_json = db.getObjectById(dsetA_id) - self.assertTrue("lastModified" in dset_json) - self.assertTrue(dset_json["lastModified"] > dset_create_time) - - arr = db.getDatasetValues(dsetA_id, sel_all) - self.assertEqual(arr[()], 42) - - # create a scalar dataset with string - dt_str = special_dtype(vlen=str) - dsetB_id = db.createDataset(shape=(), dtype=dt_str) - dset_json = db.getObjectById(dsetB_id) - db.createHardLink(root_id, "dset_b", dsetB_id) - - arr = np.zeros((), dtype=dt_str) - arr[()] = "hello world" - db.setDatasetValues(dsetB_id, sel_all, arr) - - arr = db.getDatasetValues(dsetB_id, sel_all) - - e = arr[()] - self.assertEqual(e, "hello world") - self.assertTrue(isinstance(e, str)) - - db.close() - - def testH5PyToHS(self): - # test reading from HDF5 file and writing to HSDS - - file_path = "data/hdf5/tall.h5" - domain_path = "hdf5://home/test_user1/test/hsds_writer_test_tall.h5" - - db = Hdf5db(app_logger=self.log) - db.reader = H5pyReader(file_path) - db.writer = HSDSWriter(domain_path) - root_id = db.open() - root_json = db.getObjectById(root_id) - db.flush() - - # validate - get the root group and see if counts are correct - http_conn = HttpConn(domain_path, mode='r', retries=1) - http_conn.open() - http_rsp = http_conn.GET(f"/groups/{root_id}") - self.assertEqual(http_rsp.status_code, 200) - root_json = http_rsp.json() - self.assertEqual(root_json["id"], root_id) - # attribute count should still be zero (hasn't been flushed yet) - self.assertEqual(root_json["attributeCount"], 2) - # same for link count - self.assertEqual(root_json["linkCount"], 2) - - # get the g1 hard link - http_rsp = http_conn.GET(f"/groups/{root_id}/links/g1") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - g1_link = rsp_json["link"] - g1_id = g1_link["id"] - - # get the g1 group json - http_rsp = http_conn.GET(f"/groups/{g1_id}") - self.assertEqual(http_rsp.status_code, 200) - g1_json = http_rsp.json() - self.assertEqual(g1_json["attributeCount"], 0) - self.assertEqual(g1_json["linkCount"], 2) - - # get the g1.1 link - http_rsp = http_conn.GET(f"/groups/{g1_id}/links/g1.1") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - g1_1_link = rsp_json["link"] - g1_1_id = g1_1_link["id"] - - # Get the g1.1 json - http_rsp = http_conn.GET(f"/groups/{g1_1_id}") - self.assertEqual(http_rsp.status_code, 200) - g1_json = http_rsp.json() - self.assertEqual(g1_json["attributeCount"], 0) - self.assertEqual(g1_json["linkCount"], 2) - - # get the dset1.1.1 link - http_rsp = http_conn.GET(f"/groups/{g1_1_id}/links/dset1.1.1") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - dset1_1_1_link = rsp_json["link"] - dset1_1_1_id = dset1_1_1_link["id"] - - # get the dset1.1.1 json - http_rsp = http_conn.GET(f"/datasets/{dset1_1_1_id}") - self.assertEqual(http_rsp.status_code, 200) - dset1_1_1_json = http_rsp.json() - dset1_1_1_shape = dset1_1_1_json["shape"] - self.assertEqual(dset1_1_1_shape["class"], "H5S_SIMPLE") - - # get the dset1_1_1 data - http_rsp = http_conn.GET(f"/datasets/{dset1_1_1_id}/value") - self.assertEqual(http_rsp.status_code, 200) - rsp_json = http_rsp.json() - dset1_1_1_value = rsp_json["value"] - self.assertEqual(len(dset1_1_1_value), 10) - for i in range(10): - row = dset1_1_1_value[i] - self.assertEqual(len(row), 10) - for j in range(10): - self.assertEqual(row[j], i * j) - - db.close() - - -if __name__ == "__main__": - # setup test files - - unittest.main() From 29ae2370f6de0a66891edf13378c70aa01aa717e Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 16 Sep 2025 18:13:44 +0100 Subject: [PATCH 081/129] fix for getDatasetValues --- src/h5json/hdf5db.py | 2 +- test/unit/h5py_reader_test.py | 21 ++++++++++++++++++++- test/unit/hdf5db_test.py | 27 +++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 02753ec5..c8442aff 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -587,7 +587,7 @@ def getDatasetValues(self, dset_id, sel): if fetch: arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) else: - arr = np.zeros(sel.shape, dtype=dtype) + arr = np.zeros(sel.mshape, dtype=dtype) if "updates" in dset_json: # apply any non-flushed changes that intersect the current selection diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index 8f76543c..e4cc9c7d 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -13,8 +13,11 @@ import logging import time +import numpy as np + from h5json import Hdf5db from h5json.h5pystore.h5py_reader import H5pyReader +from h5json import selections class H5pyReaderTest(unittest.TestCase): @@ -70,7 +73,23 @@ def testSimple(self): self.assertTrue(k in attr1_json) dset_shape = dset_json["shape"] self.assertEqual(dset_shape["class"], "H5S_SIMPLE") - self.assertEqual(dset_shape["dims"], [10, 10]) + dims = dset_shape["dims"] + self.assertEqual(dims, [10, 10]) + dims = tuple(dims) + + # read one element from a dataset + sel = selections.select(dims, (slice(4, 5), slice(5, 6))) + arr = db.getDatasetValues(dset111_id, sel) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (1, 1)) + self.assertEqual(arr[0, 0], 20) + + # read one row + sel = selections.select(dims, (slice(4, 5), slice(0, 10))) + arr = db.getDatasetValues(dset111_id, sel) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (1, 10)) + self.assertEqual(list(arr[0]), list(range(0, 40, 4))) # try adding an attribute db.createAttribute(dset111_id, "attr3", value=42) diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 1eca8e2a..04df3156 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -448,6 +448,33 @@ def testSimpleDataset(self): db.close() + def testBoolDataset(self): + shape = (10,) + dtype = np.dtype(bool) + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + sel_first = selections.select(shape, slice(0, 1)) + arr = db.getDatasetValues(dset_id, sel_first) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, (1,)) + self.assertEqual(arr[0], False) + + # update one element + sel_second = selections.select(shape, slice(1, 2)) + db.setDatasetValues(dset_id, sel_second, np.array([True,], dtype=dtype)) + + # read back three elements + sel_three = selections.select(shape, slice(0, 3)) + arr = db.getDatasetValues(dset_id, sel_three) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, (3,)) + self.assertEqual(list(arr[...]), [False, True, False]) + + db.close() + def testScalarDataset(self): dtype = np.int32 From b904ea51733f5901be77d07dfb9f946183014a4a Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 16 Sep 2025 19:02:42 +0100 Subject: [PATCH 082/129] fix for datasets with fillvalue --- src/h5json/hdf5db.py | 13 ++++++++++++- test/unit/hdf5db_test.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index c8442aff..5f9714a4 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -570,6 +570,11 @@ def getDatasetValues(self, dset_id, sel): dtype = self.getDtype(dset_json) + if "creationProperties" in dset_json: + cpl = dset_json["creationProperties"] + else: + cpl = {} + # determine if we need to make a read request or not if dset_id in self._new_objects: fetch = False @@ -587,7 +592,13 @@ def getDatasetValues(self, dset_id, sel): if fetch: arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) else: - arr = np.zeros(sel.mshape, dtype=dtype) + if "fillValue" in cpl: + fillValue = cpl["fillValue"] + # TBD: fix for compound types + arr = np.zeros(sel.mshape, dtype=dtype) + arr[...] = fillValue + else: + arr = np.zeros(sel.mshape, dtype=dtype) if "updates" in dset_json: # apply any non-flushed changes that intersect the current selection diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 04df3156..63030ef2 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -526,6 +526,24 @@ def testResizableDataset(self): db.close() + def testFillValueDataset(self): + dtype = np.uint32 + db = Hdf5db(app_logger=self.log) + root_id = db.open() + cpl = {"fillValue": 0xdeadbeef} + dset_id = db.createDataset((), dtype=dtype, cpl=cpl) + db.createHardLink(root_id, "dset", dset_id) + dset_json = db.getObjectById(dset_id) + self.assertTrue("creationProperties" in dset_json) + cpl = dset_json["creationProperties"] + self.assertTrue("fillValue" in cpl) + self.assertEqual(cpl["fillValue"], 0xdeadbeef) + sel_all = selections.select((), ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr[()], 0xdeadbeef) + if __name__ == "__main__": # setup test files From 65b94c12af0a7cdb1726a60432c7ac4114b48c73 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 17 Sep 2025 17:32:20 +0100 Subject: [PATCH 083/129] added dset_util functions --- src/h5json/dset_util.py | 629 ++++++++++++++++++++++++++-- src/h5json/filters.py | 246 ++++++++++- src/h5json/h5pystore/h5py_reader.py | 14 +- testall.py | 9 - 4 files changed, 849 insertions(+), 49 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 37d67f1e..34d5d0d2 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -10,36 +10,27 @@ # request a copy from help@hdfgroup.org. # ############################################################################## -import time +import math import numpy as np +from .hdf5dtype import getItemSize +from .objid import isValidUuid +from . import config +CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) +CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) +DEFAULT_TYPE_SIZE = 128 # Type size case when it is variable -def resize_dataset(dset_json, shape): - shape_json = dset_json["shape"] - shape_class = shape_json["class"] - if shape_class != "H5S_SIMPLE": - raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") - if len(shape_json["dims"]) != len(shape): - raise ValueError("Resize shape parameter doesn't match dataset's rank") - if "maxdims" not in shape_json: - raise ValueError("Dataset is not resizable") - dims = shape_json["dims"] - maxdims = shape_json["maxdims"] - if shape_json["dims"] == list(shape): - # no change, just return - return - for i in range(len(dims)): - extent = shape[i] - if extent < 0: - raise ValueError("dimensions can't be negative") - if maxdims[i] == "H5S_UNLIMITED": - # any positive extent is ok - continue - if extent > maxdims[i]: - raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}") +def getShapeClass(data_shape): + """ Return shape class of the given data shape """ - shape_json["dims"] = list(shape) + if not isinstance(data_shape, dict): + raise TypeError("expected dict object") + + if "class" not in data_shape: + raise KeyError("expected 'class' key for data shape")\ + + return data_shape["class"] def getDims(dset_json): @@ -65,6 +56,50 @@ def getNumElements(dset_json): return int(np.prod(getDims(dset_json))) +def getRank(data_shape): + """ Return rank of given data shape_json """ + + shape_class = getShapeClass(data_shape) + + if shape_class == "H5S_NULL": + return 0 + elif shape_class == "H5S_SCALAR": + return 0 + elif shape_class == "H5S_SIMPLE": + if "dims" not in data_shape: + raise KeyError("expected dims key for H5S_SIMPLE data shape") + return len(data_shape["dims"]) + else: + raise ValueError(f"unexpected data shape class: {shape_class}") + + +def getDsetRank(dset_json): + """Get rank returning 0 for scalar or NULL data shapes""" + data_shape = dset_json["shape"] + return getRank(data_shape) + + +def isNullSpace(dset_json): + """Return true if this dataset is a null data space""" + shape_class = getShapeClass(dset_json["shape"]) + if shape_class == "H5S_NULL": + return True + else: + return False + + +def isScalarSpace(dset_json): + """ return true if this is a scalar dataset """ + + data_shape = dset_json["shape"] + shape_class = getShapeClass(data_shape) + if shape_class == "H5S_NULL": + return False + + rank = getRank(data_shape) + return True if rank == 0 else False + + def getDatasetLayout(dset_json): """ Return layout json from creation property list or layout json """ layout = None @@ -89,3 +124,547 @@ def getDatasetLayoutClass(dset_json): else: layout_class = None return layout_class + + +CHUNK_LAYOUT_CLASSES = ( + "H5D_CHUNKED", + "H5D_CHUNKED_REF", + "H5D_CHUNKED_REF_INDIRECT", + "H5D_CONTIGUOUS_REF", +) + + +def get_dset_size(shape_json, typesize): + """Return the size of the dataspace. For + any unlimited dimensions, assume a value of 1. + (so the return size will be the absolute minimum) + """ + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return typesize # just return size for one item + if typesize == "H5T_VARIABLE": + typesize = DEFAULT_TYPE_SIZE # just take a guess at the item size + dset_size = typesize + shape = shape_json["dims"] + rank = len(shape) + + for n in range(rank): + if shape[n] == 0: + # extendable extent with value of 0 + continue # assume this is one + dset_size *= shape[n] + return dset_size + + +def resize_dataset(dset_json, shape): + shape_json = dset_json["shape"] + shape_class = shape_json["class"] + if shape_class != "H5S_SIMPLE": + raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") + if len(shape_json["dims"]) != len(shape): + raise ValueError("Resize shape parameter doesn't match dataset's rank") + if "maxdims" not in shape_json: + raise ValueError("Dataset is not resizable") + dims = shape_json["dims"] + maxdims = shape_json["maxdims"] + + if shape_json["dims"] == list(shape): + # no change, just return + return + for i in range(len(dims)): + extent = shape[i] + if extent < 0: + raise ValueError("dimensions can't be negative") + if maxdims[i] == "H5S_UNLIMITED": + # any positive extent is ok + continue + if extent > maxdims[i]: + raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}") + + shape_json["dims"] = list(shape) + + +def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): + """ + create a chunk layout for datasets use contiguous storage. + """ + if not isinstance(item_size, int): + msg = "ContiguousLayout can only be used with fixed-length types" + raise ValueError(msg) + + if chunk_min is None: + msg = "chunk_min not set" + raise ValueError(msg) + if chunk_max is None: + msg = "chunk_max not set" + raise ValueError(msg) + + if chunk_max < chunk_min: + raise ValueError("chunk_max cannot be less than chunk_min") + + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1,) # just enough to store one item + dims = shape_json["dims"] + rank = len(dims) + if rank == 0: + raise ValueError("rank must be positive for Contiguous Layout") + for dim in dims: + if dim < 0: + raise ValueError("extents must be positive for Contiguous Layout") + if dim == 0: + # data shape with no elements, just return dims as layout + return dims + + nsize = item_size + layout = [1,] * rank + + for i in range(rank): + dim = rank - i - 1 + extent = dims[dim] + if extent * nsize < chunk_max: + # just use the full extent as layout + layout[dim] = extent + nsize *= extent + else: + n = extent + while n > 1: + n = -(-n // 2) # use negatives so we round up on odds + if n * nsize < chunk_max: + break + layout[dim] = n + break # just use 1's for the rest of the layout + + return layout + + +def getChunkSize(layout, type_size): + """Return chunk size given layout. + i.e. just the product of the values in the list. + """ + if type_size == "H5T_VARIABLE": + type_size = DEFAULT_TYPE_SIZE + + chunk_size = type_size + for n in layout: + if n <= 0: + raise ValueError("Invalid chunk layout") + chunk_size *= n + return chunk_size + + +def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): + """ + Use chunk layout given in the creationPropertiesList (if defined and + layout is valid). + Return chunk_layout_json + """ + + rank = 0 + space_dims = None + chunk_dims = None + max_dims = None + + if "dims" in shape_json: + space_dims = shape_json["dims"] + rank = len(space_dims) + + if "maxdims" in shape_json: + max_dims = shape_json["maxdims"] + if "dims" in layout: + chunk_dims = layout["dims"] + + if chunk_dims: + # validate that the chunk_dims are valid and correlates with the + # dataset shape + if isinstance(chunk_dims, int): + chunk_dims = [ + chunk_dims, + ] # promote to array + if len(chunk_dims) != rank: + msg = "Layout rank does not match shape rank" + raise ValueError(msg) + for i in range(rank): + dim_extent = space_dims[i] + chunk_extent = chunk_dims[i] + if not isinstance(chunk_extent, int): + msg = "Layout dims must be integer or integer array" + raise ValueError(msg) + if chunk_extent <= 0: + msg = "Invalid layout value" + raise ValueError(msg) + if max_dims is None: + if chunk_extent > dim_extent: + msg = "Invalid layout value" + raise ValueError(reason=msg) + elif max_dims[i] != 0: + if chunk_extent > max_dims[i]: + msg = "Invalid layout value for extensible dimension" + raise ValueError(msg) + else: + pass # allow any positive value for unlimited dimensions + + if "class" not in layout: + msg = "class key not found in layout for creation property list" + raise ValueError(msg) + + layout_class = layout["class"] + + if layout_class == "H5D_CONTIGUOUS_REF": + # reference to a dataset in a traditional HDF5 files with + # contiguous storage + if item_size == "H5T_VARIABLE": + # can't be used with variable types... + msg = "Datasets with variable types cannot be used with " + msg += "reference layouts" + raise ValueError(msg) + if "file_uri" not in layout: + # needed for H5D_CONTIGUOUS_REF + msg = "'file_uri' key must be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + raise ValueError(msg) + if "offset" not in layout: + # needed for H5D_CONTIGUOUS_REF + msg = "'offset' key must be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + raise ValueError(msg) + if "size" not in layout: + # needed for H5D_CONTIGUOUS_REF + msg = "'size' key must be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + raise ValueError(msg) + if "dims" in layout: + # used defined chunk layout not allowed for H5D_CONTIGUOUS_REF + msg = "'dims' key can not be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + raise ValueError(msg) + elif layout_class == "H5D_CHUNKED_REF": + # reference to a dataset in a traditional HDF5 files with + # chunked storage + if item_size == "H5T_VARIABLE": + # can't be used with variable types.. + msg = "Datasets with variable types cannot be used with " + msg += "reference layouts" + raise ValueError(msg) + if "file_uri" not in layout: + # needed for H5D_CHUNKED_REF + msg = "'file_uri' key must be provided for " + msg += "H5D_CHUNKED_REF layout" + raise ValueError(msg) + if "dims" not in layout: + # needed for H5D_CHUNKED_REF + msg = "'dimns' key must be provided for " + msg += "H5D_CHUNKED_REF layout" + raise ValueError(msg) + if "chunks" not in layout: + msg = "'chunks' key must be provided for " + msg += "H5D_CHUNKED_REF layout" + raise ValueError(msg) + elif layout_class == "H5D_CHUNKED_REF_INDIRECT": + # reference to a dataset in a traditional HDF5 files with chunked + # storage using an auxiliary dataset + if item_size == "H5T_VARIABLE": + # can't be used with variable types.. + msg = "Datasets with variable types cannot be used with " + msg += "reference layouts" + raise ValueError(msg) + if "dims" not in layout: + # needed for H5D_CHUNKED_REF_INDIRECT + msg = "'dims' key must be provided for " + msg += "H5D_CHUNKED_REF_INDIRECT layout" + raise ValueError(msg) + if "chunk_table" not in layout: + msg = "'chunk_table' key must be provided for " + msg += "H5D_CHUNKED_REF_INDIRECT layout" + raise ValueError(msg) + chunk_table_id = layout["chunk_table"] + if not isValidUuid(chunk_table_id, "Dataset"): + msg = f"Invalid chunk table id: {chunk_table_id}" + raise ValueError(msg) + + elif layout_class == "H5D_CHUNKED": + if "dims" not in layout: + msg = "dims key not found in layout for creation property list" + raise ValueError(msg) + if shape_json["class"] != "H5S_SIMPLE": + msg = "Bad Request: chunked layout not valid with shape class: " + msg += f"{shape_json['class']}" + raise ValueError(msg) + elif layout_class == "H5D_CONTIGUOUS": + if "dims" in layout: + msg = "dims key found in layout for creation property list " + msg += "for H5D_CONTIGUOUS storage class" + raise ValueError(msg) + elif layout_class == "H5D_COMPACT": + if "dims" in layout: + msg = "dims key found in layout for creation property list " + msg += "for H5D_COMPACT storage class" + raise ValueError(msg) + else: + msg = f"Unexpected layout: {layout_class}" + raise ValueError(msg) + + +def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED"): + """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1,) # just enough to store one item + + layout = list(layout) + dims = shape_json["dims"] + rank = len(dims) + extendable_dims = 0 # number of dimensions that are extenable + maxdims = None + if "maxdims" in shape_json: + maxdims = shape_json["maxdims"] + for n in range(rank): + if maxdims[n] == 0 or maxdims[n] > dims[n]: + extendable_dims += 1 + + dset_size = get_dset_size(shape_json, typesize) + if dset_size <= chunk_min and extendable_dims == 0: + # just use the entire dataspace shape as one big chunk + return tuple(dims) + + chunk_size = getChunkSize(layout, typesize) + if chunk_size >= chunk_min: + return tuple(layout) # good already + while chunk_size < chunk_min: + # just adjust along extendable dimensions first + old_chunk_size = chunk_size + for n in range(rank): + dim = rank - n - 1 # start from last dim + + if extendable_dims > 0: + if maxdims[dim] == 0: + # infinitely extendable dimensions + layout[dim] *= 2 + chunk_size = getChunkSize(layout, typesize) + if chunk_size > chunk_min: + break + elif maxdims[dim] > layout[dim]: + # can only be extended so much + layout[dim] *= 2 + if layout[dim] >= dims[dim]: + layout[dim] = maxdims[dim] # trim back + extendable_dims -= 1 # one less extenable dimension + + chunk_size = getChunkSize(layout, typesize) + if chunk_size > chunk_min: + break + else: + pass # ignore non-extensible for now + else: + # no extendable dimensions + if dims[dim] > layout[dim]: + # can expand chunk along this dimension + layout[dim] *= 2 + if layout[dim] > dims[dim]: + layout[dim] = dims[dim] # trim back + chunk_size = getChunkSize(layout, typesize) + if chunk_size > chunk_min: + break + else: + pass # can't extend chunk along this dimension + if chunk_size <= old_chunk_size: + # stop iteration if we haven't increased the chunk size + break + elif chunk_size > chunk_min: + break # we're good + else: + pass # do another round + return tuple(layout) + + +def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX, layout_class="H5D_CHUNKED"): + """Compute a reduced chunk shape with a size in bytes less than chunk_max.""" + layout = list(layout) + chunk_size = getChunkSize(layout, typesize) + if chunk_size <= chunk_max: + return tuple(layout) # good already + rank = len(layout) + + while chunk_size > chunk_max: + # just adjust along extendable dimensions first + old_chunk_size = chunk_size + for dim in range(rank): + if layout[dim] > 1: + # tricky way to do x // 2 with ceil + layout[dim] = -(-layout[dim] // 2) + chunk_size = getChunkSize(layout, typesize) + if chunk_size <= chunk_max: + break + else: + pass # can't shrink chunk along this dimension + if chunk_size >= old_chunk_size: + # reality check to see if we'll ever break out of the while loop + break + elif chunk_size <= chunk_max: + break # we're good + else: + pass # do another round + return tuple(layout) + + +def guessChunk(shape_json, typesize): + """Guess an appropriate chunk layout for a dataset, given its shape and + the size of each element in bytes. Will allocate chunks only as large + as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of + each axis, slightly favoring bigger values for the last index. + + Undocumented and subject to change without warning. + """ + if shape_json is None or shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1,) # just enough to store one item + + if "maxdims" in shape_json: + shape = shape_json["maxdims"] + else: + shape = shape_json["dims"] + + if typesize == "H5T_VARIABLE": + typesize = 128 # just take a guess at the item size + + # For unlimited dimensions we have to guess. use 1024 + shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) + + return shape + + +def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, chunk_max=None): + """ Get the layout json given by creation_props. + Raise bad request error if invalid """ + + min_chunk_size = CHUNK_MIN # int(config.get("min_chunk_size")) + max_chunk_size = CHUNK_MAX # int(config.get("max_chunk_size")) + + item_size = getItemSize(type_json) + if chunk_min is None: + chunk_min = 1000 * 1000 + if chunk_max is None: + chunk_max = 4 * 1000 * 1000 + + if chunk_min > chunk_max: + msg = "chunk_max must be larger than chunk_min" + raise ValueError(msg) + + layout = None + if "layout" in creation_props: + layout_props = creation_props["layout"] + else: + layout_props = None + + if layout_props: + if "class" not in layout_props: + msg = "expected class key in layout props" + raise KeyError(msg) + layout_class = layout_props["class"] + if layout_class == "H5D_CONTIGUOUS": + # treat contiguous as chunked + layout_class = "H5D_CHUNKED" + else: + layout_class = layout_props["class"] + elif shape["class"] != "H5S_NULL": + layout_class = "H5D_CHUNKED" + else: + layout_class = None + + if layout_class == "H5D_COMPACT": + layout = {"class": "H5D_COMPACT"} + elif layout_class: + # initialize to H5D_CHUNKED + layout = {"class": "H5D_CHUNKED"} + else: + # null space - no layout + layout = None + + if layout_props and "dims" in layout_props: + chunk_dims = layout_props["dims"] + else: + chunk_dims = None + + if layout_class == "H5D_CONTIGUOUS_REF": + kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + chunk_dims = getContiguousLayout(shape, item_size, **kwargs) + layout["dims"] = chunk_dims + + if layout_class == "H5D_CHUNKED" and chunk_dims is None: + # do auto-chunking + chunk_dims = guessChunk(shape, item_size) + + if layout_class == "H5D_CHUNKED": + chunk_size = getChunkSize(chunk_dims, item_size) + + # adjust the chunk shape if chunk size is too small or too big + adjusted_chunk_dims = None + if chunk_size < min_chunk_size: + kwargs = {"chunk_min": min_chunk_size, "layout_class": layout_class} + adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape, **kwargs) + elif chunk_size > max_chunk_size: + kwargs = {"chunk_max": max_chunk_size} + adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) + if adjusted_chunk_dims: + layout["dims"] = adjusted_chunk_dims + else: + layout["dims"] = chunk_dims # don't need to adjust chunk size + + # set partition_count if needed: + max_chunks_per_folder = int(config.get("max_chunks_per_folder")) + set_partition = False + if max_chunks_per_folder > 0: + if "dims" in shape and "dims" in layout: + set_partition = True + + if set_partition: + chunk_dims = layout["dims"] + shape_dims = shape["dims"] + if "maxdims" in shape: + max_dims = shape["maxdims"] + else: + max_dims = None + num_chunks = 1 + rank = len(shape_dims) + unlimited_count = 0 + if max_dims: + for i in range(rank): + if max_dims[i] == 0: + unlimited_count += 1 + for i in range(rank): + max_dim = 1 + if max_dims: + max_dim = max_dims[i] + if max_dim == 0: + # don't really know what the ultimate extent + # could be, but assume 10^6 for total number of + # elements and square-shaped array... + MAX_ELEMENT_GUESS = 10.0 ** 6 + exp = 1 / unlimited_count + max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) + else: + max_dim = shape_dims[i] + num_chunks *= math.ceil(max_dim / chunk_dims[i]) + + if num_chunks > max_chunks_per_folder: + partition_count = math.ceil(num_chunks / max_chunks_per_folder) + msg = f"set partition count to: {partition_count}, " + msg += f"num_chunks: {num_chunks}" + layout["partition_count"] = partition_count + else: + pass # partition not needed + + if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): + chunk_size = getChunkSize(chunk_dims, item_size) + + # nothing to do about inefficiently small chunks, but large chunks + # can be subdivided + if chunk_size < min_chunk_size: + pass # too small + elif chunk_size > max_chunk_size: + pass # too large + layout["dims"] = chunk_dims diff --git a/src/h5json/filters.py b/src/h5json/filters.py index cda38178..4e985b3f 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -12,6 +12,238 @@ import h5py +from .hdf5dtype import isVlen + +# List of registered filters. Not all are supported by every reader and writer. +# +# +# tuple of filter key, filter id, and options, +FILTER_DEFS = ( + ("H5Z_FILTER_NONE", 0, "none", ()), + ("H5Z_FILTER_DEFLATE", 1, "gzip", ("level",)), # aka as "zlib" for blosc + ("H5Z_FILTER_SHUFFLE", 2, "shuffle", ()), + ("H5Z_FILTER_FLETCHER32", 3, "fletcher32", ()), + ("H5Z_FILTER_SZIP", 4, "szip", ("bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine")), + ("H5Z_FILTER_NBIT", 5, "nbit", ()), + ("H5Z_FILTER_SCALEOFFSET", 6, "scaleoffset", ("scaleType", "scaleOffset")), + ("H5Z_FILTER_LZF", 32000, "lzf", ()), + ("H5Z_FILTER_BLOSC", 32001, "blosclz", ()), + ("H5Z_FILTER_SNAPPY", 32003, "snappy", ()), + ("H5Z_FILTER_LZ4", 32004, "lz4", ()), + ("H5Z_FILTER_LZ4HC", 32005, "lz4hc", ()), + ("H5Z_FILTER_BITSHUFFLE", 32008, "bitshuffle", ()), + ("H5Z_FILTER_ZSTD", 32015, "zstd", ()), +) + +HDF_FILTER_OPTION_ENUMS = { + "coding": { + h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", + h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", + }, + "scaleType": { + h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", + h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", + h5py.h5z.SO_INT: "H5Z_SO_INT", + }, +} + +COMPRESSION_FILTER_IDS = ( + "H5Z_FILTER_DEFLATE", + "H5Z_FILTER_SZIP", + "H5Z_FILTER_SCALEOFFSET", + "H5Z_FILTER_LZF", + "H5Z_FILTER_BLOSC", + "H5Z_FILTER_SNAPPY", + "H5Z_FILTER_LZ4", + "H5Z_FILTER_LZ4HC", + "H5Z_FILTER_ZSTD", +) + +COMPRESSION_FILTER_NAMES = ( + "gzip", + "szip", + "lzf", + "blosclz", + "snappy", + "lz4", + "lz4hc", + "zstd", +) + + +def getFilterItem(key): + """ + Return filter code, id, and name, based on an id, a name or a code. + """ + + if key == "deflate": + key = "gzip" # use gzip as equivalent + for item in FILTER_DEFS: + # check for a match by key, id, or alias (the first three elements) + for i in range(3): + if key == item[i]: + return {"class": item[0], "id": item[1], "name": item[2], "options": item[3]} + return None # not found + + +def getFiltersJson(create_props, supported_filters=None): + """ return standardized filter representation from creation properties + raise bad request if invalid """ + + # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ + # filters.html#grammar-token-filter_list + + if "filters" not in create_props: + return {} # null set + + f_in = create_props["filters"] + + if not isinstance(f_in, list): + msg = "Expected filters in creation_props to be a list" + raise TypeError(msg) + + f_out = [] + for filter in f_in: + if isinstance(filter, int) or isinstance(filter, str): + item = getFilterItem(filter) + if not item: + msg = f"filter {filter} not recognized" + raise ValueError(msg) + + if item["name"] not in supported_filters: + msg = f"filter {filter} is not supported" + raise ValueError(msg) + f_out.append(item) + elif isinstance(filter, dict): + if "class" not in filter: + msg = "expected 'class' key for filter property" + raise KeyError(msg) + if filter["class"] != "H5Z_FILTER_USER": + item = getFilterItem(filter["class"]) + elif "id" in filter: + item = getFilterItem(filter["id"]) + elif "name" in filter: + item = getFilterItem(filter["name"]) + else: + item = None + if not item: + msg = f"filter {filter['class']} not recognized" + raise ValueError(msg) + if "id" not in filter: + filter["id"] = item["id"] + elif item["id"] != filter["id"]: + msg = f"Expected {filter['class']} to have id: " + msg += f"{item['id']} but got {filter['id']}" + raise ValueError(msg) + if "name" not in filter: + filter["name"] = item["name"] + if filter["name"] not in supported_filters: + msg = f"filter {filter} is not supported" + raise KeyError(msg) + + f_out.append(filter) + else: + msg = f"Unexpected type for filter: {filter}" + raise ValueError(msg) + + # return standardized filter representation + return f_out + + +def getFilters(dset_json): + """Return list of filters, or empty list""" + if "creationProperties" not in dset_json: + return [] + creationProperties = dset_json["creationProperties"] + if "filters" not in creationProperties: + return [] + filters = creationProperties["filters"] + return filters + + +def getCompressionFilter(filters): + """Return compression filter from filters, or None""" + for filter in filters: + if "class" not in filter: + # expected class key - malformed filter def + continue + filter_class = filter["class"] + if filter_class in COMPRESSION_FILTER_IDS: + return filter + if all( + ( + filter_class == "H5Z_FILTER_USER", + "name" in filter, + filter["name"] in COMPRESSION_FILTER_NAMES, + ) + ): + return filter + return None + + +def getShuffleFilter(filters): + """Return shuffle filter, or None""" + FILTER_CLASSES = ("H5Z_FILTER_SHUFFLE", "H5Z_FILTER_BITSHUFFLE") + for filter in filters: + if "class" not in filter: + # invalid filter def? + continue + filter_class = filter["class"] + if filter_class in FILTER_CLASSES: + return filter + + return None + + +def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): + """Get list of filter operations to be used for this dataset""" + filter_map = app["filter_map"] + + if dset_id in filter_map: + return filter_map[dset_id] + + compressionFilter = getCompressionFilter(filters) + + filter_ops = {} + + shuffleFilter = getShuffleFilter(filters) + + if shuffleFilter and not isVlen(dtype): + shuffle_name = shuffleFilter["name"] + if shuffle_name == "shuffle": + filter_ops["shuffle"] = 1 # use regular shuffle + elif shuffle_name == "bitshuffle": + filter_ops["shuffle"] = 2 # use bitshuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + + if compressionFilter: + if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": + filter_ops["compressor"] = "zlib" # blosc compressor + else: + if "name" in compressionFilter: + filter_ops["compressor"] = compressionFilter["name"] + else: + filter_ops["compressor"] = "lz4" # default to lz4 + if "level" not in compressionFilter: + filter_ops["level"] = 5 # medium level + else: + filter_ops["level"] = int(compressionFilter["level"]) + + if filter_ops: + # save the chunk shape and dtype + filter_ops["chunk_shape"] = chunk_shape + filter_ops["dtype"] = dtype + filter_map[dset_id] = filter_ops # save + + return filter_ops + else: + return None + + +""" _HDF_FILTERS = { 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, @@ -30,17 +262,7 @@ 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, } -_HDF_FILTER_OPTION_ENUMS = { - "coding": { - h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", - h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", - }, - "scaleType": { - h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", - h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", - h5py.h5z.SO_INT: "H5Z_SO_INT", - }, -} + # h5py supported filters _H5PY_FILTERS = { @@ -53,3 +275,5 @@ } _H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") + +""" diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index bb32a6e9..28b82ddf 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -392,29 +392,35 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): self.log.warning(f"Unknown layout value: {nLayout}") num_filters = plist.get_nfilters() + print("num_filters:", num_filters) filter_props = [] if num_filters: for n in range(num_filters): filter_info = plist.get_filter(n) + print("filter_info:", filter_info) opt_values = filter_info[2] filter_prop = {} filter_id = filter_info[0] filter_prop["id"] = filter_id if filter_info[3]: filter_prop["name"] = bytesArrayToList(filter_info[3]) - if filter_id in filters._HDF_FILTERS: - hdf_filter = filters._HDF_FILTERS[filter_id] + hdf_filter = filters.getFilterItem(filter_id) + if hdf_filter: + print("got hdf filter:", hdf_filter) + filter_prop["class"] = hdf_filter["class"] if "options" in hdf_filter: filter_opts = hdf_filter["options"] + print("got filter_opts:", filter_opts) for i in range(len(filter_opts)): if len(opt_values) <= i: break # end of option values opt_value = opt_values[i] opt_value_enum = None option_name = filter_opts[i] - if option_name in filters._HDF_FILTER_OPTION_ENUMS: - option_enums = filters._HDF_FILTER_OPTION_ENUMS[option_name] + print(f"option_name: {option_name} opt_value: {opt_value}") + if option_name in filters.HDF_FILTER_OPTION_ENUMS: + option_enums = filters.HDF_FILTER_OPTION_ENUMS[option_name] if opt_value in option_enums: opt_value_enum = option_enums[opt_value] if opt_value_enum: diff --git a/testall.py b/testall.py index 1cb36136..5911277e 100755 --- a/testall.py +++ b/testall.py @@ -26,15 +26,6 @@ "h5py_writer_test", ] -use_hsds = True -for key in ("HS_ENDPOINT", "HS_USERNAME", "HS_PASSWORD"): - if key not in os.environ: - use_hsds = False - print(f"not including HSDS tests, no {key} environment set") - break - -if use_hsds: - unit_tests.append("hsds_reader_test") unit_tests = tuple(unit_tests) integ_tests = ("h5tojson_test", "jsontoh5_test") From eb138bc31203c4855030941ed3a68e5535bbcda3 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 18 Sep 2025 13:23:55 +0100 Subject: [PATCH 084/129] added filter functions --- src/h5json/filters.py | 36 --------------------------- src/h5json/h5pystore/h5py_reader.py | 6 ----- src/h5json/h5pystore/h5py_writer.py | 14 +++++++++++ src/h5json/h5writer.py | 10 ++++++++ src/h5json/hdf5db.py | 9 +++++++ src/h5json/jsonstore/h5json_writer.py | 5 ++++ 6 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/h5json/filters.py b/src/h5json/filters.py index 4e985b3f..8268985f 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -241,39 +241,3 @@ def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): return filter_ops else: return None - - -""" -_HDF_FILTERS = { - 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, - 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, - 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, - 4: { - "class": "H5Z_FILTER_SZIP", - "alias": "szip", - "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], - }, - 5: {"class": "H5Z_FILTER_NBIT"}, - 6: { - "class": "H5Z_FILTER_SCALEOFFSET", - "alias": "scaleoffset", - "options": ["scaleType", "scaleOffset"], - }, - 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, -} - - - -# h5py supported filters -_H5PY_FILTERS = { - "gzip": 1, - "shuffle": 2, - "fletcher32": 3, - "szip": 4, - "scaleoffset": 6, - "lzf": 32000, -} - -_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") - -""" diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index 28b82ddf..b4b4c184 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -392,12 +392,10 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): self.log.warning(f"Unknown layout value: {nLayout}") num_filters = plist.get_nfilters() - print("num_filters:", num_filters) filter_props = [] if num_filters: for n in range(num_filters): filter_info = plist.get_filter(n) - print("filter_info:", filter_info) opt_values = filter_info[2] filter_prop = {} filter_id = filter_info[0] @@ -406,19 +404,15 @@ def _getHDF5DatasetCreationProperties(self, dset, type_class): filter_prop["name"] = bytesArrayToList(filter_info[3]) hdf_filter = filters.getFilterItem(filter_id) if hdf_filter: - print("got hdf filter:", hdf_filter) - filter_prop["class"] = hdf_filter["class"] if "options" in hdf_filter: filter_opts = hdf_filter["options"] - print("got filter_opts:", filter_opts) for i in range(len(filter_opts)): if len(opt_values) <= i: break # end of option values opt_value = opt_values[i] opt_value_enum = None option_name = filter_opts[i] - print(f"option_name: {option_name} opt_value: {opt_value}") if option_name in filters.HDF_FILTER_OPTION_ENUMS: option_enums = filters.HDF_FILTER_OPTION_ENUMS[option_name] if opt_value in option_enums: diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 2cb42c0b..dc62ed72 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -491,3 +491,17 @@ def getStats(self): stats["lastModified"] = stat_info.st_mtime stats['owner'] = stat_info.st_uid # TBD: convert to username? return stats + + def getFilters(self, compressors_only=False): + """ return list of filters supported by h5py """ + + h5py_filters = ["H5Z_FILTER_DEFLATE",] + + if not compressors_only: + h5py_filters.append("H5Z_FILTER_SHUFFLE") + h5py_filters.append("H5Z_FILTER_FLETCHER32") + h5py_filters.append("H5Z_FILTER_SZIP") + h5py_filters.append("H5Z_FILTER_NBIT") + h5py_filters.append("H5Z_FILTER_SCALEOFFSET") + + return tuple(h5py_filters) diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index a4b9a522..422a0450 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -99,6 +99,11 @@ def getStats(self): """ pass + @abstractmethod + def getFilters(self, compressors_only=False): + """ returns a list of filters supported by the writer """ + pass + class H5NullWriter(H5Writer): """ @@ -170,3 +175,8 @@ def getStats(self): stats["lastModified"] = 0 stats['owner'] = "" return stats + + def getFilters(self, compressors_only=False): + """ return empty list of filters """ + + return () diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 5f9714a4..18e4f3e0 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -15,6 +15,7 @@ from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype from .array_util import jsonToArray, bytesArrayToList from .dset_util import resize_dataset +from .filters import getFiltersJson from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId from . import selections from .apiversion import _apiver @@ -834,6 +835,14 @@ def createDataset( dset_json = {"shape": shape_json, "type": type_json, "attributes": {}} if cpl: + if "filters" in cpl: + if self.writer: + supported_filters = self.writer.getSupportedFilters() + else: + supported_filters = () + # validate and normalize supplied filter property list + filters_json = getFiltersJson(cpl, supported_filters=supported_filters) + cpl["filters"] = filters_json dset_json["creationProperties"] = cpl else: dset_json["creationProperties"] = {} diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index 343c045f..f37ac415 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -308,3 +308,8 @@ def getStats(self): stats["lastModified"] = stat_info.st_mtime stats['owner'] = stat_info.st_uid # TBD: convert to username? return stats + + def getFilters(self, compressors_only=False): + """ return empty list of filters """ + + return () From fec0a43ed71ca6e529be4e60a4f0836f8fff2e9b Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 18 Sep 2025 17:17:08 +0100 Subject: [PATCH 085/129] added more dset utility functions --- src/h5json/dset_util.py | 207 ++++++++++++++++++------- src/h5json/filters.py | 48 ------ test/unit/dset_util_test.py | 299 ++++++++++++++++++++++++++++++++++++ testall.py | 1 + 4 files changed, 453 insertions(+), 102 deletions(-) create mode 100755 test/unit/dset_util_test.py diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 34d5d0d2..c3a24e87 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -33,19 +33,47 @@ def getShapeClass(data_shape): return data_shape["class"] -def getDims(dset_json): - """ return extents of the dataset shape as a tuple """ - shape_json = dset_json["shape"] - shape_class = shape_json["class"] - if shape_class == "H5S_NULL": +def getShapeDims(shape): + """ + Get dims from a given shape json. Return [1,] for Scalar datasets, + None for null dataspaces + """ + dims = None + if isinstance(shape, int): + dims = [shape, ] + elif isinstance(shape, list) or isinstance(shape, tuple): + dims = shape # can use as is + elif isinstance(shape, str): + # only valid string value is H5S_NULL + if shape != "H5S_NULL": + raise ValueError("Invalid value for shape") dims = None - elif shape_class == "H5S_SCALAR": - dims = () - elif shape_class == "H5S_SIMPLE": - dims = tuple(shape_json["dims"]) + elif isinstance(shape, dict): + if shape.get("class") in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): + # this is a shape_json obj + shape_json = shape + elif "shape" in shape: + # dataset or attribute json + shape_json = shape["shape"] + else: + raise ValueError(f"Unknown shape: {shape}") + + if "class" not in shape_json: + raise ValueError("'class' key not found in shape") + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": + dims = None + elif shape_class == "H5S_SCALAR": + dims = [] + elif shape_class == "H5S_SIMPLE": + if "dims" not in shape_json: + raise ValueError("'dims' key expected for shape") + dims = shape_json["dims"] + else: + raise ValueError(f"Unknown shape: {shape_json}") else: - raise ValueError(f"Unexpected shape class: {shape_class}") - return dims + raise ValueError(f"Unexpected shape class: {type(shape)}") + return tuple(dims) def getNumElements(dset_json): @@ -53,51 +81,37 @@ def getNumElements(dset_json): returns None for null shape, 1 for scalar shape, and product of extents otherwise """ - return int(np.prod(getDims(dset_json))) + return int(np.prod(getShapeDims(dset_json))) def getRank(data_shape): - """ Return rank of given data shape_json """ - - shape_class = getShapeClass(data_shape) + """ Return rank of given data shape """ - if shape_class == "H5S_NULL": + dims = getShapeDims(data_shape) + if dims is None: return 0 - elif shape_class == "H5S_SCALAR": - return 0 - elif shape_class == "H5S_SIMPLE": - if "dims" not in data_shape: - raise KeyError("expected dims key for H5S_SIMPLE data shape") - return len(data_shape["dims"]) else: - raise ValueError(f"unexpected data shape class: {shape_class}") - + return len(dims) -def getDsetRank(dset_json): - """Get rank returning 0 for scalar or NULL data shapes""" - data_shape = dset_json["shape"] - return getRank(data_shape) - -def isNullSpace(dset_json): +def isNullSpace(shape): """Return true if this dataset is a null data space""" - shape_class = getShapeClass(dset_json["shape"]) - if shape_class == "H5S_NULL": + + dims = getShapeDims(shape) + if dims is None: return True else: return False -def isScalarSpace(dset_json): +def isScalar(shape): """ return true if this is a scalar dataset """ - data_shape = dset_json["shape"] - shape_class = getShapeClass(data_shape) - if shape_class == "H5S_NULL": + dims = getShapeDims(shape) + if dims is None or len(dims) > 0: return False - - rank = getRank(data_shape) - return True if rank == 0 else False + else: + return True def getDatasetLayout(dset_json): @@ -134,26 +148,31 @@ def getDatasetLayoutClass(dset_json): ) -def get_dset_size(shape_json, typesize): +def getDatasetSize(shape_json, typesize): """Return the size of the dataspace. For any unlimited dimensions, assume a value of 1. (so the return size will be the absolute minimum) """ - if shape_json is None or shape_json["class"] == "H5S_NULL": + + if isNullSpace(shape_json): return None - if shape_json["class"] == "H5S_SCALAR": - return typesize # just return size for one item + if typesize == "H5T_VARIABLE": - typesize = DEFAULT_TYPE_SIZE # just take a guess at the item size - dset_size = typesize - shape = shape_json["dims"] - rank = len(shape) + dset_size = DEFAULT_TYPE_SIZE # just take a guess at the item size + else: + dset_size = typesize + + if isScalar(shape_json): + return dset_size # just return size for one item + + dims = getShapeDims(shape_json) + rank = len(dims) for n in range(rank): - if shape[n] == 0: + if dims[n] == 0: # extendable extent with value of 0 continue # assume this is one - dset_size *= shape[n] + dset_size *= dims[n] return dset_size @@ -218,21 +237,21 @@ def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): # data shape with no elements, just return dims as layout return dims - nsize = item_size + n_size = item_size layout = [1,] * rank for i in range(rank): dim = rank - i - 1 extent = dims[dim] - if extent * nsize < chunk_max: + if extent * n_size < chunk_max: # just use the full extent as layout layout[dim] = extent - nsize *= extent + n_size *= extent else: n = extent while n > 1: n = -(-n // 2) # use negatives so we round up on odds - if n * nsize < chunk_max: + if n * n_size < chunk_max: break layout[dim] = n break # just use 1's for the rest of the layout @@ -255,6 +274,86 @@ def getChunkSize(layout, type_size): return chunk_size +def isExtensible(dims, maxdims): + """ + Determine if the dataset can be extended + """ + if maxdims is None or len(dims) == 0: + return False + rank = len(dims) + if len(maxdims) != rank: + raise ValueError("rank of maxdims does not match dataset") + for n in range(rank): + # TBD - shouldn't have H5S_UNLIMITED in any new files. + # Remove check once this is confirmed + if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: + return True + return False + + +def getDsetMaxDims(dset_json): + """ + Get maxdims from a given shape. Return [1,] for Scalar datasets + + Use with H5S_NULL datasets will throw a ValueError + """ + if "shape" not in dset_json: + msg = "No shape found in dset_json" + raise KeyError(msg) + shape_json = dset_json["shape"] + shape_class = getShapeClass(shape_json) + maxdims = None + if shape_class == "H5S_NULL": + msg = "Expected shape class other than H5S_NULL" + raise ValueError(msg) + elif shape_class == "H5S_SCALAR": + maxdims = [1,] + elif shape_class == "H5S_SIMPLE": + if "maxdims" in shape_json: + maxdims = shape_json["maxdims"] + else: + msg = f"Unexpected shape class: {shape_class}" + raise ValueError(msg) + return maxdims + + +def getChunkDims(dset_json): + """ get chunk shape for given dset_json """ + + layout = getDatasetLayout(dset_json) + if layout and "dims" in layout: + return layout["dims"] + else: + # H5D_COMPACT and H5D_CONTIGUOUS will not have a dims key + # Check the layout dict in dset_json to see if it's + # defined there + if "layout" in dset_json: + layout = dset_json["layout"] + if "dims" in layout: + return layout["dims"] + return None + + +def getChunkLayout(dset_json): + """Get chunk layout. Throw 500 if used with non-H5D_CHUNKED layout""" + if "layout" not in dset_json: + msg = "No layout found in dset_json" + raise KeyError(msg) + layout_json = dset_json["layout"] + if "class" not in layout_json: + msg = f"Expected class key for layout: {layout_json}" + raise KeyError(msg) + layout_class = layout_json["class"] + if layout_class not in CHUNK_LAYOUT_CLASSES: + msg = f"Unexpected shape layout: {layout_class}" + raise ValueError(msg) + if "dims" not in layout_json: + msg = f"Expected dims key in layout: {layout_json}" + raise KeyError(msg) + layout = layout_json["dims"] + return layout + + def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): """ Use chunk layout given in the creationPropertiesList (if defined and @@ -425,7 +524,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class= if maxdims[n] == 0 or maxdims[n] > dims[n]: extendable_dims += 1 - dset_size = get_dset_size(shape_json, typesize) + dset_size = getDatasetSize(shape_json, typesize) if dset_size <= chunk_min and extendable_dims == 0: # just use the entire dataspace shape as one big chunk return tuple(dims) diff --git a/src/h5json/filters.py b/src/h5json/filters.py index 8268985f..178a82bf 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -193,51 +193,3 @@ def getShuffleFilter(filters): return filter return None - - -def getFilterOps(app, dset_id, filters, dtype=None, chunk_shape=None): - """Get list of filter operations to be used for this dataset""" - filter_map = app["filter_map"] - - if dset_id in filter_map: - return filter_map[dset_id] - - compressionFilter = getCompressionFilter(filters) - - filter_ops = {} - - shuffleFilter = getShuffleFilter(filters) - - if shuffleFilter and not isVlen(dtype): - shuffle_name = shuffleFilter["name"] - if shuffle_name == "shuffle": - filter_ops["shuffle"] = 1 # use regular shuffle - elif shuffle_name == "bitshuffle": - filter_ops["shuffle"] = 2 # use bitshuffle - else: - filter_ops["shuffle"] = 0 # no shuffle - else: - filter_ops["shuffle"] = 0 # no shuffle - - if compressionFilter: - if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": - filter_ops["compressor"] = "zlib" # blosc compressor - else: - if "name" in compressionFilter: - filter_ops["compressor"] = compressionFilter["name"] - else: - filter_ops["compressor"] = "lz4" # default to lz4 - if "level" not in compressionFilter: - filter_ops["level"] = 5 # medium level - else: - filter_ops["level"] = int(compressionFilter["level"]) - - if filter_ops: - # save the chunk shape and dtype - filter_ops["chunk_shape"] = chunk_shape - filter_ops["dtype"] = dtype - filter_map[dset_id] = filter_ops # save - - return filter_ops - else: - return None diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py new file mode 100755 index 00000000..7c4556ea --- /dev/null +++ b/test/unit/dset_util_test.py @@ -0,0 +1,299 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import logging + +from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, getContiguousLayout, expandChunk + + +class DsetUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(DsetUtilTest, self).__init__(*args, **kwargs) + # main + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testGuessChunk(self): + + typesize = "H5T_VARIABLE" + logging.debug("hello") + + shape = {"class": "H5S_NULL"} + layout = guessChunk(shape, typesize) + self.assertTrue(layout is None) + + shape = {"class": "H5S_SCALAR"} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, (1,)) + + shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 100) + + typesize = 8 + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 100) + + shape = {"class": "H5S_SIMPLE", "dims": [5]} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, (5,)) + + shape = {"class": "H5S_SIMPLE", "dims": [100, 100, 100]} + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 3) + for i in range(3): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 100) + + shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} + layout = guessChunk(shape, typesize) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= 1024) + + shape = {"class": "H5S_SCALAR"} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, (1,)) + + shape = {"class": "H5S_NULL"} + layout = guessChunk(shape, typesize) + self.assertEqual(layout, None) + + def testShrinkChunk(self): + CHUNK_MIN = 500 + CHUNK_MAX = 5000 + typesize = 1 + layout = (1, 2, 3) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + self.assertEqual(shrunk, layout) + + layout = (100, 200, 300) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes > CHUNK_MAX) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + rank = len(layout) + for i in range(rank): + self.assertTrue(shrunk[i] >= 1) + self.assertTrue(shrunk[i] <= 1000 * (i + 1)) + num_bytes = getChunkSize(shrunk, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + layout = (300, 200, 100) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes > CHUNK_MAX) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + rank = len(layout) + for i in range(rank): + self.assertTrue(shrunk[i] >= 1) + self.assertTrue(shrunk[i] <= 1000 * (3 - i)) + num_bytes = getChunkSize(shrunk, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + CHUNK_MIN = 1 * 1024 * 1024 + CHUNK_MAX = 4 * 1024 * 1024 + typesize = 4 + layout = (117, 201, 189, 1) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes > CHUNK_MAX) + shrunk = shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX) + self.assertEqual(shrunk, (59, 101, 95, 1)) + num_bytes = getChunkSize(shrunk, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + def testExpandChunk(self): + CHUNK_MIN = 5000 + CHUNK_MAX = 50000 + + typesize = 20 + shape = {"class": "H5S_SIMPLE", "dims": [12, ], "maxdims": [20, ]} + layout = (20,) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + # chunk layout can't be larger than dataspace + self.assertTrue(num_bytes < CHUNK_MIN) + self.assertEqual(expanded, (20,)) + + typesize = 1 + shape = {"class": "H5S_SIMPLE", "dims": [10, 10, 10]} + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + # chunk layout can't be larger than dataspace + self.assertTrue(num_bytes < CHUNK_MIN) + self.assertEqual(expanded, (10, 10, 10)) + + shape = {"class": "H5S_SIMPLE", "dims": [1000, 2000, 3000]} + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = {"class": "H5S_SIMPLE", "dims": [1000,]} + layout = (10,) + num_bytes = getChunkSize(layout, "H5T_VARIABLE") + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, "H5T_VARIABLE", shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, "H5T_VARIABLE") + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = { + "class": "H5S_SIMPLE", + "dims": [1000, 10, 1000], + "maxdims": [1000, 100, 1000], + } + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = { + "class": "H5S_SIMPLE", + "dims": [1000, 0, 1000], + "maxdims": [1000, 100, 1000], + } + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + shape = { + "class": "H5S_SIMPLE", + "dims": [1000, 10, 1000], + "maxdims": [1000, 0, 1000], + } + layout = (10, 10, 10) + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + def testGetContiguousLayout(self): + typesize = 4 + chunk_min = 400 + chunk_max = 800 + + kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} + + def get_num_bytes(dims): + num_bytes = typesize + for n in dims: + num_bytes *= n + return num_bytes + + try: + shape = {"class": "H5S_SIMPLE", "dims": [100, 100]} + layout = getContiguousLayout(shape, "H5T_VARIABLE", **kwargs) + self.assertTrue(False) + except ValueError: + pass # expected + + shape = {"class": "H5S_NULL"} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(layout is None) + + shape = {"class": "H5S_SCALAR"} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertEqual(layout, (1,)) + + for extent in (1, 100, 10000): + dims = [ + extent, + ] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 1) + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + + self.assertTrue(chunk_bytes <= chunk_max) + + for extent in (1, 9, 90): + dims = [extent, extent] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= extent) + self.assertEqual(layout[1], extent) + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + self.assertTrue(chunk_bytes <= chunk_max) + + for extent in (1, 10, 100): + dims = [extent, extent, 50] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 3) + for i in range(3): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= dims[i]) + + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + self.assertTrue(chunk_bytes <= chunk_max) + + for extent in (1, 100, 1000): + dims = [extent, 4] + shape = {"class": "H5S_SIMPLE", "dims": dims} + layout = getContiguousLayout(shape, typesize, **kwargs) + self.assertTrue(len(layout), 2) + for i in range(2): + self.assertTrue(layout[i] >= 1) + self.assertTrue(layout[i] <= dims[i]) + + chunk_bytes = get_num_bytes(layout) + space_bytes = get_num_bytes(dims) + + if space_bytes > chunk_min: + self.assertTrue(chunk_bytes >= chunk_min) + self.assertTrue(chunk_bytes <= chunk_max) + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/testall.py b/testall.py index 5911277e..04aa4798 100755 --- a/testall.py +++ b/testall.py @@ -19,6 +19,7 @@ "array_util_test", "objid_test", "hdf5dtype_test", + "dset_util_test", "hdf5db_test", "h5json_reader_test", "h5json_writer_test", From 5ab9b6580577aea840f428d3a910b6dfeeb835b1 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 3 Oct 2025 15:56:51 +0200 Subject: [PATCH 086/129] added shape_util.py --- src/h5json/config.py | 213 ---------------------------------- src/h5json/dset_util.py | 219 +++++++---------------------------- src/h5json/filters.py | 37 ++++++ src/h5json/shape_util.py | 141 ++++++++++++++++++++++ test/unit/dset_util_test.py | 9 -- test/unit/shape_util_test.py | 121 +++++++++++++++++++ testall.py | 1 + 7 files changed, 342 insertions(+), 399 deletions(-) delete mode 100755 src/h5json/config.py create mode 100644 src/h5json/shape_util.py create mode 100755 test/unit/shape_util_test.py diff --git a/src/h5json/config.py b/src/h5json/config.py deleted file mode 100755 index b7602ffd..00000000 --- a/src/h5json/config.py +++ /dev/null @@ -1,213 +0,0 @@ -############################################################################## -# Copyright by The HDF Group. # -# All rights reserved. # -# # -# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # -# Utilities. The full HSDS copyright notice, including # -# terms governing use, modification, and redistribution, is contained in # -# the file COPYING, which can be found at the root of the source code # -# distribution tree. If you do not have access to this file, you may # -# request a copy from help@hdfgroup.org. # -############################################################################## -import os -import json - - -class Config: - """ - User Config state - """ - _cfg = {} # global state - - def __init__(self, config_file=None, **kwargs): - if Config._cfg: - return # already initialized - if config_file: - self._config_file = config_file - elif os.path.isfile(".hscfg"): - self._config_file = ".hscfg" - else: - self._config_file = os.path.expanduser("~/.hscfg") - # process config file if found - if os.path.isfile(self._config_file): - line_number = 0 - with open(self._config_file) as f: - for line in f: - line_number += 1 - s = line.strip() - if not s: - continue - if s[0] == '#': - # comment line - continue - fields = s.split('=') - if len(fields) < 2: - print(f"config file: {self._config_file} line: {line_number} is not valid") - continue - k = fields[0].strip() - v = fields[1].strip() - if k == "complex_names": - self.complex_names = v - elif k == "bool_names": - self.bool_names = v - elif k == "track_order": - self.track_order = v - else: - Config._cfg[k] = v - - # add standard keys if not already picked up - for k in ("hs_endpoint", "hs_username", "hs_password", "hs_api_key"): - if k not in Config._cfg: - Config._cfg[k] = "" - - # override any config values with environment variable if found - for k in Config._cfg.keys(): - if k.upper() in os.environ: - Config._cfg[k] = os.environ[k.upper()] - - # update any values that are passed in to the constructor - for k in kwargs.keys(): - Config._cfg[k] = kwargs[k] - - # finally, set defaults for any expected keys that are not already set - for k in ("hs_endpoint", "hs_username", "hs_endpoint"): - if k not in Config._cfg: - Config._cfg[k] = None - if "bool_names" not in Config._cfg: - Config._cfg["bool_names"] = (b"FALSE", b"TRUE") - if "complex_names" not in Config._cfg: - Config._cfg["complex_names"] = ("r", "i") - if "track_order" not in Config._cfg: - Config._cfg["track_order"] = False - - def __getitem__(self, name): - """ Get a config item """ - if name not in Config._cfg: - if name.upper() in os.environ: - Config._cfg[name] = os.environ[name.upper()] - else: - return None - return Config._cfg[name] - - def get(self, name, default): - """ return config value for name or default if None """ - val = self.__getitem__(name) - if val is None: - return default - else: - return default - - def __setitem__(self, name, obj): - """ set config item """ - Config._cfg[name] = obj - - def __delitem__(self, name): - """ Delete option. """ - del Config._cfg[name] - - def __len__(self): - return len(Config._cfg) - - def __iter__(self): - """ Iterate over config names """ - keys = Config._cfg.keys() - for key in keys: - yield key - - def __contains__(self, name): - return name in Config._cfg - - def __repr__(self): - return json.dumps(Config._cfg) - - def keys(self): - return Config._cfg.keys() - - @property - def hs_endpoint(self): - return Config._cfg.get("hs_endpoint") - - @property - def hs_username(self): - return Config._cfg.get("hs_username") - - @property - def hs_password(self): - return Config._cfg.get("hs_password") - - @property - def hs_api_key(self): - return Config._cfg.get("hs_api_key") - - @property - def bool_names(self): - if "bool_names" in Config._cfg: - names = Config._cfg["bool_names"] - else: - names = (b"FALSE", b"TRUE") - return names - - @bool_names.setter - def bool_names(self, value): - if isinstance(value, str): - names = value.split(()) - if len(names) < 2: - raise ValueError("bool_names must have two items") - elif len(names) == 2: - pass - else: - names = names[:2] # just use the first two items - elif len(value) != 2: - raise ValueError("expected two-element list for bool_names") - else: - names = value - Config._cfg["bool_names"] = tuple(names) - - @property - def complex_names(self): - if "complex_names" in Config._cfg: - names = Config._cfg["complex_names"] - else: - names = ("r", "i") - return names - - @complex_names.setter - def complex_names(self, value): - if isinstance(value, str): - names = value.split() - if len(names) < 2: - raise ValueError("complex_names must have two items") - elif len(names) == 2: - pass - else: - names = names[:2] # just use the first two items - elif len(value) != 2: - raise ValueError("complex_names must have two values") - else: - names = value - - Config._cfg["complex_names"] = tuple(names) - - @property - def track_order(self): - if "track_order" in Config._cfg: - track = Config._cfg["track_order"] - else: - track = False - return track - - @track_order.setter - def track_order(self, value): - if isinstance(value, str): - tokens = value.split() - if len(tokens) == 0: - track = False - else: - track = bool(tokens[0]) # strip any comments - else: - track = bool(value) - Config._cfg["track_order"] = track - - -def get_config(config_file=None, **kwargs): - return Config(config_file=config_file, **kwargs) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index c3a24e87..4327a044 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -11,107 +11,19 @@ ############################################################################## import math -import numpy as np from .hdf5dtype import getItemSize +from .shape_util import getDataSize from .objid import isValidUuid -from . import config CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) -DEFAULT_TYPE_SIZE = 128 # Type size case when it is variable - -def getShapeClass(data_shape): - """ Return shape class of the given data shape """ - - if not isinstance(data_shape, dict): - raise TypeError("expected dict object") - - if "class" not in data_shape: - raise KeyError("expected 'class' key for data shape")\ - - return data_shape["class"] - - -def getShapeDims(shape): - """ - Get dims from a given shape json. Return [1,] for Scalar datasets, - None for null dataspaces - """ - dims = None - if isinstance(shape, int): - dims = [shape, ] - elif isinstance(shape, list) or isinstance(shape, tuple): - dims = shape # can use as is - elif isinstance(shape, str): - # only valid string value is H5S_NULL - if shape != "H5S_NULL": - raise ValueError("Invalid value for shape") - dims = None - elif isinstance(shape, dict): - if shape.get("class") in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): - # this is a shape_json obj - shape_json = shape - elif "shape" in shape: - # dataset or attribute json - shape_json = shape["shape"] - else: - raise ValueError(f"Unknown shape: {shape}") - - if "class" not in shape_json: - raise ValueError("'class' key not found in shape") - shape_class = shape_json["class"] - if shape_class == "H5S_NULL": - dims = None - elif shape_class == "H5S_SCALAR": - dims = [] - elif shape_class == "H5S_SIMPLE": - if "dims" not in shape_json: - raise ValueError("'dims' key expected for shape") - dims = shape_json["dims"] - else: - raise ValueError(f"Unknown shape: {shape_json}") - else: - raise ValueError(f"Unexpected shape class: {type(shape)}") - return tuple(dims) - - -def getNumElements(dset_json): - """ return the number of elements defined by the dataset's shape - returns None for null shape, 1 for scalar shape, and product of - extents otherwise """ - - return int(np.prod(getShapeDims(dset_json))) - - -def getRank(data_shape): - """ Return rank of given data shape """ - - dims = getShapeDims(data_shape) - if dims is None: - return 0 - else: - return len(dims) - - -def isNullSpace(shape): - """Return true if this dataset is a null data space""" - - dims = getShapeDims(shape) - if dims is None: - return True - else: - return False - - -def isScalar(shape): - """ return true if this is a scalar dataset """ - - dims = getShapeDims(shape) - if dims is None or len(dims) > 0: - return False - else: - return True +CHUNK_LAYOUT_CLASSES = ( + "H5D_CHUNKED", + "H5D_CHUNKED_REF", + "H5D_CHUNKED_REF_INDIRECT", + "H5D_CONTIGUOUS_REF", +) def getDatasetLayout(dset_json): @@ -122,11 +34,7 @@ def getDatasetLayout(dset_json): cp = dset_json["creationProperties"] if "layout" in cp: layout = cp["layout"] - if not layout and "layout" in dset_json: - layout = dset_json["layout"] - if not layout: - # no layout for {dset_json - return None + return layout @@ -140,43 +48,8 @@ def getDatasetLayoutClass(dset_json): return layout_class -CHUNK_LAYOUT_CLASSES = ( - "H5D_CHUNKED", - "H5D_CHUNKED_REF", - "H5D_CHUNKED_REF_INDIRECT", - "H5D_CONTIGUOUS_REF", -) - - -def getDatasetSize(shape_json, typesize): - """Return the size of the dataspace. For - any unlimited dimensions, assume a value of 1. - (so the return size will be the absolute minimum) - """ - - if isNullSpace(shape_json): - return None - - if typesize == "H5T_VARIABLE": - dset_size = DEFAULT_TYPE_SIZE # just take a guess at the item size - else: - dset_size = typesize - - if isScalar(shape_json): - return dset_size # just return size for one item - - dims = getShapeDims(shape_json) - rank = len(dims) - - for n in range(rank): - if dims[n] == 0: - # extendable extent with value of 0 - continue # assume this is one - dset_size *= dims[n] - return dset_size - - def resize_dataset(dset_json, shape): + """ Update shape dims to the given shape provided new shape is valid for maxdims """ shape_json = dset_json["shape"] shape_class = shape_json["class"] if shape_class != "H5S_SIMPLE": @@ -259,12 +132,10 @@ def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): return layout -def getChunkSize(layout, type_size): +def getChunkSize(layout, type_size: int = 1): """Return chunk size given layout. i.e. just the product of the values in the list. """ - if type_size == "H5T_VARIABLE": - type_size = DEFAULT_TYPE_SIZE chunk_size = type_size for n in layout: @@ -284,8 +155,6 @@ def isExtensible(dims, maxdims): if len(maxdims) != rank: raise ValueError("rank of maxdims does not match dataset") for n in range(rank): - # TBD - shouldn't have H5S_UNLIMITED in any new files. - # Remove check once this is confirmed if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: return True return False @@ -301,7 +170,7 @@ def getDsetMaxDims(dset_json): msg = "No shape found in dset_json" raise KeyError(msg) shape_json = dset_json["shape"] - shape_class = getShapeClass(shape_json) + shape_class = shape_json["class"] maxdims = None if shape_class == "H5S_NULL": msg = "Expected shape class other than H5S_NULL" @@ -311,6 +180,8 @@ def getDsetMaxDims(dset_json): elif shape_class == "H5S_SIMPLE": if "maxdims" in shape_json: maxdims = shape_json["maxdims"] + else: + maxdims = shape_json["dims"] else: msg = f"Unexpected shape class: {shape_class}" raise ValueError(msg) @@ -335,18 +206,16 @@ def getChunkDims(dset_json): def getChunkLayout(dset_json): - """Get chunk layout. Throw 500 if used with non-H5D_CHUNKED layout""" - if "layout" not in dset_json: - msg = "No layout found in dset_json" - raise KeyError(msg) - layout_json = dset_json["layout"] - if "class" not in layout_json: - msg = f"Expected class key for layout: {layout_json}" - raise KeyError(msg) - layout_class = layout_json["class"] + """Get chunk layout. Return None for non-chunked layout""" + + layout_class = getDatasetLayoutClass(dset_json) + if not layout_class: + return None + if layout_class not in CHUNK_LAYOUT_CLASSES: - msg = f"Unexpected shape layout: {layout_class}" - raise ValueError(msg) + return None + + layout_json = getDatasetLayout(dset_json) if "dims" not in layout_json: msg = f"Expected dims key in layout: {layout_json}" raise KeyError(msg) @@ -379,9 +248,7 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): # validate that the chunk_dims are valid and correlates with the # dataset shape if isinstance(chunk_dims, int): - chunk_dims = [ - chunk_dims, - ] # promote to array + chunk_dims = [chunk_dims, ] # promote to array if len(chunk_dims) != rank: msg = "Layout rank does not match shape rank" raise ValueError(msg) @@ -516,7 +383,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class= layout = list(layout) dims = shape_json["dims"] rank = len(dims) - extendable_dims = 0 # number of dimensions that are extenable + extendable_dims = 0 # number of dimensions that are extendable maxdims = None if "maxdims" in shape_json: maxdims = shape_json["maxdims"] @@ -524,7 +391,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class= if maxdims[n] == 0 or maxdims[n] > dims[n]: extendable_dims += 1 - dset_size = getDatasetSize(shape_json, typesize) + dset_size = getDataSize(shape_json, typesize) if dset_size <= chunk_min and extendable_dims == 0: # just use the entire dataspace shape as one big chunk return tuple(dims) @@ -550,7 +417,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class= layout[dim] *= 2 if layout[dim] >= dims[dim]: layout[dim] = maxdims[dim] # trim back - extendable_dims -= 1 # one less extenable dimension + extendable_dims -= 1 # one less extendable dimension chunk_size = getChunkSize(layout, typesize) if chunk_size > chunk_min: @@ -579,8 +446,9 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class= return tuple(layout) -def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX, layout_class="H5D_CHUNKED"): +def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX): """Compute a reduced chunk shape with a size in bytes less than chunk_max.""" + layout = list(layout) chunk_size = getChunkSize(layout, typesize) if chunk_size <= chunk_max: @@ -636,18 +504,16 @@ def guessChunk(shape_json, typesize): return shape -def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, chunk_max=None): +def getLayoutJson(creation_props, + shape=None, + type_json=None, + chunk_min=CHUNK_MIN, + chunk_max=CHUNK_MAX, + max_chunks_per_folder=0): """ Get the layout json given by creation_props. - Raise bad request error if invalid """ - - min_chunk_size = CHUNK_MIN # int(config.get("min_chunk_size")) - max_chunk_size = CHUNK_MAX # int(config.get("max_chunk_size")) + Raise value error if invalid """ item_size = getItemSize(type_json) - if chunk_min is None: - chunk_min = 1000 * 1000 - if chunk_max is None: - chunk_max = 4 * 1000 * 1000 if chunk_min > chunk_max: msg = "chunk_max must be larger than chunk_min" @@ -689,7 +555,7 @@ def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, ch chunk_dims = None if layout_class == "H5D_CONTIGUOUS_REF": - kwargs = {"chunk_min": min_chunk_size, "chunk_max": max_chunk_size} + kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} chunk_dims = getContiguousLayout(shape, item_size, **kwargs) layout["dims"] = chunk_dims @@ -702,11 +568,11 @@ def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, ch # adjust the chunk shape if chunk size is too small or too big adjusted_chunk_dims = None - if chunk_size < min_chunk_size: - kwargs = {"chunk_min": min_chunk_size, "layout_class": layout_class} + if chunk_size < chunk_min: + kwargs = {"chunk_min": chunk_min, "layout_class": layout_class} adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape, **kwargs) - elif chunk_size > max_chunk_size: - kwargs = {"chunk_max": max_chunk_size} + elif chunk_size > chunk_max: + kwargs = {"chunk_max": chunk_max} adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) if adjusted_chunk_dims: layout["dims"] = adjusted_chunk_dims @@ -714,7 +580,6 @@ def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, ch layout["dims"] = chunk_dims # don't need to adjust chunk size # set partition_count if needed: - max_chunks_per_folder = int(config.get("max_chunks_per_folder")) set_partition = False if max_chunks_per_folder > 0: if "dims" in shape and "dims" in layout: @@ -762,8 +627,8 @@ def getLayoutJson(creation_props, shape=None, type_json=None, chunk_min=None, ch # nothing to do about inefficiently small chunks, but large chunks # can be subdivided - if chunk_size < min_chunk_size: + if chunk_size < chunk_min: pass # too small - elif chunk_size > max_chunk_size: + elif chunk_size > chunk_max: pass # too large layout["dims"] = chunk_dims diff --git a/src/h5json/filters.py b/src/h5json/filters.py index 178a82bf..9164f1e8 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -193,3 +193,40 @@ def getShuffleFilter(filters): return filter return None + + +def getFilterOps(filters, dtype=None): + """Get list of filter operations to be used for this dataset""" + + compressionFilter = getCompressionFilter(filters) + + filter_ops = {} + + shuffleFilter = getShuffleFilter(filters) + + if shuffleFilter and not isVlen(dtype): + shuffle_name = shuffleFilter["name"] + if shuffle_name == "shuffle": + filter_ops["shuffle"] = 1 # use regular shuffle + elif shuffle_name == "bitshuffle": + filter_ops["shuffle"] = 2 # use bitshuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + else: + filter_ops["shuffle"] = 0 # no shuffle + + """ return list of filter operations for this dataset """ + if compressionFilter: + if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": + filter_ops["compressor"] = "zlib" # blosc compressor + else: + if "name" in compressionFilter: + filter_ops["compressor"] = compressionFilter["name"] + else: + filter_ops["compressor"] = "lz4" # default to lz4 + if "level" not in compressionFilter: + filter_ops["level"] = 5 # medium level + else: + filter_ops["level"] = int(compressionFilter["level"]) + + return filter_ops diff --git a/src/h5json/shape_util.py b/src/h5json/shape_util.py new file mode 100644 index 00000000..a3531cde --- /dev/null +++ b/src/h5json/shape_util.py @@ -0,0 +1,141 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import numpy as np + + +def getShapeClass(shape): + """ Return shape class of the given data shape """ + + if not isinstance(shape, dict): + raise TypeError("expected dict object") + + if shape.get("class") in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): + # this is a shape_json obj + shape_json = shape + elif "shape" in shape: + # dataset or attribute json + shape_json = shape["shape"] + else: + raise ValueError(f"Unknown shape: {shape}") + + if "class" not in shape_json: + raise KeyError("expected 'class' key for data shape")\ + + return shape_json["class"] + + +def getShapeDims(shape): + """ + Get dims from a given shape json. Return [1,] for Scalar datasets, + None for null data spaces + """ + dims = None + if isinstance(shape, int): + dims = (shape, ) + elif isinstance(shape, list): + dims = tuple(shape) + elif isinstance(shape, tuple): + dims = shape # can use as is + elif isinstance(shape, str): + # only valid string value is H5S_NULL + if shape != "H5S_NULL": + raise ValueError("Invalid value for shape") + dims = None + elif isinstance(shape, dict): + if shape.get("class") in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): + # this is a shape_json obj + shape_json = shape + elif "shape" in shape: + # dataset or attribute json + shape_json = shape["shape"] + else: + raise ValueError(f"Unknown shape: {shape}") + + if "class" not in shape_json: + raise ValueError("'class' key not found in shape") + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": + dims = None + elif shape_class == "H5S_SCALAR": + dims = () + elif shape_class == "H5S_SIMPLE": + if "dims" not in shape_json: + raise ValueError("'dims' key expected for shape") + dims = tuple(shape_json["dims"]) + else: + raise ValueError(f"Unknown shape: {shape_json}") + else: + raise ValueError(f"Unexpected shape class: {type(shape)}") + return dims + + +def getNumElements(obj_json): + """ return the number of elements defined by the dataset's shape + returns None for null shape, 1 for scalar shape, and product of + extents otherwise """ + + dims = getShapeDims(obj_json) + if dims is None: + return 0 + else: + return int(np.prod(dims)) + + +def getRank(shape): + """ Return rank of given data shape """ + + dims = getShapeDims(shape) + if dims is None: + return 0 + else: + return len(dims) + + +def isNullSpace(shape): + """Return true if this dataset is a null data space""" + + shape_class = getShapeClass(shape) + if shape_class == "H5S_NULL": + return True + else: + return False + + +def isScalar(shape): + """ return true if this is a scalar dataset """ + + shape_class = getShapeClass(shape) + if shape_class == "H5S_SCALAR": + return True + else: + return False + + +def getDataSize(shape, type_size: int = 1): + """Return the size of the dataspace. For + any unlimited dimensions, assume a value of 1. + (so the return size will be the absolute minimum) + """ + + if isinstance(shape, dict) and isNullSpace(shape): + return 0 + + if isinstance(shape, dict) and isScalar(shape): + return type_size # just return size for one item + + dims = getShapeDims(shape) + + if dims is None: + return 0 + else: + return type_size * int(np.prod(dims)) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 7c4556ea..c8b949ec 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -154,15 +154,6 @@ def testExpandChunk(self): self.assertTrue(num_bytes > CHUNK_MIN) self.assertTrue(num_bytes < CHUNK_MAX) - shape = {"class": "H5S_SIMPLE", "dims": [1000,]} - layout = (10,) - num_bytes = getChunkSize(layout, "H5T_VARIABLE") - self.assertTrue(num_bytes < CHUNK_MIN) - expanded = expandChunk(layout, "H5T_VARIABLE", shape, chunk_min=CHUNK_MIN) - num_bytes = getChunkSize(expanded, "H5T_VARIABLE") - self.assertTrue(num_bytes > CHUNK_MIN) - self.assertTrue(num_bytes < CHUNK_MAX) - shape = { "class": "H5S_SIMPLE", "dims": [1000, 10, 1000], diff --git a/test/unit/shape_util_test.py b/test/unit/shape_util_test.py new file mode 100755 index 00000000..23c41edf --- /dev/null +++ b/test/unit/shape_util_test.py @@ -0,0 +1,121 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import logging + +from h5json.shape_util import getShapeClass, getShapeDims, getNumElements, getRank +from h5json.shape_util import isNullSpace, isScalar, getDataSize + + +class ShapeUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(ShapeUtilTest, self).__init__(*args, **kwargs) + # main + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testSimple(self): + + type_json = { + "base": "H5T_STD_I32BE", + "class": "H5T_INTEGER" + } + vstr_json = { + "charSet": "H5T_CSET_ASCII", + "class": "H5T_STRING", + "length": "H5T_VARIABLE", + "strPad": "H5T_STR_NULLTERM" + } + null_shape_json = {"class": "H5S_NULL"} + null_shape_obj = {"type": type_json, "shape": null_shape_json} + scalar_shape_json = {"class": "H5S_SCALAR"} + scalar_shape_obj = {"type": type_json, "shape": scalar_shape_json} + vstr_scalar_shape_obj = {"type": vstr_json, "shape": scalar_shape_json} + + simple_shape_json = {"class": "H5S_SIMPLE", "dims": [5, 7]} + simple_shape_obj = {"type": type_json, "shape": simple_shape_json} + vstr_simple_shape_obj = {"type": vstr_json, "shape": simple_shape_json} + + self.assertEqual(getShapeClass(null_shape_json), "H5S_NULL") + self.assertEqual(getShapeClass(null_shape_obj), "H5S_NULL") + self.assertEqual(getShapeClass(scalar_shape_json), "H5S_SCALAR") + self.assertEqual(getShapeClass(scalar_shape_obj), "H5S_SCALAR") + self.assertEqual(getShapeClass(vstr_scalar_shape_obj), "H5S_SCALAR") + self.assertEqual(getShapeClass(simple_shape_json), "H5S_SIMPLE") + self.assertEqual(getShapeClass(simple_shape_obj), "H5S_SIMPLE") + self.assertEqual(getShapeClass(vstr_simple_shape_obj), "H5S_SIMPLE") + + self.assertEqual(getShapeDims(null_shape_json), None) + self.assertEqual(getShapeDims(null_shape_obj), None) + self.assertEqual(getShapeDims(scalar_shape_json), ()) + self.assertEqual(getShapeDims(scalar_shape_obj), ()) + self.assertEqual(getShapeDims(vstr_scalar_shape_obj), ()) + self.assertEqual(getShapeDims(simple_shape_json), (5, 7)) + self.assertEqual(getShapeDims(simple_shape_obj), (5, 7)) + self.assertEqual(getShapeDims(vstr_simple_shape_obj), (5, 7)) + self.assertEqual(getShapeDims(12), (12,)) + + self.assertEqual(getRank(null_shape_json), 0) + self.assertEqual(getRank(null_shape_obj), 0) + self.assertEqual(getRank(scalar_shape_json), 0) + self.assertEqual(getRank(scalar_shape_obj), 0) + self.assertEqual(getRank(vstr_scalar_shape_obj), 0) + self.assertEqual(getRank(simple_shape_json), 2) + self.assertEqual(getRank(simple_shape_obj), 2) + self.assertEqual(getRank(vstr_simple_shape_obj), 2) + self.assertEqual(getRank((1, 2, 3)), 3) + + self.assertEqual(getNumElements(null_shape_json), 0) + self.assertEqual(getNumElements(null_shape_obj), 0) + self.assertEqual(getNumElements(scalar_shape_json), 1) + self.assertEqual(getNumElements(scalar_shape_obj), 1) + self.assertEqual(getNumElements(vstr_scalar_shape_obj), 1) + self.assertEqual(getNumElements(simple_shape_json), 35) + self.assertEqual(getNumElements(simple_shape_obj), 35) + self.assertEqual(getNumElements(vstr_simple_shape_obj), 35) + self.assertEqual(getNumElements(()), 1) + self.assertEqual(getNumElements([1, 2, 3]), 6) + + self.assertEqual(isNullSpace(null_shape_json), True) + self.assertEqual(isNullSpace(null_shape_obj), True) + self.assertEqual(isNullSpace(scalar_shape_json), False) + self.assertEqual(isNullSpace(scalar_shape_obj), False) + self.assertEqual(isNullSpace(vstr_scalar_shape_obj), False) + self.assertEqual(isNullSpace(simple_shape_json), False) + self.assertEqual(isNullSpace(simple_shape_obj), False) + self.assertEqual(isNullSpace(vstr_simple_shape_obj), False) + + self.assertEqual(isScalar(null_shape_json), False) + self.assertEqual(isScalar(null_shape_obj), False) + self.assertEqual(isScalar(scalar_shape_json), True) + self.assertEqual(isScalar(scalar_shape_obj), True) + self.assertEqual(isScalar(vstr_scalar_shape_obj), True) + self.assertEqual(isScalar(simple_shape_json), False) + self.assertEqual(isScalar(simple_shape_obj), False) + self.assertEqual(isScalar(vstr_simple_shape_obj), False) + + self.assertEqual(getDataSize(null_shape_json, 4), 0) + self.assertEqual(getDataSize(null_shape_obj, 4), 0) + self.assertEqual(getDataSize(scalar_shape_json, 4), 4) + self.assertEqual(getDataSize(scalar_shape_obj, 4), 4) + self.assertEqual(getDataSize(vstr_scalar_shape_obj, 4), 4) + self.assertEqual(getDataSize(simple_shape_json, 4), 140) + self.assertEqual(getDataSize(simple_shape_obj, 4), 140) + self.assertEqual(getDataSize(vstr_simple_shape_obj, 4), 140) + self.assertEqual(getDataSize((), 4), 4) + self.assertEqual(getDataSize([1, 2, 3], 4), 24) + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/testall.py b/testall.py index 04aa4798..34b1efd7 100755 --- a/testall.py +++ b/testall.py @@ -19,6 +19,7 @@ "array_util_test", "objid_test", "hdf5dtype_test", + "shape_util_test", "dset_util_test", "hdf5db_test", "h5json_reader_test", From dcaf2fb3d3b26e23975d494ec2db9d27da57dc9d Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 17 Oct 2025 11:18:18 +0100 Subject: [PATCH 087/129] consolidate duplicate dsetutil funcs --- src/h5json/dset_util.py | 17 ----------------- test/unit/dset_util_test.py | 38 ++++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 4327a044..eb627443 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -189,23 +189,6 @@ def getDsetMaxDims(dset_json): def getChunkDims(dset_json): - """ get chunk shape for given dset_json """ - - layout = getDatasetLayout(dset_json) - if layout and "dims" in layout: - return layout["dims"] - else: - # H5D_COMPACT and H5D_CONTIGUOUS will not have a dims key - # Check the layout dict in dset_json to see if it's - # defined there - if "layout" in dset_json: - layout = dset_json["layout"] - if "dims" in layout: - return layout["dims"] - return None - - -def getChunkLayout(dset_json): """Get chunk layout. Return None for non-chunked layout""" layout_class = getDatasetLayoutClass(dset_json) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index c8b949ec..3dafeedf 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -12,7 +12,8 @@ import unittest import logging -from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, getContiguousLayout, expandChunk +from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk +from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getDatasetLayout, getChunkDims class DsetUtilTest(unittest.TestCase): @@ -22,6 +23,41 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.WARNING) + def testGetLayout(self): + contiguous_layout = {'class': 'H5D_CONTIGUOUS'} + dset_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', + 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', + 'created': 1760613930.3584619, + 'type': {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'}, + 'shape': {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]}, + 'lastModified': 1760613930.3584619, + 'attributeCount': 0, + 'creationProperties': {'fillValue': 3.12, 'layout': contiguous_layout}} + + layout = getDatasetLayout(dset_json) + self.assertTrue("class" in layout) + layout_class = getDatasetLayoutClass(dset_json) + self.assertEqual(layout_class, "H5D_CONTIGUOUS") + chunk_dims = getChunkDims(dset_json) + self.assertEqual(chunk_dims, None) + + chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [2, ]} + dset_chunked_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', + 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', + 'created': 1760613930.3584619, + 'type': {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'}, + 'shape': {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]}, + 'lastModified': 1760613930.3584619, + 'attributeCount': 0, + 'creationProperties': {'fillValue': 3.12, 'layout': chunked_layout}} + + layout = getDatasetLayout(dset_chunked_json) + self.assertTrue("class" in layout) + layout_class = getDatasetLayoutClass(dset_chunked_json) + self.assertEqual(layout_class, "H5D_CHUNKED") + chunk_dims = getChunkDims(dset_chunked_json) + self.assertEqual(chunk_dims, [2, ]) + def testGuessChunk(self): typesize = "H5T_VARIABLE" From e6357ff4bd18b993a6abe6e100940cc9b35a777f Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 17 Oct 2025 11:40:03 +0100 Subject: [PATCH 088/129] for non chunked datasets return chunk dims as dset shape --- src/h5json/dset_util.py | 22 ++++++++++++++-------- test/unit/dset_util_test.py | 4 ++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index eb627443..00220235 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -132,13 +132,13 @@ def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): return layout -def getChunkSize(layout, type_size: int = 1): +def getChunkSize(chunk_dims, type_size: int = 1): """Return chunk size given layout. i.e. just the product of the values in the list. """ chunk_size = type_size - for n in layout: + for n in chunk_dims: if n <= 0: raise ValueError("Invalid chunk layout") chunk_size *= n @@ -185,25 +185,31 @@ def getDsetMaxDims(dset_json): else: msg = f"Unexpected shape class: {shape_class}" raise ValueError(msg) - return maxdims + return tuple(maxdims) def getChunkDims(dset_json): - """Get chunk layout. Return None for non-chunked layout""" + """Get chunk layout. Return shape dims for non-chunked layout""" + shape_json = dset_json["shape"] + if shape_json["class"] == "H5S_NULL": + return None + if shape_json["class"] == "H5S_SCALAR": + return (1, ) + shape_dims = shape_json["dims"] layout_class = getDatasetLayoutClass(dset_json) if not layout_class: - return None + return tuple(shape_dims) if layout_class not in CHUNK_LAYOUT_CLASSES: - return None + return tuple(shape_dims) layout_json = getDatasetLayout(dset_json) if "dims" not in layout_json: msg = f"Expected dims key in layout: {layout_json}" raise KeyError(msg) - layout = layout_json["dims"] - return layout + chunk_dims = tuple(layout_json["dims"]) + return chunk_dims def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 3dafeedf..498276ce 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -39,7 +39,7 @@ def testGetLayout(self): layout_class = getDatasetLayoutClass(dset_json) self.assertEqual(layout_class, "H5D_CONTIGUOUS") chunk_dims = getChunkDims(dset_json) - self.assertEqual(chunk_dims, None) + self.assertEqual(chunk_dims, (10, )) chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [2, ]} dset_chunked_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', @@ -56,7 +56,7 @@ def testGetLayout(self): layout_class = getDatasetLayoutClass(dset_chunked_json) self.assertEqual(layout_class, "H5D_CHUNKED") chunk_dims = getChunkDims(dset_chunked_json) - self.assertEqual(chunk_dims, [2, ]) + self.assertEqual(chunk_dims, (2, )) def testGuessChunk(self): From b8c474f5ad94e767e7a2731cc6b399a1f8641e2f Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 17 Oct 2025 12:14:33 +0100 Subject: [PATCH 089/129] add more tests for dset_util --- test/unit/dset_util_test.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 498276ce..8a30527b 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -41,6 +41,23 @@ def testGetLayout(self): chunk_dims = getChunkDims(dset_json) self.assertEqual(chunk_dims, (10, )) + compact_layout = {'class': 'H5D_COMPACT'} + dset_compact_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', + 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', + 'created': 1760613930.3584619, + 'type': {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'}, + 'shape': {'class': 'H5S_SCALAR'}, + 'lastModified': 1760613930.3584619, + 'attributeCount': 0, + 'creationProperties': {'fillValue': 3.12, 'layout': compact_layout}} + + layout = getDatasetLayout(dset_compact_json) + self.assertTrue("class" in layout) + layout_class = getDatasetLayoutClass(dset_json) + self.assertEqual(layout_class, "H5D_CONTIGUOUS") + chunk_dims = getChunkDims(dset_compact_json) + self.assertEqual(chunk_dims, (1, )) + chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [2, ]} dset_chunked_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', From 0be82f28cdc857289399d303385e7b3e5786517b Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 23 Oct 2025 18:00:48 +0100 Subject: [PATCH 090/129] add min/max param for guessChunk --- src/h5json/dset_util.py | 10 ++++++++-- test/unit/dset_util_test.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 00220235..6d28ab76 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -362,7 +362,7 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): raise ValueError(msg) -def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN, layout_class="H5D_CHUNKED"): +def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN): """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" if shape_json is None or shape_json["class"] == "H5S_NULL": return None @@ -466,7 +466,7 @@ def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX): return tuple(layout) -def guessChunk(shape_json, typesize): +def guessChunk(shape_json, typesize, chunk_min=None, chunk_max=None): """Guess an appropriate chunk layout for a dataset, given its shape and the size of each element in bytes. Will allocate chunks only as large as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of @@ -490,6 +490,12 @@ def guessChunk(shape_json, typesize): # For unlimited dimensions we have to guess. use 1024 shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) + chunk_size = getChunkSize(shape, typesize) + if chunk_min and chunk_size < chunk_min: + shape = expandChunk(shape, typesize, shape_json, chunk_min=chunk_min) + elif chunk_max and chunk_size > chunk_max: + shape = shrinkChunk(shape, typesize, chunk_max=chunk_max) + return shape diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 8a30527b..adeb9f4a 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -107,11 +107,14 @@ def testGuessChunk(self): self.assertEqual(layout, (5,)) shape = {"class": "H5S_SIMPLE", "dims": [100, 100, 100]} - layout = guessChunk(shape, typesize) + chunk_max = 400 + layout = guessChunk(shape, typesize, chunk_max=chunk_max) self.assertTrue(len(layout), 3) for i in range(3): self.assertTrue(layout[i] >= 1) - self.assertTrue(layout[i] <= 100) + self.assertTrue(layout[i] < 100) + chunk_size = getChunkSize(layout, typesize) + self.assertTrue(chunk_size <= chunk_max) shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} layout = guessChunk(shape, typesize) From fdb9ffa233f2201ac6f3c901bb122dfdc78f5c88 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 29 Oct 2025 11:56:51 +0000 Subject: [PATCH 091/129] added constant for valid layout classes --- src/h5json/dset_util.py | 12 ++++++++--- test/unit/dset_util_test.py | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 6d28ab76..2fce42ea 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -18,11 +18,14 @@ CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) -CHUNK_LAYOUT_CLASSES = ( + +LAYOUT_CLASSES = ( + "H5D_COMPACT", + "H5D_CONTIGUOUS", + "H5D_CONTIGUOUS_REF", "H5D_CHUNKED", "H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT", - "H5D_CONTIGUOUS_REF", ) @@ -201,7 +204,8 @@ def getChunkDims(dset_json): if not layout_class: return tuple(shape_dims) - if layout_class not in CHUNK_LAYOUT_CLASSES: + if not layout_class.startswith("H5D_CHUNKED"): + # for non-chunked layouts, just return the shape as the chunk dim return tuple(shape_dims) layout_json = getDatasetLayout(dset_json) @@ -495,6 +499,8 @@ def guessChunk(shape_json, typesize, chunk_min=None, chunk_max=None): shape = expandChunk(shape, typesize, shape_json, chunk_min=chunk_min) elif chunk_max and chunk_size > chunk_max: shape = shrinkChunk(shape, typesize, chunk_max=chunk_max) + else: + pass # good already return shape diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index adeb9f4a..f7e4aa91 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -123,6 +123,18 @@ def testGuessChunk(self): self.assertTrue(layout[i] >= 1) self.assertTrue(layout[i] <= 1024) + dims = [50000, 80000] + shape = {'class': 'H5S_SIMPLE', 'dims': dims} + chunk_min = 1048576 + chunk_max = 4194304 + layout = guessChunk(shape, typesize, chunk_min=chunk_min, chunk_max=chunk_max) + self.assertTrue(len(layout), 2) + self.assertTrue(layout[0] < dims[0]) + self.assertTrue(layout[1] < dims[1]) + chunk_size = layout[0] * layout[1] * typesize + self.assertTrue(chunk_size >= chunk_min) + self.assertTrue(chunk_size <= chunk_max) + shape = {"class": "H5S_SCALAR"} layout = guessChunk(shape, typesize) self.assertEqual(layout, (1,)) @@ -175,6 +187,18 @@ def testShrinkChunk(self): self.assertTrue(num_bytes > CHUNK_MIN) self.assertTrue(num_bytes < CHUNK_MAX) + shape = { + "class": "H5S_SIMPLE", + "dims": [50000, 80000], + } + layout = [782, 125] + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + def testExpandChunk(self): CHUNK_MIN = 5000 CHUNK_MAX = 50000 @@ -242,6 +266,22 @@ def testExpandChunk(self): "maxdims": [1000, 0, 1000], } layout = (10, 10, 10) + typesize = 4 + num_bytes = getChunkSize(layout, typesize) + self.assertTrue(num_bytes < CHUNK_MIN) + expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) + num_bytes = getChunkSize(expanded, typesize) + self.assertTrue(num_bytes > CHUNK_MIN) + self.assertTrue(num_bytes < CHUNK_MAX) + + CHUNK_MIN = 1024 * 1024 + CHUNK_MAX = 4 * CHUNK_MIN + shape = { + "class": "H5S_SIMPLE", + "dims": [50000, 80000], + } + layout = [100, 100] + typesize = 4 num_bytes = getChunkSize(layout, typesize) self.assertTrue(num_bytes < CHUNK_MIN) expanded = expandChunk(layout, typesize, shape, chunk_min=CHUNK_MIN) From e84a072eb70463c83dc11369e59ab99365304ce2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Dec 2025 12:15:24 +0800 Subject: [PATCH 092/129] update for create time --- src/h5json/h5pystore/h5py_reader.py | 11 ++++++----- src/h5json/h5pystore/h5py_writer.py | 1 + src/h5json/hdf5db.py | 25 +++++++++++++------------ test/unit/h5py_reader_test.py | 7 ++++--- test/unit/h5py_writer_test.py | 2 +- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index b4b4c184..6725c783 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -153,7 +153,7 @@ def __init__( else: self.log = logging.getLogger() if not h5py.is_hdf5(filepath): - self.log.warn(f"File: {filepath} is not an HDF5 file") + self.log.warning(f"File: {filepath} is not an HDF5 file") raise IOError("not an HDF5 file") super().__init__(filepath, app_logger=app_logger) self._f = None @@ -265,8 +265,8 @@ def getAttribute(self, obj_id, name, include_data=True): item["value"] = value else: pass # no data - - item['created'] = time.time() # TBD: get attribute creation time from h5py? + stats = self.getStats() + item['created'] = stats["lastModified"] # use file modification time as attr creation time return item def getAttributes(self, obj_id, include_data=True): @@ -312,7 +312,8 @@ def _getLink(self, parent, link_name): else: item["id"] = self._addr_map[addr] - item['created'] = time.time() # TBD: get the link creation time from h5py? + stats = self.getStats() + item['created'] = stats["lastModified"] # use file modification time as attr creation time return item @@ -567,7 +568,7 @@ def getStats(self): """ stat_info = os_stat(self.filepath) stats = {} - stats['created'] = stat_info.st_ctime + stats['created'] = stat_info.st_birthtime stats["lastModified"] = stat_info.st_mtime stats['owner'] = stat_info.st_uid # TBD: convert to username? return stats diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index dc62ed72..5e1e20d7 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -401,6 +401,7 @@ def updateAttributes(self, obj_id, obj): del obj.attrs[name] else: pass # already deleted or never added + continue if "created" in attr_json and attr_json["created"] < self._flush_time: # attribute should be saved already continue diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 18e4f3e0..be84be92 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -9,7 +9,7 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import time + import numpy as np import logging from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype @@ -18,6 +18,7 @@ from .filters import getFiltersJson from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId from . import selections +from .time_util import getNow from .apiversion import _apiver from .h5reader import H5Reader, H5NullReader from .h5writer import H5Writer, H5NullWriter @@ -138,7 +139,7 @@ def make_dirty(self, obj_id): # object deleted, just return return obj_json = self.db[obj_id] - obj_json["lastModified"] = time.time() + obj_json["lastModified"] = getNow() if not self.is_new(obj_id): # object hasn't been initially written yet, add to dirt_object set self._dirty_objects.add(obj_id) @@ -520,7 +521,7 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): type_json = getTypeItem(dtype) # finally put it all together... attr_json = {"shape": shape_json, "type": type_json, "value": value_json} - attr_json["created"] = time.time() + attr_json["created"] = getNow() # slot into the obj_json["attrs"] attrs_json[name] = attr_json @@ -535,7 +536,7 @@ def deleteAttribute(self, obj_id, name): if name not in attrs_json: raise KeyError(f"attribute [{name}] not found in {obj_id}") attr_json = attrs_json[name] - attr_json["DELETED"] = time.time() # mark key for deletion + attr_json["DELETED"] = getNow() # mark key for deletion self.make_dirty(obj_id) @@ -726,26 +727,26 @@ def _addLink(self, grp_id, name, link_json): def createHardLink(self, grp_id, name, tgt_id): """ Create a new hardlink """ link_json = {"class": "H5L_TYPE_HARD", "id": tgt_id} - link_json["created"] = time.time() + link_json["created"] = getNow() self._addLink(grp_id, name, link_json) def createSoftLink(self, grp_id, name, h5path): """ Create a soft link """ link_json = {"class": "H5L_TYPE_SOFT", "h5path": h5path} - link_json["created"] = time.time() + link_json["created"] = getNow() self._addLink(grp_id, name, link_json) def createCustomLink(self, grp_id, name, link_json): """ create a custom link """ if link_json.get("class") != "H5L_TYPE_USER_DEFINED": link_json["class"] = "H5L_TYPE_USER_DEFINED" - link_json["created"] = time.time() + link_json["created"] = getNow() self._addLink(grp_id, name, link_json) def createExternalLink(self, grp_id, name, h5path, filepath): """ Create a external link link """ link_json = {"class": "H5L_TYPE_EXTERNAL", "h5path": h5path, "file": filepath} - link_json["created"] = time.time() + link_json["created"] = getNow() self._addLink(grp_id, name, link_json) def deleteLink(self, grp_id, name): @@ -757,7 +758,7 @@ def deleteLink(self, grp_id, name): if name not in links: raise KeyError(f"Link [{name}] not found in {grp_id}") link_json = links[name] - link_json["DELETED"] = time.time() # mark for deletion + link_json["DELETED"] = getNow() # mark for deletion self.make_dirty(grp_id) grp_json = self.getObjectById(grp_id) links = grp_json["links"] @@ -772,7 +773,7 @@ def createGroup(self, cpl=None): group_json["creationProperties"] = cpl else: group_json["creationProperties"] = {} - group_json["created"] = time.time() + group_json["created"] = getNow() self.db[grp_id] = group_json self._new_objects.add(grp_id) return grp_id @@ -797,7 +798,7 @@ def createCommittedType(self, datatype, cpl=None): type_json = getTypeItem(dt) # get canonical json description of datatype ctype_json = {"type": type_json, "attributes": {}, "creationProperties": cpl} - ctype_json["created"] = time.time() + ctype_json["created"] = getNow() self.db[ctype_id] = ctype_json self._new_objects.add(ctype_id) return ctype_id @@ -846,7 +847,7 @@ def createDataset( dset_json["creationProperties"] = cpl else: dset_json["creationProperties"] = {} - dset_json["created"] = time.time() + dset_json["created"] = getNow() dset_id = createObjId("datasets", root_id=self.root_id) self.db[dset_id] = dset_json diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index e4cc9c7d..74108313 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -12,12 +12,12 @@ import unittest import logging -import time import numpy as np from h5json import Hdf5db from h5json.h5pystore.h5py_reader import H5pyReader from h5json import selections +from h5json.time_util import getNow class H5pyReaderTest(unittest.TestCase): @@ -55,8 +55,9 @@ def testSimple(self): self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") self.assertTrue("created" in g1_link) g1_created = g1_link["created"] - now = time.time() - self.assertTrue(g1_created < now) + now = getNow() + self.assertTrue(g1_created < int(now)) + g1_id = g1_link["id"] self.assertTrue(g1_id) self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index aa481dfd..5426310d 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -38,7 +38,7 @@ def __init__(self, *args, **kwargs): self.log.setLevel(logging.DEBUG) # create logger - handler = logging.FileHandler("./h5pywriterbtest.log") + handler = logging.FileHandler("./h5pywritertest.log") # add handler to logger self.log.addHandler(handler) From 1f53fe012806c093bcd1471fa04f744f0d25dfe1 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Dec 2025 12:22:06 +0800 Subject: [PATCH 093/129] add time_util.py --- src/h5json/time_util.py | 86 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 src/h5json/time_util.py diff --git a/src/h5json/time_util.py b/src/h5json/time_util.py new file mode 100644 index 00000000..7cfcad69 --- /dev/null +++ b/src/h5json/time_util.py @@ -0,0 +1,86 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +from datetime import datetime +import time +import os +import pytz + +def unixTimeToUTC(timestamp): + """Convert unix timestamp (seconds since Jan 1, 1970, to ISO-8601 + compatible UTC time string. + + """ + utc = pytz.utc + dtTime = datetime.fromtimestamp(timestamp, utc) + iso_str = dtTime.isoformat() + # isoformat returns a string like this: + # '2014-10-30T04:25:21+00:00' + # strip off the '+00:00' and replace + # with 'Z' (both are ISO-8601 compatible) + npos = iso_str.rfind("+") + iso_z = iso_str[:npos] + "Z" + return iso_z + + +def elapsedTime(timestamp): + """Get Elapsed time from given timestamp""" + delta = int(time.time()) - timestamp + if delta < 0: + return "Invalid timestamp!" + day_length = 24 * 60 * 60 + days = 0 + hour_length = 60 * 60 + hours = 0 + minute_length = 60 + minutes = 0 + ret_str = "" + + if delta > day_length: + days = delta // day_length + delta = delta % day_length + ret_str += f"{days} days " + if delta > hour_length or days > 0: + hours = delta // hour_length + delta = delta % hour_length + ret_str += f"{hours} hours " + if delta > minute_length or days > 0 or hours > 0: + minutes = delta // minute_length + delta = delta % minute_length + ret_str += f"{minutes} minutes " + ret_str += f"{delta} seconds" + return ret_str + + +def getNow(app=None): + """ + Get current time in unix timestamp + + Returns a precise timestamp even on platforms where + time.time() has low resolution (e.g. Windows) + """ + system = os.name + current_time = 0 + + if system == "nt": + # Windows + if app is None or "start_time_relative" not in app or "start_time" not in app: + current_time = time.time() # just use lower precision time.time() + else: + current_time = (time.perf_counter() - app["start_time_relative"]) + app["start_time"] + elif system == "posix": + # Unix + current_time = time.time() + else: + raise ValueError(f"Unsupported OS: {system}") + + return current_time From 6a6f38511e8b2f026013406f9d5fc0f01d6b4999 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Dec 2025 12:25:32 +0800 Subject: [PATCH 094/129] fix flake8 errors --- src/h5json/time_util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/h5json/time_util.py b/src/h5json/time_util.py index 7cfcad69..24f6a835 100644 --- a/src/h5json/time_util.py +++ b/src/h5json/time_util.py @@ -15,6 +15,7 @@ import os import pytz + def unixTimeToUTC(timestamp): """Convert unix timestamp (seconds since Jan 1, 1970, to ISO-8601 compatible UTC time string. From 632260b103372f588ee3ab4ed02fac2e6d0dd880 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Dec 2025 12:34:19 +0800 Subject: [PATCH 095/129] revert getStats change --- src/h5json/h5pystore/h5py_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index 6725c783..fddfedb4 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -568,7 +568,7 @@ def getStats(self): """ stat_info = os_stat(self.filepath) stats = {} - stats['created'] = stat_info.st_birthtime + stats['created'] = stat_info.st_ctime stats["lastModified"] = stat_info.st_mtime stats['owner'] = stat_info.st_uid # TBD: convert to username? return stats From d95766949bf9a1eceb9b907f6ded91963486c79e Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Dec 2025 17:22:39 +0800 Subject: [PATCH 096/129] check for chunked for resiable dsets --- src/h5json/dset_util.py | 14 +++++++- test/unit/dset_util_test.py | 68 ++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 2fce42ea..23e54aba 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -270,7 +270,6 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): raise ValueError(msg) layout_class = layout["class"] - if layout_class == "H5D_CONTIGUOUS_REF": # reference to a dataset in a traditional HDF5 files with # contiguous storage @@ -299,6 +298,11 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): msg = "'dims' key can not be provided for " msg += "H5D_CONTIGUOUS_REF layout" raise ValueError(msg) + if "maxdims" in shape_json: + # maxdims not allowed for H5D_CONTIGUOUS_REF + msg = "'maxdims' key can not be provided for " + msg += "H5D_CONTIGUOUS_REF layout" + raise ValueError(msg) elif layout_class == "H5D_CHUNKED_REF": # reference to a dataset in a traditional HDF5 files with # chunked storage @@ -356,11 +360,19 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): msg = "dims key found in layout for creation property list " msg += "for H5D_CONTIGUOUS storage class" raise ValueError(msg) + if "maxdims" in shape_json: + msg = "maxdims found in shape for creation property list " + msg += "for H5D_CONTIGUOUS storage class" + raise ValueError(msg) elif layout_class == "H5D_COMPACT": if "dims" in layout: msg = "dims key found in layout for creation property list " msg += "for H5D_COMPACT storage class" raise ValueError(msg) + if "maxdims" in shape_json: + msg = "maxdims found in shape for creation property list " + msg += "for H5D_COMPACT storage class" + raise ValueError(msg) else: msg = f"Unexpected layout: {layout_class}" raise ValueError(msg) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index f7e4aa91..52fb49f9 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -13,7 +13,8 @@ import logging from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk -from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getDatasetLayout, getChunkDims +from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getChunkDims +from h5json.dset_util import validateChunkLayout, getDatasetLayout class DsetUtilTest(unittest.TestCase): @@ -25,54 +26,53 @@ def __init__(self, *args, **kwargs): def testGetLayout(self): contiguous_layout = {'class': 'H5D_CONTIGUOUS'} + fixed_1d_shape_json = {'class': 'H5S_SIMPLE', 'dims': [10]} + resizable_shape_json = {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]} + base_type = 'H5T_IEEE_F32LE' + item_size = 4 # bytes + type_json = {'class': 'H5T_FLOAT', 'base': base_type} + chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [2, ]} + cpl = {'fillValue': 3.12, 'layout': contiguous_layout} + dset_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', 'created': 1760613930.3584619, - 'type': {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'}, - 'shape': {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]}, + 'type': type_json, + 'shape': resizable_shape_json, 'lastModified': 1760613930.3584619, - 'attributeCount': 0, - 'creationProperties': {'fillValue': 3.12, 'layout': contiguous_layout}} + 'creationProperties': cpl} layout = getDatasetLayout(dset_json) self.assertTrue("class" in layout) layout_class = getDatasetLayoutClass(dset_json) self.assertEqual(layout_class, "H5D_CONTIGUOUS") - chunk_dims = getChunkDims(dset_json) - self.assertEqual(chunk_dims, (10, )) - - compact_layout = {'class': 'H5D_COMPACT'} - dset_compact_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', - 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', - 'created': 1760613930.3584619, - 'type': {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'}, - 'shape': {'class': 'H5S_SCALAR'}, - 'lastModified': 1760613930.3584619, - 'attributeCount': 0, - 'creationProperties': {'fillValue': 3.12, 'layout': compact_layout}} - - layout = getDatasetLayout(dset_compact_json) + + # contigous layout with resizable shape should raise exception + try: + validateChunkLayout(dset_json["shape"], item_size, layout) + self.assertTrue(False) # should not reach here + except ValueError: + pass # should raise exception + + dset_json["shape"] = fixed_1d_shape_json + layout = getDatasetLayout(dset_json) self.assertTrue("class" in layout) layout_class = getDatasetLayoutClass(dset_json) self.assertEqual(layout_class, "H5D_CONTIGUOUS") - chunk_dims = getChunkDims(dset_compact_json) - self.assertEqual(chunk_dims, (1, )) - chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [2, ]} - dset_chunked_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', - 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', - 'created': 1760613930.3584619, - 'type': {'class': 'H5T_FLOAT', 'base': 'H5T_IEEE_F32LE'}, - 'shape': {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]}, - 'lastModified': 1760613930.3584619, - 'attributeCount': 0, - 'creationProperties': {'fillValue': 3.12, 'layout': chunked_layout}} - - layout = getDatasetLayout(dset_chunked_json) + dset_json["shape"] = resizable_shape_json + cpl["layout"] = chunked_layout + layout = getDatasetLayout(dset_json) self.assertTrue("class" in layout) - layout_class = getDatasetLayoutClass(dset_chunked_json) + layout_class = getDatasetLayoutClass(dset_json) self.assertEqual(layout_class, "H5D_CHUNKED") - chunk_dims = getChunkDims(dset_chunked_json) + + try: + validateChunkLayout(dset_json["shape"], item_size, layout) + except ValueError: + self.assertTrue(False) # should raise exception + + chunk_dims = getChunkDims(dset_json) self.assertEqual(chunk_dims, (2, )) def testGuessChunk(self): From 978a54849234bae57d37bdf14738a22e62ccb7ef Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Dec 2025 18:48:22 +0800 Subject: [PATCH 097/129] added validateDatasetCreationProps --- src/h5json/dset_util.py | 58 +++++++++++++++++++++++++++++++++++-- test/unit/dset_util_test.py | 5 ++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 23e54aba..57d983b5 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -11,8 +11,10 @@ ############################################################################## import math -from .hdf5dtype import getItemSize +from .hdf5dtype import getItemSize, createDataType from .shape_util import getDataSize +from .array_util import getNumpyValue +from .filters import getFiltersJson from .objid import isValidUuid CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) @@ -216,7 +218,7 @@ def getChunkDims(dset_json): return chunk_dims -def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): +def validateChunkLayout(shape_json, type_json, layout): """ Use chunk layout given in the creationPropertiesList (if defined and layout is valid). @@ -227,6 +229,7 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): space_dims = None chunk_dims = None max_dims = None + item_size = getItemSize(type_json) if "dims" in shape_json: space_dims = shape_json["dims"] @@ -378,6 +381,57 @@ def validateChunkLayout(shape_json, item_size, layout, chunk_table=None): raise ValueError(msg) +def validateDatasetCreationProps(creation_props, type_json=None, shape=None): + """ validate creation props """ + + if not type_json or not shape: + msg = "validateDatasetCreationProps - shape and type must be set" + raise ValueError(msg) + + if "fillValue" in creation_props: + # validate fill value compatible with type + dt = createDataType(type_json) + fill_value = creation_props["fillValue"] + if "fillValue_encoding" in creation_props: + fill_value_encoding = creation_props["fillValue_encoding"] + if fill_value_encoding not in ("None", "base64"): + msg = f"unexpected value for fill_value_encoding: {fill_value_encoding}" + raise ValueError(msg) + else: + # should see a string in this case + if not isinstance(fill_value, str): + msg = f"unexpected fill value: {fill_value} " + msg += f"for encoding: {fill_value_encoding}" + raise ValueError(msg) + else: + fill_value_encoding = None + + try: + getNumpyValue(fill_value, dt=dt, encoding=fill_value_encoding) + except ValueError: + msg = f"invalid fill value: {fill_value}" + raise ValueError(msg) + + layout_class = None + if "layout" in creation_props: + layout_json = creation_props["layout"] + validateChunkLayout(shape, type_json, layout_json) + layout_class = layout_json["class"] + + if "filters" in creation_props: + try: + filters_out = getFiltersJson(creation_props) + except (KeyError, ValueError): + # raise bad request exception if not valid + msg = "invalid filter provided" + raise ValueError(msg) + if filters_out: + # check that a chunked layout is used + if layout_class is None or layout_class.startswith("H5D_CHUNKED") is False: + msg = "filters can only be used with chunked layout" + raise ValueError(msg) + + def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN): """Compute an increased chunk shape with a size in bytes greater than chunk_min.""" if shape_json is None or shape_json["class"] == "H5S_NULL": diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 52fb49f9..3b2e35cd 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -29,7 +29,6 @@ def testGetLayout(self): fixed_1d_shape_json = {'class': 'H5S_SIMPLE', 'dims': [10]} resizable_shape_json = {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]} base_type = 'H5T_IEEE_F32LE' - item_size = 4 # bytes type_json = {'class': 'H5T_FLOAT', 'base': base_type} chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [2, ]} cpl = {'fillValue': 3.12, 'layout': contiguous_layout} @@ -49,7 +48,7 @@ def testGetLayout(self): # contigous layout with resizable shape should raise exception try: - validateChunkLayout(dset_json["shape"], item_size, layout) + validateChunkLayout(dset_json["shape"], type_json, layout) self.assertTrue(False) # should not reach here except ValueError: pass # should raise exception @@ -68,7 +67,7 @@ def testGetLayout(self): self.assertEqual(layout_class, "H5D_CHUNKED") try: - validateChunkLayout(dset_json["shape"], item_size, layout) + validateChunkLayout(dset_json["shape"], type_json, layout) except ValueError: self.assertTrue(False) # should raise exception From 6cb136e61a2de1298dfbf56a1ce515b167107103 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 11 Dec 2025 18:55:46 +0800 Subject: [PATCH 098/129] updated dset_util_test --- test/unit/dset_util_test.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 3b2e35cd..85c260f2 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -14,7 +14,7 @@ from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getChunkDims -from h5json.dset_util import validateChunkLayout, getDatasetLayout +from h5json.dset_util import validateChunkLayout, validateDatasetCreationProps, getDatasetLayout class DsetUtilTest(unittest.TestCase): @@ -69,11 +69,16 @@ def testGetLayout(self): try: validateChunkLayout(dset_json["shape"], type_json, layout) except ValueError: - self.assertTrue(False) # should raise exception + self.assertTrue(False) # shouldn't raise exception chunk_dims = getChunkDims(dset_json) self.assertEqual(chunk_dims, (2, )) + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + except ValueError: + self.assertTrue(False) # shouldn't raise exception + def testGuessChunk(self): typesize = "H5T_VARIABLE" From 678025bfb2f80499b52abb726a456a056e41f2ef Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 15 Dec 2025 15:21:13 +0800 Subject: [PATCH 099/129] added filter validation --- src/h5json/dset_util.py | 2 +- src/h5json/filters.py | 107 ++++++++++++++++++++++++++++++------ test/unit/dset_util_test.py | 55 ++++++++++++++++++ 3 files changed, 145 insertions(+), 19 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 57d983b5..872e3160 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -421,7 +421,7 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): if "filters" in creation_props: try: filters_out = getFiltersJson(creation_props) - except (KeyError, ValueError): + except (KeyError, TypeError, ValueError): # raise bad request exception if not valid msg = "invalid filter provided" raise ValueError(msg) diff --git a/src/h5json/filters.py b/src/h5json/filters.py index 9164f1e8..d4b256d7 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -71,6 +71,20 @@ ) +def getAllFilterNames(): + """ Return list of all recognized filter names """ + + names = set() + for item in FILTER_DEFS: + filter_id = item[1] + filter_name = item[2] + if filter_id > 0 and filter_name: + names.add(filter_name) + names = list(names) + names.sort() + return tuple(names) + + def getFilterItem(key): """ Return filter code, id, and name, based on an id, a name or a code. @@ -102,6 +116,9 @@ def getFiltersJson(create_props, supported_filters=None): msg = "Expected filters in creation_props to be a list" raise TypeError(msg) + if not supported_filters: + supported_filters = getAllFilterNames() + f_out = [] for filter in f_in: if isinstance(filter, int) or isinstance(filter, str): @@ -115,11 +132,12 @@ def getFiltersJson(create_props, supported_filters=None): raise ValueError(msg) f_out.append(item) elif isinstance(filter, dict): - if "class" not in filter: - msg = "expected 'class' key for filter property" - raise KeyError(msg) - if filter["class"] != "H5Z_FILTER_USER": - item = getFilterItem(filter["class"]) + if filter.get("class") == "H5Z_FILTER_USER": + # user filter - must have either id or name + if "id" not in filter and "name" not in filter: + msg = "user filter must have either 'id' or 'name' key" + raise KeyError(msg) + item = filter elif "id" in filter: item = getFilterItem(filter["id"]) elif "name" in filter: @@ -127,21 +145,74 @@ def getFiltersJson(create_props, supported_filters=None): else: item = None if not item: - msg = f"filter {filter['class']} not recognized" - raise ValueError(msg) - if "id" not in filter: - filter["id"] = item["id"] - elif item["id"] != filter["id"]: - msg = f"Expected {filter['class']} to have id: " - msg += f"{item['id']} but got {filter['id']}" + msg = f"filter {filter} not recognized" raise ValueError(msg) - if "name" not in filter: - filter["name"] = item["name"] - if filter["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - raise KeyError(msg) - f_out.append(filter) + # copy any filter specified options + filter_class = item["class"] + if filter_class == "H5Z_FILTER_DEFLATE": + if "level" in filter: + level_val = filter["level"] + if not isinstance(level_val, int): + msg = "Expected integer level for deflate filter" + raise TypeError(msg) + if level_val < 0 or level_val > 9: + msg = "Deflate filter level must be between 0 and 9" + raise ValueError(msg) + item["level"] = level_val + elif filter_class == "H5Z_FILTER_SHUFFLE": + pass # no options + elif filter_class == "H5Z_FILTER_FLETCHER32": + pass # no options + elif filter_class == "H5Z_FILTER_SZIP": + for key in ("bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"): + if key in filter: + val = filter[key] + if key == "coding": + if val not in HDF_FILTER_OPTION_ENUMS["coding"].values(): + msg = f"Invalid coding option for szip filter: {val}" + raise ValueError(msg) + else: + # other options need to be positivie integers + if not isinstance(val, int) or val <= 0: + msg = f"Expected positive integer for szip filter option {key}" + raise ValueError(msg) + item[key] = val + elif filter_class == "H5Z_FILTER_NBIT": + pass # no options + elif filter_class == "H5Z_FILTER_SCALEOFFSET": + if "scaleType" in filter: + val = filter["scaleType"] + if val not in HDF_FILTER_OPTION_ENUMS["scaleType"].values(): + msg = f"Invalid scaleType option for scaleoffset filter: {val}" + raise ValueError(msg) + else: + item["scaleType"] = val + if "scaleOffset" in filter: + val = filter["scaleOffset"] + if not isinstance(val, int) or val < 0: + msg = "Expected non-negative integer for scaleOffset option" + raise ValueError(msg) + else: + item["scaleOffset"] = val + elif filter_class == "H5Z_FILTER_LZF": + pass # no options + elif filter_class == "H5Z_FILTER_BLOSC": + pass # no options + elif filter_class == "H5Z_FILTER_SNAPPY": + pass # no options + elif filter_class == "H5Z_FILTER_LZ4": + pass # no options + elif filter_class == "H5Z_FILTER_LZ4HC": + pass # no options + elif filter_class == "H5Z_FILTER_BITSHUFFLE": + pass # no options + elif filter_class == "H5Z_FILTER_ZSTD": + pass # no options + else: + msg = f"filter class {filter_class} is not supported" + raise KeyError(msg) + f_out.append(item) else: msg = f"Unexpected type for filter: {filter}" raise ValueError(msg) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 85c260f2..82466013 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -79,6 +79,61 @@ def testGetLayout(self): except ValueError: self.assertTrue(False) # shouldn't raise exception + def testFilterValidation(self): + + shape_json = {'class': 'H5S_SIMPLE', 'dims': [500]} + base_type = 'H5T_IEEE_F32LE' + type_json = {'class': 'H5T_FLOAT', 'base': base_type} + contiguous_layout = {'class': 'H5D_CONTIGUOUS'} + chunked_layout = {'class': 'H5D_CHUNKED', 'dims': [100, ]} + deflate_filter = {'class': 'H5Z_FILTER_DEFLATE', 'id': 1, 'name': 'deflate'} + filters = [deflate_filter, ] + cpl = {'fillValue': 3.12, 'layout': contiguous_layout, "filters": filters} + + dset_json = {'id': 'd-f4a9f95e-c8962a53-f6c8-f18440-78d051', + 'root': 'g-f4a9f95e-c8962a53-7c21-71d640-1ea2db', + 'created': 1760613930.3584619, + 'type': type_json, + 'shape': shape_json, + 'lastModified': 1760613930.3584619, + 'creationProperties': cpl} + + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + self.assertTrue(False) # should not reach here + except ValueError: + pass # filters are invalid with contiguous layout + cpl["layout"] = chunked_layout + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + except ValueError: + self.assertTrue(False) # shouldn't raise exception + # add an invlaid level option for deflate + deflate_filter["level"] = 20 + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + self.assertTrue(False) # should not reach here + except ValueError: + pass # invalid deflate level + deflate_filter["level"] = 5 + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + except ValueError: + self.assertTrue(False) # shouldn't raise exception + # try with just a filter name + cpl["filters"] = ["gzip", ] + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + except ValueError: + self.assertTrue(False) # shouldn't raise exception + # try with an invalid filter name + cpl["filters"] = ["invalid_filter_name", ] + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + self.assertTrue(False) # should not reach here + except ValueError: + pass # invalid filter name + def testGuessChunk(self): typesize = "H5T_VARIABLE" From b68e9679f3ba6b5a2b977b0d22a06016da19a6ef Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 17 Dec 2025 17:50:04 +0800 Subject: [PATCH 100/129] fix for getFilters --- src/h5json/filters.py | 3 +++ src/h5json/hdf5db.py | 2 +- test/unit/dset_util_test.py | 9 +++++++ test/unit/h5py_writer_test.py | 45 +++++++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/h5json/filters.py b/src/h5json/filters.py index d4b256d7..724ac929 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -148,6 +148,9 @@ def getFiltersJson(create_props, supported_filters=None): msg = f"filter {filter} not recognized" raise ValueError(msg) + # will replace options list with specified options + del item["options"] + # copy any filter specified options filter_class = item["class"] if filter_class == "H5Z_FILTER_DEFLATE": diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index be84be92..08af1c16 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -838,7 +838,7 @@ def createDataset( if cpl: if "filters" in cpl: if self.writer: - supported_filters = self.writer.getSupportedFilters() + supported_filters = self.writer.getFilters() else: supported_filters = () # validate and normalize supplied filter property list diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 82466013..c029fd01 100755 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -134,6 +134,15 @@ def testFilterValidation(self): except ValueError: pass # invalid filter name + deflate_filter = {'class': 'H5Z_FILTER_DEFLATE', 'id': 1, 'level': 9, 'name': 'deflate'} + fletcher_filter = {'class': 'H5Z_FILTER_FLETCHER32', 'id': 3, 'name': 'fletcher32'} + filters = [fletcher_filter, deflate_filter] + cpl["filters"] = filters + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + except ValueError: + self.assertTrue(False) # shouldn't raise exception + def testGuessChunk(self): typesize = "H5T_VARIABLE" diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 5426310d..5b2ff629 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -635,6 +635,51 @@ def testReaderWithUpdate(self): else: self.assertEqual(data[i, j], 0) + def testCompression(self): + + filepath = "test/unit/out/h5py_writer_test_testCompression.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + self.assertEqual(db.getObjectIdByPath("/"), root_id) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + + layout = {"class": "H5D_CHUNKED", "dims": (10, 1)} + gzip_filter = { + "class": "H5Z_FILTER_DEFLATE", + "id": 1, + "level": 9, + "name": "deflate", + } + cpl = {"layout": layout, "filters": [gzip_filter, ]} + dset_id = db.createDataset(shape=(10, 10), dtype=np.int32, cpl=cpl) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_id, sel_all, arr) + db.createHardLink(g1_id, "dset1.1.1", dset_id) + db.close() + + # open file with h5py and verify changes + with h5py.File(filepath) as f: + + self.assertTrue("g1" in f) + + g1 = f["g1"] + self.assertEqual(len(g1), 1) + self.assertTrue("dset1.1.1" in g1) + dset = g1["dset1.1.1"] + self.assertEqual(dset.shape, (10, 10)) + for i in range(10): + for j in range(10): + self.assertEqual(dset[i, j], i * j) + if __name__ == "__main__": # setup test files From 7ad35b7bcb8629099bf0a06460eb08e6555ad852 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 22 Dec 2025 15:44:44 +0800 Subject: [PATCH 101/129] updates for dataset reads/writes --- pyproject.toml | 6 +- src/h5json/dset_util.py | 4 + src/h5json/h5pystore/h5py_reader.py | 1 - src/h5json/h5reader.py | 6 +- src/h5json/h5writer.py | 3 +- src/h5json/hdf5db.py | 182 +++++++++++++++++----------- src/h5json/selections.py | 23 ++++ test/unit/h5py_writer_test.py | 1 + test/unit/hdf5db_test.py | 72 +++++++++++ 9 files changed, 220 insertions(+), 78 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 11302438..a299a9e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,8 @@ classifiers = [ authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }] keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"] requires-python = ">=3.9" +version = "1.0.0" + dependencies = [ "h5py >= 3.10", "numpy >= 2.0; python_version>='3.9'", @@ -24,7 +26,7 @@ dependencies = [ "tomli; python_version<'3.11'", ] -dynamic = ["version"] +#dynamic = ["version"] [project.urls] Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" @@ -44,7 +46,7 @@ dev = ["check-manifest"] test = ["coverage"] [build-system] -requires = ["setuptools", "setuptools_scm", "wheel"] +requires = ["setuptools >= 61"] build-backend = "setuptools.build_meta" [tool.setuptools] diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 872e3160..b9b57563 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -40,6 +40,10 @@ def getDatasetLayout(dset_json): if "layout" in cp: layout = cp["layout"] + if layout is None and "layout" in dset_json: + # previous HSDS versions stored layout here + layout = dset_json["layout"] + return layout diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index fddfedb4..e0d5d825 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -13,7 +13,6 @@ import numpy as np import logging from os import stat as os_stat -import time from ..objid import createObjId, getCollectionForId from ..hdf5dtype import getTypeItem, isOpaqueDtype diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py index a4127097..08df2adb 100644 --- a/src/h5json/h5reader.py +++ b/src/h5json/h5reader.py @@ -14,7 +14,6 @@ import logging import time -import numpy as np from .objid import createObjId @@ -158,10 +157,9 @@ def getDatasetValues(self, obj_id, sel=None, dtype=None): number of elements as the rank of the dataset. """ - # just return a zero array - arr = np.zeros(sel.shape, dtype=dtype) + # just return None - return arr + return None def open(self): """ Open data source for reading """ diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py index 422a0450..fc368bfe 100644 --- a/src/h5json/h5writer.py +++ b/src/h5json/h5writer.py @@ -78,7 +78,8 @@ def open(self): @abstractmethod def flush(self): """ Write dirty items """ - pass + # return False since we can't actually persist anything + return False @abstractmethod def close(self): diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 08af1c16..9468c9fe 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -15,6 +15,7 @@ from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype from .array_util import jsonToArray, bytesArrayToList from .dset_util import resize_dataset +from .shape_util import getShapeClass, getShapeDims from .filters import getFiltersJson from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId from . import selections @@ -24,6 +25,14 @@ from .h5writer import H5Writer, H5NullWriter +def _getDatasetUpdates(dset_json): + """ return a list of value updates for the datset. + initalize one if not already present. """ + if "updates" not in dset_json: + dset_json["updates"] = [] + return dset_json["updates"] + + class Hdf5db: """ This class is used to manage id lookup tables for primary HDF objects (Groups, Datasets, @@ -109,10 +118,12 @@ def root_id(self): def is_new(self, obj_id): """ return true if this is a new object (has not been persisted) """ + obj_id = getHashTagForId(obj_id) return obj_id in self._new_objects def is_dirty(self, obj_id): """ return true if this object has been modified """ + obj_id = getHashTagForId(obj_id) if self.is_new(obj_id): return True return obj_id in self._dirty_objects @@ -131,7 +142,7 @@ def deleted_objects(self): def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ - + obj_id = getHashTagForId(obj_id) if obj_id not in self.db: self.log.error("make dirty called on deleted object") raise KeyError(f"obj_id: {obj_id} not found") @@ -236,8 +247,8 @@ def close(self): """ close reader and writer handles """ self.log.info("Hdf5db __close") - self.flush() - if self.writer: + if self.writer and not isinstance(self.writer, H5NullWriter): + self.flush() self.writer.close() if self.reader: self.reader.close() @@ -280,13 +291,13 @@ def _checkWriter(self): def getObjectById(self, obj_id, refresh=False): """ return object with given id """ self._checkReader() - tag = getHashTagForId(obj_id) - if tag not in self.db or refresh: + obj_id = getHashTagForId(obj_id) + if obj_id not in self.db or refresh: # load the obj from the reader self.log.debug(f"getObjectById - fetching {obj_id} from reader") obj_json = self.reader.getObjectById(obj_id) - self.db[tag] = obj_json - obj_json = self.db[tag] + self.db[obj_id] = obj_json + obj_json = self.db[obj_id] return obj_json @@ -299,6 +310,9 @@ def getObjectIdByPath(self, h5path, parent_id=None): if parent_id is None: parent_id = self.root_id + else: + parent_id = getHashTagForId(parent_id) + self.log.debug(f"getObjectIdDByPath(h5path: {h5path} parent_id: {parent_id}") obj_json = self.getObjectById(parent_id) @@ -359,7 +373,7 @@ def getObjectByPath(self, path): return obj_json def getDtype(self, obj_json): - """ Return numpy data type for given object id + """ Return numpy data type for given dataset, datatype, or attribute """ if "type" not in obj_json: @@ -546,30 +560,26 @@ def getDatasetValues(self, dset_id, sel): If a slices list or tuple is provided, it should have the same number of elements as the rank of the dataset. """ + + def init_arr(dtype, cpl): + """ create an ndarray with the give shape, dtype and fill_value + (if the latter is found in the creation properties list) """ + arr_shape = sel.count if isinstance(sel.count, tuple) else (sel.count, ) + arr = np.zeros(arr_shape, dtype=dtype) + if "fillValue" in cpl: + fillValue = cpl["fillValue"] + # TBD: fix for compound types + arr[...] = fillValue + return arr + + dset_id = getHashTagForId(dset_id) self.log.info(f"getDatasetValues dset_id: {dset_id}, sel: {sel}") - self._checkReader() dset_json = self.getObjectById(dset_id) shape_json = dset_json["shape"] if not isinstance(sel, selections.Selection): raise TypeError("Expected Selection class") - if shape_json["class"] == "H5S_NULL": - return None - - if shape_json["class"] == "H5S_SCALAR": - if sel.select_type != selections.H5S_SELECT_ALL: - # TBD: support other selection types - raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") - if sel.shape != (): - raise ValueError("Selection shape does not match dataset shape") - rank = 0 - else: - dims = tuple(shape_json["dims"]) - if sel.shape != dims: - raise ValueError("Selection shape does not match dataset shape") - rank = len(dims) - dtype = self.getDtype(dset_json) if "creationProperties" in dset_json: @@ -577,50 +587,72 @@ def getDatasetValues(self, dset_id, sel): else: cpl = {} - # determine if we need to make a read request or not - if dset_id in self._new_objects: + updates = _getDatasetUpdates(dset_json) + + shape_class = getShapeClass(shape_json) + + if shape_class == "H5S_NULL": + # return None for selections on null space + return None + + if sel.shape != getShapeDims(shape_json): + raise ValueError("Selection shape does not match dataset shape") + + if shape_class == "H5S_SCALAR": + if sel.select_type != selections.H5S_SELECT_ALL: + # TBD: support other selection types + raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") + if sel.shape != (): + raise ValueError("Selection shape does not match dataset shape") + if updates: + # for scalars the update has to be the requested value + (update_sel, arr) = updates[-1] + elif dset_id in self._new_objects: + arr = init_arr(dtype, cpl) + else: + # fetch from the server + arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) + if arr is None: + raise KeyError(f"Data for dataset {dset_id} not returned") + # done with NULL and SCALAR cases + return arr + + # simple daaset + arr = None + fetch = True + + # determine if we need to get data from the reader + if isinstance(self._reader, H5NullReader) or dset_id in self._new_objects: fetch = False else: - fetch = True - # check against pending updates - if "updates" in dset_json: - updates = dset_json["updates"] - for (update_sel, update_val) in updates: - if selections.contained(sel, update_sel): - fetch = False - break - - # send a reader request unless an update already covers the sel area - if fetch: - arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) - else: - if "fillValue" in cpl: - fillValue = cpl["fillValue"] - # TBD: fix for compound types - arr = np.zeros(sel.mshape, dtype=dtype) - arr[...] = fillValue - else: - arr = np.zeros(sel.mshape, dtype=dtype) - - if "updates" in dset_json: - # apply any non-flushed changes that intersect the current selection - updates = dset_json["updates"] for (update_sel, update_val) in updates: sel_inter = selections.intersect(sel, update_sel) if sel_inter.nselect == 0: continue - # update portion of arr, that intersects update_val - slices = [] - for dim in range(rank): - start = sel_inter.start[dim] - sel.start[dim] - stop = start + sel_inter.count[dim] - slices.append(slice(start, stop, 1)) - slices = tuple(slices) - # TBD: needs updating to work in the general case! - if slices == (): - arr[slices] = update_val[slices] - else: - arr[slices] = update_val + if selections.contained(sel, update_sel): + # desired selection is wholly contained in this update + # TBD: determine if multiple updates would contain all the + # required elements + fetch = False + break + if fetch: + # get last saved version of the data from the reader + arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) + else: + # initialize an array with fill value if given + arr = init_arr(dtype, cpl) + + # apply any updates that impact this selection + for (update_sel, update_val) in updates: + # get the part of the update that is in common with the requested selection + x_sel = selections.intersect(sel, update_sel) + if x_sel.nselect == 0: + # this update doesn't effect the selection, so ignore + continue + # apply the update to the array to be returned + src_sel = selections.translate(update_sel, x_sel) + tgt_sel = selections.translate(sel, x_sel) + arr[tgt_sel.slices] = update_val[src_sel.slices] return arr @@ -641,22 +673,32 @@ def setDatasetValues(self, dset_id, sel, arr): src_dt = arr.dtype if src_dt != tgt_dt: raise TypeError("arr.dtype doesn't match dataset dtype") - - if shape_json["class"] == "H5S_NULL": + shape_class = getShapeClass(shape_json) + if shape_class == "H5S_NULL": raise ValueError("writing to null space dataset not supported") - if shape_json["class"] == "H5S_SCALAR": + if shape_class == "H5S_SCALAR": if sel.shape != (): raise ValueError("Selection shape does not match dataset shape") if len(arr.shape) > 0: raise TypeError("Expected scalar ndarray for scalar dataset") else: - dims = tuple(shape_json["dims"]) + dims = getShapeDims(shape_json) if sel.shape != dims: raise ValueError("Selection shape does not match dataset shape") - if "updates" not in dset_json or sel.select_type == selections.H5S_SELECT_ALL: + updates = _getDatasetUpdates(dset_json) + if sel.select_type == selections.H5S_SELECT_ALL: # for select all, throw out any existing updates since this will overwrite them - dset_json["updates"] = [] - updates = dset_json["updates"] + updates.clear() + arr = arr.copy() # make a copy in case the client updates it later + rank = len(sel.shape) + if len(arr.shape) < rank: + # reshape to keep compatiblity with dataset rank + if sel.select_type == selections.H5S_SELECT_ALL: + # this should not result in a dimension reduction + raise ValueError("unexpected selection shape") + if sel.select_type != selections.H5S_SELECT_HYPERSLABS: + raise ValueError("tbd") + arr = arr.reshape(sel.mshape) updates.append((sel, arr.copy())) self.make_dirty(dset_id) diff --git a/src/h5json/selections.py b/src/h5json/selections.py index ec4ac649..93dd8bcb 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -158,6 +158,8 @@ def contained(s1, s2): is_contained = True rank = len(s1.shape) + if len(s2.shape) != rank: + raise ValueError("contained can be used in selections of different ranks") for dim in range(rank): if s1.step[dim] > 1 or s2.step[dim] > 1: # TBD: do the right thing for stepped selections @@ -173,6 +175,27 @@ def contained(s1, s2): return is_contained +def translate(s1, s2): + """ Given two selections, s1 and s2, return a new selection + definied by s2 relative to s1's stat and count. + s2 must be contained in s1 """ + + _check_bool_args(s1, s2) + sel_inter = intersect(s1, s2) + if sel_inter.nselect == 0: + raise ValueError("translate - selections not overlapping") + + rank = len(s1.shape) + + slices = [] + for dim in range(rank): + start = s2.start[dim] - s1.start[dim] + count = s2.count[dim] + slices.append(slice(start, start + count, 1)) + slices = tuple(slices) + return select(s1.shape, slices) + + class Selection(object): """ diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 5b2ff629..f0091a39 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -58,6 +58,7 @@ def testOpen(self): self.assertEqual(db.getObjectIdByPath("/"), root_id) db.close() self.assertTrue(db.closed) + self.assertTrue(db.writer.isClosed()) obj_id = db.open() self.assertEqual(obj_id, root_id) db.close() diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 63030ef2..11bdd30b 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -432,20 +432,85 @@ def testSimpleDataset(self): db.createAttribute(dset_id, "a1", "Hello, world") sel_all = selections.select(shape, ...) arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) self.assertEqual(arr.shape, shape) self.assertEqual(arr.min(), 0) self.assertEqual(arr.max(), 0) row = np.zeros((ncols,), dtype=dtype) + + # set values row by row for i in range(nrows): row[:] = list(range(i * 10, (i + 1) * 10)) row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) db.setDatasetValues(dset_id, row_sel, row) + + # read entire dataset arr = db.getDatasetValues(dset_id, sel_all) for i in range(nrows): row = np.array(list(range(i * 10, (i + 1) * 10)), dtype=dtype) np.testing.assert_array_equal(arr[i, :], row) + # read row by row + for i in range(nrows): + sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) + row = db.getDatasetValues(dset_id, sel) + self.assertTrue(isinstance(row, np.ndarray)) + self.assertEqual(row.shape, (1, ncols)) + for j in range(ncols): + self.assertEqual(row[0, j], i * 10 + j) + + # read col by col + for j in range(ncols): + sel = selections.select(shape, (slice(0, ncols), slice(j, j + 1))) + col = db.getDatasetValues(dset_id, sel) + self.assertTrue(isinstance(col, np.ndarray)) + self.assertEqual(col.shape, (nrows, 1)) + for i in range(nrows): + self.assertEqual(col[i, 0], i * 10 + j) + + # read element by element + for i in range(nrows): + for j in range(ncols): + sel = selections.select(shape, (slice(i, i + 1), slice(j, j + 1))) + val = db.getDatasetValues(dset_id, sel) + self.assertTrue(isinstance(val, np.ndarray)) + self.assertEqual(val.shape, (1, 1)) + self.assertEqual(val[0, 0], i * 10 + j) + + db.close() + + def testStringDataset(self): + nrows = 6 + ncols = 3 + shape = (nrows, ncols) + dtype = np.dtype("S1") + data = [[b'a', b'b', b'c'], + [b'd', b'e', b'f'], + [b'g', b'h', b'i'], + [b'j', b'k', b'l'], + [b'm', b'n', b'o'], + [b'x', b'y', b'z']] + init_arr = np.array(data, dtype=dtype) + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + + db.setDatasetValues(dset_id, sel_all, init_arr) + + arr = db.getDatasetValues(dset_id, sel_all) + self.assertTrue(np.array_equal(arr, init_arr)) + sel_one = selections.select(shape, (slice(5, 6), slice(2, 3))) + arr = db.getDatasetValues(dset_id, sel_one) + self.assertEqual(arr.shape, (1, 1)) + self.assertEqual(arr[0, 0], b'z') + db.close() def testBoolDataset(self): @@ -473,6 +538,13 @@ def testBoolDataset(self): self.assertEqual(arr.shape, (3,)) self.assertEqual(list(arr[...]), [False, True, False]) + # read back three elements + sel_three = selections.select(shape, slice(1, 4)) + arr = db.getDatasetValues(dset_id, sel_three) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, (3,)) + self.assertEqual(list(arr[...]), [True, False, False]) + db.close() def testScalarDataset(self): From 8bb734e6b347426e0e9eb3ce0861f2b7ea870c85 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 24 Dec 2025 20:47:58 +0800 Subject: [PATCH 102/129] update filter methods --- src/h5json/dset_util.py | 345 ++++++++++++++++---------------- src/h5json/filters.py | 377 +++++++++++++++++------------------ src/h5json/hdf5db.py | 5 +- src/h5json/shape_util.py | 104 +++++++++- test/unit/dset_util_test.py | 4 +- test/unit/filter_test.py | 98 +++++++++ test/unit/hdf5db_test.py | 0 test/unit/hdf5dtype_test.py | 0 test/unit/objid_test.py | 0 test/unit/shape_util_test.py | 41 +++- 10 files changed, 593 insertions(+), 381 deletions(-) mode change 100755 => 100644 test/unit/dset_util_test.py create mode 100644 test/unit/filter_test.py mode change 100755 => 100644 test/unit/hdf5db_test.py mode change 100755 => 100644 test/unit/hdf5dtype_test.py mode change 100755 => 100644 test/unit/objid_test.py mode change 100755 => 100644 test/unit/shape_util_test.py diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index b9b57563..ffcf0147 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -12,13 +12,14 @@ import math from .hdf5dtype import getItemSize, createDataType -from .shape_util import getDataSize +from .shape_util import getDataSize, getShapeClass, getNumElements, getShapeDims +from .shape_util import isExtensible, getMaxDims, getRank from .array_util import getNumpyValue -from .filters import getFiltersJson +from .filters import validateFilters from .objid import isValidUuid CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) -CHUNK_MAX = 2048 * 1024 # Hard upper limit (2M) +CHUNK_MAX = 8096 * 1024 # Hard upper limit (2M) LAYOUT_CLASSES = ( @@ -57,6 +58,33 @@ def getDatasetLayoutClass(dset_json): return layout_class +def estimateDatasetSize(shape_json, item_size, chunk_min=CHUNK_MIN): + """ Get the dataset size in bytes. Make a reasonable guess + for extensible datasets """ + + shape_class = getShapeClass(shape_json) + if shape_class == "H5S_NULL": + return 0 + if shape_class == "H5S_SCALAR": + return item_size + if "maxdims" not in shape_json: + # can just multiple item_size by the number of elements + return item_size * getNumElements(shape_json) + max_dims = getMaxDims(shape_json) + rank = getRank(shape_json) + nsize = item_size + for dim in range(rank): + extent = max_dims[dim] + if extent not in (0, "H5S_UNLIMITED"): + nsize *= extent + # if the current size is less than min_chunk size, + # return something just larger than min_chunk_size + if chunk_min and nsize < chunk_min: + nsize = chunk_min + nsize = -(-nsize // item_size) * item_size # round up to be divisible by item_size + return nsize + + def resize_dataset(dset_json, shape): """ Update shape dims to the given shape provided new shape is valid for maxdims """ shape_json = dset_json["shape"] @@ -88,7 +116,7 @@ def resize_dataset(dset_json, shape): def getContiguousLayout(shape_json, item_size, chunk_min=None, chunk_max=None): """ - create a chunk layout for datasets use contiguous storage. + create a chunk layout for datasets using contiguous storage. """ if not isinstance(item_size, int): msg = "ContiguousLayout can only be used with fixed-length types" @@ -154,49 +182,6 @@ def getChunkSize(chunk_dims, type_size: int = 1): return chunk_size -def isExtensible(dims, maxdims): - """ - Determine if the dataset can be extended - """ - if maxdims is None or len(dims) == 0: - return False - rank = len(dims) - if len(maxdims) != rank: - raise ValueError("rank of maxdims does not match dataset") - for n in range(rank): - if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: - return True - return False - - -def getDsetMaxDims(dset_json): - """ - Get maxdims from a given shape. Return [1,] for Scalar datasets - - Use with H5S_NULL datasets will throw a ValueError - """ - if "shape" not in dset_json: - msg = "No shape found in dset_json" - raise KeyError(msg) - shape_json = dset_json["shape"] - shape_class = shape_json["class"] - maxdims = None - if shape_class == "H5S_NULL": - msg = "Expected shape class other than H5S_NULL" - raise ValueError(msg) - elif shape_class == "H5S_SCALAR": - maxdims = [1,] - elif shape_class == "H5S_SIMPLE": - if "maxdims" in shape_json: - maxdims = shape_json["maxdims"] - else: - maxdims = shape_json["dims"] - else: - msg = f"Unexpected shape class: {shape_class}" - raise ValueError(msg) - return tuple(maxdims) - - def getChunkDims(dset_json): """Get chunk layout. Return shape dims for non-chunked layout""" @@ -423,17 +408,17 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): layout_class = layout_json["class"] if "filters" in creation_props: + filters = creation_props["filters"] try: - filters_out = getFiltersJson(creation_props) - except (KeyError, TypeError, ValueError): + validateFilters(filters) + except (KeyError, TypeError, ValueError) as e: # raise bad request exception if not valid - msg = "invalid filter provided" + msg = f"invalid filter provided: {str(e)}" + raise ValueError(msg) + # check that a chunked layout is used + if layout_class and layout_class.startswith("H5D_CHUNKED") is False: + msg = "filters can only be used with chunked layout" raise ValueError(msg) - if filters_out: - # check that a chunked layout is used - if layout_class is None or layout_class.startswith("H5D_CHUNKED") is False: - msg = "filters can only be used with chunked layout" - raise ValueError(msg) def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN): @@ -540,7 +525,7 @@ def shrinkChunk(layout, typesize, chunk_max=CHUNK_MAX): return tuple(layout) -def guessChunk(shape_json, typesize, chunk_min=None, chunk_max=None): +def guessChunk(shape, typesize, chunk_min=None, chunk_max=None): """Guess an appropriate chunk layout for a dataset, given its shape and the size of each element in bytes. Will allocate chunks only as large as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of @@ -548,11 +533,17 @@ def guessChunk(shape_json, typesize, chunk_min=None, chunk_max=None): Undocumented and subject to change without warning. """ - if shape_json is None or shape_json["class"] == "H5S_NULL": + if shape is None or isinstance(shape, dict) and shape.get("class") == "H5S_NULL": return None - if shape_json["class"] == "H5S_SCALAR": + if isinstance(shape, dict) and shape.get("class") == "H5S_SCALAR": return (1,) # just enough to store one item + # if we are passed shape as a tuple, create an shape json using H5S_SIMPLE + if isinstance(shape, (list, tuple)): + shape_json = {"class": "H5S_SIMPLE", "dims": shape} + else: + shape_json = shape + if "maxdims" in shape_json: shape = shape_json["maxdims"] else: @@ -575,131 +566,133 @@ def guessChunk(shape_json, typesize, chunk_min=None, chunk_max=None): return shape -def getLayoutJson(creation_props, - shape=None, - type_json=None, - chunk_min=CHUNK_MIN, - chunk_max=CHUNK_MAX, - max_chunks_per_folder=0): - """ Get the layout json given by creation_props. - Raise value error if invalid """ +def generateLayout( + shape_json, + item_size=0, + has_filter=False, + chunks=None, + chunk_min=CHUNK_MIN, + chunk_max=CHUNK_MAX, + max_chunks_per_folder=0 +): - item_size = getItemSize(type_json) + """ Create a dataset layout based on type and shape properties """ + + if item_size < 0: + raise ValueError("item_size is invalid") + + shape_class = getShapeClass(shape_json) + if shape_class == "H5S_NULL": + if chunks or has_filter: + raise ValueError("Null space datasets do not support chunking") + return {} + + if shape_class == "H5S_SCALAR": + if chunks or has_filter: + raise ValueError("Scalar datasets do not support chunking") + return {"class": "H5D_CONIGUOUS"} if chunk_min > chunk_max: msg = "chunk_max must be larger than chunk_min" raise ValueError(msg) - layout = None - if "layout" in creation_props: - layout_props = creation_props["layout"] - else: - layout_props = None - - if layout_props: - if "class" not in layout_props: - msg = "expected class key in layout props" - raise KeyError(msg) - layout_class = layout_props["class"] - if layout_class == "H5D_CONTIGUOUS": - # treat contiguous as chunked - layout_class = "H5D_CHUNKED" - else: - layout_class = layout_props["class"] - elif shape["class"] != "H5S_NULL": - layout_class = "H5D_CHUNKED" - else: - layout_class = None - - if layout_class == "H5D_COMPACT": - layout = {"class": "H5D_COMPACT"} - elif layout_class: - # initialize to H5D_CHUNKED - layout = {"class": "H5D_CHUNKED"} - else: - # null space - no layout - layout = None + dset_size = estimateDatasetSize(shape_json, item_size, chunk_min=chunk_min) + shape_dims = getShapeDims(shape_json) + rank = len(shape_dims) + max_dims = getMaxDims(shape_json) + extensible = isExtensible(shape_dims, max_dims) - if layout_props and "dims" in layout_props: - chunk_dims = layout_props["dims"] - else: - chunk_dims = None + if dset_size < chunk_min and not extensible and not has_filter and not chunks: + # can just return a contiguous layout + return {"class": "H5D_CONTIGUOUS"} - if layout_class == "H5D_CONTIGUOUS_REF": + layout = {"class": "H5D_CHUNKED"} # otherwise use chunked layout + chunk_dims = None + if chunks: + if isinstance(chunks, (tuple, list)): + chunk_dims = chunks + if len(chunk_dims) != rank: + raise ValueError("given chunk dims do not agree with dataset rank") + if not chunk_dims: kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} - chunk_dims = getContiguousLayout(shape, item_size, **kwargs) - layout["dims"] = chunk_dims - - if layout_class == "H5D_CHUNKED" and chunk_dims is None: - # do auto-chunking - chunk_dims = guessChunk(shape, item_size) - - if layout_class == "H5D_CHUNKED": - chunk_size = getChunkSize(chunk_dims, item_size) - - # adjust the chunk shape if chunk size is too small or too big - adjusted_chunk_dims = None - if chunk_size < chunk_min: - kwargs = {"chunk_min": chunk_min, "layout_class": layout_class} - adjusted_chunk_dims = expandChunk(chunk_dims, item_size, shape, **kwargs) - elif chunk_size > chunk_max: - kwargs = {"chunk_max": chunk_max} - adjusted_chunk_dims = shrinkChunk(chunk_dims, item_size, **kwargs) - if adjusted_chunk_dims: - layout["dims"] = adjusted_chunk_dims - else: - layout["dims"] = chunk_dims # don't need to adjust chunk size - - # set partition_count if needed: - set_partition = False - if max_chunks_per_folder > 0: - if "dims" in shape and "dims" in layout: - set_partition = True - - if set_partition: - chunk_dims = layout["dims"] - shape_dims = shape["dims"] - if "maxdims" in shape: - max_dims = shape["maxdims"] - else: - max_dims = None - num_chunks = 1 - rank = len(shape_dims) - unlimited_count = 0 - if max_dims: - for i in range(rank): - if max_dims[i] == 0: - unlimited_count += 1 - for i in range(rank): - max_dim = 1 - if max_dims: - max_dim = max_dims[i] - if max_dim == 0: - # don't really know what the ultimate extent - # could be, but assume 10^6 for total number of - # elements and square-shaped array... - MAX_ELEMENT_GUESS = 10.0 ** 6 - exp = 1 / unlimited_count - max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) - else: - max_dim = shape_dims[i] - num_chunks *= math.ceil(max_dim / chunk_dims[i]) - - if num_chunks > max_chunks_per_folder: - partition_count = math.ceil(num_chunks / max_chunks_per_folder) - msg = f"set partition count to: {partition_count}, " - msg += f"num_chunks: {num_chunks}" - layout["partition_count"] = partition_count + chunk_dims = getChunkDims(shape_json, item_size, **kwargs) + layout["dims"] = chunk_dims + + # set partition_count if needed: + if max_chunks_per_folder > 0: + num_chunks = 1 + rank = len(shape_dims) + unlimited_count = 0 + for dim in range(rank): + if max_dims[dim] in (0, "H5S_UNLIMITED"): + unlimited_count += 1 + for dim in range(rank): + max_dim = 1 + max_dim = max_dims[dim] + if max_dim in (0, "H5S_UNLIMITED"): + # don't really know what the ultimate extent + # could be, but assume 10^6 for total number of + # elements and square-shaped array... + MAX_ELEMENT_GUESS = 10.0 ** 6 + exp = 1 / unlimited_count + max_dim = int(math.pow(MAX_ELEMENT_GUESS, exp)) else: - pass # partition not needed - - if layout_class in ("H5D_CHUNKED_REF", "H5D_CHUNKED_REF_INDIRECT"): - chunk_size = getChunkSize(chunk_dims, item_size) - - # nothing to do about inefficiently small chunks, but large chunks - # can be subdivided - if chunk_size < chunk_min: - pass # too small - elif chunk_size > chunk_max: - pass # too large - layout["dims"] = chunk_dims + max_dim = shape_dims[dim] + num_chunks *= math.ceil(max_dim / chunk_dims[dim]) + + if num_chunks > max_chunks_per_folder: + partition_count = math.ceil(num_chunks / max_chunks_per_folder) + layout["partition_count"] = partition_count + else: + pass # partition not needed + return layout + + +def generate_dcpl( + shape_json, + dtype, + chunks=None, + filters=[], + chunk_min=CHUNK_MIN, + chunk_max=CHUNK_MAX, + max_chunks_per_folder=None, + initializer=None, + initializer_opts=None +): + """Generate a dataset creation property list. + + """ + + plist = {} + + shape_class = getShapeClass(shape_json) + + if shape_class != "H5S_SIMPLE": + if chunks or filters: + raise TypeError(f"{shape_class} datasets don't support chunk/filter options") + + return plist # return empty property list for non-simple datasets + + validateFilters(filters) # check filter params if any + + # End argument validation + + kwargs = {"item_size": dtype.itemsize, "has_filter": filters} + kwargs["chunks"] = chunks + kwargs["chunk_min"] = chunk_min + kwargs["chunk_max"] = chunk_max + kwargs["max_chunks_per_folder"] = max_chunks_per_folder + plist["layout"] = generateLayout(shape_json, **kwargs) + + if len(filters) > 0: + plist["filters"] = filters + + if initializer: + # TBD: this needs to be documented in the json spec + # pass in initializer options + initializer = [initializer,] + if initializer_opts: + initializer.extend(initializer_opts) + plist["initializer"] = initializer + + return plist diff --git a/src/h5json/filters.py b/src/h5json/filters.py index 724ac929..3ddfe3f5 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -14,13 +14,17 @@ from .hdf5dtype import isVlen +DEFAULT_GZIP = 4 +DEFAULT_SZIP = 4 +SO_INT_MINBITS_DEFAULT = 0 + # List of registered filters. Not all are supported by every reader and writer. # # # tuple of filter key, filter id, and options, FILTER_DEFS = ( ("H5Z_FILTER_NONE", 0, "none", ()), - ("H5Z_FILTER_DEFLATE", 1, "gzip", ("level",)), # aka as "zlib" for blosc + ("H5Z_FILTER_DEFLATE", 1, "gzip", ("level",)), # aka as "default" or "zlib" for blosc ("H5Z_FILTER_SHUFFLE", 2, "shuffle", ()), ("H5Z_FILTER_FLETCHER32", 3, "fletcher32", ()), ("H5Z_FILTER_SZIP", 4, "szip", ("bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine")), @@ -85,143 +89,193 @@ def getAllFilterNames(): return tuple(names) -def getFilterItem(key): +def getFilterItem(name, options={}): """ Return filter code, id, and name, based on an id, a name or a code. """ - - if key == "deflate": - key = "gzip" # use gzip as equivalent + # is key is dict, just verify it's a valid filter and return + filter_json = None + + if isinstance(name, dict): + filter_json = name + base_keys = ("class", "id", "name") + for key in base_keys: + if key not in filter_json: + raise KeyError(f"Expected {key} for filter") + # use class key to look up options + name = filter_json["class"] + elif name in ("deflate", "zlib"): + name = "gzip" # use gzip as equivalent + + option_set = None for item in FILTER_DEFS: # check for a match by key, id, or alias (the first three elements) for i in range(3): - if key == item[i]: - return {"class": item[0], "id": item[1], "name": item[2], "options": item[3]} - return None # not found - + if name == item[i]: + if filter_json is None: + filter_json = {"class": item[0], "id": item[1], "name": item[2]} + option_set = set(item[3]) + break + + if not filter_json and isinstance(name, int) and name > 32000: + filter_json = {"class": "H5Z_FILTER_USER", "id": name, "name": f"user filter {name}"} + + if not filter_json: + raise KeyError(f"filter {name} is unknown") + + filter_class = filter_json["class"] + if filter_class == "H5Z_FILTER_USER": + option_set = set() + option_set.add("parameters") + + # check that any option supplied is supported by the filter + for key in options: + if key not in option_set: + msg = f"Option {key} is not supported by the {filter_class} filter" + raise KeyError(msg) + + # for any supplied options verify they are correct type and range + # (raise Type or Value error if not). If option is not given, use + # the default value if not. Finally add options to the filter_json + + if filter_class == "H5Z_FILTER_DEFLATE": + if "level" in options: + level_val = options["level"] + if not isinstance(level_val, int): + msg = "Expected integer level for deflate filter" + raise TypeError(msg) + if level_val < 0 or level_val > 9: + msg = "Deflate filter level must be between 0 and 9" + raise ValueError(msg) + filter_json["level"] = level_val + else: + filter_json["level"] = DEFAULT_GZIP + + elif filter_class == "H5Z_FILTER_SHUFFLE": + pass # no options + elif filter_class == "H5Z_FILTER_FLETCHER32": + pass # no options + elif filter_class == "H5Z_FILTER_SZIP": + for key in option_set: # option set("bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"): + if key in options: + val = options[key] + if key == "coding": + if val not in HDF_FILTER_OPTION_ENUMS["coding"].values(): + msg = f"Invalid coding option for szip filter: {val}" + raise ValueError(msg) + else: + # other options need to be positivie integers + if not isinstance(val, int) or val <= 0: + msg = f"Expected positive integer for szip filter option {key}" + raise ValueError(msg) + filter_json[key] = val + else: + pass # no defaults for szip + elif filter_class == "H5Z_FILTER_NBIT": + pass # no options + elif filter_class == "H5Z_FILTER_SCALEOFFSET": + if "scaleType" in options: + val = options["scaleType"] + if val not in HDF_FILTER_OPTION_ENUMS["scaleType"].values(): + msg = f"Invalid scaleType option for scaleoffset filter: {val}" + raise ValueError(msg) -def getFiltersJson(create_props, supported_filters=None): - """ return standardized filter representation from creation properties - raise bad request if invalid """ + filter_json["scaleType"] = val + if "scaleOffset" in options: + val = options["scaleOffset"] + if not isinstance(val, int) or val < 0: + msg = "Expected non-negative integer for scaleOffset option" + raise ValueError(msg) + filter_json["scaleOffset"] = val + elif filter_class == "H5Z_FILTER_LZF": + pass # no options + elif filter_class == "H5Z_FILTER_BLOSC": + pass # no options + elif filter_class == "H5Z_FILTER_SNAPPY": + pass # no options + elif filter_class == "H5Z_FILTER_LZ4": + pass # no options + elif filter_class == "H5Z_FILTER_LZ4HC": + pass # no options + elif filter_class == "H5Z_FILTER_BITSHUFFLE": + pass # no options + elif filter_class == "H5Z_FILTER_ZSTD": + pass # no options + elif filter_class == "H5Z_FILTER_NONE": + pass # no options + elif filter_class == "H5Z_FILTER_USER": + if "parameters" in options: + parameters = options["parameters"] + # expecting a positive integer array + if not isinstance(parameters, (list, tuple)): + raise TypeError(f"filter {filter_class} parameters option should be a list") + vals = [] + for val in parameters: + if not isinstance(val, int): + raise TypeError(f"filter {filter_class} parameters expected integer value") + if val <= 0: + raise TypeError(f"filter {filter_class} parameters option should be a positive int") + vals.append(val) + filter_json["parameters"] = val + else: + msg = f"filter class {filter_class} is not supported" + raise KeyError(msg) + + return filter_json + + +def validateFilter(filter_json, supported_filters=None): + """ Check the given the given filter for create format, + required options set. Raise TypeError, KeyError or ValueError if not. + If supported_filters is supplied, raise KeyError if a non-supported + filter is supplied. """ + + if not isinstance(filter_json, dict): + raise TypeError(f"Expected dict for filter but got {type(filter_json)}") + base_keys = ("class", "id", "name") + for key in base_keys: + if key not in filter_json: + raise KeyError(f"Expected {key} for filter") + filter_class = filter_json["class"] + filter_id = filter_json["id"] + # check that the filter_class agrees with the id in FILTER_DEFS + options = None + for filter_def in FILTER_DEFS: + if filter_def[0] == filter_class: + if filter_id != filter_def[1]: + msg = f"Incorrect filter_id: {filter_id} for filter: {filter_class}" + raise ValueError(msg) + # collect any filter options to check later + options = {} + for key in filter_json: + if key in base_keys: + continue + options[key] = filter_json[key] + break - # refer to https://hdf5-json.readthedocs.io/en/latest/bnf/\ - # filters.html#grammar-token-filter_list + if options is None and filter_class == "H5Z_FILTER_USER": + # custom filter, id should be > 32000 + if filter_id <= 32000: + raise ValueError(f"Unexpected filter id: {filter_id} for user filter") + options = {} + for key in filter_json: + if key in base_keys: + continue + options[key] = filter_json[key] - if "filters" not in create_props: - return {} # null set + if options is None: + raise KeyError(f"Unknown filter: {filter_class}") - f_in = create_props["filters"] + # will raise error if any option is invalid + getFilterItem(filter_json, options) - if not isinstance(f_in, list): - msg = "Expected filters in creation_props to be a list" - raise TypeError(msg) - if not supported_filters: - supported_filters = getAllFilterNames() +def validateFilters(filters, supported_filters=None): + """ validate each filter in the filter list """ - f_out = [] - for filter in f_in: - if isinstance(filter, int) or isinstance(filter, str): - item = getFilterItem(filter) - if not item: - msg = f"filter {filter} not recognized" - raise ValueError(msg) - - if item["name"] not in supported_filters: - msg = f"filter {filter} is not supported" - raise ValueError(msg) - f_out.append(item) - elif isinstance(filter, dict): - if filter.get("class") == "H5Z_FILTER_USER": - # user filter - must have either id or name - if "id" not in filter and "name" not in filter: - msg = "user filter must have either 'id' or 'name' key" - raise KeyError(msg) - item = filter - elif "id" in filter: - item = getFilterItem(filter["id"]) - elif "name" in filter: - item = getFilterItem(filter["name"]) - else: - item = None - if not item: - msg = f"filter {filter} not recognized" - raise ValueError(msg) - - # will replace options list with specified options - del item["options"] - - # copy any filter specified options - filter_class = item["class"] - if filter_class == "H5Z_FILTER_DEFLATE": - if "level" in filter: - level_val = filter["level"] - if not isinstance(level_val, int): - msg = "Expected integer level for deflate filter" - raise TypeError(msg) - if level_val < 0 or level_val > 9: - msg = "Deflate filter level must be between 0 and 9" - raise ValueError(msg) - item["level"] = level_val - elif filter_class == "H5Z_FILTER_SHUFFLE": - pass # no options - elif filter_class == "H5Z_FILTER_FLETCHER32": - pass # no options - elif filter_class == "H5Z_FILTER_SZIP": - for key in ("bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"): - if key in filter: - val = filter[key] - if key == "coding": - if val not in HDF_FILTER_OPTION_ENUMS["coding"].values(): - msg = f"Invalid coding option for szip filter: {val}" - raise ValueError(msg) - else: - # other options need to be positivie integers - if not isinstance(val, int) or val <= 0: - msg = f"Expected positive integer for szip filter option {key}" - raise ValueError(msg) - item[key] = val - elif filter_class == "H5Z_FILTER_NBIT": - pass # no options - elif filter_class == "H5Z_FILTER_SCALEOFFSET": - if "scaleType" in filter: - val = filter["scaleType"] - if val not in HDF_FILTER_OPTION_ENUMS["scaleType"].values(): - msg = f"Invalid scaleType option for scaleoffset filter: {val}" - raise ValueError(msg) - else: - item["scaleType"] = val - if "scaleOffset" in filter: - val = filter["scaleOffset"] - if not isinstance(val, int) or val < 0: - msg = "Expected non-negative integer for scaleOffset option" - raise ValueError(msg) - else: - item["scaleOffset"] = val - elif filter_class == "H5Z_FILTER_LZF": - pass # no options - elif filter_class == "H5Z_FILTER_BLOSC": - pass # no options - elif filter_class == "H5Z_FILTER_SNAPPY": - pass # no options - elif filter_class == "H5Z_FILTER_LZ4": - pass # no options - elif filter_class == "H5Z_FILTER_LZ4HC": - pass # no options - elif filter_class == "H5Z_FILTER_BITSHUFFLE": - pass # no options - elif filter_class == "H5Z_FILTER_ZSTD": - pass # no options - else: - msg = f"filter class {filter_class} is not supported" - raise KeyError(msg) - f_out.append(item) - else: - msg = f"Unexpected type for filter: {filter}" - raise ValueError(msg) - - # return standardized filter representation - return f_out + # TBD: check given order of filters is supported + for filter_json in filters: + validateFilter(filter_json, supported_filters=supported_filters) def getFilters(dset_json): @@ -235,72 +289,11 @@ def getFilters(dset_json): return filters -def getCompressionFilter(filters): - """Return compression filter from filters, or None""" - for filter in filters: - if "class" not in filter: - # expected class key - malformed filter def - continue - filter_class = filter["class"] - if filter_class in COMPRESSION_FILTER_IDS: - return filter - if all( - ( - filter_class == "H5Z_FILTER_USER", - "name" in filter, - filter["name"] in COMPRESSION_FILTER_NAMES, - ) - ): - return filter - return None - - -def getShuffleFilter(filters): - """Return shuffle filter, or None""" - FILTER_CLASSES = ("H5Z_FILTER_SHUFFLE", "H5Z_FILTER_BITSHUFFLE") - for filter in filters: - if "class" not in filter: - # invalid filter def? - continue - filter_class = filter["class"] - if filter_class in FILTER_CLASSES: - return filter - - return None - - -def getFilterOps(filters, dtype=None): - """Get list of filter operations to be used for this dataset""" - - compressionFilter = getCompressionFilter(filters) - - filter_ops = {} - - shuffleFilter = getShuffleFilter(filters) - - if shuffleFilter and not isVlen(dtype): - shuffle_name = shuffleFilter["name"] - if shuffle_name == "shuffle": - filter_ops["shuffle"] = 1 # use regular shuffle - elif shuffle_name == "bitshuffle": - filter_ops["shuffle"] = 2 # use bitshuffle - else: - filter_ops["shuffle"] = 0 # no shuffle - else: - filter_ops["shuffle"] = 0 # no shuffle +def isCompressionFilter(filter): + filter_json = getFilterItem(filter) + return filter_json["class"] in COMPRESSION_FILTER_IDS - """ return list of filter operations for this dataset """ - if compressionFilter: - if compressionFilter["class"] == "H5Z_FILTER_DEFLATE": - filter_ops["compressor"] = "zlib" # blosc compressor - else: - if "name" in compressionFilter: - filter_ops["compressor"] = compressionFilter["name"] - else: - filter_ops["compressor"] = "lz4" # default to lz4 - if "level" not in compressionFilter: - filter_ops["level"] = 5 # medium level - else: - filter_ops["level"] = int(compressionFilter["level"]) - return filter_ops +def getCompressionFilter(filters): + """Return compression filter ids from filters, or None""" + return COMPRESSION_FILTER_IDS diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 9468c9fe..7982a926 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -16,7 +16,7 @@ from .array_util import jsonToArray, bytesArrayToList from .dset_util import resize_dataset from .shape_util import getShapeClass, getShapeDims -from .filters import getFiltersJson +from .filters import validateFilters from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId from . import selections from .time_util import getNow @@ -884,8 +884,7 @@ def createDataset( else: supported_filters = () # validate and normalize supplied filter property list - filters_json = getFiltersJson(cpl, supported_filters=supported_filters) - cpl["filters"] = filters_json + validateFilters(cpl["filters"], supported_filters=supported_filters) dset_json["creationProperties"] = cpl else: dset_json["creationProperties"] = {} diff --git a/src/h5json/shape_util.py b/src/h5json/shape_util.py index a3531cde..cbc6a8fe 100644 --- a/src/h5json/shape_util.py +++ b/src/h5json/shape_util.py @@ -13,20 +13,20 @@ import numpy as np -def getShapeClass(shape): +def getShapeClass(obj_json): """ Return shape class of the given data shape """ - if not isinstance(shape, dict): + if not isinstance(obj_json, dict): raise TypeError("expected dict object") - if shape.get("class") in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): + if obj_json.get("class") in ("H5S_NULL", "H5S_SCALAR", "H5S_SIMPLE"): # this is a shape_json obj - shape_json = shape - elif "shape" in shape: + shape_json = obj_json + elif "shape" in obj_json: # dataset or attribute json - shape_json = shape["shape"] + shape_json = obj_json["shape"] else: - raise ValueError(f"Unknown shape: {shape}") + raise ValueError(f"Unknown shape: {obj_json}") if "class" not in shape_json: raise KeyError("expected 'class' key for data shape")\ @@ -34,6 +34,33 @@ def getShapeClass(shape): return shape_json["class"] +def getShapeJson(dims, maxdims=None): + """ create a new shape_json based on dims and + optionally maxdims (the later only applies to + datasets) """ + if isinstance(dims, int): + dims = (dims, ) + if isinstance(maxdims, int): + maxdims = (maxdims, ) + if dims is None: + shape_class = "H5S_NULL" + elif len(dims) == 0: + shape_class = "H5S_SCALAR" + else: + shape_class = "H5S_SIMPLE" + if maxdims is not None: + if shape_class != "H5S_SIMPLE": + raise ValueError(f"maxdims can not be used with shape class: {shape_class}") + if len(maxdims) != len(dims): + raise ValueError("maxdims must match dataspace rank") + shape_json = {"class": shape_class} + if shape_class == "H5S_SIMPLE": + shape_json["dims"] = dims + if maxdims is not None: + shape_json["maxdims"] = maxdims + return shape_json + + def getShapeDims(shape): """ Get dims from a given shape json. Return [1,] for Scalar datasets, @@ -139,3 +166,66 @@ def getDataSize(shape, type_size: int = 1): return 0 else: return type_size * int(np.prod(dims)) + + +def isExtensible(obj_json): + """ + Determine if the dataset can be extended + """ + + if "shape" in obj_json: + # assume dataset or attribute json + shape_json = obj_json["shape"] + else: + shape_json = obj_json + shape_class = getShapeClass(shape_json) + if shape_class != "H5S_SIMPLE": + return False + + if "maxdims" not in shape_json: + return False + + dims = shape_json["dims"] + maxdims = shape_json["maxdims"] + rank = len(dims) + if len(maxdims) != rank: + raise ValueError("rank of maxdims does not match dataset") + for n in range(rank): + if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: + return True + return False + + +def getMaxDims(obj_json): + """ + Get maxdims from a given shape. Return [1,] for Scalar datasets + + Use with H5S_NULL datasets will throw a ValueError + """ + + if not isinstance(obj_json, dict): + raise TypeError("expected a dict argument") + + if "shape" in obj_json: + shape_json = obj_json["shape"] + else: + shape_json = obj_json + + if "class" not in shape_json: + # should have at least this + raise KeyError(f"unexpected shape json: {shape_json}") + shape_class = shape_json["class"] + maxdims = None + if shape_class == "H5S_NULL": + return None + elif shape_class == "H5S_SCALAR": + maxdims = () + elif shape_class == "H5S_SIMPLE": + if "maxdims" in shape_json: + maxdims = shape_json["maxdims"] + else: + maxdims = shape_json["dims"] + else: + msg = f"Unexpected shape class: {shape_class}" + raise ValueError(msg) + return tuple(maxdims) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py old mode 100755 new mode 100644 index c029fd01..b24594d2 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -12,6 +12,7 @@ import unittest import logging +from h5json.filters import getFilterItem from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getChunkDims from h5json.dset_util import validateChunkLayout, validateDatasetCreationProps, getDatasetLayout @@ -121,7 +122,8 @@ def testFilterValidation(self): except ValueError: self.assertTrue(False) # shouldn't raise exception # try with just a filter name - cpl["filters"] = ["gzip", ] + gzip_filter = getFilterItem("gzip") + cpl["filters"] = [gzip_filter, ] try: validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) except ValueError: diff --git a/test/unit/filter_test.py b/test/unit/filter_test.py new file mode 100644 index 00000000..0b37c54f --- /dev/null +++ b/test/unit/filter_test.py @@ -0,0 +1,98 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import logging + +from h5json.filters import FILTER_DEFS +from h5json.filters import getFilterItem, validateFilter, isCompressionFilter + + +class FiltersTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(FiltersTest, self).__init__(*args, **kwargs) + # main + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testStandardFilters(self): + + # check standard filters with no options + + self.assertEqual(len(FILTER_DEFS), 14) + for item in FILTER_DEFS: + filter_class = item[0] + filter_id = item[1] + filter_name = item[2] + for value in (filter_class, filter_id, filter_name): + filter_json = getFilterItem(value) + validateFilter(filter_json) + + # check alternate names work + for name in ("deflate", "gzip"): + filter_json = getFilterItem(name) + validateFilter(filter_json) + self.assertTrue(isCompressionFilter(filter_json)) + + # check random name raises exception + try: + getFilterItem("goofy") + self.assertTrue(False) + except KeyError: + pass # expected + + # check invalid filter id fails + try: + getFilterItem(1234) + self.assertTrue(False) + except KeyError: + pass # expected + + def testCustomFilters(self): + + # check custom filter usage + custom_filter = {"class": "H5Z_FILTER_USER", "name": "myspecialfilter"} + # id should be over 32000 + custom_filter["id"] = 32000 + try: + validateFilter(custom_filter) + self.assertTrue(False) # shouldn't get here + except ValueError: + pass # expected + + custom_filter["id"] = 32099 + validateFilter(custom_filter) + + custom_filter["unknown_option"] = 42 + try: + validateFilter(custom_filter) + self.assertTrue(False) # shouldn't get here + except KeyError: + pass # expected + + del custom_filter["unknown_option"] + good_params = (1, 2, 3) + bad_params = (2, -1) # needs to be positive + custom_filter["parameters"] = good_params + validateFilter(custom_filter) + + custom_filter["parameters"] = bad_params + try: + validateFilter(custom_filter) + self.assertTrue(False) # shouldn't get here + except TypeError: + pass # expected + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py old mode 100755 new mode 100644 diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py old mode 100755 new mode 100644 diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py old mode 100755 new mode 100644 diff --git a/test/unit/shape_util_test.py b/test/unit/shape_util_test.py old mode 100755 new mode 100644 index 23c41edf..98812692 --- a/test/unit/shape_util_test.py +++ b/test/unit/shape_util_test.py @@ -13,7 +13,7 @@ import logging from h5json.shape_util import getShapeClass, getShapeDims, getNumElements, getRank -from h5json.shape_util import isNullSpace, isScalar, getDataSize +from h5json.shape_util import isNullSpace, isScalar, getDataSize, isExtensible, getMaxDims class ShapeUtilTest(unittest.TestCase): @@ -44,6 +44,7 @@ def testSimple(self): simple_shape_json = {"class": "H5S_SIMPLE", "dims": [5, 7]} simple_shape_obj = {"type": type_json, "shape": simple_shape_json} vstr_simple_shape_obj = {"type": vstr_json, "shape": simple_shape_json} + resizable_shape_obj = {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]} self.assertEqual(getShapeClass(null_shape_json), "H5S_NULL") self.assertEqual(getShapeClass(null_shape_obj), "H5S_NULL") @@ -53,6 +54,7 @@ def testSimple(self): self.assertEqual(getShapeClass(simple_shape_json), "H5S_SIMPLE") self.assertEqual(getShapeClass(simple_shape_obj), "H5S_SIMPLE") self.assertEqual(getShapeClass(vstr_simple_shape_obj), "H5S_SIMPLE") + self.assertEqual(getShapeClass(resizable_shape_obj), "H5S_SIMPLE") self.assertEqual(getShapeDims(null_shape_json), None) self.assertEqual(getShapeDims(null_shape_obj), None) @@ -63,6 +65,17 @@ def testSimple(self): self.assertEqual(getShapeDims(simple_shape_obj), (5, 7)) self.assertEqual(getShapeDims(vstr_simple_shape_obj), (5, 7)) self.assertEqual(getShapeDims(12), (12,)) + self.assertEqual(getShapeDims(resizable_shape_obj), (10,)) + + self.assertEqual(getMaxDims(null_shape_json), None) + self.assertEqual(getMaxDims(null_shape_obj), None) + self.assertEqual(getMaxDims(scalar_shape_json), ()) + self.assertEqual(getMaxDims(scalar_shape_obj), ()) + self.assertEqual(getMaxDims(vstr_scalar_shape_obj), ()) + self.assertEqual(getMaxDims(simple_shape_json), (5, 7)) + self.assertEqual(getMaxDims(simple_shape_obj), (5, 7)) + self.assertEqual(getMaxDims(vstr_simple_shape_obj), (5, 7)) + self.assertEqual(getMaxDims(resizable_shape_obj), (20,)) self.assertEqual(getRank(null_shape_json), 0) self.assertEqual(getRank(null_shape_obj), 0) @@ -72,7 +85,7 @@ def testSimple(self): self.assertEqual(getRank(simple_shape_json), 2) self.assertEqual(getRank(simple_shape_obj), 2) self.assertEqual(getRank(vstr_simple_shape_obj), 2) - self.assertEqual(getRank((1, 2, 3)), 3) + self.assertEqual(getRank(resizable_shape_obj), 1) self.assertEqual(getNumElements(null_shape_json), 0) self.assertEqual(getNumElements(null_shape_obj), 0) @@ -82,6 +95,7 @@ def testSimple(self): self.assertEqual(getNumElements(simple_shape_json), 35) self.assertEqual(getNumElements(simple_shape_obj), 35) self.assertEqual(getNumElements(vstr_simple_shape_obj), 35) + self.assertEqual(getNumElements(resizable_shape_obj), 10) self.assertEqual(getNumElements(()), 1) self.assertEqual(getNumElements([1, 2, 3]), 6) @@ -93,6 +107,7 @@ def testSimple(self): self.assertEqual(isNullSpace(simple_shape_json), False) self.assertEqual(isNullSpace(simple_shape_obj), False) self.assertEqual(isNullSpace(vstr_simple_shape_obj), False) + self.assertEqual(isNullSpace(resizable_shape_obj), False) self.assertEqual(isScalar(null_shape_json), False) self.assertEqual(isScalar(null_shape_obj), False) @@ -102,6 +117,7 @@ def testSimple(self): self.assertEqual(isScalar(simple_shape_json), False) self.assertEqual(isScalar(simple_shape_obj), False) self.assertEqual(isScalar(vstr_simple_shape_obj), False) + self.assertEqual(isScalar(resizable_shape_obj), False) self.assertEqual(getDataSize(null_shape_json, 4), 0) self.assertEqual(getDataSize(null_shape_obj, 4), 0) @@ -111,9 +127,30 @@ def testSimple(self): self.assertEqual(getDataSize(simple_shape_json, 4), 140) self.assertEqual(getDataSize(simple_shape_obj, 4), 140) self.assertEqual(getDataSize(vstr_simple_shape_obj, 4), 140) + self.assertEqual(getDataSize(resizable_shape_obj, 4), 40) self.assertEqual(getDataSize((), 4), 4) self.assertEqual(getDataSize([1, 2, 3], 4), 24) + self.assertEqual(isScalar(null_shape_json), False) + self.assertEqual(isScalar(null_shape_obj), False) + self.assertEqual(isScalar(scalar_shape_json), True) + self.assertEqual(isScalar(scalar_shape_obj), True) + self.assertEqual(isScalar(vstr_scalar_shape_obj), True) + self.assertEqual(isScalar(simple_shape_json), False) + self.assertEqual(isScalar(simple_shape_obj), False) + self.assertEqual(isScalar(vstr_simple_shape_obj), False) + self.assertEqual(isScalar(resizable_shape_obj), False) + + self.assertEqual(isExtensible(null_shape_json), False) + self.assertEqual(isExtensible(null_shape_obj), False) + self.assertEqual(isExtensible(scalar_shape_json), False) + self.assertEqual(isExtensible(scalar_shape_obj), False) + self.assertEqual(isExtensible(vstr_scalar_shape_obj), False) + self.assertEqual(isExtensible(simple_shape_json), False) + self.assertEqual(isExtensible(simple_shape_obj), False) + self.assertEqual(isExtensible(vstr_simple_shape_obj), False) + self.assertEqual(isExtensible(resizable_shape_obj), True) + if __name__ == "__main__": # setup test files From e4aafaf7bbdedd89c43da5deb2510cbcb1f1b97d Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 26 Dec 2025 13:57:41 +0800 Subject: [PATCH 103/129] fix for lz4 filter opts --- src/h5json/dset_util.py | 17 ++++++++------- src/h5json/filters.py | 28 ++++++++++++++++++++----- test/unit/dset_util_test.py | 42 ++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 14 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index ffcf0147..1946bd35 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -569,7 +569,6 @@ def guessChunk(shape, typesize, chunk_min=None, chunk_max=None): def generateLayout( shape_json, item_size=0, - has_filter=False, chunks=None, chunk_min=CHUNK_MIN, chunk_max=CHUNK_MAX, @@ -583,14 +582,14 @@ def generateLayout( shape_class = getShapeClass(shape_json) if shape_class == "H5S_NULL": - if chunks or has_filter: + if chunks: raise ValueError("Null space datasets do not support chunking") return {} if shape_class == "H5S_SCALAR": - if chunks or has_filter: + if chunks: raise ValueError("Scalar datasets do not support chunking") - return {"class": "H5D_CONIGUOUS"} + return {"class": "H5D_CONTIGUOUS"} if chunk_min > chunk_max: msg = "chunk_max must be larger than chunk_min" @@ -600,9 +599,9 @@ def generateLayout( shape_dims = getShapeDims(shape_json) rank = len(shape_dims) max_dims = getMaxDims(shape_json) - extensible = isExtensible(shape_dims, max_dims) + extensible = isExtensible(shape_json) - if dset_size < chunk_min and not extensible and not has_filter and not chunks: + if dset_size < chunk_min and not extensible and not chunks: # can just return a contiguous layout return {"class": "H5D_CONTIGUOUS"} @@ -613,10 +612,12 @@ def generateLayout( chunk_dims = chunks if len(chunk_dims) != rank: raise ValueError("given chunk dims do not agree with dataset rank") + else: + pass # otherwise we'll guess a chunk shape below if not chunk_dims: kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} - chunk_dims = getChunkDims(shape_json, item_size, **kwargs) - layout["dims"] = chunk_dims + chunk_dims = guessChunk(shape_json, item_size, **kwargs) + layout["dims"] = list(chunk_dims) # set partition_count if needed: if max_chunks_per_folder > 0: diff --git a/src/h5json/filters.py b/src/h5json/filters.py index 3ddfe3f5..c8435873 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -33,7 +33,7 @@ ("H5Z_FILTER_LZF", 32000, "lzf", ()), ("H5Z_FILTER_BLOSC", 32001, "blosclz", ()), ("H5Z_FILTER_SNAPPY", 32003, "snappy", ()), - ("H5Z_FILTER_LZ4", 32004, "lz4", ()), + ("H5Z_FILTER_LZ4", 32004, "lz4", ("level",)), ("H5Z_FILTER_LZ4HC", 32005, "lz4hc", ()), ("H5Z_FILTER_BITSHUFFLE", 32008, "bitshuffle", ()), ("H5Z_FILTER_ZSTD", 32015, "zstd", ()), @@ -194,7 +194,11 @@ def getFilterItem(name, options={}): elif filter_class == "H5Z_FILTER_SNAPPY": pass # no options elif filter_class == "H5Z_FILTER_LZ4": - pass # no options + if "level" in options: + level_val = options["level"] + if level_val < 0 or level_val > 9: + msg = "Deflate filter level must be between 0 and 9" + raise ValueError(msg) elif filter_class == "H5Z_FILTER_LZ4HC": pass # no options elif filter_class == "H5Z_FILTER_BITSHUFFLE": @@ -224,7 +228,7 @@ def getFilterItem(name, options={}): return filter_json -def validateFilter(filter_json, supported_filters=None): +def validateFilter(filter_json): """ Check the given the given filter for create format, required options set. Raise TypeError, KeyError or ValueError if not. If supported_filters is supplied, raise KeyError if a non-supported @@ -275,7 +279,10 @@ def validateFilters(filters, supported_filters=None): # TBD: check given order of filters is supported for filter_json in filters: - validateFilter(filter_json, supported_filters=supported_filters) + validateFilter(filter_json) + filter_class = filter_json["class"] + if supported_filters and filter_class not in supported_filters: + raise ValueError(f"filter: {filter_class} not supported") def getFilters(dset_json): @@ -296,4 +303,15 @@ def isCompressionFilter(filter): def getCompressionFilter(filters): """Return compression filter ids from filters, or None""" - return COMPRESSION_FILTER_IDS + for filter in filters: + if filter["class"] in COMPRESSION_FILTER_IDS: + return filter + return None + + +def getShuffleFilter(filters): + """Return shuffle filter if present or None""" + for filter in filters: + if filter["class"] == "H5Z_FILTER_SHUFFLE": + return filter + return None diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index b24594d2..1331cd97 100644 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -13,7 +13,7 @@ import logging from h5json.filters import getFilterItem -from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk +from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk, generateLayout from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getChunkDims from h5json.dset_util import validateChunkLayout, validateDatasetCreationProps, getDatasetLayout @@ -359,6 +359,46 @@ def testExpandChunk(self): self.assertTrue(num_bytes > CHUNK_MIN) self.assertTrue(num_bytes < CHUNK_MAX) + def testGenerateLayout(self): + typesize = 4 + chunk_min = 4000 + chunk_max = 8000 + shape = { + "class": "H5S_SIMPLE", + "dims": [40, 20], + } + kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} + layout = generateLayout(shape, typesize, **kwargs) + self.assertTrue("class" in layout) + self.assertEqual(layout["class"], "H5D_CONTIGUOUS") + self.assertFalse("dims" in layout) + + layout = generateLayout(shape, typesize, chunks=True, **kwargs) + self.assertTrue("class" in layout) + self.assertEqual(layout["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout) + self.assertEqual(layout["dims"], [40, 20]) + + layout = generateLayout(shape, typesize, chunks=(20, 10), **kwargs) + self.assertTrue("class" in layout) + self.assertEqual(layout["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout) + self.assertEqual(layout["dims"], [20, 10]) + + shape = { + "class": "H5S_SIMPLE", + "dims": [0, 20], + "maxdims": [0, 20] + } + layout = generateLayout(shape, typesize, **kwargs) + self.assertTrue("class" in layout) + self.assertEqual(layout["class"], "H5D_CHUNKED") + self.assertTrue("dims" in layout) + dims = layout["dims"] + self.assertEqual(len(dims), 2) + self.assertTrue(dims[0] > 0) + self.assertTrue(dims[1] > 0) + def testGetContiguousLayout(self): typesize = 4 chunk_min = 400 From 43133daf7ce4b49dcef7b17a5d1f421952a4d01e Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 26 Dec 2025 14:23:46 +0800 Subject: [PATCH 104/129] added filter test to testall --- testall.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/testall.py b/testall.py index 34b1efd7..e4237ce6 100755 --- a/testall.py +++ b/testall.py @@ -17,10 +17,11 @@ unit_tests = [ "array_util_test", + "dset_util_test", + "filter_test", "objid_test", "hdf5dtype_test", "shape_util_test", - "dset_util_test", "hdf5db_test", "h5json_reader_test", "h5json_writer_test", From df8ce239a1a686aa1bae8efbfb37091a9c5753ba Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 30 Dec 2025 19:13:26 +0800 Subject: [PATCH 105/129] updates for resizable datasets --- src/h5json/array_util.py | 29 +++- src/h5json/dset_util.py | 82 +++++++---- src/h5json/filters.py | 3 +- src/h5json/h5pystore/h5py_writer.py | 213 +++++++++++++++------------- src/h5json/hdf5db.py | 145 ++++++++++++++----- src/h5json/shape_util.py | 22 +++ src/h5json/track_util.py | 26 ++++ test/unit/dset_util_test.py | 30 ++-- test/unit/h5py_writer_test.py | 46 ++++++ test/unit/hdf5db_test.py | 4 +- test/unit/shape_util_test.py | 12 ++ 11 files changed, 438 insertions(+), 174 deletions(-) create mode 100644 src/h5json/track_util.py diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index cb39cd55..e57a3892 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -15,7 +15,7 @@ import binascii import numpy as np -from .hdf5dtype import isVlen +from .hdf5dtype import isVlen, is_float16_dtype, guess_dtype MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million @@ -474,6 +474,33 @@ def arrayToBytes(arr, encoding=None): return data +def array_for_new_object(data, specified_dtype=None): + """Prepare an array from data used to create a new dataset or attribute""" + + # We mostly let HDF5 convert data as necessary when it's written. + # But if we are going to a float16 datatype, pre-convert in python + # to workaround a bug in the conversion. + # https://github.com/h5py/h5py/issues/819 + if is_float16_dtype(specified_dtype): + as_dtype = specified_dtype + elif not isinstance(data, np.ndarray) and (specified_dtype is not None): + # If we need to convert e.g. a list to an array, don't leave numpy + # to guess a dtype we already know. + as_dtype = specified_dtype + else: + as_dtype = guess_dtype(data) + + data = np.asarray(data, order="C", dtype=as_dtype) + + # In most cases, this does nothing. But if data was already an array, + # and as_dtype is a tagged h5py dtype (e.g. for an object array of strings), + # asarray() doesn't replace its dtype object. This gives it the tagged dtype: + if as_dtype is not None: + data = data.view(dtype=as_dtype) + + return data + + def bytesToArray(data, dt, shape, encoding=None): """ Create numpy array based on byte representation diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 1946bd35..50340438 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -19,7 +19,7 @@ from .objid import isValidUuid CHUNK_MIN = 512 * 1024 # Soft lower limit (512k) -CHUNK_MAX = 8096 * 1024 # Hard upper limit (2M) +CHUNK_MAX = 8096 * 1024 # Hard upper limit (8M) LAYOUT_CLASSES = ( @@ -87,30 +87,36 @@ def estimateDatasetSize(shape_json, item_size, chunk_min=CHUNK_MIN): def resize_dataset(dset_json, shape): """ Update shape dims to the given shape provided new shape is valid for maxdims """ - shape_json = dset_json["shape"] - shape_class = shape_json["class"] + + layout_class = getDatasetLayoutClass(dset_json) + if layout_class != "H5D_CHUNKED": + raise TypeError("Only chunked datasets can be resized") + shape_class = getShapeClass(dset_json) if shape_class != "H5S_SIMPLE": raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") - if len(shape_json["dims"]) != len(shape): + dims = getShapeDims(dset_json) + if len(dims) != len(shape): raise ValueError("Resize shape parameter doesn't match dataset's rank") - if "maxdims" not in shape_json: + if not isExtensible(dset_json): raise ValueError("Dataset is not resizable") - dims = shape_json["dims"] - maxdims = shape_json["maxdims"] + maxdims = getMaxDims(dset_json) - if shape_json["dims"] == list(shape): + if dims == tuple(shape): # no change, just return - return - for i in range(len(dims)): + return None + rank = getRank(dset_json) + for i in range(rank): extent = shape[i] if extent < 0: raise ValueError("dimensions can't be negative") - if maxdims[i] == "H5S_UNLIMITED": + if maxdims[i] in (0, "H5S_UNLIMITED"): # any positive extent is ok continue if extent > maxdims[i]: raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}") + # update the object json with the new dimensions + shape_json = dset_json["shape"] shape_json["dims"] = list(shape) @@ -185,12 +191,12 @@ def getChunkSize(chunk_dims, type_size: int = 1): def getChunkDims(dset_json): """Get chunk layout. Return shape dims for non-chunked layout""" - shape_json = dset_json["shape"] - if shape_json["class"] == "H5S_NULL": + shape_class = getShapeClass(dset_json) + if shape_class == "H5S_NULL": return None - if shape_json["class"] == "H5S_SCALAR": + if shape_class == "H5S_SCALAR": return (1, ) - shape_dims = shape_json["dims"] + shape_dims = getShapeDims(dset_json) layout_class = getDatasetLayoutClass(dset_json) if not layout_class: return tuple(shape_dims) @@ -207,7 +213,7 @@ def getChunkDims(dset_json): return chunk_dims -def validateChunkLayout(shape_json, type_json, layout): +def validateLayout(shape_json, type_json, layout): """ Use chunk layout given in the creationPropertiesList (if defined and layout is valid). @@ -218,6 +224,7 @@ def validateChunkLayout(shape_json, type_json, layout): space_dims = None chunk_dims = None max_dims = None + item_size = getItemSize(type_json) if "dims" in shape_json: @@ -250,7 +257,7 @@ def validateChunkLayout(shape_json, type_json, layout): if chunk_extent > dim_extent: msg = "Invalid layout value" raise ValueError(reason=msg) - elif max_dims[i] != 0: + elif max_dims[i] not in (0, "H5S_UNLIMITED"): if chunk_extent > max_dims[i]: msg = "Invalid layout value for extensible dimension" raise ValueError(msg) @@ -404,7 +411,7 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None): layout_class = None if "layout" in creation_props: layout_json = creation_props["layout"] - validateChunkLayout(shape, type_json, layout_json) + validateLayout(shape, type_json, layout_json) layout_class = layout_json["class"] if "filters" in creation_props: @@ -436,7 +443,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN): if "maxdims" in shape_json: maxdims = shape_json["maxdims"] for n in range(rank): - if maxdims[n] == 0 or maxdims[n] > dims[n]: + if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]: extendable_dims += 1 dset_size = getDataSize(shape_json, typesize) @@ -454,7 +461,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN): dim = rank - n - 1 # start from last dim if extendable_dims > 0: - if maxdims[dim] == 0: + if maxdims[dim] in (0, "H5S_UNLIMITED"): # infinitely extendable dimensions layout[dim] *= 2 chunk_size = getChunkSize(layout, typesize) @@ -553,7 +560,7 @@ def guessChunk(shape, typesize, chunk_min=None, chunk_max=None): typesize = 128 # just take a guess at the item size # For unlimited dimensions we have to guess. use 1024 - shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape)) + shape = tuple((x if x not in (0, "H5S_UNLIMITED") else 1024) for i, x in enumerate(shape)) chunk_size = getChunkSize(shape, typesize) if chunk_min and chunk_size < chunk_min: @@ -568,7 +575,7 @@ def guessChunk(shape, typesize, chunk_min=None, chunk_max=None): def generateLayout( shape_json, - item_size=0, + type_json, chunks=None, chunk_min=CHUNK_MIN, chunk_max=CHUNK_MAX, @@ -577,6 +584,9 @@ def generateLayout( """ Create a dataset layout based on type and shape properties """ + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + item_size = 128 # take a guess if item_size < 0: raise ValueError("item_size is invalid") @@ -612,6 +622,13 @@ def generateLayout( chunk_dims = chunks if len(chunk_dims) != rank: raise ValueError("given chunk dims do not agree with dataset rank") + for dim in range(rank): + if max_dims[dim] in (0, "H5S_UNLIMITED"): + pass # unlimited, so any chunk extent is ok + elif chunk_dims[dim] > max_dims[dim]: + msg = "Chunk shape must not be greater than data shape in any dimension. " + msg += f"{chunk_dims} is not compatible with {max_dims}" + raise ValueError() else: pass # otherwise we'll guess a chunk shape below if not chunk_dims: @@ -646,12 +663,14 @@ def generateLayout( layout["partition_count"] = partition_count else: pass # partition not needed + + validateLayout(shape_json, type_json, layout) return layout def generate_dcpl( shape_json, - dtype, + type_json, chunks=None, filters=[], chunk_min=CHUNK_MIN, @@ -678,12 +697,12 @@ def generate_dcpl( # End argument validation - kwargs = {"item_size": dtype.itemsize, "has_filter": filters} + kwargs = {"has_filter": filters} kwargs["chunks"] = chunks kwargs["chunk_min"] = chunk_min kwargs["chunk_max"] = chunk_max kwargs["max_chunks_per_folder"] = max_chunks_per_folder - plist["layout"] = generateLayout(shape_json, **kwargs) + plist["layout"] = generateLayout(shape_json, type_json, **kwargs) if len(filters) > 0: plist["filters"] = filters @@ -697,3 +716,16 @@ def generate_dcpl( plist["initializer"] = initializer return plist + + +def getFillValue(obj_json): + """ Return the fill value or None if not set """ + + if "creationProperties" in obj_json: + cpl = obj_json["creationProperties"] + else: + cpl = obj_json # assume we've been based a cpl + if "filLValue" in cpl: + return cpl["fillValue"] + else: + return None diff --git a/src/h5json/filters.py b/src/h5json/filters.py index c8435873..3642fe07 100644 --- a/src/h5json/filters.py +++ b/src/h5json/filters.py @@ -16,6 +16,7 @@ DEFAULT_GZIP = 4 DEFAULT_SZIP = 4 +DEFAULT_LZ4 = 1 SO_INT_MINBITS_DEFAULT = 0 # List of registered filters. Not all are supported by every reader and writer. @@ -97,7 +98,7 @@ def getFilterItem(name, options={}): filter_json = None if isinstance(name, dict): - filter_json = name + filter_json = name.copy() base_keys = ("class", "id", "name") for key in base_keys: if key not in filter_json: diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 5e1e20d7..b801af83 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -17,7 +17,11 @@ from ..objid import getCollectionForId, isValidUuid, createObjId from ..hdf5dtype import createDataType from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype +from ..shape_util import getShapeDims, getShapeClass, isExtensible, getMaxDims from ..array_util import jsonToArray +from ..track_util import getTrackTimes +from ..dset_util import getDatasetLayout, getFillValue +from ..filters import isCompressionFilter, getFilters, getFilterItem from .. import selections from .. import filters from ..h5writer import H5Writer @@ -153,106 +157,104 @@ def _createDataset(self, parent, dset_json, name=None): dtype = self.db.getDtype(dset_json) kwargs = {"dtype": dtype} - shape_json = dset_json["shape"] - shape_class = shape_json["class"] + shape_class = getShapeClass(dset_json) if shape_class == "H5S_NULL": # skip the shape keyword to create a null space dataset pass elif shape_class == "H5S_SCALAR": kwargs["shape"] = () else: - kwargs["shape"] = shape_json["dims"] - if "dcpl" in dset_json and shape_class != "H5S_NULL": - creation_props = dset_json["dcpl"] - if "fillValue" in creation_props: - fillvalue = creation_props["fillValue"] - if fillvalue and len(dtype) > 1 and type(fillvalue) in (list, tuple): - # for compound types, need to convert from list to dataset compatible element - - if len(dtype) != len(fillvalue): - msg = "fillvalue has incorrect number of elements" - self.log.warning(msg) - raise ValueError(msg) - - fillvalue = jsonToArray((), dtype, fillvalue) - - kwargs["fillvalue"] = fillvalue - - if "trackTimes" in creation_props: - kwargs["track_times"] = creation_props["trackTimes"] - if "layout" in creation_props: - layout = creation_props["layout"] - if "dims" in layout: - kwargs["chunks"] = tuple(layout["dims"]) - if "filters" in creation_props: - filter_props = creation_props["filters"] - for filter_prop in filter_props: - if "id" not in filter_prop: - self.log.warning("filter id not provided") - continue - filter_id = filter_prop["id"] - if filter_id not in filters._HDF_FILTERS: - self.log.warning(f"unknown filter id: {filter_id} ignoring") - continue - - hdf_filter = filters._HDF_FILTERS[filter_id] - - self.log.info(f"got filter: {filter_id}") - if "alias" not in hdf_filter: - self.log.warning(f"unsupported filter id: {filter_id} ignoring") - continue - - filter_alias = hdf_filter["alias"] - if not h5py.h5z.filter_avail(filter_id): - msg = "compression filter not available, filter: {filter_alias}, ignoring" - self.log.warning(msg) - continue - if filter_alias in filters._H5PY_COMPRESSION_FILTERS: - if kwargs.get("compression"): - msg = f"compression filter already set for {filter_alias}, ignoring" - self.log.info(msg) - continue - - kwargs["compression"] = filter_alias - self.log.info("setting compression filter to: {filter_alias}") - if filter_alias == "gzip": - # check for an optional compression value - if "level" in filter_prop: - kwargs["compression_opts"] = filter_prop["level"] - elif filter_alias == "szip": - bitsPerPixel = None - coding = "nn" + shape = getShapeDims(dset_json) + kwargs["shape"] = shape + if isExtensible(dset_json): + maxshape = list(getMaxDims(dset_json)) + # replace any 0, or H5S_UNLIMITED with None + for dim in range(len(maxshape)): + if maxshape[dim] in (0, "H5S_UNLIMITED"): + maxshape[dim] = None + kwargs["maxshape"] = tuple(maxshape) - if "bitsPerPixel" in filter_prop: - bitsPerPixel = filter_prop["bitsPerPixel"] - if "coding" in filter_prop: - if filter_prop["coding"] == "H5_SZIP_EC_OPTION_MASK": - coding = "ec" - elif filter_prop["coding"] == "H5_SZIP_NN_OPTION_MASK": - coding = "nn" - else: - self.log.warning("invalid szip option: 'coding'") - # note: pixelsPerBlock, and pixelsPerScanline not supported by h5py, - # so these options will be ignored - if "pixelsPerBlock" in filter_props: - self.log.info("ignoring szip option: 'pixelsPerBlock'") - if "pixelsPerScanline" in filter_props: - self.log.info("ignoring szip option: 'pixelsPerScanline'") - if bitsPerPixel: - kwargs["compression_opts"] = (coding, bitsPerPixel) - else: - if filter_alias == "shuffle": - kwargs["shuffle"] = True - elif filter_alias == "fletcher32": - kwargs["fletcher32"] = True - elif filter_alias == "scaleoffset": - if "scaleOffset" not in filter_prop: - msg = "No scale_offset provided for scale offset filter, ignoring" - self.log(msg) - continue - kwargs["scaleoffset"] = filter_prop["scaleOffset"] + fillvalue = getFillValue(dset_json) + + if fillvalue and len(dtype) > 1 and type(fillvalue) in (list, tuple): + # for compound types, need to convert from list to dataset compatible element + + if len(dtype) != len(fillvalue): + msg = "fillvalue has incorrect number of elements" + raise ValueError(msg) + + fillvalue = jsonToArray((), dtype, fillvalue) + + kwargs["fillvalue"] = fillvalue + + track_times = getTrackTimes(dset_json) + if track_times is not None: + kwargs["track_times"] = track_times + + layout = getDatasetLayout(dset_json) + if layout and "dims" in layout: + kwargs["chunks"] = tuple(layout["dims"]) + + filter_props = getFilters(dset_json) + + for filter_prop in filter_props: + try: + getFilterItem(filter_prop) + except (KeyError, ValueError, TypeError): + self.log.warning(f"unknown filter: {filter_prop} ignoring") + continue + filter_class = filter_prop["class"] + filter_id = filter_prop["id"] + filter_name = filter_prop["name"] + + if not h5py.h5z.filter_avail(filter_id): + msg = f"filter not available, filter: {filter_class}, ignoring" + self.log.warning(msg) + continue + + if isCompressionFilter(filter_class): + if kwargs.get("compression"): + msg = f"compression filter already set for {filter_class}, ignoring" + self.log.info(msg) + continue + + kwargs["compression"] = filter_name + self.log.info(f"setting compression filter to: {filter_class}") + if filter_class == "H5Z_FILTER_DEFLATE": + kwargs["compression"] = "gzip" # h5py doesn't recognize 'deflate' name + # check for an optional compression value + if "level" in filter_prop: + kwargs["compression_opts"] = filter_prop["level"] + elif filter_class == "H5Z_FILTER_SZIP": + bitsPerPixel = None + coding = "nn" + + if "bitsPerPixel" in filter_prop: + bitsPerPixel = filter_prop["bitsPerPixel"] + if "coding" in filter_prop: + if filter_prop["coding"] == "H5_SZIP_EC_OPTION_MASK": + coding = "ec" + elif filter_prop["coding"] == "H5_SZIP_NN_OPTION_MASK": + coding = "nn" else: - self.log.info(f"Unexpected filter name: {filter_alias}, ignoring") + self.log.warning("invalid szip option: 'coding'") + # note: pixelsPerBlock, and pixelsPerScanline not supported by h5py, + # so these options will be ignored + if "pixelsPerBlock" in filter_props: + self.log.info("ignoring szip option: 'pixelsPerBlock'") + if "pixelsPerScanline" in filter_props: + self.log.info("ignoring szip option: 'pixelsPerScanline'") + if bitsPerPixel: + kwargs["compression_opts"] = (coding, bitsPerPixel) + elif filter_class == "H5Z_FILTER_SHUFFLE": + kwargs["shuffle"] = True + elif filter_class == "H5Z_FILTER_FLETCHER32": + kwargs["fletcher32"] = True + elif filter_class == "H5Z_FILTER_SCALEOFFSET": + if "scaleOffset" in filter_prop: + kwargs["scaleoffset"] = filter_prop["scaleOffset"] + else: + self.log.warning(f"Ignoring filter: {filter_class}") dset = parent.create_dataset(name, **kwargs) return dset @@ -332,12 +334,18 @@ def _createObjects(self, parent, links_json, visited=set()): else: self.log.warning(f"unexpected link class: {link_class}") + def resizeDataset(self, dset_id, dset): + """ Update the datasets shape """ + + dset_json = self.db.getObjectById(dset_id) + new_dims = getShapeDims(dset_json) + dset.resize(new_dims) + def updateDatasetValues(self, dset_id, dset): """ write any pending dataset values """ - dset_json = self.db.getObjectById(dset_id) - if "updates" not in dset_json: - return - updates = dset_json["updates"] + + updates = self.db._getDatasetUpdates(dset_id) + for (sel, val) in updates: slices = [] for dim in range(len(sel.shape)): @@ -436,11 +444,14 @@ def flush(self): obj = self._f[h5path] self.updateAttributes(obj_id, obj) collection = getCollectionForId(obj_id) - if collection == "datasets" and not self.no_data: - if self._init: - self.initializeDatasetValues(obj_id, obj) - else: - self.updateDatasetValues(obj_id, obj) + if collection == "datasets": + if self.db.is_resized(obj_id): + self.resizeDataset(obj_id, obj) + if not self.no_data: + if self._init: + self.initializeDatasetValues(obj_id, obj) + else: + self.updateDatasetValues(obj_id, obj) # mark time write is complete # updates before this time will not need to be written # TBD: possible race condition with multithreading diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 7982a926..eadf0dd0 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -13,9 +13,10 @@ import numpy as np import logging from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype +from .hdf5dtype import numpy_integer_types, numpy_float_types from .array_util import jsonToArray, bytesArrayToList -from .dset_util import resize_dataset -from .shape_util import getShapeClass, getShapeDims +from .dset_util import resize_dataset, getDatasetLayoutClass +from .shape_util import getShapeClass, getShapeDims, getShapeJson from .filters import validateFilters from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId, getHashTagForId from . import selections @@ -25,12 +26,38 @@ from .h5writer import H5Writer, H5NullWriter -def _getDatasetUpdates(dset_json): - """ return a list of value updates for the datset. - initalize one if not already present. """ - if "updates" not in dset_json: - dset_json["updates"] = [] - return dset_json["updates"] +def _decode(item, encoding="ascii"): + """ + decode any byte items to python 3 strings + """ + ret_val = None + if type(item) is bytes: + ret_val = item.decode(encoding) + elif type(item) is list: + ret_val = [] + for x in item: + ret_val.append(_decode(x, encoding)) + elif type(item) is tuple: + ret_val = [] + for x in item: + ret_val.append(_decode(x, encoding)) + ret_val = tuple(ret_val) + elif type(item) is dict: + ret_val = {} + for k in dict: + ret_val[k] = _decode(item[k], encoding) + elif type(item) is np.ndarray: + x = item.tolist() + ret_val = [] + for x in item: + ret_val.append(_decode(x, encoding)) + elif type(item) in numpy_integer_types: + ret_val = int(item) + elif type(item) in numpy_float_types: + ret_val = float(item) + else: + ret_val = item + return ret_val class Hdf5db: @@ -59,9 +86,11 @@ def __init__( self._db = {} - self._new_objects = set() # set of for newly created objects - self._dirty_objects = set() # set of modified objects - self._deleted_objects = set() # set of deleted objects + self._new_objects = set() # set of for newly created objects + self._dirty_objects = set() # set of modified objects + self._deleted_objects = set() # set of deleted objects + self._resized_datasets = set() # set of dataset ids that have been resized + self._dataset_updates = {} # list of dataset values updates keyed by dset_id self._root_id = None @@ -126,8 +155,19 @@ def is_dirty(self, obj_id): obj_id = getHashTagForId(obj_id) if self.is_new(obj_id): return True + if obj_id in self._resized_datasets: + return True return obj_id in self._dirty_objects + def is_resized(self, dset_id): + """ return true if this dataset has been resized """ + dset_id = getHashTagForId(dset_id) + + if dset_id in self._resized_datasets: + return True + else: + return False + @property def new_objects(self): return self._new_objects @@ -140,6 +180,18 @@ def dirty_objects(self): def deleted_objects(self): return self._deleted_objects + @property + def resized_datsets(self): + return self._resized_datasets + + def _getDatasetUpdates(self, dset_id): + """ Get list of update tuples """ + if getCollectionForId(dset_id) != "datasets": + raise TypeError("expected dataset id") + if dset_id not in self._dataset_updates: + self._dataset_updates[dset_id] = [] + return self._dataset_updates[dset_id] + def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ obj_id = getHashTagForId(obj_id) @@ -152,7 +204,7 @@ def make_dirty(self, obj_id): obj_json = self.db[obj_id] obj_json["lastModified"] = getNow() if not self.is_new(obj_id): - # object hasn't been initially written yet, add to dirt_object set + # object hasn't been initially written yet, add to dirty_object set self._dirty_objects.add(obj_id) def flush(self): @@ -165,9 +217,11 @@ def flush(self): return False # reset new, dirty and deleted sets - self._new_objects = set() - self._dirty_objects = set() - self._deleted_objects = set() + self._new_objects.clear() + self._dirty_objects.clear() + self._deleted_objects.clear() + self._resized_datasets.clear() + self._dataset_updates.clear() return True def readAll(self): @@ -587,7 +641,7 @@ def init_arr(dtype, cpl): else: cpl = {} - updates = _getDatasetUpdates(dset_json) + updates = self._getDatasetUpdates(dset_id) shape_class = getShapeClass(shape_json) @@ -685,7 +739,7 @@ def setDatasetValues(self, dset_id, sel, arr): dims = getShapeDims(shape_json) if sel.shape != dims: raise ValueError("Selection shape does not match dataset shape") - updates = _getDatasetUpdates(dset_json) + updates = self._getDatasetUpdates(dset_id) if sel.select_type == selections.H5S_SELECT_ALL: # for select all, throw out any existing updates since this will overwrite them updates.clear() @@ -709,8 +763,22 @@ def resizeDataset(self, dset_id, shape): self.log.info(f"resizeDataset {dset_id}, {shape}") dset_json = self.getObjectById(dset_id) # will throw exception if not found - if resize_dataset(dset_json, shape): - self._make_dirty(dset_id) + old_dims = getShapeDims(dset_json) + resize_dataset(dset_json, shape) + + if dset_id not in self.new_objects: + self._resized_datasets.add(dset_id) + + # if the shape has shrunk in any dimension, do a flush now + new_dims = getShapeDims(dset_json) + do_flush = False + for i in range(len(new_dims)): + if new_dims[i] < old_dims[i]: + do_flush = True + break + + if do_flush: + self.flush() def deleteObject(self, obj_id): """ Delete the given object """ @@ -727,6 +795,9 @@ def deleteObject(self, obj_id): if obj_id in self._dirty_objects: self._dirty_objects.remove(obj_id) + if obj_id in self._resized_datasets: + self._resized_datasets.remove(obj_id) + self._deleted_objects.add(obj_id) def getLinks(self, grp_id): @@ -859,22 +930,7 @@ def createDataset( if self.closed: raise ValueError("db is closed") type_json = getTypeItem(dtype) - if shape is None: - raise ValueError("shape not set") - elif shape == "H5S_NULL": - shape_json = {"class": "H5S_NULL"} - elif shape == (): - shape_json = {"class": "H5S_SCALAR"} - else: - shape_json = {"class": "H5S_SIMPLE"} - shape_json["dims"] = list(shape) - - if maxdims: - if shape_json["class"] != "H5S_SIMPLE": - raise ValueError("only simple shapes can be resizable") - if len(shape) != len(maxdims): - raise ValueError("maxdims length not equal to shape rank") - shape_json["maxdims"] = ["H5S_UNLIMITED" if x is None else x for x in maxdims] + shape_json = getShapeJson(shape, maxdims=maxdims) dset_json = {"shape": shape_json, "type": type_json, "attributes": {}} if cpl: @@ -885,9 +941,28 @@ def createDataset( supported_filters = () # validate and normalize supplied filter property list validateFilters(cpl["filters"], supported_filters=supported_filters) + if cpl.get("fillValue"): + fillvalue = cpl["fillValue"] + # is it compatible with the array type? + if hasattr(fillvalue, "tolist"): + # convert numpy object to list + fillvalue = fillvalue.tolist() + fillvalue = _decode(fillvalue) + if not isinstance(fillvalue, str) and hasattr(fillvalue, "__iter__"): + # fill value is a list, or similar: check that dtype is compound + if len(fillvalue) != len(dtype): + raise ValueError("Invalid fill value for non-compound type dataset") + fillvalue = list(fillvalue) + cpl["fillValue"] = fillvalue + else: + if type_json["class"] == "H5T_COMPOUND": + raise ValueError("Invalid fill value for compound type dataset") dset_json["creationProperties"] = cpl else: dset_json["creationProperties"] = {} + + if maxdims and getDatasetLayoutClass(dset_json) != "H5D_CHUNKED": + raise ValueError("Only datasets with 'H5D_CHUNKED' layout can be resizable") dset_json["created"] = getNow() dset_id = createObjId("datasets", root_id=self.root_id) diff --git a/src/h5json/shape_util.py b/src/h5json/shape_util.py index cbc6a8fe..753603cd 100644 --- a/src/h5json/shape_util.py +++ b/src/h5json/shape_util.py @@ -48,11 +48,33 @@ def getShapeJson(dims, maxdims=None): shape_class = "H5S_SCALAR" else: shape_class = "H5S_SIMPLE" + if dims: + for extent in dims: + if not isinstance(extent, int): + raise TypeError("expected an integer value for dimensions") + if extent < 0: + raise ValueError("negative extent values are not supported") + if maxdims is not None: if shape_class != "H5S_SIMPLE": raise ValueError(f"maxdims can not be used with shape class: {shape_class}") if len(maxdims) != len(dims): raise ValueError("maxdims must match dataspace rank") + # convert any 0 or None vlues to "H5S_UNLIMITED" + maxdims = list(tuple(maxdims)) + for i in range(len(maxdims)): + extent = maxdims[i] + if extent is None or extent == 0: + maxdims[i] = "H5S_UNLIMITED" + elif isinstance(extent, str): + if extent != "H5S_UNLIMITED": + raise ValueError(f"invalid maxdims extent: {extent}") + elif isinstance(extent, int): + if extent < 0: + raise ValueError("negative extent values are not supported") + else: + raise TypeError("expected an integer value for maxdims") + shape_json = {"class": shape_class} if shape_class == "H5S_SIMPLE": shape_json["dims"] = dims diff --git a/src/h5json/track_util.py b/src/h5json/track_util.py new file mode 100644 index 00000000..b59e2a08 --- /dev/null +++ b/src/h5json/track_util.py @@ -0,0 +1,26 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +def getTrackTimes(obj_json): + """ Return a boolean if trackTimes is set in the objects' creation Property list. + Otherwise return None. """ + + if "creationProperties" in obj_json: + cpl = obj_json["creationProperties"] + else: + cpl = obj_json # assume this is the cpl + if "trackTimes" in cpl: + track_times = bool(cpl["trackTimes"]) + else: + track_times = None + + return track_times diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 1331cd97..364d8929 100644 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -15,7 +15,7 @@ from h5json.filters import getFilterItem from h5json.dset_util import guessChunk, shrinkChunk, getChunkSize, expandChunk, generateLayout from h5json.dset_util import getDatasetLayoutClass, getContiguousLayout, getChunkDims -from h5json.dset_util import validateChunkLayout, validateDatasetCreationProps, getDatasetLayout +from h5json.dset_util import validateLayout, validateDatasetCreationProps, getDatasetLayout class DsetUtilTest(unittest.TestCase): @@ -49,7 +49,7 @@ def testGetLayout(self): # contigous layout with resizable shape should raise exception try: - validateChunkLayout(dset_json["shape"], type_json, layout) + validateLayout(dset_json["shape"], type_json, layout) self.assertTrue(False) # should not reach here except ValueError: pass # should raise exception @@ -68,7 +68,7 @@ def testGetLayout(self): self.assertEqual(layout_class, "H5D_CHUNKED") try: - validateChunkLayout(dset_json["shape"], type_json, layout) + validateLayout(dset_json["shape"], type_json, layout) except ValueError: self.assertTrue(False) # shouldn't raise exception @@ -186,7 +186,7 @@ def testGuessChunk(self): chunk_size = getChunkSize(layout, typesize) self.assertTrue(chunk_size <= chunk_max) - shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, 0]} + shape = {"class": "H5S_SIMPLE", "dims": [100, 0], "maxdims": [100, "H5S_UNLIMITED"]} layout = guessChunk(shape, typesize) self.assertTrue(len(layout), 2) for i in range(2): @@ -333,7 +333,7 @@ def testExpandChunk(self): shape = { "class": "H5S_SIMPLE", "dims": [1000, 10, 1000], - "maxdims": [1000, 0, 1000], + "maxdims": [1000, "H5S_UNLIMITED", 1000], } layout = (10, 10, 10) typesize = 4 @@ -360,37 +360,47 @@ def testExpandChunk(self): self.assertTrue(num_bytes < CHUNK_MAX) def testGenerateLayout(self): - typesize = 4 chunk_min = 4000 chunk_max = 8000 shape = { "class": "H5S_SIMPLE", "dims": [40, 20], } + base_type = 'H5T_IEEE_F32LE' + type_json = {'class': 'H5T_FLOAT', 'base': base_type} + kwargs = {"chunk_min": chunk_min, "chunk_max": chunk_max} - layout = generateLayout(shape, typesize, **kwargs) + layout = generateLayout(shape, type_json, **kwargs) self.assertTrue("class" in layout) self.assertEqual(layout["class"], "H5D_CONTIGUOUS") self.assertFalse("dims" in layout) - layout = generateLayout(shape, typesize, chunks=True, **kwargs) + layout = generateLayout(shape, type_json, chunks=True, **kwargs) self.assertTrue("class" in layout) self.assertEqual(layout["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout) self.assertEqual(layout["dims"], [40, 20]) - layout = generateLayout(shape, typesize, chunks=(20, 10), **kwargs) + layout = generateLayout(shape, type_json, chunks=(20, 10), **kwargs) self.assertTrue("class" in layout) self.assertEqual(layout["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout) self.assertEqual(layout["dims"], [20, 10]) + try: + # proposed chunk shape can't be larger than shape in + # any dimension + generateLayout(shape, type_json, chunks=(50, 10), **kwargs) + self.assertTrue(False) # shouldn't get here + except ValueError: + pass # expected + shape = { "class": "H5S_SIMPLE", "dims": [0, 20], "maxdims": [0, 20] } - layout = generateLayout(shape, typesize, **kwargs) + layout = generateLayout(shape, type_json, **kwargs) self.assertTrue("class" in layout) self.assertEqual(layout["class"], "H5D_CHUNKED") self.assertTrue("dims" in layout) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index f0091a39..6e208ada 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -199,6 +199,52 @@ def testSimple(self): expected = i * j self.assertEqual(dset[i, j], expected) + def testResizableDataset(self): + filepath = "test/unit/out/h5py_writer_test_testResizableDataset.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + maxdims = (None, ncols * 2) + layout = {"class": "H5D_CHUNKED", "dims": (nrows, ncols)} + cpl = {"layout": layout} + + root_id = db.open() + dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype, cpl=cpl) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + + # resize limited dimension + db.resizeDataset(dset_id, (nrows, ncols * 2)) + + # try to go beyond max extent + try: + db.resizeDataset(dset_id, (nrows, ncols * 3)) + self.assertTrue(False) + except ValueError: + pass # expected + + db.close() + + with h5py.File(filepath) as f: + dset = f["dset"] + self.assertEqual(dset.shape, (nrows, ncols * 2)) + + db.open() + # resize unlimited dimension + db.resizeDataset(dset_id, (nrows * 10, ncols)) + + db.close() + + with h5py.File(filepath) as f: + dset = f["dset"] + self.assertEqual(dset.shape, (nrows * 10, ncols)) + def testNullSpaceAttribute(self): filepath = "test/unit/out/h5py_writer_test_testNullSpaceAttribute.h5" diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 11bdd30b..f7d27f76 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -575,11 +575,13 @@ def testResizableDataset(self): shape = (nrows, ncols) dtype = np.int32 maxdims = (None, ncols * 2) + layout = {"class": "H5D_CHUNKED", "dims": shape} + cpl = {"layout": layout} db = Hdf5db(app_logger=self.log) root_id = db.open() - dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) + dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype, cpl=cpl) db.createHardLink(root_id, "dset", dset_id) db.createAttribute(dset_id, "a1", "Hello, world") diff --git a/test/unit/shape_util_test.py b/test/unit/shape_util_test.py index 98812692..61c78e51 100644 --- a/test/unit/shape_util_test.py +++ b/test/unit/shape_util_test.py @@ -45,6 +45,7 @@ def testSimple(self): simple_shape_obj = {"type": type_json, "shape": simple_shape_json} vstr_simple_shape_obj = {"type": vstr_json, "shape": simple_shape_json} resizable_shape_obj = {'class': 'H5S_SIMPLE', 'dims': [10], 'maxdims': [20]} + unlimited_shape_obj = {'class': 'H5S_SIMPLE', 'dims': [0, 20], 'maxdims': ["H5S_UNLIMITED", 40]} self.assertEqual(getShapeClass(null_shape_json), "H5S_NULL") self.assertEqual(getShapeClass(null_shape_obj), "H5S_NULL") @@ -55,6 +56,7 @@ def testSimple(self): self.assertEqual(getShapeClass(simple_shape_obj), "H5S_SIMPLE") self.assertEqual(getShapeClass(vstr_simple_shape_obj), "H5S_SIMPLE") self.assertEqual(getShapeClass(resizable_shape_obj), "H5S_SIMPLE") + self.assertEqual(getShapeClass(unlimited_shape_obj), "H5S_SIMPLE") self.assertEqual(getShapeDims(null_shape_json), None) self.assertEqual(getShapeDims(null_shape_obj), None) @@ -66,6 +68,7 @@ def testSimple(self): self.assertEqual(getShapeDims(vstr_simple_shape_obj), (5, 7)) self.assertEqual(getShapeDims(12), (12,)) self.assertEqual(getShapeDims(resizable_shape_obj), (10,)) + self.assertEqual(getShapeDims(unlimited_shape_obj), (0, 20)) self.assertEqual(getMaxDims(null_shape_json), None) self.assertEqual(getMaxDims(null_shape_obj), None) @@ -76,6 +79,7 @@ def testSimple(self): self.assertEqual(getMaxDims(simple_shape_obj), (5, 7)) self.assertEqual(getMaxDims(vstr_simple_shape_obj), (5, 7)) self.assertEqual(getMaxDims(resizable_shape_obj), (20,)) + self.assertEqual(getMaxDims(unlimited_shape_obj), ("H5S_UNLIMITED", 40)) self.assertEqual(getRank(null_shape_json), 0) self.assertEqual(getRank(null_shape_obj), 0) @@ -86,6 +90,7 @@ def testSimple(self): self.assertEqual(getRank(simple_shape_obj), 2) self.assertEqual(getRank(vstr_simple_shape_obj), 2) self.assertEqual(getRank(resizable_shape_obj), 1) + self.assertEqual(getRank(unlimited_shape_obj), 2) self.assertEqual(getNumElements(null_shape_json), 0) self.assertEqual(getNumElements(null_shape_obj), 0) @@ -96,6 +101,7 @@ def testSimple(self): self.assertEqual(getNumElements(simple_shape_obj), 35) self.assertEqual(getNumElements(vstr_simple_shape_obj), 35) self.assertEqual(getNumElements(resizable_shape_obj), 10) + self.assertEqual(getNumElements(unlimited_shape_obj), 0) self.assertEqual(getNumElements(()), 1) self.assertEqual(getNumElements([1, 2, 3]), 6) @@ -108,6 +114,7 @@ def testSimple(self): self.assertEqual(isNullSpace(simple_shape_obj), False) self.assertEqual(isNullSpace(vstr_simple_shape_obj), False) self.assertEqual(isNullSpace(resizable_shape_obj), False) + self.assertEqual(isNullSpace(unlimited_shape_obj), False) self.assertEqual(isScalar(null_shape_json), False) self.assertEqual(isScalar(null_shape_obj), False) @@ -118,6 +125,7 @@ def testSimple(self): self.assertEqual(isScalar(simple_shape_obj), False) self.assertEqual(isScalar(vstr_simple_shape_obj), False) self.assertEqual(isScalar(resizable_shape_obj), False) + self.assertEqual(isScalar(unlimited_shape_obj), False) self.assertEqual(getDataSize(null_shape_json, 4), 0) self.assertEqual(getDataSize(null_shape_obj, 4), 0) @@ -128,6 +136,8 @@ def testSimple(self): self.assertEqual(getDataSize(simple_shape_obj, 4), 140) self.assertEqual(getDataSize(vstr_simple_shape_obj, 4), 140) self.assertEqual(getDataSize(resizable_shape_obj, 4), 40) + self.assertEqual(getDataSize(unlimited_shape_obj, 4), 0) + self.assertEqual(getDataSize((), 4), 4) self.assertEqual(getDataSize([1, 2, 3], 4), 24) @@ -140,6 +150,7 @@ def testSimple(self): self.assertEqual(isScalar(simple_shape_obj), False) self.assertEqual(isScalar(vstr_simple_shape_obj), False) self.assertEqual(isScalar(resizable_shape_obj), False) + self.assertEqual(isScalar(unlimited_shape_obj), False) self.assertEqual(isExtensible(null_shape_json), False) self.assertEqual(isExtensible(null_shape_obj), False) @@ -150,6 +161,7 @@ def testSimple(self): self.assertEqual(isExtensible(simple_shape_obj), False) self.assertEqual(isExtensible(vstr_simple_shape_obj), False) self.assertEqual(isExtensible(resizable_shape_obj), True) + self.assertEqual(isExtensible(unlimited_shape_obj), True) if __name__ == "__main__": From 6f94e0736a26f8819cb0fc651c9c52a48119581e Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 2 Jan 2026 10:09:04 +0800 Subject: [PATCH 106/129] adjust dataset updates for resize --- src/h5json/hdf5db.py | 20 ++++++++++-- src/h5json/selections.py | 1 - src/h5json/shape_util.py | 6 ++-- test/unit/shape_util_test.py | 60 +++++++++++++++++++++++++++++++++++- 4 files changed, 81 insertions(+), 6 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index eadf0dd0..a901d0ba 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -181,7 +181,7 @@ def deleted_objects(self): return self._deleted_objects @property - def resized_datsets(self): + def resized_datasets(self): return self._resized_datasets def _getDatasetUpdates(self, dset_id): @@ -769,8 +769,24 @@ def resizeDataset(self, dset_id, shape): if dset_id not in self.new_objects: self._resized_datasets.add(dset_id) - # if the shape has shrunk in any dimension, do a flush now new_dims = getShapeDims(dset_json) + rank = len(new_dims) + + # adjust any selections in the update list + updates = self._getDatasetUpdates(dset_id) + for i in range(len(updates)): + (sel_update, arr) = updates[i] + if sel_update.select_type == selections.H5S_SELECT_HYPERSLABS: + slices = list(sel_update.slices) + for dim in range(rank): + s = slices[dim] + if s.stop > new_dims[dim]: + # selection outside new bounds of dataset + slices[dim] = slice(s.start, new_dims[dim], s.step) + sel_update = selections.select(new_dims, tuple(slices)) + updates[i] = (sel_update, arr) + + # if the shape has shrunk in any dimension, do a flush now do_flush = False for i in range(len(new_dims)): if new_dims[i] < old_dims[i]: diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 93dd8bcb..75b06913 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -422,7 +422,6 @@ def __getitem__(self, args): start, count, step, scalar = _handle_simple(self._shape, args) self._sel = (start, count, step, scalar) - # self._id.select_hyperslab(start, count, step) self._select_type = H5S_SELECT_HYPERSLABS self._mshape = tuple(x for x, y in zip(count, scalar) if not y) diff --git a/src/h5json/shape_util.py b/src/h5json/shape_util.py index 753603cd..5f96c392 100644 --- a/src/h5json/shape_util.py +++ b/src/h5json/shape_util.py @@ -40,8 +40,8 @@ def getShapeJson(dims, maxdims=None): datasets) """ if isinstance(dims, int): dims = (dims, ) - if isinstance(maxdims, int): - maxdims = (maxdims, ) + elif dims == "H5S_NULL": + dims = None if dims is None: shape_class = "H5S_NULL" elif len(dims) == 0: @@ -58,6 +58,8 @@ def getShapeJson(dims, maxdims=None): if maxdims is not None: if shape_class != "H5S_SIMPLE": raise ValueError(f"maxdims can not be used with shape class: {shape_class}") + if isinstance(maxdims, int): + maxdims = (maxdims, ) if len(maxdims) != len(dims): raise ValueError("maxdims must match dataspace rank") # convert any 0 or None vlues to "H5S_UNLIMITED" diff --git a/test/unit/shape_util_test.py b/test/unit/shape_util_test.py index 61c78e51..4266e2a9 100644 --- a/test/unit/shape_util_test.py +++ b/test/unit/shape_util_test.py @@ -12,7 +12,7 @@ import unittest import logging -from h5json.shape_util import getShapeClass, getShapeDims, getNumElements, getRank +from h5json.shape_util import getShapeClass, getShapeDims, getNumElements, getRank, getShapeJson from h5json.shape_util import isNullSpace, isScalar, getDataSize, isExtensible, getMaxDims @@ -23,6 +23,64 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.WARNING) + def testGetShape(self): + + null_shape = getShapeJson("H5S_NULL") + self.assertTrue("class" in null_shape) + self.assertEqual(null_shape["class"], "H5S_NULL") + self.assertFalse("dims" in null_shape) + self.assertFalse("maxdims" in null_shape) + + null_shape = getShapeJson(None) + self.assertTrue("class" in null_shape) + self.assertEqual(null_shape["class"], "H5S_NULL") + self.assertFalse("dims" in null_shape) + self.assertFalse("maxdims" in null_shape) + + scalar_shape = getShapeJson(()) + self.assertTrue("class" in scalar_shape) + self.assertEqual(scalar_shape["class"], "H5S_SCALAR") + self.assertTrue("dims" not in scalar_shape) + self.assertFalse("maxdims" in scalar_shape) + + simple_shape = getShapeJson(42) + self.assertTrue("class" in simple_shape) + self.assertEqual(simple_shape["class"], "H5S_SIMPLE") + self.assertTrue("dims" in simple_shape) + self.assertEqual(simple_shape["dims"], (42, )) + self.assertFalse("maxdims" in simple_shape) + + simple_shape = getShapeJson((42, )) + self.assertTrue("class" in simple_shape) + self.assertEqual(simple_shape["class"], "H5S_SIMPLE") + self.assertTrue("dims" in simple_shape) + self.assertEqual(simple_shape["dims"], (42, )) + self.assertFalse("maxdims" in simple_shape) + + extendable_shape = getShapeJson((4, 5), maxdims=("H5S_UNLIMITED", 10)) + self.assertTrue("class" in extendable_shape) + self.assertEqual(extendable_shape["class"], "H5S_SIMPLE") + self.assertTrue("dims" in extendable_shape) + self.assertEqual(extendable_shape["dims"], (4, 5)) + self.assertTrue("maxdims" in extendable_shape) + self.assertTrue(extendable_shape["maxdims"], ("H5S_UNLIMITED", 10)) + + extendable_shape = getShapeJson((4, 5), maxdims=(None, 10)) + self.assertTrue("class" in extendable_shape) + self.assertEqual(extendable_shape["class"], "H5S_SIMPLE") + self.assertTrue("dims" in extendable_shape) + self.assertEqual(extendable_shape["dims"], (4, 5)) + self.assertTrue("maxdims" in extendable_shape) + self.assertTrue(extendable_shape["maxdims"], ("H5S_UNLIMITED", 10)) + + extendable_shape = getShapeJson((4, 5), maxdims=(0, 10)) + self.assertTrue("class" in extendable_shape) + self.assertEqual(extendable_shape["class"], "H5S_SIMPLE") + self.assertTrue("dims" in extendable_shape) + self.assertEqual(extendable_shape["dims"], (4, 5)) + self.assertTrue("maxdims" in extendable_shape) + self.assertTrue(extendable_shape["maxdims"], ("H5S_UNLIMITED", 10)) + def testSimple(self): type_json = { From b4485eb44675b32c38e565776b6d731f004f8a84 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 2 Jan 2026 11:20:43 +0800 Subject: [PATCH 107/129] test broadcasting --- src/h5json/hdf5db.py | 2 +- test/unit/h5py_writer_test.py | 30 ++++++++++++++++++++++++------ test/unit/hdf5db_test.py | 17 +++++++++++++++++ 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index a901d0ba..b4758ff0 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -753,7 +753,7 @@ def setDatasetValues(self, dset_id, sel, arr): if sel.select_type != selections.H5S_SELECT_HYPERSLABS: raise ValueError("tbd") arr = arr.reshape(sel.mshape) - updates.append((sel, arr.copy())) + updates.append((sel, arr)) self.make_dirty(dset_id) def resizeDataset(self, dset_id, shape): diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 6e208ada..567c1439 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -97,12 +97,13 @@ def testSimple(self): g1_1_id = db.createGroup() db.createHardLink(g1_id, "g1.1", g1_1_id) dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) - arr = np.zeros((10, 10), dtype=np.int32) - for i in range(10): - for j in range(10): - arr[i, j] = i * j + + # try setting dset values with broadcasting + arr_one_value = np.zeros((1, 1), dtype=np.int32) + arr_one_value[0, 0] = 42 sel_all = selections.select((10, 10), ...) - db.setDatasetValues(dset_111_id, sel_all, arr) + db.setDatasetValues(dset_111_id, sel_all, arr_one_value) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) db.createSoftLink(g2_id, "slink", "somewhere") db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") @@ -126,12 +127,29 @@ def testSimple(self): self.assertEqual(dset.shape, (10, 10)) for i in range(10): for j in range(10): - self.assertEqual(dset[i, j], i * j) + self.assertEqual(dset[i, j], 42) self.assertTrue("g2" in f) g2 = f["g2"] self.assertTrue("extlink" in g2) self.assertTrue("slink" in g2) + # write dataset values element by element + db.open() + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.close() + + # verify changes in h5py + with h5py.File(filepath) as f: + dset = f["/g1/g1.1/dset1.1.1"] + for i in range(10): + for j in range(10): + self.assertEqual(dset[i, j], i * j) + db.open() db.createAttribute(g1_id, "a1", "hello") db.createAttribute(g1_id, "a2", "bye-bye") diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index f7d27f76..b3b4891c 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -478,6 +478,23 @@ def testSimpleDataset(self): self.assertEqual(val.shape, (1, 1)) self.assertEqual(val[0, 0], i * 10 + j) + # test select all write + sel = selections.select(shape, ...) + print("got sel:", sel) + print(sel.select_type) + arr = np.zeros(shape, dtype=dtype) + arr[...] = 42 + db.setDatasetValues(dset_id, sel, arr) + arr = db.getDatasetValues(dset_id, sel) + for i in range(nrows): + for j in range(ncols): + self.assertEqual(arr[i, j], 42) + + # try with broadcasting + arr_one_value = np.zeros((1, 1), dtype=dtype) + arr_one_value[0, 0] = 7 + db.setDatasetValues(dset_id, sel, arr_one_value) + db.close() def testStringDataset(self): From bb4d148a213e5ae53fd6d7e1f81cb61faff62cb2 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 2 Jan 2026 18:55:35 +0800 Subject: [PATCH 108/129] added data limit option to json writer --- src/h5json/apps/h5tojson.py | 16 +++-- src/h5json/hdf5db.py | 2 +- src/h5json/jsonstore/h5json_writer.py | 33 +++++++-- test/unit/h5json_writer_test.py | 99 +++++++++++++++++++++++---- test/unit/hdf5db_test.py | 5 +- 5 files changed, 127 insertions(+), 28 deletions(-) diff --git a/src/h5json/apps/h5tojson.py b/src/h5json/apps/h5tojson.py index 284de84c..24b5716e 100755 --- a/src/h5json/apps/h5tojson.py +++ b/src/h5json/apps/h5tojson.py @@ -20,14 +20,22 @@ def main(): if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): - print(f"usage: {sys.argv[0]} [-h] [--nodata] ") + print(f"usage: {sys.argv[0]} [-h] [--nodata] [--data-limit n] ") sys.exit(0) - no_data = False + data_limit = None filename = None for i in range(1, len(sys.argv)): if sys.argv[i] == "--nodata": - no_data = True + data_limit = 0 + elif sys.argv[i] == "--data-limit": + i += 1 + if i >= len(sys.argv): + sys.exit("Error: --data-limit requires a numeric argument") + try: + data_limit = int(sys.argv[i]) + except ValueError: + sys.exit("Error: --data-limit requires a numeric argument") else: filename = sys.argv[i] @@ -45,7 +53,7 @@ def main(): db = Hdf5db(app_logger=log) db.reader = H5pyReader(filename, app_logger=log) - db.writer = H5JsonWriter(None, no_data=no_data, app_logger=log) + db.writer = H5JsonWriter(None, data_limit=data_limit, app_logger=log) db.open() # read HDF5 data into db db.close() # close will trigger write to json file diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index b4758ff0..0085b93e 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -215,7 +215,7 @@ def flush(self): # flush not successful, don't clear dirty set self.log.error("writer flush failed") return False - + self.log.debug("clearing new, dirty, deleted sets") # reset new, dirty and deleted sets self._new_objects.clear() self._dirty_objects.clear() diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index f37ac415..f97df007 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -17,6 +17,7 @@ from ..h5writer import H5Writer from ..objid import getUuidFromId, getCollectionForId, createObjId from ..array_util import bytesArrayToList +from ..hdf5dtype import getItemSize from .. import selections @@ -30,15 +31,20 @@ def __init__( self, filepath, append=False, - no_data=False, + data_limit=None, + indent=4, app_logger=None ): + no_data = True if data_limit == 0 else False super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) if append: raise ValueError("H5JsonWriter does not support append mode") self.alias_db = {} self.json = {} + self._data_limit = data_limit self._root_id = None + self._indent = indent + self._file_dumped = False def flush(self): """ Write dirty items """ @@ -49,7 +55,12 @@ def flush(self): raise IOError(msg) self.log.info("flush") - self.dumpFile() + if self._file_dumped: + self.log.info("flush: file already dumped, nothing to do") + else: + self.dumpFile() + self._file_dumped = True + return True def open(self): @@ -196,7 +207,8 @@ def dumpDataset(self, obj_id): alias = self.getAliasList(obj_id) response["alias"] = alias - response["type"] = item["type"] + type_item = item["type"] + response["type"] = type_item shapeItem = item["shape"] shape_rsp = {} num_elements = 1 @@ -229,8 +241,15 @@ def dumpDataset(self, obj_id): attributes = self.dumpAttributes(obj_id) if attributes: response["attributes"] = attributes - - if not self.no_data: + if self._data_limit is not None: + item_size = getItemSize(type_item) + if item_size == "H5T_VARIABLE": + item_size = 1024 # assume average size for variable length types + total_size = item_size * num_elements + + if total_size > self._data_limit: + self.log.info(f"skipping data dump for dataset {obj_id} with {num_elements} elements") + if self._data_limit is None or total_size <= self._data_limit: if num_elements > 0: sel_all = selections.select(dims, ...) arr = self.db.getDatasetValues(obj_id, sel_all) @@ -287,10 +306,10 @@ def dumpFile(self): self.dumpDatasets() self.dumpDatatypes() - indent = 4 + indent = self._indent ensure_ascii = True if self._filepath: - with open('data.json', 'w', encoding='utf-8') as f: + with open(self._filepath, 'w', encoding='utf-8') as f: json.dump(self.json, f, ensure_ascii=ensure_ascii, indent=indent) else: print(json.dumps(self.json, sort_keys=True, ensure_ascii=ensure_ascii, indent=indent)) diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py index ba2cbc19..bb1b8a5d 100644 --- a/test/unit/h5json_writer_test.py +++ b/test/unit/h5json_writer_test.py @@ -11,10 +11,12 @@ ############################################################################## import unittest import time +from os.path import getsize import logging import numpy as np from h5json import Hdf5db from h5json.jsonstore.h5json_writer import H5JsonWriter + from h5json.hdf5dtype import special_dtype, Reference from h5json import selections @@ -44,7 +46,7 @@ def __init__(self, *args, **kwargs): def testSimple(self): - filepath = "test/unit/out/h5json_writer_testSimple.h5" + filepath = "test/unit/out/h5json_writer_testSimple.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -71,12 +73,12 @@ def testSimple(self): db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") db.createCustomLink(g2_id, "cust", {"foo": "bar"}) self.assertTrue(db.writer.lastModified is None) # no update yet - db.flush() + db.close() self.assertTrue(db.writer.lastModified > 0) # timestamp should be updated def testNullSpaceAttribute(self): - filepath = "test/unit/out/h5json_writer_testNullSpaceAttribute.h5" + filepath = "test/unit/out/h5json_writer_testNullSpaceAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -90,9 +92,10 @@ def testNullSpaceAttribute(self): self.assertTrue(item["created"] > time.time() - 1.0) value = db.getAttributeValue(root_id, "A1") self.assertEqual(value, None) + db.close() def testScalarAttribute(self): - filepath = "test/unit/out/h5json_writer_testScalarAttribute.h5" + filepath = "test/unit/out/h5json_writer_testScalarAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -116,9 +119,10 @@ def testScalarAttribute(self): self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") + db.close() def testFixedStringAttribute(self): - filepath = "test/unit/out/h5json_writer_testFixedStringAttribute.h5" + filepath = "test/unit/out/h5json_writer_testFixedStringAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -138,9 +142,10 @@ def testFixedStringAttribute(self): self.assertTrue(item["created"] > now - 1) ret_value = db.getAttributeValue(root_id, "A1") self.assertEqual(ret_value, b'Hello, world!') + db.close() def testVlenAsciiAttribute(self): - filepath = "test/unit/out/h5json_writer_testVlenAsciiAttribute.h5" + filepath = "test/unit/out/h5json_writer_testVlenAsciiAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -163,9 +168,10 @@ def testVlenAsciiAttribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) + db.close() def testVlenUtf8Attribute(self): - filepath = "test/unit/out/h5json_writer_testVlenutf8Attribute.h5" + filepath = "test/unit/out/h5json_writer_testVlenutf8Attribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -188,9 +194,10 @@ def testVlenUtf8Attribute(self): self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) self.assertTrue(item["created"] > now - 1) + db.close() def testIntAttribute(self): - filepath = "test/unit/out/h5json_writer_testIntAttribute.h5" + filepath = "test/unit/out/h5json_writer_testIntAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -207,9 +214,10 @@ def testIntAttribute(self): item_type = item["type"] self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db.close() def testCreateReferenceAttribute(self): - filepath = "test/unit/out/h5json_writer_testCreateReferenceAttribute.h5" + filepath = "test/unit/out/h5json_writer_testCreateReferenceAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -223,19 +231,22 @@ def testCreateReferenceAttribute(self): ds1_ref = "datasets/" + dset_id value = [ds1_ref,] db.createAttribute(root_id, "A1", value, dtype=dt) - item = db.getAttribute(root_id, "A1") attr = db.getAttribute(root_id, "A1") self.assertTrue("shape" in attr) + shape = attr["shape"] + self.assertEqual(shape["class"], "H5S_SIMPLE") + self.assertEqual(shape["dims"], [1,]) attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") - attr_value = item["value"] + attr_value = attr["value"] self.assertEqual(len(attr_value), 1) self.assertEqual(attr_value[0], ds1_ref) + db.close() def testCreateVlenReferenceAttribute(self): - filepath = "test/unit/out/h5json_writer_testVlenReferenceAttribute.h5" + filepath = "test/unit/out/h5json_writer_testVlenReferenceAttribute.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -268,9 +279,10 @@ def testCreateVlenReferenceAttribute(self): item_shape = item["shape"] self.assertEqual(item_shape["class"], "H5S_SCALAR") + db.close() def testCommittedType(self): - filepath = "test/unit/out/h5json_writer_testCommittedType.h5" + filepath = "test/unit/out/h5json_writer_testCommittedType.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -299,9 +311,10 @@ def testCommittedType(self): self.assertEqual(attr_type["class"], "H5T_STRING") self.assertEqual(attr_type["length"], 15) self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + db.close() def testCommittedCompoundType(self): - filepath = "test/unit/out/h5json_writer_testCommittedCompoundType.h5" + filepath = "test/unit/out/h5json_writer_testCommittedCompoundType.json" db = Hdf5db(app_logger=self.log) db.writer = H5JsonWriter(filepath, app_logger=self.log) @@ -340,6 +353,64 @@ def testCommittedCompoundType(self): value = db.getAttributeValue(root_id, "A1") self.assertTrue(isinstance(value, np.ndarray)) + db.close() + + def testNoData(self): + + def init_db(db): + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + dset_0_id = db.createDataset(shape=(), dtype=np.int32) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + sel_all = selections.select((), ...) + db.setDatasetValues(dset_0_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createHardLink(g1_1_id, "dset0", dset_0_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + + def save_json(filepath, data_limit=None): + db = Hdf5db(app_logger=self.log) + kwargs = {"indent": 2, "app_logger": self.log} + db.writer = H5JsonWriter(filepath, data_limit=data_limit, **kwargs) + db.open() + init_db(db) + db.close() + file_size = getsize(filepath) + return file_size + + file_prefix = "test/unit/out/h5json_writer_testNoData_" + + size_with_data = save_json(file_prefix + "withData.json", data_limit=None) + # should be close to 4640 + self.assertTrue(size_with_data > 4000) + + size_without_data = save_json(file_prefix + "withoutData.json", data_limit=0) + # should be close to 3038 + self.assertTrue(size_without_data > 3000) + self.assertTrue(size_without_data < 4000) + + size_with_smalldata = save_json(file_prefix + "withSmallData.json", data_limit=100) + # should be close to 3057 + self.assertTrue(size_with_smalldata > size_without_data) + self.assertTrue(size_with_smalldata < size_with_data) if __name__ == "__main__": diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index b3b4891c..e34dd3b3 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -145,6 +145,9 @@ def testGroup(self): self.assertEqual(len(db.getAttributes(g1_id)), 2) a1_attr = db.getAttribute(g1_id, "a1") self.assertEqual(a1_attr["value"], "hello") + self.assertTrue("shape" in a1_attr) + attr_shape = a1_attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") db.deleteAttribute(g1_id, "a1") self.assertEqual(len(db.getAttributes(g1_id)), 1) @@ -480,8 +483,6 @@ def testSimpleDataset(self): # test select all write sel = selections.select(shape, ...) - print("got sel:", sel) - print(sel.select_type) arr = np.zeros(shape, dtype=dtype) arr[...] = 42 db.setDatasetValues(dset_id, sel, arr) From db47efae7f4cbc2fc060e04ab8ee2477032b391c Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 6 Jan 2026 19:05:39 +0800 Subject: [PATCH 109/129] fix for H5S_UNLIMITED --- src/h5json/dset_util.py | 2 +- src/h5json/hdf5db.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py index 50340438..d3f8dfba 100644 --- a/src/h5json/dset_util.py +++ b/src/h5json/dset_util.py @@ -256,7 +256,7 @@ def validateLayout(shape_json, type_json, layout): if max_dims is None: if chunk_extent > dim_extent: msg = "Invalid layout value" - raise ValueError(reason=msg) + raise ValueError(msg) elif max_dims[i] not in (0, "H5S_UNLIMITED"): if chunk_extent > max_dims[i]: msg = "Invalid layout value for extensible dimension" diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 0085b93e..d3ceeb01 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -346,7 +346,7 @@ def getObjectById(self, obj_id, refresh=False): """ return object with given id """ self._checkReader() obj_id = getHashTagForId(obj_id) - if obj_id not in self.db or refresh: + if obj_id not in self.db or (refresh and not self.is_new(obj_id)): # load the obj from the reader self.log.debug(f"getObjectById - fetching {obj_id} from reader") obj_json = self.reader.getObjectById(obj_id) From 3a2e6b29fbddc51fb6ed130e8197b1b2deeb9542 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 8 Jan 2026 13:12:24 +0800 Subject: [PATCH 110/129] added link_util file --- src/h5json/link_util.py | 146 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 src/h5json/link_util.py diff --git a/src/h5json/link_util.py b/src/h5json/link_util.py new file mode 100644 index 00000000..5d659210 --- /dev/null +++ b/src/h5json/link_util.py @@ -0,0 +1,146 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# link_util: +# link related functions +# +from h5json.objid import isValidUuid + + +def validateLinkName(name): + """ verify the link name is valid """ + if not isinstance(name, str): + msg = "Unexpected type for link name" + raise ValueError(msg) + if name.find("/") >= 0: + msg = "link name contains slash" + raise ValueError(msg) + + +def getLinkClass(link_json): + """ verify this is a valid link + returns the link class """ + if "class" in link_json: + link_class = link_json["class"] + else: + link_class = None + if "h5path" in link_json and "id" in link_json: + msg = "link tgt_id and h5path both set" + raise ValueError(msg) + if "id" in link_json: + tgt_id = link_json["id"] + if not isValidUuid(tgt_id): + msg = f"link with invalid id: {tgt_id}" + raise ValueError(msg) + if link_class: + if link_class != "H5L_TYPE_HARD": + msg = f"expected link class to be H5L_TYPE_HARD but got: {link_class}" + raise ValueError(msg) + else: + link_class = "H5L_TYPE_HARD" + elif link_json.get("h5path"): + if link_json.get("h5domain") or link_json.get("file"): + if link_class: + if link_class != "H5L_TYPE_EXTERNAL": + msg = f"expected link class to be H5L_TYPE_EXTERNAL but got: {link_class}" + raise ValueError(msg) + else: + link_class = "H5L_TYPE_EXTERNAL" + else: + if link_class: + if link_class != "H5L_TYPE_SOFT": + msg = f"expected link class to be H5L_TYPE_SOFT but got: {link_class}" + raise ValueError(msg) + else: + link_class = "H5L_TYPE_SOFT" + else: + msg = "link with no id or h5path" + raise ValueError(msg) + + return link_class + + +def getLinkId(link_json): + """ return id for hard links, otherwise raise type error """ + if getLinkClass(link_json) != "H5L_TYPE_HARD": + raise TypeError("expected hard link") + return link_json["id"] + + +def getLinkPath(link_json): + """ Returns h5path for soft or external link. Otherwise raise type error """ + + if getLinkClass(link_json) not in ("H5L_TYPE_SOFT", "H5L_TYPE_EXTERNAL"): + raise TypeError("expected soft or external link") + + return link_json["h5path"] + + +def getLinkFilePath(link_json): + """ return file path for an external link. Otherwise raise type error """ + if getLinkClass(link_json) != "H5L_TYPE_EXTERNAL": + raise TypeError("expected External Link") + if "file" in link_json: + link_file = link_json["file"] + elif "h5domain" in link_json: + # h5domain was the deprecated storage key + # check for backward compatibility + link_file = link_json["h5domain"] + else: + raise KeyError("unexpected link format") + return link_file + + +def isEqualLink(link1, link2): + """ Return True if the two links are the same """ + + for obj in (link1, link2): + if not isinstance(obj, dict): + raise TypeError(f"unexpected type: {type(obj)}") + if "class" not in obj: + raise TypeError("expected class key for link") + link_class = getLinkClass(link1) + if link_class != getLinkClass(link2): + return False # different link types + if link_class == "H5L_TYPE_HARD": + if getLinkId(link1) != getLinkId(link2): + return False + else: + return True + elif link_class == "H5L_TYPE_SOFT": + if getLinkPath(link1) != getLinkPath(link2): + return False + else: + return True + elif link_class == "H5L_TYPE_EXTERNAL": + if getLinkPath(link1) != getLinkPath(link2): + return False + if getLinkFilePath(link1) != getLinkFilePath(link2): + return False + return True + else: + raise TypeError(f"unexpected link class: {link_class}") + + +def h5Join(path, paths): + """ join the paths """ + + h5path = path + if not paths: + return h5path + if isinstance(paths, str): + paths = (paths,) + for s in paths: + if h5path[-1] != "/": + h5path += "/" + h5path += s + return h5path From c8f2aa35261f497f535bfc84f87301c1435c1a83 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 13 Jan 2026 18:24:51 -0800 Subject: [PATCH 111/129] fix circular import --- src/h5json/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/h5json/__init__.py b/src/h5json/__init__.py index d4a7f781..d44ab67c 100644 --- a/src/h5json/__init__.py +++ b/src/h5json/__init__.py @@ -30,6 +30,3 @@ from .objid import isSchema2Id from .objid import isRootObjId from .hdf5db import Hdf5db -from . import _version - -__version__ = _version.__version__ From 40c47052bc294693b9cc94cca740d19f02d75fad Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 27 Jan 2026 22:33:58 -0800 Subject: [PATCH 112/129] fix for vlen types --- src/h5json/array_util.py | 76 ++++++++++++++++++++++------- src/h5json/h5pystore/h5py_writer.py | 29 ++++++++++- src/h5json/hdf5db.py | 6 ++- src/h5json/hdf5dtype.py | 23 ++++++--- test/unit/array_util_test.py | 66 +++++++++++++++---------- test/unit/hdf5db_test.py | 1 + 6 files changed, 149 insertions(+), 52 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index e57a3892..39966715 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -15,7 +15,7 @@ import binascii import numpy as np -from .hdf5dtype import isVlen, is_float16_dtype, guess_dtype +from .hdf5dtype import isVlen, is_float16_dtype, guess_dtype, vlenBaseType MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million @@ -108,8 +108,6 @@ def jsonToArray(data_shape, data_dtype, data_json): Return numpy array from the given json array. """ - # print(f"jsonToArray - data_shape: {data_shape} dtype: {data_dtype} data: {data_json}") - def get_array(data, rank, dtype): # helper function to create an array with encoding if needed try: @@ -120,28 +118,72 @@ def get_array(data, rank, dtype): arr = np.array(data, dtype=dtype) return arr - if data_json is None: - return np.array([]).astype(data_dtype) + def fillVlenArray(rank, data, arr, index): + for i in range(len(data)): + if rank > 1: + index = fillVlenArray(rank - 1, data[i], arr, index) + elif len(arr.dtype) > 0: + # deal with compound dtype + element_data = data[i] + arr_element = [] + for j in range(len(arr.dtype)): + compound_data = element_data[j] + compound_dtype = arr.dtype[j] + if isVlen(compound_dtype): + base_dt = vlenBaseType(compound_dtype) + if base_dt is str and isinstance(compound_data, bytes): + compound_data = compound_data.decode('utf8') + if base_dt in (str, bytes): + arr_element.append(compound_data) + else: + arr_element.append(np.array(compound_data, base_dt)) + else: + arr_element.append(compound_data) + arr[i] = tuple(arr_element) + index += 1 + else: + base_dt = vlenBaseType(arr.dtype) + element_data = data[i] + # If base dtype is str and data is bytes, decode it first + if base_dt is str and isinstance(element_data, bytes): + element_data = element_data.decode('utf8') + arr_element = np.array(element_data, base_dt) + arr[index] = arr_element + index += 1 + return index - if isinstance(data_json, (list, tuple)): - if None in data_json: - return np.array([]).astype(data_dtype) + if data_json is None: + return np.array(data_shape).astype(data_dtype) - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if len(data_dtype) > 0 and not isinstance(data_json, (list, tuple)): - raise TypeError("expected list data for compound data type") npoints = getNumElements(data_shape) np_shape_rank = len(data_shape) - if type(data_json) in (list, tuple): - data_json = toTuple(np_shape_rank, data_json) + was_list_input = type(data_json) in (list, tuple) + if was_list_input: + converted_data = [] + if npoints == 1 and len(data_json) == len(data_dtype): + converted_data.append(toTuple(0, data_json)) + else: + converted_data = toTuple(np_shape_rank, data_json) + data_json = converted_data + else: + if isinstance(data_json, str): + data_json = data_json.encode("utf8") + data_json = [data_json,] # listify if isVlen(data_dtype): - # for vlen data we need to initialize of zero numpy array to ensure the right shape + # For scalar vlen where input was a list with multiple items (e.g. ['ref1', 'ref2'] + # for vlen refs), the items represent vlen contents for the single scalar, not + # separate array elements. Wrap so fillVlenArray sees one element. + # Skip wrapping if already has 1 element (e.g. [('foo', 'bar')] is already correct). + if np_shape_rank == 0 and len(data_dtype) == 0 and was_list_input and len(data_json) > 1: + data_json = [data_json] + # for vlen data we need to initialize a zero numpy array to ensure the right shape + arr = np.zeros((npoints,), dtype=data_dtype) + fillVlenArray(np_shape_rank, data_json, arr, 0) + elif all(e is None for e in data_json): + # just create a zero array arr = np.zeros(data_shape, dtype=data_dtype) - arr[...] = data_json else: try: arr = get_array(data_json, np_shape_rank, data_dtype) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index b801af83..0bb7fc9d 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -15,7 +15,7 @@ import time from ..objid import getCollectionForId, isValidUuid, createObjId -from ..hdf5dtype import createDataType +from ..hdf5dtype import createDataType, isVlen, vlenBaseType from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype from ..shape_util import getShapeDims, getShapeClass, isExtensible, getMaxDims from ..array_util import jsonToArray @@ -141,6 +141,32 @@ def _copy_array(self, src_arr, fout=None): element = self._copy_element(e, src_arr.dtype, tgt_dt, fout=fout) tgt_arr_flat[i] = element tgt_arr = tgt_arr_flat.reshape(src_arr.shape) + elif len(src_arr.dtype) == 0 and isVlen(src_arr.dtype) and vlenBaseType(src_arr.dtype) in (bytes, str): + # vlen strings need elements converted to Python str for h5py + count = int(np.prod(src_arr.shape)) + tgt_dt = h5py.special_dtype(vlen=str) + tgt_arr = np.zeros(src_arr.shape, dtype=tgt_dt) + tgt_arr_flat = tgt_arr.reshape((count,)) + src_arr_flat = src_arr.reshape((count,)) + for i in range(count): + e = src_arr_flat[i] + if isinstance(e, str): + tgt_arr_flat[i] = e + elif isinstance(e, bytes): + tgt_arr_flat[i] = e.decode('utf-8') + elif isinstance(e, np.ndarray) and e.dtype.kind == 'S': + # numpy byte string array - convert to Python string + tgt_arr_flat[i] = e.item().decode('utf-8') + elif isinstance(e, np.ndarray) and e.dtype.kind == 'U': + # numpy unicode array - get Python string + tgt_arr_flat[i] = e.item() + elif isinstance(e, np.bytes_): + tgt_arr_flat[i] = e.decode('utf-8') + elif isinstance(e, np.str_): + tgt_arr_flat[i] = str(e) + else: + tgt_arr_flat[i] = e + tgt_arr = tgt_arr_flat.reshape(src_arr.shape) else: # can just copy the entire array tgt_arr[...] = src_arr[...] @@ -366,6 +392,7 @@ def initializeDatasetValues(self, dset_id, dset): sel_all = selections.select(dset.shape, ...) arr = self.db.getDatasetValues(dset_id, sel_all) if arr is not None: + arr = self._copy_array(arr, fout=dset.file) dset[...] = arr def createAttribute(self, obj, name, attr_json): diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index d3ceeb01..d19f7da5 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -574,7 +574,7 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): else: value_json = None - if shape is None: + if shape is None and value is not None: shape = value.shape if shape == "H5S_NULL": shape_json = {"class": "H5S_NULL"} @@ -588,7 +588,9 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): attrs_json = obj_json["attributes"] type_json = getTypeItem(dtype) # finally put it all together... - attr_json = {"shape": shape_json, "type": type_json, "value": value_json} + attr_json = {"shape": shape_json, "type": type_json} + if shape != "H5S_NULL": + attr_json["value"] = value_json attr_json["created"] = getNow() # slot into the obj_json["attrs"] diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index c0ed2884..defd09a2 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -235,7 +235,7 @@ class (either Reference or RegionReference). Returns None if the dtype name, dt = kwds.popitem() if name not in ("vlen", "enum", "ref"): - raise TypeError('Unknown special type "%s"' % name) + raise TypeError(f"Unknown special type {name}") try: return dt.metadata[name] @@ -341,11 +341,7 @@ def getTypeItem(dt, metadata=None): # vlen string or data # # check for h5py variable length extension - vlen_check = None - if metadata and "vlen" in metadata: - vlen_check = metadata["vlen"] - if vlen_check is not None and not isinstance(vlen_check, np.dtype): - vlen_check = np.dtype(vlen_check) + vlen_check = vlenBaseType(dt) if metadata and "ref" in metadata: ref_check = metadata["ref"] @@ -509,6 +505,21 @@ def isVlen(dt): return is_vlen +def vlenBaseType(dt): + """ + Return the base dtype of a vlen, otherwise none + """ + if len(dt): + raise TypeError("BaseType can't be deterined for compound type") + if dt.base.metadata and "vlen" in dt.base.metadata: + base_dt = dt.base.metadata["vlen"] + if base_dt not in (bytes, str): + base_dt = np.dtype(base_dt) + else: + base_dt = None + return base_dt + + def isOpaqueDtype(dt): """ Return True if this is an opaque dtype diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index 1ede343d..b3b7c266 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -234,7 +234,8 @@ def testJsonToArray(self): self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], bytes) self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out[2], "three") + e = out[2] + self.assertEqual(e, "three".encode()) # test utf8 strings dt = np.dtype("S26") @@ -277,9 +278,13 @@ def testJsonToArray(self): # VLEN data shape = [] dt = special_dtype(vlen=np.dtype("S10")) - data = ["foo", "bar"] + data = [("foo", "bar")] out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + self.assertEqual(out[()][0], b'foo') + self.assertEqual(out[()][1], b'bar') dt = special_dtype(vlen=np.dtype("int32")) shape = [4, ] @@ -298,8 +303,11 @@ def testJsonToArray(self): self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) for i in range(4): e = out[i] # .tolist() - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, tuple(range(1, i + 2))) + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(e.shape, (i + 1,)) + self.assertEqual(e.dtype, np.dtype("int32")) + for j in range(i + 1): + self.assertEqual(e[j], j + 1) # VLEN 2D data dt = special_dtype(vlen=np.dtype("int32")) @@ -321,10 +329,18 @@ def testJsonToArray(self): self.assertEqual(out.shape, (2, 2)) self.assertEqual(out.dtype.kind, "O") self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) - for i in range(2): - for j in range(2): - e = out[i, j] # .tolist() - self.assertTrue(isinstance(e, tuple)) + e = out[0, 0] + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [0]) + e = out[0, 1] + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [1, 2]) + e = out[1, 0] + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [1]) + e = out[1, 1] + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [2, 3]) # create VLEN of obj ref's ref_type = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} @@ -352,14 +368,14 @@ def testJsonToArray(self): self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("S48")) e = out[0] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0,)) + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [id0,]) e = out[1] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0, id1)) + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [id0, id1]) e = out[2] - self.assertTrue(isinstance(e, tuple)) - self.assertEqual(e, (id0, id1, id2)) + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(list(e), [id0, id1, id2]) # compound type dt = np.dtype([("a", "i4"), ("b", "S5")]) @@ -939,7 +955,6 @@ def array_equal(a, b): """ compare two values element by element.""" if type(a) in (list, tuple, np.void, np.ndarray): if len(a) != len(b): - print("number of elements doesn't match") return False nelements = len(a) for i in range(nelements): @@ -999,15 +1014,16 @@ def array_equal(a, b): data = [[42, "Hello"], [0, 0], [0, 0], [84, "Bye"]] arr = jsonToArray(shape, dt, data) self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(tuple(arr[0]), (42, 'Hello')) + self.assertEqual(tuple(arr[3]), (84, 'Bye')) buffer = arrayToBytes(arr) self.assertEqual(len(buffer), 40) expected = bytearray(40) - expected[0:8] = b"*\x00\x00\x00\x05\x00\x00\x00" - expected[8:19] = b"Hello\x00\x00\x00\x00\x00\x00" - expected[19:26] = b"\x00\x00\x00\x00\x00\x00\x00" - expected[26:40] = b"\x00\x00\x00T\x00\x00\x00\x03\x00\x00\x00Bye" - + expected[0:10] = b'*\x00\x00\x00\x05\x00\x00\x00He' + expected[10:20] = b'llo\x00\x00\x00\x00\x00\x00\x00' + expected[20:30] = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00T' + expected[30:40] = b'\x00\x00\x00\x03\x00\x00\x00Bye' self.assertEqual(buffer, expected) # convert back to array @@ -1219,16 +1235,14 @@ def testGetNumpyValueBase64Encoded(self): def testJsonToArrayOnNoneArray(self): data_dtype = np.dtype("i4") - data_shape = [0, ] - data_json = [None] + data_shape = [3, ] + data_json = [None, None, None] arr = None - try: arr = jsonToArray(data_shape, data_dtype, data_json) except Exception as e: print(f"Exception while testing jsonToArray on array with None elements: {e}") - - self.assertTrue(len(arr) == 0) + self.assertEqual(arr.shape, (3, )) self.assertTrue(arr.dtype == data_dtype) def testGetBroadcastShape(self): @@ -1259,7 +1273,7 @@ def testJsonToArrayOnNoneCompoundArray(self): arr = jsonToArray(shape, dt, data) - self.assertEqual(len(arr), 0) + self.assertEqual(arr.shape, (1,)) self.assertEqual(arr.dtype, dt) diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index e34dd3b3..dcaf92fe 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -164,6 +164,7 @@ def testNullSpaceAttribute(self): shape_item = item["shape"] self.assertTrue("class" in shape_item) self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertFalse("value" in item) self.assertTrue(item["created"] > time.time() - 1.0) value = db.getAttributeValue(root_id, "A1") self.assertEqual(value, None) From 7295f6a4a0f978c7749a12ee03c94a0e366aff02 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 2 Feb 2026 17:45:07 -0800 Subject: [PATCH 113/129] fix for str encoding --- src/h5json/array_util.py | 7 ++++--- test/unit/array_util_test.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 39966715..3888c06f 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -144,11 +144,12 @@ def fillVlenArray(rank, data, arr, index): else: base_dt = vlenBaseType(arr.dtype) element_data = data[i] - # If base dtype is str and data is bytes, decode it first if base_dt is str and isinstance(element_data, bytes): element_data = element_data.decode('utf8') - arr_element = np.array(element_data, base_dt) - arr[index] = arr_element + if base_dt in (str, bytes): + arr[index] = element_data + else: + arr[index] = np.array(element_data, base_dt) index += 1 return index diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index b3b7c266..52d9f668 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -235,7 +235,7 @@ def testJsonToArray(self): self.assertEqual(out.dtype.metadata["vlen"], bytes) self.assertEqual(out.dtype.kind, "O") e = out[2] - self.assertEqual(e, "three".encode()) + self.assertEqual(e, "three") # test utf8 strings dt = np.dtype("S26") @@ -243,7 +243,7 @@ def testJsonToArray(self): data = "eight: \u516b" out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out[()], data.encode("utf8")) + self.assertEqual(out[()], data.encode()) dt = special_dtype(vlen=str) out = jsonToArray(shape, dt, data) From 3a9e57374b8d549f8f179d41123c24f25c518eb8 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 3 Feb 2026 13:11:30 -0800 Subject: [PATCH 114/129] update for vlen dsets --- src/h5json/array_util.py | 36 +++++++++++++++++++------------ test/unit/array_util_test.py | 23 +++++++++++--------- test/unit/h5py_writer_test.py | 40 +++++++++++++++++++++++++++++++++++ test/unit/hdf5db_test.py | 27 +++++++++++++++++++++++ 4 files changed, 102 insertions(+), 24 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 3888c06f..44f245ad 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -443,22 +443,22 @@ def readElement(buffer, offset, arr, index, dt): offset += 4 n = offset m = offset + count - if count > 0: - e_buffer = buffer[n:m] - offset += count - - if vlenBaseType is bytes: + if vlenBaseType is bytes or vlenBaseType is str: + if count > 0: + e_buffer = buffer[n:m] + offset += count arr[index] = bytes(e_buffer) - elif vlenBaseType is str: - s = e_buffer.decode("utf-8") - arr[index] = s else: - try: - e = np.frombuffer(bytes(e_buffer), dtype=vlenBaseType) - except ValueError: - msg = f"Failed to parse vlen data: {e_buffer} with dtype: {vlenBaseType}" - raise ValueError(msg) - arr[index] = e + arr[index] = b"" + elif count > 0: + e_buffer = buffer[n:m] + offset += count + try: + e = np.frombuffer(bytes(e_buffer), dtype=vlenBaseType) + except ValueError: + msg = f"Failed to parse vlen data: {e_buffer} with dtype: {vlenBaseType}" + raise ValueError(msg) + arr[index] = e return offset @@ -703,6 +703,14 @@ def ndarray_compare(arr1, arr2): # TBD: this is slow for multi-megabyte vlen arrays, needs to be optimized if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): if not isinstance(arr1, np.void) and not isinstance(arr2, np.void): + if not arr1 and not arr2: + # treat 0, b"", and "" as equivalent (uninitialized vlen) + return True + # compare str and bytes by encoding/decoding + if isinstance(arr1, str) and isinstance(arr2, bytes): + return arr1.encode("utf-8") == arr2 + if isinstance(arr1, bytes) and isinstance(arr2, str): + return arr1 == arr2.encode("utf-8") return arr1 == arr2 if isinstance(arr1, np.void) and not isinstance(arr2, np.void): if arr1.size == 0 and not arr2: diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index 52d9f668..ba712d61 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -590,8 +590,14 @@ def testToBytes(self): self.assertEqual(buffer, expected) # convert back to array arr_copy = bytesToArray(buffer, dt, (5,)) - - self.assertTrue(ndarray_compare(arr, arr_copy)) + print("arr_copy[0]:", arr_copy[0]) + print("arr_copy[0] type:", type(arr_copy[0])) + + for i in range(4): + self.assertTrue(isinstance(arr_copy[i], bytes)) + self.assertEqual(arr_copy[i].decode(), arr[i]) + self.assertTrue(isinstance(arr_copy[4], bytes)) + self.assertEqual(arr_copy[4], b"") # VLEN of bytes dt = special_dtype(vlen=bytes) arr = np.zeros((5,), dtype=dt) @@ -684,10 +690,7 @@ def testToBytes(self): self.assertEqual(arr.dtype, arr_copy.dtype) self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) + self.assertTrue(ndarray_compare(arr, arr_copy)) # # VLEN ascii with array type # @@ -896,10 +899,7 @@ def testArrToBytesBase64(self): self.assertEqual(arr.dtype, arr_copy.dtype) self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) + self.assertTrue(ndarray_compare(arr, arr_copy)) # # VLEN ascii with array type # @@ -967,6 +967,9 @@ def array_equal(a, b): a = a.encode("utf8") if isinstance(b, str): b = b.encode("utf8") + # treat 0 and b"" as equivalent (uninitialized vlen) + if not a and not b: + return True if a != b: return False diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 567c1439..36c1dbd9 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -522,6 +522,46 @@ def testCreateVlenReferenceAttribute(self): ref_obj = f[a1[0]] self.assertEqual(ref_obj.name, "/DS1") + def testVlenStringDataset(self): + filepath = "test/unit/out/h5py_writer_test_testVlenStringDataset.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + nrows = 4 + shape = (nrows,) + dtype = special_dtype(vlen=str) + data = ["Hello", "HDF5", "REST", "API"] + init_arr = np.array(data, dtype=dtype) + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + + db.setDatasetValues(dset_id, sel_all, init_arr) + + arr = db.getDatasetValues(dset_id, sel_all) + self.assertTrue(np.array_equal(arr, init_arr)) + sel_one = selections.select(shape, slice(2, 3)) + arr = db.getDatasetValues(dset_id, sel_one) + self.assertEqual(arr.shape, (1,)) + self.assertEqual(arr[0], 'REST') + + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("dset" in f) + dset = f["dset"] + self.assertEqual(dset.shape, (nrows,)) + self.assertEqual(dset.dtype, dtype) + for i in range(nrows): + self.assertEqual(dset[i], data[i].encode()) + def testCommittedType(self): filepath = "test/unit/out/h5py_writer_test_testCommittedType.h5" diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index dcaf92fe..9d89893c 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -566,6 +566,33 @@ def testBoolDataset(self): db.close() + def testVlenStringDataset(self): + nrows = 4 + shape = (nrows,) + dtype = special_dtype(vlen=str) + data = ["Hello", "HDF5", "REST", "API"] + init_arr = np.array(data, dtype=dtype) + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + + db.setDatasetValues(dset_id, sel_all, init_arr) + + arr = db.getDatasetValues(dset_id, sel_all) + self.assertTrue(np.array_equal(arr, init_arr)) + sel_one = selections.select(shape, slice(2, 3)) + arr = db.getDatasetValues(dset_id, sel_one) + self.assertEqual(arr.shape, (1,)) + self.assertEqual(arr[0], 'REST') + + db.close() + def testScalarDataset(self): dtype = np.int32 From 6201152884c6e706e32cca18cb6d36ccc049c1c1 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 5 Feb 2026 16:59:23 -0800 Subject: [PATCH 115/129] log warning on link replacement --- src/h5json/hdf5db.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index d19f7da5..82d1dd92 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -346,7 +346,7 @@ def getObjectById(self, obj_id, refresh=False): """ return object with given id """ self._checkReader() obj_id = getHashTagForId(obj_id) - if obj_id not in self.db or (refresh and not self.is_new(obj_id)): + if obj_id not in self.db or (refresh and not self.is_new(obj_id) and not self.is_dirty(obj_id)): # load the obj from the reader self.log.debug(f"getObjectById - fetching {obj_id} from reader") obj_json = self.reader.getObjectById(obj_id) @@ -852,6 +852,8 @@ def getLink(self, grp_id, name): def _addLink(self, grp_id, name, link_json): obj_json = self.getObjectById(grp_id) links = obj_json["links"] + if name in links: + self.log.warning(f"Link [{name}] already exists in {grp_id}") links[name] = link_json self.make_dirty(grp_id) From 4aecf515af7b312c9d77811b2fcfb68e592e7819 Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 13 Mar 2026 14:54:22 +0100 Subject: [PATCH 116/129] fix filtertest --- test/unit/dset_util_test.py | 15 +++++++++++++++ test/unit/hdf5db_test.py | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/test/unit/dset_util_test.py b/test/unit/dset_util_test.py index 364d8929..fae31451 100644 --- a/test/unit/dset_util_test.py +++ b/test/unit/dset_util_test.py @@ -104,11 +104,13 @@ def testFilterValidation(self): self.assertTrue(False) # should not reach here except ValueError: pass # filters are invalid with contiguous layout + cpl["layout"] = chunked_layout try: validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) except ValueError: self.assertTrue(False) # shouldn't raise exception + # add an invlaid level option for deflate deflate_filter["level"] = 20 try: @@ -116,11 +118,13 @@ def testFilterValidation(self): self.assertTrue(False) # should not reach here except ValueError: pass # invalid deflate level + deflate_filter["level"] = 5 try: validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) except ValueError: self.assertTrue(False) # shouldn't raise exception + # try with just a filter name gzip_filter = getFilterItem("gzip") cpl["filters"] = [gzip_filter, ] @@ -128,6 +132,7 @@ def testFilterValidation(self): validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) except ValueError: self.assertTrue(False) # shouldn't raise exception + # try with an invalid filter name cpl["filters"] = ["invalid_filter_name", ] try: @@ -145,6 +150,16 @@ def testFilterValidation(self): except ValueError: self.assertTrue(False) # shouldn't raise exception + sc_filter = {'class': 'H5Z_FILTER_SCALEOFFSET', 'id': 6, 'name': 'scaleoffset'} + sc_filter['scaleOffset'] = 12 + sc_filter['scaleType'] = 'H5Z_SO_INT' + filters = [sc_filter, ] + cpl["filters"] = filters + try: + validateDatasetCreationProps(cpl, type_json, dset_json["shape"]) + except ValueError: + self.assertTrue(False) # shouldn't raise exception + def testGuessChunk(self): typesize = "H5T_VARIABLE" diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 9d89893c..1fc13a0f 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -78,6 +78,13 @@ def testGroup(self): self.assertTrue(isValidUuid(g2_id, obj_class="groups")) db.createHardLink(root_id, "g2", g2_id) + root_obj = db.getObjectById(root_id) + self.assertTrue("links" in root_obj) + root_links = root_obj["links"] + self.assertTrue("g1" in root_links) + self.assertTrue("g2" in root_links) + self.assertEqual(len(root_links), 2) + g1_1_id = db.createGroup() self.assertTrue(isSchema2Id(g1_1_id)) self.assertFalse(isRootObjId(g1_1_id)) From 413827603a5df0a2098f469d44bf7b486f944210 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 16 Mar 2026 16:07:13 +0100 Subject: [PATCH 117/129] vlen array fix --- src/h5json/array_util.py | 9 +++++++-- test/unit/array_util_test.py | 10 ++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 44f245ad..56fc15be 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -447,9 +447,14 @@ def readElement(buffer, offset, arr, index, dt): if count > 0: e_buffer = buffer[n:m] offset += count - arr[index] = bytes(e_buffer) + if vlenBaseType is str: + e_buffer = e_buffer.decode("utf-8") + arr[index] = e_buffer else: - arr[index] = b"" + if vlenBaseType is str: + arr[index] = "" + else: + arr[index] = b"" elif count > 0: e_buffer = buffer[n:m] offset += count diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index ba712d61..f6196168 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -590,14 +590,8 @@ def testToBytes(self): self.assertEqual(buffer, expected) # convert back to array arr_copy = bytesToArray(buffer, dt, (5,)) - print("arr_copy[0]:", arr_copy[0]) - print("arr_copy[0] type:", type(arr_copy[0])) - - for i in range(4): - self.assertTrue(isinstance(arr_copy[i], bytes)) - self.assertEqual(arr_copy[i].decode(), arr[i]) - self.assertTrue(isinstance(arr_copy[4], bytes)) - self.assertEqual(arr_copy[4], b"") + self.assertTrue(ndarray_compare(arr, arr_copy)) + # VLEN of bytes dt = special_dtype(vlen=bytes) arr = np.zeros((5,), dtype=dt) From adafa903bf06af88211328660d8f7a06aa2da364 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 17 Mar 2026 16:17:39 +0100 Subject: [PATCH 118/129] check that db.setvalue has same rank as dataset --- src/h5json/hdf5db.py | 5 ++++- src/h5json/selections.py | 10 ++++++++++ test/unit/hdf5db_test.py | 4 ++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 82d1dd92..39d2cc5d 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -673,7 +673,7 @@ def init_arr(dtype, cpl): # done with NULL and SCALAR cases return arr - # simple daaset + # simple dataset arr = None fetch = True @@ -708,6 +708,7 @@ def init_arr(dtype, cpl): # apply the update to the array to be returned src_sel = selections.translate(update_sel, x_sel) tgt_sel = selections.translate(sel, x_sel) + arr[tgt_sel.slices] = update_val[src_sel.slices] return arr @@ -741,6 +742,8 @@ def setDatasetValues(self, dset_id, sel, arr): dims = getShapeDims(shape_json) if sel.shape != dims: raise ValueError("Selection shape does not match dataset shape") + if len(arr.shape) != len(dims): + raise TypeError("Expected ndarray with same rank as dataset") updates = self._getDatasetUpdates(dset_id) if sel.select_type == selections.H5S_SELECT_ALL: # for select all, throw out any existing updates since this will overwrite them diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 75b06913..cfa70769 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -252,6 +252,11 @@ def mshape(self): """ Shape of selection (always 1-D for this class) """ return (self.nselect,) + @property + def tgtshape(self): + """ shape of selection in rank of dataspace""" + return self.mshape + def getSelectNpoints(self): npoints = None if self._select_type == H5S_SELECT_NONE: @@ -388,6 +393,11 @@ class SimpleSelection(Selection): def mshape(self): """ Shape of current selection """ return self._mshape + + @property + def tgtshape(self): + """ shape of selection in rank of dataspace""" + return [self.count[dim] for dim in range(len(self._shape))] @property def start(self): diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 1fc13a0f..a1fb27d7 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -448,11 +448,11 @@ def testSimpleDataset(self): self.assertEqual(arr.shape, shape) self.assertEqual(arr.min(), 0) self.assertEqual(arr.max(), 0) - row = np.zeros((ncols,), dtype=dtype) + row = np.zeros((1, ncols,), dtype=dtype) # set values row by row for i in range(nrows): - row[:] = list(range(i * 10, (i + 1) * 10)) + row[0, :] = list(range(i * 10, (i + 1) * 10)) row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) db.setDatasetValues(dset_id, row_sel, row) From a7b16133b032ac3f109bf433c4272665332d4544 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 24 Mar 2026 19:14:21 +0100 Subject: [PATCH 119/129] raise error when attempting to serialize object arrays --- src/h5json/array_util.py | 3 +++ src/h5json/hdf5db.py | 2 +- src/h5json/hdf5dtype.py | 3 +-- src/h5json/selections.py | 2 +- test/unit/array_util_test.py | 9 +++++++++ test/unit/hdf5db_test.py | 38 ++++++++++++++++++++++++++++++++++++ 6 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py index 56fc15be..575d1968 100644 --- a/src/h5json/array_util.py +++ b/src/h5json/array_util.py @@ -514,6 +514,9 @@ def arrayToBytes(arr, encoding=None): offset = copyElement(e, arr1d.dtype, buffer, offset) data = bytes(buffer) else: + if arr.dtype.kind == "O": + # object array, can't convert to bytes + raise TypeError("Object arrays with no vlen are not supported for arrayToBytes") # fixed length type data = arr.tobytes() diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 39d2cc5d..49837069 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -743,7 +743,7 @@ def setDatasetValues(self, dset_id, sel, arr): if sel.shape != dims: raise ValueError("Selection shape does not match dataset shape") if len(arr.shape) != len(dims): - raise TypeError("Expected ndarray with same rank as dataset") + arr = arr.reshape(sel.mshape) # reshape to match dataset rank updates = self._getDatasetUpdates(dset_id) if sel.select_type == selections.H5S_SELECT_ALL: # for select all, throw out any existing updates since this will overwrite them diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index defd09a2..570d396e 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -215,8 +215,7 @@ def check_dtype(**kwds): vlen = dtype If the dtype represents an HDF5 vlen, returns the Python base class. - Currently only builting string vlens (str) are supported. Returns - None if the dtype does not represent an HDF5 vlen. + Returns None if the dtype does not represent an HDF5 vlen. enum = dtype If the dtype represents an HDF5 enumerated type, returns the dictionary diff --git a/src/h5json/selections.py b/src/h5json/selections.py index cfa70769..04e2ddbe 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -393,7 +393,7 @@ class SimpleSelection(Selection): def mshape(self): """ Shape of current selection """ return self._mshape - + @property def tgtshape(self): """ shape of selection in rank of dataspace""" diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py index f6196168..c61baf9f 100644 --- a/test/unit/array_util_test.py +++ b/test/unit/array_util_test.py @@ -567,6 +567,15 @@ def testToBytes(self): arr_copy = bytesToArray(buffer, dt, (4,)) self.assertTrue(ndarray_compare(arr, arr_copy)) + # VLEN of generic object ndarray + arr = np.zeros((4,), dtype=object) + + try: + arrayToBytes(arr) + self.assertTrue(False) # expected type error + except TypeError: + pass # expected, object arrays not supported for arrayToBytes + # VLEN of strings dt = special_dtype(vlen=str) arr = np.zeros((5,), dtype=dt) diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index a1fb27d7..208af9df 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -600,6 +600,44 @@ def testVlenStringDataset(self): db.close() + def testVlenIntDataset(self): + nrows = 4 + shape = (nrows,) + dtype = special_dtype(vlen=np.int32) + + init_arr = np.empty((nrows,), dtype=dtype) + for i in range(nrows): + init_arr[i] = np.array(list(range(i, 2 * i + 1)), dtype=np.int32) + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + + db.setDatasetValues(dset_id, sel_all, init_arr) + + arr = db.getDatasetValues(dset_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.dtype.kind, 'O') + self.assertTrue("vlen" in arr.dtype.metadata) + self.assertEqual(arr.dtype.metadata["vlen"], np.dtype(np.int32)) + for i in range(nrows): + e = arr[i] + self.assertTrue(isinstance(e, np.ndarray)) + self.assertEqual(e.dtype, np.int32) + self.assertTrue(np.array_equal(e, init_arr[i])) + + sel_one = selections.select(shape, slice(2, 3)) + arr = db.getDatasetValues(dset_id, sel_one) + self.assertEqual(arr.shape, (1,)) + self.assertTrue(np.array_equal(arr[0], init_arr[2])) + + db.close() + def testScalarDataset(self): dtype = np.int32 From fbd0688db9582513df016558ff1d65b42f821e33 Mon Sep 17 00:00:00 2001 From: John Readey Date: Thu, 2 Apr 2026 19:21:08 +0200 Subject: [PATCH 120/129] added additional hdf5db tests --- test/unit/hdf5db_test.py | 69 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 208af9df..6e64419e 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -199,6 +199,12 @@ def testScalarAttribute(self): self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + self.assertEqual(value.shape, ()) + self.assertEqual(value.dtype, np.int32) + self.assertEqual(value[()], 42) db.close() def testFixedStringAttribute(self): @@ -218,7 +224,10 @@ def testFixedStringAttribute(self): now = int(time.time()) self.assertTrue(item["created"] > now - 1) ret_value = db.getAttributeValue(root_id, "A1") - self.assertEqual(ret_value, value.encode("ascii")) + self.assertTrue(isinstance(ret_value, np.ndarray)) + self.assertEqual(ret_value.shape, ()) + self.assertEqual(ret_value.dtype, np.dtype("S13")) + self.assertEqual(ret_value[()], value.encode("ascii")) db.close() def testVlenAsciiAttribute(self): @@ -240,6 +249,13 @@ def testVlenAsciiAttribute(self): self.assertEqual(item_type["length"], "H5T_VARIABLE") self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") self.assertEqual(item["value"], "Hello, world!") + + ret_value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(ret_value, np.ndarray)) + self.assertEqual(ret_value.shape, ()) + self.assertEqual(ret_value.dtype, dt) + self.assertEqual(ret_value[()], value) + now = int(time.time()) self.assertTrue(item["created"] > now - 1) db.close() @@ -263,6 +279,13 @@ def testVlenUtf8Attribute(self): self.assertEqual(item_type["length"], "H5T_VARIABLE") self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") self.assertEqual(item["value"], "Hello, world!") + + ret_value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(ret_value, np.ndarray)) + self.assertEqual(ret_value.shape, ()) + self.assertEqual(ret_value.dtype, dt) + self.assertEqual(ret_value[()].encode(), value) + now = int(time.time()) self.assertTrue(item["created"] > now - 1) db.close() @@ -282,6 +305,50 @@ def testIntAttribute(self): item_type = item["type"] self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I16LE") + + ret_value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(ret_value, np.ndarray)) + self.assertEqual(ret_value.shape, (len(value),)) + self.assertEqual(ret_value.dtype, np.int16) + for i in range(len(value)): + self.assertEqual(ret_value[i], value[i]) + + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + db.close() + + def testCompoundAttribute(self): + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dt_compound = np.dtype([("field1", "S8"), ("field2", np.int32)]) + value = [("hello", 42), ('', 0), ("world", 99),] + db.createAttribute(root_id, "A1", value, dtype=dt_compound) + item = db.getAttribute(root_id, "A1") + item_value = item['value'] + self.assertEqual(len(item_value), 3) + for i in range(3): + e = item_value[i] + # self.assertTrue(isinstance(e, tuple)) # TBD + self.assertEqual(tuple(e), value[i]) + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [3,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_COMPOUND") + + ret_value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(ret_value, np.ndarray)) + self.assertEqual(ret_value.shape, (3,)) + self.assertEqual(ret_value.dtype, dt_compound) + for i in range(3): + e = ret_value[i] + self.assertEqual((e[0].decode(), e[1]), value[i]) + + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() def testCreateReferenceAttribute(self): From 79f78224411be5de9ba668dcac1d8c95fa4368a6 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Apr 2026 18:59:30 +0200 Subject: [PATCH 121/129] add getPathsForObjectId method --- src/h5json/hdf5db.py | 56 ++++++++++++++++++++++++++- src/h5json/jsonstore/h5json_writer.py | 40 +------------------ test/unit/hdf5db_test.py | 9 +++++ 3 files changed, 65 insertions(+), 40 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 49837069..cf3d6a4c 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -426,6 +426,52 @@ def getObjectByPath(self, path): obj_json = self.getObjectById(obj_id) return obj_json + def getPathsForObjectId(self, obj_id, parent_id=None, path_prefix=""): + """ Return list of paths for the given object id starting from parent_id if set, + otherwise the root_id """ + # TBD: this function will be rather slow for domains with a large number + # of objects (it will search through the complete heirarchy). + + if parent_id is None: + parent_id = self.root_id + else: + parent_id = getHashTagForId(parent_id) + + obj_json = self.getObjectById(parent_id) + if obj_json is None: + self.log.warning("getPathsForObjectId - parent_id not found") + raise KeyError("parent_id: {parent_id} not found") + + paths = [] + obj_id = getHashTagForId(obj_id) + searched_ids = set(obj_id) + + if parent_id == obj_id: + paths.append(path_prefix if path_prefix else "/") + + if 'links' in obj_json: + links = obj_json['links'] + for link_name in links: + link_tgt = links[link_name] + link_class = link_tgt['class'] + if link_class == 'H5L_TYPE_HARD': + # hard link + tgt_obj_id = link_tgt['id'] + if tgt_obj_id in searched_ids: + self.log.warning(f"circular reference using path: {path_prefix}/{link_name}") + continue + searched_ids.add(tgt_obj_id) + kwargs = {"parent_id": tgt_obj_id, "path_prefix": path_prefix + "/" + link_name} + paths.extend(self.getPathsForObjectId(obj_id, **kwargs)) + elif link_class == 'H5L_TYPE_SOFT': + self.log.warning("getPathsForObjectId can't follow soft links") + elif link_class == 'H5L_TYPE_EXTERNAL': + self.log.warning("getPathsForObjectId can't follow external links") + else: + self.log.error(f"link type: {link_class} not supported") + + return paths + def getDtype(self, obj_json): """ Return numpy data type for given dataset, datatype, or attribute """ @@ -535,7 +581,15 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): else: dtype = np.dtype(dtype) else: - value = np.asarray(value, dtype=dtype, order='C') + try: + value = np.asarray(value, dtype=dtype, order='C') + except ValueError: + # some special cases for compound and vlen types are handled + # by jsonToArray... + if shape is None or dtype is None: + raise + print(f"calling jsonToArray for shape: {shape} dtype: {dtype} value: {value}") + value = jsonToArray(shape, dtype, value) if dtype is None: dtype = value.dtype else: diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py index f97df007..756ef578 100644 --- a/src/h5json/jsonstore/h5json_writer.py +++ b/src/h5json/jsonstore/h5json_writer.py @@ -39,7 +39,6 @@ def __init__( super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) if append: raise ValueError("H5JsonWriter does not support append mode") - self.alias_db = {} self.json = {} self._data_limit = data_limit self._root_id = None @@ -83,43 +82,8 @@ def isClosed(self): def getAliasList(self, obj_id): """ return list of alias """ - if obj_id not in self.alias_db: - self.alias_db[obj_id] = [] - return self.alias_db[obj_id] - - def updateAliasList(self): - """ update the alias list for each object """ - # clear exiting aliases - obj_ids = self.db.getCollection() - for obj_id in obj_ids: - self.alias_db[obj_id] = [] - - self._setAlias(self._root_uuid, set(), "/") - - def _setAlias(self, obj_id, id_set, h5path): - """ add the given h5path to the object's alias list - If the object is a group, recurse through each hard link """ - obj_json = self.db.getObjectById(obj_id) - alias_list = self.getAliasList(obj_id) - if h5path in alias_list: - return # nothing to do - alias_list.append(h5path) - if getCollectionForId(obj_id) != "groups": - return # done - id_set.add(obj_id) # keep track of objects we've visited to avoid loops - links = obj_json["links"] - if h5path[-1] != '/': - h5path += '/' - for link_name in links: - link_json = links[link_name] - if link_json["class"] == "H5L_TYPE_HARD": - tgt_id = link_json["id"] - if tgt_id in id_set: - self.log.info("_setAlias - circular loop found") - else: - self._setAlias(tgt_id, id_set, f"{h5path}{link_name}") - id_set.remove(obj_id) + return self.db.getPathsForObjectId(obj_id) def dumpAttribute(self, obj_id, attr_name): self.log.info(f"dumpAttribute: [{attr_name}]") @@ -299,8 +263,6 @@ def dumpFile(self): self.json["apiVersion"] = db_version_info["hdf5-json-version"] self.json["root"] = getUuidFromId(self._root_uuid) - self.updateAliasList() # create alias_db with obj_id to alias list dict - self.dumpGroups() self.dumpDatasets() diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 6e64419e..bccf7a52 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -66,11 +66,18 @@ def testGroup(self): db = Hdf5db(app_logger=self.log) root_id = db.open() + paths = db.getPathsForObjectId(root_id) + self.assertEqual(paths, ["/"]) + g1_id = db.createGroup() self.assertTrue(isSchema2Id(g1_id)) self.assertFalse(isRootObjId(g1_id)) self.assertTrue(isValidUuid(g1_id, obj_class="groups")) + paths = db.getPathsForObjectId(g1_id) + self.assertEqual(paths, []) db.createHardLink(root_id, "g1", g1_id) + paths = db.getPathsForObjectId(g1_id) + self.assertEqual(paths, ["/g1"]) g2_id = db.createGroup() self.assertTrue(isSchema2Id(g2_id)) @@ -90,6 +97,8 @@ def testGroup(self): self.assertFalse(isRootObjId(g1_1_id)) self.assertTrue(isValidUuid(g1_1_id, obj_class="groups")) db.createHardLink(g1_id, "g1.1", g1_1_id) + paths = db.getPathsForObjectId(g1_1_id) + self.assertEqual(paths, ["/g1/g1.1"]) self.assertEqual(db.getObjectIdByPath("g1"), g1_id) self.assertEqual(db.getObjectIdByPath("/g1"), g1_id) From f5c27f4240ed923a0b63649999348c86b9043f92 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Apr 2026 19:42:18 +0200 Subject: [PATCH 122/129] fix for circular links --- src/h5json/hdf5db.py | 23 +++++++++++------------ test/unit/hdf5db_test.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index cf3d6a4c..2a512804 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -375,7 +375,6 @@ def getObjectIdByPath(self, h5path, parent_id=None): raise KeyError("parent_id: {parent_id} not found") obj_id = parent_id - searched_ids = set(obj_id) link_names = h5path.split('/') self.log.debug(f"link_names: {link_names}") @@ -403,11 +402,7 @@ def getObjectIdByPath(self, h5path, parent_id=None): if link_class == 'H5L_TYPE_HARD': # hard link obj_id = link_tgt['id'] - if obj_id in searched_ids: - self.log.warning(f"circular reference using path: {h5path}") - raise KeyError(h5path) obj_json = self.getObjectById(obj_id) - searched_ids.add(obj_id) elif link_class == 'H5L_TYPE_SOFT': self.log.warning("getObjectIdByPath can't follow soft links") elif link_class == 'H5L_TYPE_EXTERNAL': @@ -426,7 +421,7 @@ def getObjectByPath(self, path): obj_json = self.getObjectById(obj_id) return obj_json - def getPathsForObjectId(self, obj_id, parent_id=None, path_prefix=""): + def getPathsForObjectId(self, obj_id, parent_id=None, path_prefix="", _visited=None): """ Return list of paths for the given object id starting from parent_id if set, otherwise the root_id """ # TBD: this function will be rather slow for domains with a large number @@ -437,6 +432,14 @@ def getPathsForObjectId(self, obj_id, parent_id=None, path_prefix=""): else: parent_id = getHashTagForId(parent_id) + if _visited is None: + _visited = set() + + if parent_id in _visited: + self.log.warning(f"circular reference detected at path: {path_prefix}") + return [] + _visited.add(parent_id) + obj_json = self.getObjectById(parent_id) if obj_json is None: self.log.warning("getPathsForObjectId - parent_id not found") @@ -444,7 +447,6 @@ def getPathsForObjectId(self, obj_id, parent_id=None, path_prefix=""): paths = [] obj_id = getHashTagForId(obj_id) - searched_ids = set(obj_id) if parent_id == obj_id: paths.append(path_prefix if path_prefix else "/") @@ -457,11 +459,8 @@ def getPathsForObjectId(self, obj_id, parent_id=None, path_prefix=""): if link_class == 'H5L_TYPE_HARD': # hard link tgt_obj_id = link_tgt['id'] - if tgt_obj_id in searched_ids: - self.log.warning(f"circular reference using path: {path_prefix}/{link_name}") - continue - searched_ids.add(tgt_obj_id) - kwargs = {"parent_id": tgt_obj_id, "path_prefix": path_prefix + "/" + link_name} + kwargs = {"parent_id": tgt_obj_id, "_visited": _visited} + kwargs["path_prefix"] = path_prefix + "/" + link_name paths.extend(self.getPathsForObjectId(obj_id, **kwargs)) elif link_class == 'H5L_TYPE_SOFT': self.log.warning("getPathsForObjectId can't follow soft links") diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index bccf7a52..7220e5f6 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -168,7 +168,41 @@ def testGroup(self): db.deleteAttribute(g1_id, "a1") self.assertEqual(len(db.getAttributes(g1_id)), 1) self.assertEqual(db.getAttribute(g1_id, "a1"), None) + db.close() + def testCircularLinks(self): + db = Hdf5db(app_logger=self.log) + root_id = db.open() + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + g2_id = db.createGroup() + db.createHardLink(g1_id, "g2", g2_id) + # create circular link + db.createHardLink(g2_id, "g1", g1_id) + + g1_json = db.getObjectById(g1_id) + self.assertTrue("links" in g1_json) + g1_links = g1_json["links"] + self.assertTrue("g2" in g1_links) + self.assertEqual(len(g1_links), 1) + + g2_json = db.getObjectById(g2_id) + self.assertTrue("links" in g2_json) + g2_links = g2_json["links"] + self.assertTrue("g1" in g2_links) + self.assertEqual(len(g2_links), 1) + + paths = db.getPathsForObjectId(g2_id) + # only the canonical path is returned + self.assertEqual(paths, ["/g1/g2"]) + grp_id = db.getObjectIdByPath("/g1/g2") + self.assertEqual(grp_id, g2_id) + # you can still get objects via circular paths... + grp_id = db.getObjectIdByPath("/g1/g2/g1") + self.assertEqual(grp_id, g1_id) + grp_id = db.getObjectIdByPath("/g1/g2/g1/g2") + self.assertEqual(grp_id, g2_id) + db.close() def testNullSpaceAttribute(self): From 59fea055672defb3739ecd5fc46bf56aa80e48bf Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Apr 2026 19:44:37 +0200 Subject: [PATCH 123/129] fix flake8 error --- test/unit/hdf5db_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 7220e5f6..446b1dda 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -202,7 +202,7 @@ def testCircularLinks(self): self.assertEqual(grp_id, g1_id) grp_id = db.getObjectIdByPath("/g1/g2/g1/g2") self.assertEqual(grp_id, g2_id) - + db.close() def testNullSpaceAttribute(self): From 70a1d8d0784f598ddc5499430581f3c4ab2e5074 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 7 Apr 2026 20:50:53 +0200 Subject: [PATCH 124/129] remove debug print --- src/h5json/hdf5db.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 2a512804..3723ff07 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -587,7 +587,6 @@ def createAttribute(self, obj_id, name, value, shape=None, dtype=None): # by jsonToArray... if shape is None or dtype is None: raise - print(f"calling jsonToArray for shape: {shape} dtype: {dtype} value: {value}") value = jsonToArray(shape, dtype, value) if dtype is None: dtype = value.dtype From c25dc3a51bc5daf5cafb70a1f381b9fdec88a728 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 13 Apr 2026 19:38:50 +0200 Subject: [PATCH 125/129] allow / for Reference constructor --- src/h5json/hdf5dtype.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py index 570d396e..7ee65462 100644 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -43,6 +43,11 @@ def __init__(self, bind): if not isinstance(bind, str): raise TypeError("Expected string id") + if bind.find('/') != -1: + parts = bind.split('/') + if parts[0] not in ("groups", "datasets", "datatypes"): + raise TypeError("Expected id to start with 'groups/', 'datasets/' or 'datatypes/'") + bind = parts[1] self._id = getHashTagForId(bind) def __repr__(self): From dfc0224c051d9698a11afdb5f423397c68cc4978 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 12 May 2026 18:00:00 +0200 Subject: [PATCH 126/129] updates for point selection --- src/h5json/h5pystore/h5py_reader.py | 9 +- src/h5json/hdf5db.py | 100 +++++++---- src/h5json/jsonstore/h5json_reader.py | 2 +- src/h5json/selections.py | 235 ++++++++++++++++++-------- test/unit/h5py_reader_test.py | 8 + test/unit/h5py_writer_test.py | 5 +- test/unit/hdf5db_test.py | 23 ++- 7 files changed, 273 insertions(+), 109 deletions(-) diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py index e0d5d825..23684ab3 100644 --- a/src/h5json/h5pystore/h5py_reader.py +++ b/src/h5json/h5pystore/h5py_reader.py @@ -548,10 +548,17 @@ def getDatasetValues(self, dset_id, sel=None, dtype=None): if isOpaqueDtype(dset.dtype): # TBD: Opaque data not supported yet return None - if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + if sel is None or sel.select_type == selections.H5S_SEL_ALL: arr = dset[...] elif isinstance(sel, selections.SimpleSelection): arr = dset[sel.slices] + elif isinstance(sel, selections.PointSelection): + # h5py has no native point-selection API, so read each point individually. + # sel.points rows are numpy arrays; wrap each in a tuple so h5py + # interprets it as a multi-dimensional index rather than fancy indexing. + arr = np.zeros((sel.nselect,), dtype=dset.dtype) + for i, pt in enumerate(selections._iter_points(sel)): + arr[i] = dset[pt] else: raise NotImplementedError("selection type not supported") diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 3723ff07..e65b877d 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -672,7 +672,10 @@ def getDatasetValues(self, dset_id, sel): def init_arr(dtype, cpl): """ create an ndarray with the give shape, dtype and fill_value (if the latter is found in the creation properties list) """ - arr_shape = sel.count if isinstance(sel.count, tuple) else (sel.count, ) + if hasattr(sel, "count"): + arr_shape = sel.count if isinstance(sel.count, tuple) else (sel.count, ) + else: + arr_shape = (sel.nselect,) arr = np.zeros(arr_shape, dtype=dtype) if "fillValue" in cpl: fillValue = cpl["fillValue"] @@ -707,7 +710,7 @@ def init_arr(dtype, cpl): raise ValueError("Selection shape does not match dataset shape") if shape_class == "H5S_SCALAR": - if sel.select_type != selections.H5S_SELECT_ALL: + if sel.select_type != selections.H5S_SEL_ALL: # TBD: support other selection types raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") if sel.shape != (): @@ -751,6 +754,7 @@ def init_arr(dtype, cpl): arr = init_arr(dtype, cpl) # apply any updates that impact this selection + for (update_sel, update_val) in updates: # get the part of the update that is in common with the requested selection x_sel = selections.intersect(sel, update_sel) @@ -758,10 +762,20 @@ def init_arr(dtype, cpl): # this update doesn't effect the selection, so ignore continue # apply the update to the array to be returned - src_sel = selections.translate(update_sel, x_sel) - tgt_sel = selections.translate(sel, x_sel) - - arr[tgt_sel.slices] = update_val[src_sel.slices] + if sel.select_type == selections.H5S_SEL_POINTS: + # For point selections apply each intersecting point individually. + # arr is 1-D with one entry per selected point; map each intersection + # point back to its position in sel and its offset in update_val. + rank = len(sel.shape) + sel_pts = list(selections._iter_points(sel)) + for pt in selections._iter_points(x_sel): + tgt_idx = sel_pts.index(pt) + src_coords = tuple(pt[d] - update_sel.start[d] for d in range(rank)) + arr[tgt_idx] = update_val[src_coords] + else: + src_sel = selections.translate(update_sel, x_sel) + tgt_sel = selections.translate(sel, x_sel) + arr[tgt_sel.slices] = update_val[src_sel.slices] return arr @@ -769,47 +783,63 @@ def setDatasetValues(self, dset_id, sel, arr): """ Write the given ndarray to the dataset using the selection """ - dset_json = self.getObjectById(dset_id) - shape_json = dset_json["shape"] + if not isinstance(sel, selections.Selection): raise TypeError("Expected Selection class") - if sel.select_type not in (selections.H5S_SELECT_HYPERSLABS, selections.H5S_SELECT_ALL): - # TBD: support other selection types - raise ValueError("Only hyperslab selections are currently supported") - if not isinstance(arr, np.ndarray): - raise TypeError("Expected ndarray for data value") - tgt_dt = self.getDtype(dset_json) - src_dt = arr.dtype - if src_dt != tgt_dt: - raise TypeError("arr.dtype doesn't match dataset dtype") + + dset_json = self.getObjectById(dset_id) + shape_json = dset_json["shape"] + shape_class = getShapeClass(shape_json) if shape_class == "H5S_NULL": raise ValueError("writing to null space dataset not supported") + + updates = self._getDatasetUpdates(dset_id) + if shape_class == "H5S_SCALAR": + if sel.select_type != selections.H5S_SEL_ALL: + # TBD: support other selection types + raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") if sel.shape != (): raise ValueError("Selection shape does not match dataset shape") - if len(arr.shape) > 0: - raise TypeError("Expected scalar ndarray for scalar dataset") - else: + + if arr.shape != (): + raise ValueError("Expected scalar array for scalar dataset") + + if not isinstance(arr, np.ndarray): + raise TypeError("Expected ndarray for data value") + + tgt_dt = self.getDtype(dset_json) + src_dt = arr.dtype + if src_dt != tgt_dt: + raise TypeError("arr.dtype doesn't match dataset dtype") + + if sel.select_type == selections.H5S_SEL_POINTS: + if sel.nselect != arr.shape[0]: + raise TypeError("Selection shape does not match number of points") + elif sel.select_type == selections.H5S_SEL_ALL: + if sel.shape != getShapeDims(shape_json): + raise TypeError("Selection shape does not match dataset shape") + elif sel.select_type == selections.H5S_SEL_HYPERSLABS: dims = getShapeDims(shape_json) if sel.shape != dims: - raise ValueError("Selection shape does not match dataset shape") + raise TypeError("Selection shape does not match dataset shape") if len(arr.shape) != len(dims): - arr = arr.reshape(sel.mshape) # reshape to match dataset rank - updates = self._getDatasetUpdates(dset_id) - if sel.select_type == selections.H5S_SELECT_ALL: + raise TypeError("Array shape does not match dataset shape") + try: + sel.broadcast(arr.shape) + except TypeError: + # selection can't be broadcast to array shape + raise + else: + raise TypeError("Unsupported selection type") + + if sel.select_type == selections.H5S_SEL_ALL or sel.shape == sel.mshape: # for select all, throw out any existing updates since this will overwrite them updates.clear() - arr = arr.copy() # make a copy in case the client updates it later - rank = len(sel.shape) - if len(arr.shape) < rank: - # reshape to keep compatiblity with dataset rank - if sel.select_type == selections.H5S_SELECT_ALL: - # this should not result in a dimension reduction - raise ValueError("unexpected selection shape") - if sel.select_type != selections.H5S_SELECT_HYPERSLABS: - raise ValueError("tbd") - arr = arr.reshape(sel.mshape) + + # make a copy in case the client updates it later + arr = arr.copy() updates.append((sel, arr)) self.make_dirty(dset_id) @@ -833,7 +863,7 @@ def resizeDataset(self, dset_id, shape): updates = self._getDatasetUpdates(dset_id) for i in range(len(updates)): (sel_update, arr) = updates[i] - if sel_update.select_type == selections.H5S_SELECT_HYPERSLABS: + if sel_update.select_type == selections.H5S_SEL_HYPERSLABS: slices = list(sel_update.slices) for dim in range(rank): s = slices[dim] diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py index b64a3d1d..2196eb10 100644 --- a/src/h5json/jsonstore/h5json_reader.py +++ b/src/h5json/jsonstore/h5json_reader.py @@ -208,7 +208,7 @@ def getDatasetValues(self, obj_id, sel=None, dtype=None): dims = shape_json["dims"] arr = jsonToArray(dims, dtype, json_value) - if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + if sel is None or sel.select_type == selections.H5S_SEL_ALL: pass # just return the entire array elif isinstance(sel, selections.SimpleSelection): arr = arr[sel.slices] diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 04e2ddbe..1ce10c29 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -21,16 +21,22 @@ import numpy as np -H5S_SEL_POINTS = 0 + +# Selection types +H5S_SEL_NONE = 0 +H5S_SEL_POINTS = 1 +H5S_SEL_HYPERSLABS = 2 +H5S_SEL_ALL = 3 +H5S_SEL_FANCY = 4 + + +# Boolean selection operations H5S_SELECT_SET = 1 H5S_SELECT_APPEND = 2 H5S_SELECT_PREPEND = 3 H5S_SELECT_OR = 4 H5S_SELECT_NONE = 5 -H5S_SELECT_ALL = 6 -H5S_SELECT_HYPERSLABS = 7 -H5S_SELECT_NOTB = 8 -H5S_SELLECT_FANCY = 9 +H5S_SELECT_NOTB = 6 def select(obj, args): @@ -73,14 +79,18 @@ def select(obj, args): if len(args) == 1: arg = args[0] + if hasattr(arg, "shape"): + obj_shape = obj.shape + else: + obj_shape = obj if isinstance(arg, Selection): - if arg.shape != obj.shape: + if arg.shape != obj_shape: raise TypeError("Mismatched selection shape") return arg elif isinstance(arg, np.ndarray) or isinstance(arg, list): - sel = PointSelection(obj.shape) + sel = PointSelection(obj_shape) sel[arg] return sel """ @@ -119,37 +129,119 @@ def select(obj, args): def _check_bool_args(s1, s2): """ verify argument for boolean operations """ # TBD: this is currently only working for simple selections with stride 1 - valid_select_types = (H5S_SELECT_HYPERSLABS, H5S_SELECT_ALL) + valid_s1_types = (H5S_SEL_HYPERSLABS, H5S_SEL_ALL) + valid_s2_types = (H5S_SEL_HYPERSLABS, H5S_SEL_POINTS, H5S_SEL_ALL) + if not isinstance(s1, Selection): raise TypeError("Expected selection type for first arg") if not isinstance(s2, Selection): raise TypeError("Expected selection type for second arg") - if s1.select_type not in valid_select_types: + if s1.select_type not in valid_s1_types: raise TypeError("Expected hyperslab selection for first arg") - if s2.select_type not in valid_select_types: + if s2.select_type not in valid_s2_types: raise TypeError("Expected hyperslab selection for second arg") if s1.shape != s2.shape: raise ValueError("selections have incompatible shapes") -def intersect(s1, s2): - """ Return the intersection of two selections """ - # TBD: this is currently only working for simple selections with stride 1 - _check_bool_args(s1, s2) +def _iter_points(point_sel): + """Yield each point in a PointSelection as a tuple of ints.""" + pts = point_sel.points + rank = len(point_sel.shape) + pts_arr = np.asarray(pts) + + if pts_arr.size == 0: + return + + if pts_arr.ndim == 1: + if rank == 1: + # Each scalar element is a coordinate in 1-D space + for p in pts_arr: + yield (int(p),) + else: + # Single point in rank-N space stored as a flat array [c0, c1, ..., c_{N-1}] + yield tuple(int(x) for x in pts_arr) + else: + # Shape (N, rank): each row is one point + for row in pts_arr: + yield tuple(int(x) for x in row) + + +def _filter_points_by_hyperslab(point_sel, hyper_sel): + """Return a PointSelection of points from point_sel that lie within hyper_sel.""" + start = hyper_sel.start + count = hyper_sel.count + step = hyper_sel.step + rank = len(point_sel.shape) + + result_pts = [] + for pt in _iter_points(point_sel): + if all( + start[d] <= pt[d] < start[d] + count[d] * step[d] and (pt[d] - start[d]) % step[d] == 0 + for d in range(rank) + ): + result_pts.append(pt) + + result = PointSelection(point_sel.shape) + if rank == 1: + result.set([p[0] for p in result_pts] if result_pts else []) + else: + result.set(result_pts if result_pts else []) + return result + + +def _intersect_points_points(s1, s2): + """Return a PointSelection of points common to both s1 and s2.""" + common = sorted(set(_iter_points(s1)) & set(_iter_points(s2))) - slices = [] rank = len(s1.shape) - for dim in range(rank): - start = max(s1.start[dim], s2.start[dim]) - stop = min(s1.start[dim] + s1.count[dim], s2.start[dim] + s2.count[dim]) - if s1.step[dim] > 1 or s2.step[dim] > 1: - raise ValueError("stepped slices not currently supported") - if start > stop: - stop = start - slices.append(slice(start, stop, 1)) - slices = tuple(slices) + result = PointSelection(s1.shape) + if rank == 1: + result.set([p[0] for p in common] if common else []) + else: + result.set(common if common else []) + return result - return select(s1.shape, slices) + +def intersect(s1, s2): + """ Return the intersection of two selections. + + Supports hyperslab/hyperslab, hyperslab/point, and point/point combinations. + """ + if not isinstance(s1, Selection): + raise TypeError("Expected selection type for first arg") + if not isinstance(s2, Selection): + raise TypeError("Expected selection type for second arg") + if s1.shape != s2.shape: + raise ValueError("selections have incompatible shapes") + + t1 = s1.select_type + t2 = s2.select_type + hyperslab_types = (H5S_SEL_HYPERSLABS, H5S_SEL_ALL) + + if t1 in hyperslab_types and t2 in hyperslab_types: + slices = [] + rank = len(s1.shape) + for dim in range(rank): + start = max(s1.start[dim], s2.start[dim]) + stop = min(s1.start[dim] + s1.count[dim], s2.start[dim] + s2.count[dim]) + if s1.step[dim] > 1 or s2.step[dim] > 1: + raise ValueError("stepped slices not currently supported") + if start > stop: + stop = start + slices.append(slice(start, stop, 1)) + return select(s1.shape, tuple(slices)) + + if t1 == H5S_SEL_POINTS and t2 in hyperslab_types: + return _filter_points_by_hyperslab(s1, s2) + + if t1 in hyperslab_types and t2 == H5S_SEL_POINTS: + return _filter_points_by_hyperslab(s2, s1) + + if t1 == H5S_SEL_POINTS and t2 == H5S_SEL_POINTS: + return _intersect_points_points(s1, s2) + + raise TypeError(f"Unsupported selection types for intersection: {t1}, {t2}") def contained(s1, s2): @@ -177,7 +269,7 @@ def contained(s1, s2): def translate(s1, s2): """ Given two selections, s1 and s2, return a new selection - definied by s2 relative to s1's stat and count. + definied by s2 relative to s1's start and count. s2 must be contained in s1 """ _check_bool_args(s1, s2) @@ -186,14 +278,25 @@ def translate(s1, s2): raise ValueError("translate - selections not overlapping") rank = len(s1.shape) - - slices = [] - for dim in range(rank): - start = s2.start[dim] - s1.start[dim] - count = s2.count[dim] - slices.append(slice(start, start + count, 1)) - slices = tuple(slices) - return select(s1.shape, slices) + args = [] + if s2.select_type == H5S_SEL_POINTS: + points = [] + for pt in _iter_points(sel_inter): + for d in range(rank): + if pt[d] < s1.start[d] or pt[d] >= s1.start[d] + s1.count[d]: + continue + points.append(tuple(pt[d] - s1.start[d] for d in range(rank))) + if len(points) == 0: + raise ValueError("translate - selections not overlapping") + args.append(points) + elif s2.select_type == H5S_SEL_HYPERSLABS: + for dim in range(rank): + start = s2.start[dim] - s1.start[dim] + count = s2.count[dim] + args.append(slice(start, start + count, 1)) + else: + raise TypeError("translate - unsupported selection type for s2") + return select(s1.shape, tuple(args)) class Selection(object): @@ -229,7 +332,7 @@ def __init__(self, shape, *args, **kwds): shape = tuple(shape) self._shape = shape - self._select_type = H5S_SELECT_ALL + self._select_type = H5S_SEL_ALL @property def select_type(self): @@ -259,9 +362,9 @@ def tgtshape(self): def getSelectNpoints(self): npoints = None - if self._select_type == H5S_SELECT_NONE: + if self._select_type == H5S_SEL_NONE: npoints = 0 - elif self._select_type == H5S_SELECT_ALL: + elif self._select_type == H5S_SEL_ALL: dims = self._shape npoints = 1 for nextent in dims: @@ -294,6 +397,7 @@ def __init__(self, shape, *args, **kwds): """ Create a Point selection. """ Selection.__init__(self, shape, *args, **kwds) self._points = [] + self._select_type = H5S_SEL_POINTS @property def points(self): @@ -302,9 +406,9 @@ def points(self): def getSelectNpoints(self): npoints = None - if self._select_type == H5S_SELECT_NONE: + if self._select_type == H5S_SEL_NONE: npoints = 0 - elif self._select_type == H5S_SELECT_ALL: + elif self._select_type == H5S_SEL_ALL: dims = self._shape npoints = 1 for nextent in dims: @@ -343,8 +447,6 @@ def _perform_selection(self, points, op): else: raise ValueError("Unsupported operation") - # def _perform_list_selection(points, H5S_SELECT_SET): - def __getitem__(self, arg): """ Perform point-wise selection from a NumPy boolean array """ if isinstance(arg, list): @@ -416,7 +518,7 @@ def __init__(self, shape, *args, **kwds): rank = len(self._shape) self._sel = ((0,) * rank, self._shape, (1,) * rank, (False,) * rank) self._mshape = self._shape - self._select_type = H5S_SELECT_ALL + self._select_type = H5S_SEL_ALL def __getitem__(self, args): @@ -426,13 +528,13 @@ def __getitem__(self, args): if self._shape == (): if len(args) > 0 and args[0] not in (Ellipsis, ()): raise TypeError("Invalid index for scalar dataset (only ..., () allowed)") - self._select_type = H5S_SELECT_ALL + self._select_type = H5S_SEL_ALL return self start, count, step, scalar = _handle_simple(self._shape, args) self._sel = (start, count, step, scalar) - self._select_type = H5S_SELECT_HYPERSLABS + self._select_type = H5S_SEL_HYPERSLABS self._mshape = tuple(x for x, y in zip(count, scalar) if not y) @@ -442,14 +544,14 @@ def getSelectNpoints(self): """Return number of elements in current selection """ npoints = None - if self._select_type == H5S_SELECT_NONE: + if self._select_type == H5S_SEL_NONE: npoints = 0 - elif self._select_type == H5S_SELECT_ALL: + elif self._select_type == H5S_SEL_ALL: dims = self._shape npoints = 1 for nextent in dims: npoints *= nextent - elif self._select_type == H5S_SELECT_HYPERSLABS: + elif self._select_type == H5S_SEL_HYPERSLABS: dims = self._shape npoints = 1 rank = len(dims) @@ -490,8 +592,7 @@ def broadcast(self, target_shape): if self._shape == (): if np.product(target_shape) != 1: raise TypeError(f"Can't broadcast {target_shape} to scalar") - self._id.select_all() - yield self._id + yield self._sel return start, count, step, scalar = self._sel @@ -513,17 +614,18 @@ def broadcast(self, target_shape): tshape = tuple(tshape) chunks = tuple(x // y for x, y in zip(count, tshape)) - nchunks = int(np.product(chunks)) + nchunks = int(np.prod(chunks)) if nchunks == 1: - yield self._id + yield self._sel else: - sid = self._id.copy() - sid.select_hyperslab((0,) * rank, tshape, step) for idx in range(nchunks): - offset = tuple(x * y * z + s for x, y, z, s in zip(np.unravel_index(idx, chunks), tshape, step, start)) - sid.offset_simple(offset) - yield sid + offset = [] + for x, y, z, s in zip(np.unravel_index(idx, chunks), tshape, step, start): + offset.append(int(x * y * z + s)) + offset = tuple(offset) + sel = [tuple([sum(x) for x in zip(offset, start)]), tshape, step, scalar] + yield sel @property def slices(self): @@ -567,6 +669,7 @@ def mshape(self): def __init__(self, shape, *args, **kwds): Selection.__init__(self, shape, *args, **kwds) self._slices = [] + self._select_type = H5S_SEL_FANCY def __getitem__(self, args): @@ -574,7 +677,7 @@ def __getitem__(self, args): args = (args,) args = _expand_ellipsis(args, len(self._shape)) - select_type = H5S_SELECT_HYPERSLABS # will adjust if we have a coord + select_type = H5S_SEL_HYPERSLABS # will adjust if we have a coord # Create list of slices and/or coordinates slices = [] @@ -611,7 +714,7 @@ def __getitem__(self, args): if sorted(arg) != list(arg): raise TypeError("Indexing elements must be in increasing order") mshape.append(len(arg)) - select_type = H5S_SELLECT_FANCY + select_type = H5S_SEL_FANCY elif isinstance(arg, list) or hasattr(arg, 'dtype'): # coordinate selection slices.append(arg) @@ -627,7 +730,7 @@ def __getitem__(self, args): # this shouldn't happen since HSDS would have thrown an error raise ValueError("coordinate num element missmatch") mshape.append(len(arg)) - select_type = H5S_SELLECT_FANCY + select_type = H5S_SEL_FANCY elif isinstance(arg, int): if arg < 0 or arg >= length: raise IndexError(f"Index ({arg}) out of range (0-{length - 1})") @@ -804,9 +907,9 @@ def guess_shape(sid): elif sel_class == 'H5S_SCALAR': # NumPy has no way of expressing empty 0-rank selections, so we use None - if sel_type == H5S_SELECT_NONE: + if sel_type == H5S_SEL_NONE: return None - if sel_type == H5S_SELECT_ALL: + if sel_type == H5S_SEL_ALL: return tuple() elif sel_class != 'H5S_SIMPLE': @@ -817,10 +920,10 @@ def guess_shape(sid): N = sid.get_select_npoints() rank = len(sid.shape) - if sel_type == H5S_SELECT_NONE: + if sel_type == H5S_SEL_NONE: return (0,) * rank - elif sel_type == H5S_SELECT_ALL: + elif sel_type == H5S_SEL_ALL: return sid.shape elif sel_type == H5S_SEL_POINTS: @@ -828,7 +931,7 @@ def guess_shape(sid): # the dataspace rank return (N,) - elif sel_type != H5S_SELECT_HYPERSLABS: + elif sel_type != H5S_SEL_HYPERSLABS: raise TypeError(f"Unrecognized selection method {sel_type}") # We have a hyperslab-based selection @@ -895,9 +998,9 @@ def __init__(self, shape, *args, **kwds): arg = args[0] if arg == (): self._mshape = None - self._select_type = H5S_SELECT_ALL + self._select_type = H5S_SEL_ALL elif arg == (Ellipsis,): self._mshape = () - self._select_type = H5S_SELECT_ALL + self._select_type = H5S_SEL_ALL else: raise ValueError("Illegal slicing argument for scalar dataspace") diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py index 74108313..baebcf23 100644 --- a/test/unit/h5py_reader_test.py +++ b/test/unit/h5py_reader_test.py @@ -92,6 +92,14 @@ def testSimple(self): self.assertEqual(arr.shape, (1, 10)) self.assertEqual(list(arr[0]), list(range(0, 40, 4))) + # do a point selection; dset1.1.1[i,j] = i*j, so diagonals are i*i + sel = selections.select(dims, [(0, 0), (1, 1), (2, 2), (3, 3)]) + arr = db.getDatasetValues(dset111_id, sel) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (4,)) + for i in range(4): + self.assertEqual(arr[i], i * i) + # try adding an attribute db.createAttribute(dset111_id, "attr3", value=42) dset_json = db.getObjectById(dset111_id) diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 36c1dbd9..4c2513b6 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -201,8 +201,8 @@ def testSimple(self): db.open() sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) - arr = np.zeros((), dtype=np.int32) - arr[()] = 42 + arr = np.zeros((1, 1), dtype=np.int32) + arr[0, 0] = 42 db.setDatasetValues(dset_111_id, sel, arr) db.close() @@ -726,6 +726,7 @@ def testReaderWithUpdate(self): db.open() arr = np.asarray(range(10), dtype=np.int32) + arr = arr.reshape(1, 10) sel = selections.select((10, 10), (slice(5, 6), slice(0, 10))) db.setDatasetValues(dset_id, sel, arr) db.close() diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 446b1dda..57074866 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -599,12 +599,24 @@ def testSimpleDataset(self): self.assertEqual(val.shape, (1, 1)) self.assertEqual(val[0, 0], i * 10 + j) + # do a point selection + sel = selections.select(shape, [(0, 0), (1, 1), (2, 2), (3, 3)]) + val = db.getDatasetValues(dset_id, sel) + self.assertTrue(isinstance(val, np.ndarray)) + self.assertEqual(val.shape, (4,)) + for i in range(4): + self.assertEqual(val[i], i * 10 + i) + + # point selection write + arr = np.zeros((4,), dtype=dtype) + db.setDatasetValues(dset_id, sel, arr) + # test select all write - sel = selections.select(shape, ...) + sel_all = selections.select(shape, ...) arr = np.zeros(shape, dtype=dtype) arr[...] = 42 - db.setDatasetValues(dset_id, sel, arr) - arr = db.getDatasetValues(dset_id, sel) + db.setDatasetValues(dset_id, sel_all, arr) + arr = db.getDatasetValues(dset_id, sel_all) for i in range(nrows): for j in range(ncols): self.assertEqual(arr[i, j], 42) @@ -612,7 +624,10 @@ def testSimpleDataset(self): # try with broadcasting arr_one_value = np.zeros((1, 1), dtype=dtype) arr_one_value[0, 0] = 7 - db.setDatasetValues(dset_id, sel, arr_one_value) + db.setDatasetValues(dset_id, sel_all, arr_one_value) + # check that entire dataset is updated to the single value + arr = db.getDatasetValues(dset_id, sel_all) + self.assertTrue((arr == 7).all()) db.close() From 47cff958c4b8e4d3a22b7b84dd4e93ec39a01e22 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 13 May 2026 10:42:27 +0200 Subject: [PATCH 127/129] support point write selections in h5pywriter --- src/h5json/h5pystore/h5py_writer.py | 30 ++++++++++++----- test/unit/h5py_writer_test.py | 50 +++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py index 0bb7fc9d..6d128239 100644 --- a/src/h5json/h5pystore/h5py_writer.py +++ b/src/h5json/h5pystore/h5py_writer.py @@ -373,14 +373,28 @@ def updateDatasetValues(self, dset_id, dset): updates = self.db._getDatasetUpdates(dset_id) for (sel, val) in updates: - slices = [] - for dim in range(len(sel.shape)): - start = sel.start[dim] - stop = start + sel.count[dim] - step = sel.step[dim] - slices.append(slice(start, stop, step)) - slices = tuple(slices) - dset[slices] = val + if sel is None or sel.select_type == selections.H5S_SEL_NONE: + pass # no updates + elif sel.select_type == selections.H5S_SEL_ALL: + dset[...] = val + self.log.debug(f"h5py_writer dset {dset.name} updated with sel_all") + elif isinstance(sel, selections.SimpleSelection): + slices = [] + for dim in range(len(sel.shape)): + start = sel.start[dim] + stop = start + sel.count[dim] + step = sel.step[dim] + slices.append(slice(start, stop, step)) + slices = tuple(slices) + dset[slices] = val + elif isinstance(sel, selections.PointSelection): + for i in range(len(sel.points)): + point = tuple(sel.points[i]) + dset[point] = val[i] + self.log.debug(f"h5py_writer dset {dset.name} updated with point selection") + else: + raise TypeError(f"Unexpected selection type: {type(sel)}") + self.log.debug(f"h5py_writer dset {dset.name} updated") def initializeDatasetValues(self, dset_id, dset): diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py index 4c2513b6..f850f28d 100644 --- a/test/unit/h5py_writer_test.py +++ b/test/unit/h5py_writer_test.py @@ -96,12 +96,13 @@ def testSimple(self): g1_1_id = db.createGroup() db.createHardLink(g1_id, "g1.1", g1_1_id) - dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + shape = (10, 10) + dset_111_id = db.createDataset(shape=shape, dtype=np.int32) # try setting dset values with broadcasting arr_one_value = np.zeros((1, 1), dtype=np.int32) arr_one_value[0, 0] = 42 - sel_all = selections.select((10, 10), ...) + sel_all = selections.select(shape, ...) db.setDatasetValues(dset_111_id, sel_all, arr_one_value) db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) @@ -124,9 +125,9 @@ def testSimple(self): g11 = g1["g1.1"] self.assertTrue("dset1.1.1" in g11) dset = g11["dset1.1.1"] - self.assertEqual(dset.shape, (10, 10)) - for i in range(10): - for j in range(10): + self.assertEqual(dset.shape, shape) + for i in range(shape[0]): + for j in range(shape[1]): self.assertEqual(dset[i, j], 42) self.assertTrue("g2" in f) g2 = f["g2"] @@ -135,19 +136,19 @@ def testSimple(self): # write dataset values element by element db.open() - arr = np.zeros((10, 10), dtype=np.int32) - for i in range(10): - for j in range(10): + arr = np.zeros(shape, dtype=np.int32) + for i in range(shape[0]): + for j in range(shape[1]): arr[i, j] = i * j - sel_all = selections.select((10, 10), ...) + sel_all = selections.select(shape, ...) db.setDatasetValues(dset_111_id, sel_all, arr) db.close() # verify changes in h5py with h5py.File(filepath) as f: dset = f["/g1/g1.1/dset1.1.1"] - for i in range(10): - for j in range(10): + for i in range(shape[0]): + for j in range(shape[1]): self.assertEqual(dset[i, j], i * j) db.open() @@ -200,7 +201,7 @@ def testSimple(self): self.assertFalse("tmp_group" in g2) db.open() - sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) + sel = selections.select(shape, (slice(4, 5), slice(4, 5))) arr = np.zeros((1, 1), dtype=np.int32) arr[0, 0] = 42 db.setDatasetValues(dset_111_id, sel, arr) @@ -208,8 +209,8 @@ def testSimple(self): with h5py.File(filepath) as f: dset = f["/g1/g1.1/dset1.1.1"] - for i in range(10): - for j in range(10): + for i in range(shape[0]): + for j in range(shape[1]): if i == 4 and j == 4: # this is the one element that was updated expected = 42 @@ -217,6 +218,27 @@ def testSimple(self): expected = i * j self.assertEqual(dset[i, j], expected) + # try a point write + db.open() + points = [] + for i in range(shape[0]): + points.append((i, i)) + sel = selections.select(shape, points) + arr = np.zeros((len(points),), dtype=np.int32) + db.setDatasetValues(dset_111_id, sel, arr) + db.close() + + with h5py.File(filepath) as f: + dset = f["/g1/g1.1/dset1.1.1"] + for i in range(shape[0]): + for j in range(shape[1]): + if i == j: + # the diagonal elements were updated to 0 + expected = 0 + else: + expected = i * j + self.assertEqual(dset[i, j], expected) + def testResizableDataset(self): filepath = "test/unit/out/h5py_writer_test_testResizableDataset.h5" if os.path.isfile(filepath): From a61eb095e3990c2f0e0b52f190ea1a4b2dfc2410 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 26 May 2026 18:34:05 +0200 Subject: [PATCH 128/129] updated selection code --- src/h5json/hdf5db.py | 95 +++-- src/h5json/selections.py | 60 +++- test/unit/hdf5db_test.py | 70 +++- test/unit/selection_test.py | 668 ++++++++++++++++++++++++++++++++++++ 4 files changed, 855 insertions(+), 38 deletions(-) create mode 100644 test/unit/selection_test.py diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index e65b877d..adcd9f10 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -90,7 +90,7 @@ def __init__( self._dirty_objects = set() # set of modified objects self._deleted_objects = set() # set of deleted objects self._resized_datasets = set() # set of dataset ids that have been resized - self._dataset_updates = {} # list of dataset values updates keyed by dset_id + self._dataset_updates = {} # list of dataset values updates keyed by dset_id self._root_id = None @@ -106,6 +106,14 @@ def __init__( else: self._writer = None + def _getDatasetUpdates(self, dset_id): + """ Return list of updates for the given dataset id """ + + if dset_id not in self._dataset_updates: + self._dataset_updates[dset_id] = [] + + return self._dataset_updates[dset_id] + @property def db(self): """ return object db dictionary """ @@ -184,14 +192,6 @@ def deleted_objects(self): def resized_datasets(self): return self._resized_datasets - def _getDatasetUpdates(self, dset_id): - """ Get list of update tuples """ - if getCollectionForId(dset_id) != "datasets": - raise TypeError("expected dataset id") - if dset_id not in self._dataset_updates: - self._dataset_updates[dset_id] = [] - return self._dataset_updates[dset_id] - def make_dirty(self, obj_id): """ Mark the object as dirty and update the lastModified timestamp """ obj_id = getHashTagForId(obj_id) @@ -222,6 +222,7 @@ def flush(self): self._deleted_objects.clear() self._resized_datasets.clear() self._dataset_updates.clear() + return True def readAll(self): @@ -711,7 +712,6 @@ def init_arr(dtype, cpl): if shape_class == "H5S_SCALAR": if sel.select_type != selections.H5S_SEL_ALL: - # TBD: support other selection types raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") if sel.shape != (): raise ValueError("Selection shape does not match dataset shape") @@ -754,28 +754,59 @@ def init_arr(dtype, cpl): arr = init_arr(dtype, cpl) # apply any updates that impact this selection - - for (update_sel, update_val) in updates: - # get the part of the update that is in common with the requested selection - x_sel = selections.intersect(sel, update_sel) - if x_sel.nselect == 0: - # this update doesn't effect the selection, so ignore - continue - # apply the update to the array to be returned - if sel.select_type == selections.H5S_SEL_POINTS: - # For point selections apply each intersecting point individually. - # arr is 1-D with one entry per selected point; map each intersection - # point back to its position in sel and its offset in update_val. - rank = len(sel.shape) - sel_pts = list(selections._iter_points(sel)) - for pt in selections._iter_points(x_sel): - tgt_idx = sel_pts.index(pt) - src_coords = tuple(pt[d] - update_sel.start[d] for d in range(rank)) - arr[tgt_idx] = update_val[src_coords] - else: - src_sel = selections.translate(update_sel, x_sel) - tgt_sel = selections.translate(sel, x_sel) - arr[tgt_sel.slices] = update_val[src_sel.slices] + if sel.select_type == selections.H5S_SEL_POINTS: + # For point selections apply each intersecting point individually. + # arr is 1-D with one entry per selected point; map each intersection + # point back to its position in sel and its offset in update_val. + points = sel.points + for tgt_idx in range(len(points)): + pt = points[tgt_idx] + pt_sel = selections.select(sel.shape, [pt]) + for (update_sel, update_val) in updates: + x_sel = selections.intersect(update_sel, pt_sel) + if x_sel.nselect == 0: + pass # no intersection, ignore + elif x_sel.nselect > 1: + raise ValueError("unexpected multiple points in intersection of point selection") + else: + if update_sel.select_type == selections.H5S_SEL_POINTS: + # update_val is 1-D indexed by position in update_sel.points + update_pts = list(selections._iter_points(update_sel)) + pt_tuple = next(iter(selections._iter_points(pt_sel))) + src_idx = update_pts.index(pt_tuple) + arr[tgt_idx] = update_val[src_idx] + else: + src_sel = selections.translate(update_sel, x_sel) + # src_sel is a PointSelection with 1 translated point + # index update_val using the full N-D coordinates + src_pt = next(iter(selections._iter_points(src_sel))) + arr[tgt_idx] = update_val[src_pt] if len(src_pt) > 1 else update_val[src_pt[0]] + else: + # hyperslab selections + for (update_sel, update_val) in updates: + # get the part of the update that is in common with the requested selection + x_sel = selections.intersect(sel, update_sel) + if x_sel.nselect == 0: + # this update doesn't effect the selection, so ignore + continue + if update_sel.select_type == selections.H5S_SEL_POINTS: + # update_val is 1-D indexed by position in update_sel.points + update_pts = list(selections._iter_points(update_sel)) + update_pt_to_idx = {pt: i for i, pt in enumerate(update_pts)} + rank = len(sel.shape) + sel_start = sel.start + for pt in selections._iter_points(x_sel): + src_idx = update_pt_to_idx[pt] + tgt_coords = tuple(pt[d] - sel_start[d] for d in range(rank)) + if rank == 1: + arr[tgt_coords[0]] = update_val[src_idx] + else: + arr[tgt_coords] = update_val[src_idx] + else: + # apply the update to the array to be returned + src_sel = selections.translate(update_sel, x_sel) + tgt_sel = selections.translate(sel, x_sel) + arr[tgt_sel.slices] = update_val[src_sel.slices] return arr diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 1ce10c29..93366937 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -167,8 +167,29 @@ def _iter_points(point_sel): yield tuple(int(x) for x in row) +def _bboxes_overlap(s1, s2): + """Return True if the bounding boxes of s1 and s2 overlap in every dimension.""" + min1, max1 = s1.bbox + if min1 is None: + return False + min2, max2 = s2.bbox + if min2 is None: + return False + return all(min1[d] < max2[d] and min2[d] < max1[d] for d in range(len(s1.shape))) + + +def _empty_point_sel(shape): + """Return an empty PointSelection for the given shape.""" + result = PointSelection(shape) + result.set([]) + return result + + def _filter_points_by_hyperslab(point_sel, hyper_sel): """Return a PointSelection of points from point_sel that lie within hyper_sel.""" + if not _bboxes_overlap(point_sel, hyper_sel): + return _empty_point_sel(point_sel.shape) + start = hyper_sel.start count = hyper_sel.count step = hyper_sel.step @@ -192,6 +213,9 @@ def _filter_points_by_hyperslab(point_sel, hyper_sel): def _intersect_points_points(s1, s2): """Return a PointSelection of points common to both s1 and s2.""" + if not _bboxes_overlap(s1, s2): + return _empty_point_sel(s1.shape) + common = sorted(set(_iter_points(s1)) & set(_iter_points(s2))) rank = len(s1.shape) @@ -344,6 +368,33 @@ def shape(self): """ Shape of whole dataspace """ return self._shape + @property + def bbox(self): + """ Bounding box of selection, as a tuple of (min, max) corner coordinates. + + For point-based selections, this is the smallest hyperslab that contains + all selected points. For hyperslab-based selections, this is the + smallest hyperslab that contains the selection (which may be larger than + the actual selection if stepped slices are used). + """ + if self._select_type == H5S_SEL_POINTS: + pts_arr = np.asarray(self._points) + if pts_arr.size == 0: + return None, None + # For rank-1, pts_arr is 1-D (shape (N,)); reshape so axis=0 reduces over points. + rank = len(self._shape) + if pts_arr.ndim == 1 and rank == 1: + pts_arr = pts_arr.reshape(-1, 1) + min_corner = tuple(int(x) for x in np.min(pts_arr, axis=0)) + max_corner = tuple(int(x) + 1 for x in np.max(pts_arr, axis=0)) + return min_corner, max_corner + elif self._select_type in (H5S_SEL_HYPERSLABS, H5S_SEL_ALL): + start = self.start + stop = tuple(start[dim] + (self.count[dim] - 1) * self.step[dim] + 1 for dim in range(len(self._shape))) + return start, stop + else: + raise TypeError("Bounding box is not defined for this selection type") + @property def nselect(self): """ Number of elements currently selected """ @@ -426,11 +477,10 @@ def getSelectNpoints(self): def _perform_selection(self, points, op): """ Internal method which actually performs the selection """ - if isinstance(points, np.ndarray) or True: - points = np.asarray(points, order='C', dtype='u8') - if len(points.shape) == 1: - # points.shape = (1,points.shape[0]) - pass + points = np.asarray(points, order='C', dtype='u8') + if len(points.shape) == 1: + # points.shape = (1,points.shape[0]) + pass if self._select_type != H5S_SEL_POINTS: op = H5S_SELECT_SET diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 57074866..dea6f663 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -540,7 +540,75 @@ def testCommittedCompoundType(self): db.close() - def testSimpleDataset(self): + def test1DDataset(self): + nelements = 10 + shape = (nelements,) + dtype = np.int32 + + db = Hdf5db(app_logger=self.log) + root_id = db.open() + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + self.assertEqual(arr.min(), 0) + self.assertEqual(arr.max(), 0) + + # set values element by element + for i in range(nelements): + sel = selections.select(shape, slice(i, i + 1)) + db.setDatasetValues(dset_id, sel, np.array([i], dtype=dtype)) + + # read entire dataset + arr = db.getDatasetValues(dset_id, sel_all) + for i in range(nelements): + val = np.array([i], dtype=dtype) + np.testing.assert_array_equal(arr[i], val) + + # read element by element + for i in range(nelements): + sel = selections.select(shape, slice(i, i + 1)) + val = db.getDatasetValues(dset_id, sel) + self.assertTrue(isinstance(val, np.ndarray)) + self.assertEqual(val.shape, (1,)) + self.assertEqual(val[0], i) + + # do a point selection + sel = selections.select(shape, [2, 3, 5, 7]) + val = db.getDatasetValues(dset_id, sel) + self.assertTrue(isinstance(val, np.ndarray)) + self.assertEqual(val.shape, (4,)) + + self.assertEqual(val[0], 2) + self.assertEqual(val[1], 3) + self.assertEqual(val[2], 5) + self.assertEqual(val[3], 7) + + # point selection write + arr = np.zeros((4,), dtype=dtype) + db.setDatasetValues(dset_id, sel, arr) + arr = db.getDatasetValues(dset_id, sel_all) + for i in range(nelements): + if i in (2, 3, 5, 7): + self.assertEqual(arr[i], 0) # these were set to 0 by point selection write + else: + self.assertEqual(arr[i], i) + + # try with broadcasting + arr_one_value = np.zeros((1), dtype=dtype) + arr_one_value[0] = 42 + db.setDatasetValues(dset_id, sel_all, arr_one_value) + # check that entire dataset is updated to the single value + arr = db.getDatasetValues(dset_id, sel_all) + self.assertTrue((arr == 42).all()) + + db.close() + + def test2DDataset(self): nrows = 8 ncols = 10 shape = (nrows, ncols) diff --git a/test/unit/selection_test.py b/test/unit/selection_test.py new file mode 100644 index 00000000..7ca42225 --- /dev/null +++ b/test/unit/selection_test.py @@ -0,0 +1,668 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import logging +import numpy as np + +from h5json import selections +from h5json.selections import ( + H5S_SEL_POINTS, + H5S_SEL_ALL, + H5S_SEL_HYPERSLABS, + H5S_SEL_FANCY, + PointSelection, + SimpleSelection, + FancySelection, + ScalarSelection, +) + + +def make_point_sel(shape, mask): + """Build a PointSelection from a boolean ndarray mask.""" + sel = PointSelection(shape) + sel[mask] + return sel + + +class SimpleSelectionTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(SimpleSelectionTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testSelectAll(self): + shape = (10,) + sel = selections.select(shape, ...) + self.assertIsInstance(sel, SimpleSelection) + # __getitem__ always sets HYPERSLABS even for a full-range ellipsis + self.assertEqual(sel.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(sel.shape, shape) + self.assertEqual(sel.nselect, 10) + self.assertEqual(sel.shape, sel.mshape) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (0,)) + self.assertEqual(bbox[1], shape) + + def testSelectAll2D(self): + shape = (4, 5) + sel = selections.select(shape, ...) + self.assertIsInstance(sel, SimpleSelection) + self.assertEqual(sel.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(sel.nselect, 20) + self.assertEqual(sel.shape, sel.mshape) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (0, 0)) + self.assertEqual(bbox[1], shape) + + def testSlice1D(self): + shape = (10,) + sel = selections.select(shape, slice(2, 7)) + self.assertIsInstance(sel, SimpleSelection) + self.assertEqual(sel.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(sel.start, (2,)) + self.assertEqual(sel.count, (5,)) + self.assertEqual(sel.step, (1,)) + self.assertEqual(sel.nselect, 5) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (2,)) + self.assertEqual(bbox[1], (7,)) + + def testSliceWithStep(self): + shape = (10,) + sel = selections.select(shape, slice(0, 10, 2)) + self.assertIsInstance(sel, SimpleSelection) + self.assertEqual(sel.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(sel.start, (0,)) + self.assertEqual(sel.count, (5,)) + self.assertEqual(sel.step, (2,)) + self.assertEqual(sel.nselect, 5) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (0,)) + self.assertEqual(bbox[1], (9,)) + + def testSlice2D(self): + shape = (8, 10) + sel = selections.select(shape, (slice(1, 4), slice(2, 9))) + self.assertIsInstance(sel, SimpleSelection) + self.assertEqual(sel.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(sel.start, (1, 2)) + self.assertEqual(sel.count, (3, 7)) + self.assertEqual(sel.step, (1, 1)) + self.assertEqual(sel.nselect, 21) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (1, 2)) + self.assertEqual(bbox[1], (4, 9)) + + def testBroadcast1D(self): + shape = (10,) + sel = selections.select(shape, ...) + self.assertIsInstance(sel, SimpleSelection) + + it = sel.broadcast((1,)) + count = 0 + for x in it: + # start + self.assertTrue(x[0][0] >= 0 and x[0][0] < 10) + # count + self.assertEqual(x[1], (1,)) + # step + self.assertEqual(x[2], (1,)) + # scalar + self.assertEqual(x[3], (False,)) + count += 1 + self.assertEqual(count, 10) + + def testBroadcast2D(self): + shape = (8, 10) + sel = selections.select(shape, ...) + self.assertIsInstance(sel, SimpleSelection) + try: + sel.broadcast(4, 5) + self.assertTrue(False) + except TypeError: + pass + it = sel.broadcast((1, 10)) + count = 0 + for x in it: + # start + self.assertTrue(x[0][0] >= 0 and x[0][0] < 8) + self.assertEqual(x[0][1], 0) + # count + self.assertEqual(x[1], (1, 10)) + # step + self.assertEqual(x[2], (1, 1)) + # scalar + self.assertEqual(x[3], (False, False)) + count += 1 + self.assertEqual(count, 8) + + def testSlices(self): + shape = (8, 10) + sel = selections.select(shape, (slice(2, 5), slice(3, 7))) + self.assertEqual(sel.slices, (slice(2, 5, 1), slice(3, 7, 1))) + + def testNselect(self): + shape = (100,) + sel = selections.select(shape, slice(0, 100)) + self.assertEqual(sel.nselect, 100) + sel2 = selections.select(shape, slice(10, 20)) + self.assertEqual(sel2.nselect, 10) + + def testOutOfRangeRaises(self): + shape = (10,) + # integer index out of range raises IndexError; slices are silently clamped + with self.assertRaises(IndexError): + selections.select(shape, 15) + + def testGetQueryParam1D(self): + shape = (10,) + sel = selections.select(shape, slice(2, 8)) + param = sel.getQueryParam() + self.assertEqual(param, "[2:8]") + + def testGetQueryParam2D(self): + shape = (8, 10) + sel = selections.select(shape, (slice(1, 4), slice(0, 10))) + param = sel.getQueryParam() + self.assertEqual(param, "[1:4,0:10]") + + def testRepr(self): + shape = (10,) + sel = selections.select(shape, slice(0, 5)) + self.assertIn("SimpleSelection", repr(sel)) + + def testScalarDataset(self): + # select() routes to ScalarSelection when obj has .shape == () + scalar_ds = np.array(42) + sel = selections.select(scalar_ds, ...) + self.assertIsInstance(sel, ScalarSelection) + self.assertEqual(sel.select_type, H5S_SEL_ALL) + self.assertEqual(sel.nselect, 1) + + +class PointSelectionTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(PointSelectionTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testBoolMask1D(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[0, 3, 7]] = True + sel = make_point_sel(shape, mask) + self.assertIsInstance(sel, PointSelection) + self.assertEqual(sel.select_type, H5S_SEL_POINTS) + self.assertEqual(sel.nselect, 3) + points = sel.points + self.assertEqual(len(points), 3) + for i in range(len(points)): + pt = points[i] + self.assertTrue(isinstance(pt, np.ndarray)) + self.assertEqual(pt.shape, (1,)) + self.assertTrue(pt[0] in (0, 3, 7)) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (0,)) + self.assertEqual(bbox[1], (8,)) + + def testBoolMask2D(self): + shape = (4, 5) + mask = np.zeros(shape, dtype=bool) + mask[0, 1] = True + mask[2, 3] = True + sel = make_point_sel(shape, mask) + self.assertEqual(sel.select_type, H5S_SEL_POINTS) + self.assertEqual(sel.nselect, 2) + pts = sel.points + self.assertEqual(pts.shape, (2, 2)) + self.assertEqual(list(pts[0]), [0, 1]) + self.assertEqual(list(pts[1]), [2, 3]) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (0, 1)) + self.assertEqual(bbox[1], (3, 4)) + + def testListOfCoords2D(self): + shape = (8, 10) + sel = selections.select(shape, [(0, 0), (1, 1), (2, 2), (3, 3)]) + self.assertIsInstance(sel, PointSelection) + self.assertEqual(sel.select_type, H5S_SEL_POINTS) + self.assertEqual(sel.nselect, 4) + points = sel.points + self.assertEqual(len(points), 4) + for i in range(len(points)): + pt = points[i] + self.assertTrue(isinstance(pt, np.ndarray)) + self.assertEqual(pt.shape, (2,)) + self.assertTrue(pt[0] == pt[1]) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (0, 0)) + self.assertEqual(bbox[1], (4, 4)) + + def testEmptySet(self): + shape = (10,) + sel = PointSelection(shape) + sel.set([]) + self.assertEqual(sel.nselect, 0) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], None) + self.assertEqual(bbox[1], None) + + def testSetReplacesPoints(self): + shape = (10,) + mask1 = np.zeros(10, dtype=bool) + mask1[[1, 2, 3]] = True + sel = make_point_sel(shape, mask1) + self.assertEqual(sel.nselect, 3) + + mask2 = np.zeros(10, dtype=bool) + mask2[[5, 6]] = True + sel[mask2] + self.assertEqual(sel.nselect, 2) + + def testRepr(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[0, 1]] = True + sel = make_point_sel(shape, mask) + self.assertIn("PointSelection", repr(sel)) + + +class FancySelectionTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(FancySelectionTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testCoordList1D(self): + shape = (10,) + sel = FancySelection(shape) + sel[[2, 5, 8]] + self.assertEqual(sel.select_type, H5S_SEL_FANCY) + + def testGetQueryParamSlice(self): + shape = (10,) + sel = FancySelection(shape) + sel[slice(2, 8)] + param = sel.getQueryParam() + self.assertEqual(param, "[2:8]") + + def testGetQueryParamList(self): + shape = (10,) + sel = FancySelection(shape) + sel[[1, 3, 5]] + param = sel.getQueryParam() + self.assertEqual(param, "[[1,3,5]]") + + def testGetQueryParam2D(self): + shape = (10, 10) + sel = FancySelection(shape) + sel[(slice(1, 4), slice(2, 6))] + param = sel.getQueryParam() + self.assertEqual(param, "[1:4,2:6]") + + def testRepr(self): + shape = (10,) + sel = FancySelection(shape) + sel[slice(0, 5)] + self.assertIn("FancySelection", repr(sel)) + + +class IntersectHyperslabTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(IntersectHyperslabTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testOverlapping1D(self): + shape = (10,) + s1 = selections.select(shape, slice(0, 6)) + s2 = selections.select(shape, slice(3, 10)) + result = selections.intersect(s1, s2) + self.assertIsInstance(result, SimpleSelection) + self.assertEqual(result.nselect, 3) + self.assertEqual(result.start, (3,)) + self.assertEqual(result.count, (3,)) + + def testNonOverlapping1D(self): + shape = (10,) + s1 = selections.select(shape, slice(0, 3)) + s2 = selections.select(shape, slice(5, 10)) + result = selections.intersect(s1, s2) + self.assertEqual(result.nselect, 0) + + def testOverlapping2D(self): + shape = (10, 10) + s1 = selections.select(shape, (slice(0, 6), slice(0, 6))) + s2 = selections.select(shape, (slice(3, 10), slice(3, 10))) + result = selections.intersect(s1, s2) + self.assertEqual(result.nselect, 9) + self.assertEqual(result.start, (3, 3)) + self.assertEqual(result.count, (3, 3)) + + def testFullOverlap(self): + shape = (10,) + s1 = selections.select(shape, slice(2, 8)) + s2 = selections.select(shape, slice(0, 10)) + result = selections.intersect(s1, s2) + self.assertEqual(result.nselect, 6) + self.assertEqual(result.start, (2,)) + self.assertEqual(result.count, (6,)) + + def testSelectAllWithHyperslab(self): + shape = (10,) + s_all = selections.select(shape, ...) + s_hyp = selections.select(shape, slice(3, 7)) + result = selections.intersect(s_all, s_hyp) + self.assertEqual(result.nselect, 4) + self.assertEqual(result.start, (3,)) + + def testSteppedSliceRaises(self): + shape = (10,) + s1 = selections.select(shape, slice(0, 10, 2)) + s2 = selections.select(shape, slice(0, 10, 2)) + with self.assertRaises(ValueError): + selections.intersect(s1, s2) + + def testShapeMismatchRaises(self): + s1 = selections.select((10,), slice(0, 5)) + s2 = selections.select((20,), slice(0, 5)) + with self.assertRaises(ValueError): + selections.intersect(s1, s2) + + def testBadArgRaises(self): + s1 = selections.select((10,), slice(0, 5)) + with self.assertRaises(TypeError): + selections.intersect(s1, "not a selection") + + +class IntersectPointHyperslabTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(IntersectPointHyperslabTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testPointsInsideHyperslab1D(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[0, 1, 3, 5, 9]] = True + pts = make_point_sel(shape, mask) + hyp = selections.select(shape, slice(2, 8)) + result = selections.intersect(pts, hyp) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 2) + self.assertEqual(list(result.points.flatten()), [3, 5]) + + def testHyperslabIntersectPoints1D(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[0, 1, 3, 5, 9]] = True + pts = make_point_sel(shape, mask) + hyp = selections.select(shape, slice(2, 8)) + result = selections.intersect(hyp, pts) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 2) + self.assertEqual(list(result.points.flatten()), [3, 5]) + + def testAllPointsInsideHyperslab(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[2, 4, 6]] = True + pts = make_point_sel(shape, mask) + hyp = selections.select(shape, slice(0, 10)) + result = selections.intersect(pts, hyp) + self.assertEqual(result.nselect, 3) + + def testNoPointsInsideHyperslab(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[0, 1]] = True + pts = make_point_sel(shape, mask) + hyp = selections.select(shape, slice(5, 10)) + result = selections.intersect(pts, hyp) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 0) + + def testPoints2DIntersectHyperslab(self): + shape = (6, 6) + pts = selections.select(shape, [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]) + hyp = selections.select(shape, (slice(1, 4), slice(1, 4))) + result = selections.intersect(pts, hyp) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 3) + pts_list = [tuple(row) for row in result.points] + self.assertIn((1, 1), pts_list) + self.assertIn((2, 2), pts_list) + self.assertIn((3, 3), pts_list) + + def testPoints2DIntersectSelectAll(self): + shape = (5, 5) + pts = selections.select(shape, [(0, 0), (2, 3), (4, 4)]) + s_all = selections.select(shape, ...) + result = selections.intersect(pts, s_all) + self.assertEqual(result.nselect, 3) + + def testHyperslabWithStep1D(self): + shape = (20,) + mask = np.zeros(20, dtype=bool) + mask[[0, 2, 4, 6, 7]] = True + pts = make_point_sel(shape, mask) + # step-2 hyperslab covers 0,2,4,6,8,... + hyp = selections.select(shape, slice(0, 10, 2)) + result = selections.intersect(pts, hyp) + self.assertEqual(result.nselect, 4) + self.assertEqual(list(result.points.flatten()), [0, 2, 4, 6]) + + def testHyperslabFirstArg2D(self): + # hyperslab as the first argument in 2-D + shape = (8, 10) + hyp = selections.select(shape, (slice(2, 6), slice(3, 8))) + pts = selections.select(shape, [(1, 1), (2, 3), (3, 5), (5, 7), (6, 9)]) + result = selections.intersect(hyp, pts) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 3) + pts_list = [tuple(row) for row in result.points] + self.assertIn((2, 3), pts_list) + self.assertIn((3, 5), pts_list) + self.assertIn((5, 7), pts_list) + + def testDisjointBboxReturnsEmpty(self): + # bounding boxes don't overlap at all — exercises the bbox fast path + shape = (20,) + mask = np.zeros(20, dtype=bool) + mask[[0, 1, 2, 3, 4]] = True # points in [0, 5) + pts = make_point_sel(shape, mask) + hyp = selections.select(shape, slice(10, 20)) # hyperslab in [10, 20) + result = selections.intersect(hyp, pts) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 0) + # commuted + result2 = selections.intersect(pts, hyp) + self.assertEqual(result2.nselect, 0) + + +class IntersectPointPointTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(IntersectPointPointTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testOverlapping1D(self): + shape = (10,) + mask1 = np.zeros(10, dtype=bool) + mask1[[0, 1, 3, 5]] = True + mask2 = np.zeros(10, dtype=bool) + mask2[[1, 3, 7]] = True + s1 = make_point_sel(shape, mask1) + s2 = make_point_sel(shape, mask2) + result = selections.intersect(s1, s2) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 2) + self.assertEqual(list(result.points.flatten()), [1, 3]) + + def testNoOverlap1D(self): + shape = (10,) + mask1 = np.zeros(10, dtype=bool) + mask1[[0, 1]] = True + mask2 = np.zeros(10, dtype=bool) + mask2[[8, 9]] = True + result = selections.intersect(make_point_sel(shape, mask1), + make_point_sel(shape, mask2)) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 0) + + def testIdentical1D(self): + shape = (10,) + mask = np.zeros(10, dtype=bool) + mask[[2, 5, 8]] = True + s1 = make_point_sel(shape, mask) + s2 = make_point_sel(shape, mask) + result = selections.intersect(s1, s2) + self.assertEqual(result.nselect, 3) + self.assertEqual(list(result.points.flatten()), [2, 5, 8]) + + def testOverlapping2D(self): + shape = (6, 6) + s1 = selections.select(shape, [(0, 0), (1, 1), (2, 2), (3, 3)]) + s2 = selections.select(shape, [(1, 1), (2, 2), (5, 5)]) + result = selections.intersect(s1, s2) + self.assertIsInstance(result, PointSelection) + self.assertEqual(result.nselect, 2) + pts_list = [tuple(row) for row in result.points] + self.assertIn((1, 1), pts_list) + self.assertIn((2, 2), pts_list) + + def testNoOverlap2D(self): + shape = (6, 6) + s1 = selections.select(shape, [(0, 0), (1, 1)]) + s2 = selections.select(shape, [(3, 3), (4, 4)]) + result = selections.intersect(s1, s2) + self.assertEqual(result.nselect, 0) + + def testCommutativity(self): + shape = (10,) + mask1 = np.zeros(10, dtype=bool) + mask1[[0, 2, 4, 6]] = True + mask2 = np.zeros(10, dtype=bool) + mask2[[2, 4, 8]] = True + s1 = make_point_sel(shape, mask1) + s2 = make_point_sel(shape, mask2) + r_fwd = selections.intersect(s1, s2) + r_rev = selections.intersect(s2, s1) + self.assertEqual(r_fwd.nselect, r_rev.nselect) + self.assertEqual(list(r_fwd.points.flatten()), list(r_rev.points.flatten())) + + +class ContainedTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(ContainedTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testContainedTrue(self): + shape = (10,) + s1 = selections.select(shape, slice(2, 5)) + s2 = selections.select(shape, slice(0, 10)) + self.assertTrue(selections.contained(s1, s2)) + + def testContainedFalse(self): + shape = (10,) + s1 = selections.select(shape, slice(0, 6)) + s2 = selections.select(shape, slice(3, 10)) + self.assertFalse(selections.contained(s1, s2)) + + def testContainedSelf(self): + shape = (10,) + s = selections.select(shape, slice(2, 8)) + self.assertTrue(selections.contained(s, s)) + + def testContained2D(self): + shape = (10, 10) + inner = selections.select(shape, (slice(2, 5), slice(2, 5))) + outer = selections.select(shape, (slice(0, 10), slice(0, 10))) + self.assertTrue(selections.contained(inner, outer)) + self.assertFalse(selections.contained(outer, inner)) + + +class TranslateTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(TranslateTest, self).__init__(*args, **kwargs) + self.logger = logging.getLogger() + self.logger.setLevel(logging.WARNING) + + def testTranslate1D(self): + shape = (10,) + s1 = selections.select(shape, slice(2, 8)) + s2 = selections.select(shape, slice(4, 7)) + result = selections.translate(s1, s2) + self.assertEqual(result.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(result.start, (2,)) + self.assertEqual(result.count, (3,)) + + def testTranslate2D(self): + shape = (10, 10) + s1 = selections.select(shape, (slice(2, 8), slice(2, 8))) + s2 = selections.select(shape, (slice(4, 6), slice(4, 6))) + result = selections.translate(s1, s2) + self.assertEqual(result.select_type, H5S_SEL_HYPERSLABS) + self.assertEqual(result.start, (2, 2)) + self.assertEqual(result.count, (2, 2)) + + def testTranslate2DWithPoints(self): + shape = (10, 10) + s1 = selections.select(shape, (slice(2, 8), slice(2, 8))) + s2 = selections.select(shape, [(2, 2), (3, 3), (9, 9)]) + + result = selections.translate(s1, s2) + self.assertEqual(result.select_type, H5S_SEL_POINTS) + self.assertEqual(result.nselect, 2) + + self.assertEqual(result.points.shape, (2, 2)) + self.assertEqual(list(result.points[0]), [0, 0]) + self.assertEqual(list(result.points[1]), [1, 1]) + + def testTranslateNoOverlapRaises(self): + shape = (10,) + s1 = selections.select(shape, slice(0, 3)) + s2 = selections.select(shape, slice(5, 8)) + with self.assertRaises(ValueError): + selections.translate(s1, s2) + + +if __name__ == "__main__": + unittest.main() From 4b69b2ce8a546176e4dd3d42a3690b92f19fc929 Mon Sep 17 00:00:00 2001 From: John Readey Date: Mon, 1 Jun 2026 21:04:03 +0200 Subject: [PATCH 129/129] fix bug in selection creater --- src/h5json/hdf5db.py | 4 +++- src/h5json/selections.py | 27 ++++++++++++++++----------- test/unit/hdf5db_test.py | 2 ++ test/unit/selection_test.py | 18 ++++++++++++++++++ 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index adcd9f10..ab72ce9c 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -673,7 +673,9 @@ def getDatasetValues(self, dset_id, sel): def init_arr(dtype, cpl): """ create an ndarray with the give shape, dtype and fill_value (if the latter is found in the creation properties list) """ - if hasattr(sel, "count"): + if isinstance(sel, selections.ScalarSelection): + arr_shape = () + elif hasattr(sel, "count"): arr_shape = sel.count if isinstance(sel.count, tuple) else (sel.count, ) else: arr_shape = (sel.nselect,) diff --git a/src/h5json/selections.py b/src/h5json/selections.py index 93366937..fe0186ea 100644 --- a/src/h5json/selections.py +++ b/src/h5json/selections.py @@ -70,9 +70,16 @@ def select(obj, args): if not isinstance(args, tuple): args = (args,) - if hasattr(obj, "shape") and obj.shape == (): + if hasattr(obj, "shape"): + obj_shape = obj.shape + elif isinstance(obj, tuple): + obj_shape = obj + else: + raise TypeError("Object must be a dataset or a shape tuple") + + if len(obj_shape) == 0: # scalar object - sel = ScalarSelection(obj.shape, args) + sel = ScalarSelection(obj_shape, args) return sel # "Special" indexing objects @@ -80,12 +87,12 @@ def select(obj, args): arg = args[0] if hasattr(arg, "shape"): - obj_shape = obj.shape + arg_shape = arg.shape else: - obj_shape = obj + arg_shape = obj_shape if isinstance(arg, Selection): - if arg.shape != obj_shape: + if arg_shape != obj_shape: raise TypeError("Mismatched selection shape") return arg @@ -114,14 +121,12 @@ def select(obj, args): int(a) except Exception: use_fancy = True - if use_fancy and hasattr(obj, "shape"): - sel = FancySelection(obj.shape) + if use_fancy: + sel = FancySelection(obj_shape) sel[args] return sel - if hasattr(obj, "shape"): - sel = SimpleSelection(obj.shape) - else: - sel = SimpleSelection(obj) + sel = SimpleSelection(obj_shape) + sel[args] return sel diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index dea6f663..0b383c43 100644 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -579,6 +579,7 @@ def test1DDataset(self): # do a point selection sel = selections.select(shape, [2, 3, 5, 7]) + val = db.getDatasetValues(dset_id, sel) self.assertTrue(isinstance(val, np.ndarray)) self.assertEqual(val.shape, (4,)) @@ -840,6 +841,7 @@ def testScalarDataset(self): db.createHardLink(root_id, "dset", dset_id) db.createAttribute(dset_id, "a1", "Hello, world") sel_all = selections.select((), ...) + arr = db.getDatasetValues(dset_id, sel_all) self.assertEqual(arr.dtype, dtype) self.assertEqual(arr.shape, ()) diff --git a/test/unit/selection_test.py b/test/unit/selection_test.py index 7ca42225..8cac5603 100644 --- a/test/unit/selection_test.py +++ b/test/unit/selection_test.py @@ -251,6 +251,24 @@ def testBoolMask2D(self): self.assertEqual(bbox[0], (0, 1)) self.assertEqual(bbox[1], (3, 4)) + def testListOfCoords1D(self): + shape = (10,) + sel = selections.select(shape, [2, 3, 5, 7]) + self.assertIsInstance(sel, PointSelection) + self.assertEqual(sel.select_type, H5S_SEL_POINTS) + self.assertEqual(sel.nselect, 4) + points = sel.points + self.assertEqual(len(points), 4) + for i in range(len(points)): + pt = points[i] + self.assertTrue(pt in (2, 3, 5, 7)) + + bbox = sel.bbox + self.assertTrue(isinstance(bbox, tuple)) + self.assertEqual(len(bbox), 2) + self.assertEqual(bbox[0], (2,)) + self.assertEqual(bbox[1], (8,)) + def testListOfCoords2D(self): shape = (8, 10) sel = selections.select(shape, [(0, 0), (1, 1), (2, 2), (3, 3)])