Skip to content

Commit bea8c31

Browse files
committed
Fix HDF5 datetime serialization to avoid false positives
Use metadata-based approach instead of heuristic inference to prevent numeric values from being incorrectly converted to datetime objects.
1 parent ba22dde commit bea8c31

2 files changed

Lines changed: 211 additions & 33 deletions

File tree

guidata/io/h5fmt.py

Lines changed: 55 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from __future__ import annotations
1111

1212
import datetime
13-
import numbers
1413
import sys
1514
from collections.abc import Callable, Sequence
1615
from typing import Any
@@ -463,9 +462,9 @@ def write(self, val: Any, group_name: str | None = None) -> None:
463462
elif isinstance(val, dict):
464463
self.write_dict(val)
465464
elif isinstance(val, datetime.datetime):
466-
self.write_float(val.timestamp())
465+
self.write_datetime(val)
467466
elif isinstance(val, datetime.date):
468-
self.write_int(val.toordinal())
467+
self.write_date(val)
469468
elif isinstance(val, np.ndarray):
470469
self.write_array(val)
471470
elif hasattr(val, "serialize") and isinstance(val.serialize, Callable):
@@ -504,6 +503,30 @@ def write_bool(self, val: bool) -> None:
504503
"""
505504
self.write_int(int(val))
506505

506+
def write_datetime(self, val: datetime.datetime) -> None:
507+
"""
508+
Write a datetime value to the HDF5 file with type metadata.
509+
510+
Args:
511+
val: The datetime value to write.
512+
"""
513+
group = self.get_parent_group()
514+
attr_name = self.option[-1]
515+
group.attrs[attr_name] = val.timestamp()
516+
group.attrs[f"{attr_name}__type__"] = "datetime"
517+
518+
def write_date(self, val: datetime.date) -> None:
519+
"""
520+
Write a date value to the HDF5 file with type metadata.
521+
522+
Args:
523+
val: The date value to write.
524+
"""
525+
group = self.get_parent_group()
526+
attr_name = self.option[-1]
527+
group.attrs[attr_name] = val.toordinal()
528+
group.attrs[f"{attr_name}__type__"] = "date"
529+
507530
def write_array(self, val: np.ndarray) -> None:
508531
"""
509532
Write the numpy array value to the HDF5 file.
@@ -584,32 +607,6 @@ class NoDefault:
584607
pass
585608

586609

587-
def infer_datetime(value: Any) -> Any:
588-
"""Infer if a numeric value represents a datetime object.
589-
590-
Args:
591-
value: The value to check.
592-
593-
Returns:
594-
The inferred datetime object if applicable, otherwise the original value.
595-
"""
596-
if isinstance(value, (numbers.Real, np.generic)):
597-
# Convert NumPy scalars to native Python types for consistency
598-
value = value.item() if isinstance(value, np.generic) else value
599-
600-
# datetime.datetime: seconds since epoch
601-
if isinstance(value, float) and 1e8 < value < 2e9:
602-
return datetime.datetime.fromtimestamp(value)
603-
604-
# datetime.date: ordinal days (typical range)
605-
if isinstance(value, numbers.Integral) and 50000 < value < 1000000:
606-
try:
607-
return datetime.date.fromordinal(value)
608-
except ValueError:
609-
pass
610-
return value
611-
612-
613610
class HDF5Reader(HDF5Handler):
614611
"""
615612
Reader for HDF5 files. Inherits from HDF5Handler.
@@ -687,17 +684,30 @@ def read_any(
687684
The read value.
688685
"""
689686
group = self.get_parent_group()
687+
attr_name = self.option[-1]
690688
try:
691-
value = group.attrs[self.option[-1]]
689+
value = group.attrs[attr_name]
692690
except KeyError:
693691
if self.read(SEQUENCE_NAME, func=self.read_int, default=None) is None:
694692
# No sequence found, this means that the data we are trying to read
695693
# is not here (e.g. compatibility issue), so we raise an error
696694
raise
697695
value = self.read_sequence()
696+
697+
# Check for type metadata
698+
type_key = f"{attr_name}__type__"
699+
if type_key in group.attrs:
700+
type_hint = group.attrs[type_key]
701+
if isinstance(type_hint, bytes):
702+
type_hint = type_hint.decode("utf-8")
703+
if type_hint == "datetime":
704+
return datetime.datetime.fromtimestamp(value)
705+
if type_hint == "date":
706+
return datetime.date.fromordinal(int(value))
707+
698708
if isinstance(value, bytes):
699709
return value.decode("utf-8")
700-
return infer_datetime(value)
710+
return value
701711

702712
def read_bool(self) -> bool | None:
703713
"""
@@ -791,9 +801,21 @@ def read_dict(self) -> dict[str, Any]:
791801
dict_group = group[self.option[-1]]
792802
dict_val = {}
793803
for key, value in dict_group.attrs.items():
794-
if key == DICT_NAME:
804+
if key == DICT_NAME or key.endswith("__type__"):
795805
continue
796-
dict_val[key] = infer_datetime(value)
806+
# Check for type metadata
807+
type_key = f"{key}__type__"
808+
if type_key in dict_group.attrs:
809+
type_hint = dict_group.attrs[type_key]
810+
if isinstance(type_hint, bytes):
811+
type_hint = type_hint.decode("utf-8")
812+
if type_hint == "datetime":
813+
dict_val[key] = datetime.datetime.fromtimestamp(value)
814+
continue
815+
if type_hint == "date":
816+
dict_val[key] = datetime.date.fromordinal(int(value))
817+
continue
818+
dict_val[key] = value
797819
for key in dict_group:
798820
with self.group(key):
799821
try:
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Licensed under the terms of the BSD 3-Clause
4+
# (see guidata/LICENSE for details)
5+
6+
"""
7+
Test HDF5 DateTime Serialization
8+
---------------------------------
9+
10+
Testing datetime/date serialization with metadata to avoid false positives.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import atexit
16+
import datetime
17+
import os
18+
import os.path as osp
19+
20+
import numpy as np
21+
22+
from guidata.io import HDF5Reader, HDF5Writer
23+
24+
25+
class DateTimeTestObject:
26+
"""Test object with various datetime and numeric values"""
27+
28+
def __init__(self) -> None:
29+
# Datetime values that should be preserved
30+
self.dt1 = datetime.datetime(2024, 1, 15, 10, 30, 45)
31+
self.dt2 = datetime.datetime.now()
32+
self.date1 = datetime.date(2024, 6, 1)
33+
self.date2 = datetime.date.today()
34+
35+
# Numeric values that could be mistaken for datetime
36+
# (these should remain as numbers)
37+
self.timestamp_like_float = 1500000000.0 # Within datetime range
38+
self.timestamp_like_int = 1500000000 # Within datetime range
39+
self.ordinal_like_int = 737000 # Within date ordinal range
40+
self.regular_float = 3.14159
41+
self.regular_int = 42
42+
43+
# Dictionary with mixed types
44+
self.metadata = {
45+
"created": datetime.datetime(2023, 12, 25, 12, 0, 0),
46+
"modified": datetime.date(2024, 1, 1),
47+
"count": 100000, # Could be mistaken for date ordinal
48+
"score": 1.5e8, # Could be mistaken for timestamp
49+
"name": "Test Object",
50+
}
51+
52+
def __eq__(self, other: object) -> bool:
53+
"""Check equality"""
54+
if not isinstance(other, DateTimeTestObject):
55+
return False
56+
return (
57+
self.dt1 == other.dt1
58+
and self.dt2 == other.dt2
59+
and self.date1 == other.date1
60+
and self.date2 == other.date2
61+
and self.timestamp_like_float == other.timestamp_like_float
62+
and self.timestamp_like_int == other.timestamp_like_int
63+
and self.ordinal_like_int == other.ordinal_like_int
64+
and self.regular_float == other.regular_float
65+
and self.regular_int == other.regular_int
66+
and self.metadata == other.metadata
67+
)
68+
69+
def serialize(self, writer: HDF5Writer) -> None:
70+
"""Serialize to HDF5"""
71+
writer.write(self.dt1, "dt1")
72+
writer.write(self.dt2, "dt2")
73+
writer.write(self.date1, "date1")
74+
writer.write(self.date2, "date2")
75+
writer.write(self.timestamp_like_float, "timestamp_like_float")
76+
writer.write(self.timestamp_like_int, "timestamp_like_int")
77+
writer.write(self.ordinal_like_int, "ordinal_like_int")
78+
writer.write(self.regular_float, "regular_float")
79+
writer.write(self.regular_int, "regular_int")
80+
with writer.group("metadata"):
81+
writer.write_dict(self.metadata)
82+
83+
def deserialize(self, reader: HDF5Reader) -> None:
84+
"""Deserialize from HDF5"""
85+
self.dt1 = reader.read("dt1")
86+
self.dt2 = reader.read("dt2")
87+
self.date1 = reader.read("date1")
88+
self.date2 = reader.read("date2")
89+
self.timestamp_like_float = reader.read("timestamp_like_float")
90+
self.timestamp_like_int = reader.read("timestamp_like_int")
91+
self.ordinal_like_int = reader.read("ordinal_like_int")
92+
self.regular_float = reader.read("regular_float")
93+
self.regular_int = reader.read("regular_int")
94+
with reader.group("metadata"):
95+
self.metadata = reader.read_dict()
96+
97+
98+
def test_h5fmt_datetime_serialization():
99+
"""Test datetime serialization with metadata to avoid false positives"""
100+
path = osp.abspath("test_datetime.h5")
101+
atexit.register(os.unlink, path)
102+
103+
# Create and serialize the object
104+
original = DateTimeTestObject()
105+
writer = HDF5Writer(path)
106+
original.serialize(writer)
107+
writer.close()
108+
109+
# Deserialize the object
110+
loaded = DateTimeTestObject()
111+
reader = HDF5Reader(path)
112+
loaded.deserialize(reader)
113+
reader.close()
114+
115+
# Verify datetime objects are correctly restored
116+
assert isinstance(loaded.dt1, datetime.datetime)
117+
assert loaded.dt1 == original.dt1
118+
assert isinstance(loaded.dt2, datetime.datetime)
119+
assert loaded.dt2 == original.dt2
120+
assert isinstance(loaded.date1, datetime.date)
121+
assert loaded.date1 == original.date1
122+
assert isinstance(loaded.date2, datetime.date)
123+
assert loaded.date2 == original.date2
124+
125+
# Verify numeric values are NOT converted to datetime (no false positives)
126+
assert isinstance(loaded.timestamp_like_float, (float, np.floating))
127+
assert loaded.timestamp_like_float == original.timestamp_like_float
128+
assert isinstance(loaded.timestamp_like_int, (int, np.integer))
129+
assert loaded.timestamp_like_int == original.timestamp_like_int
130+
assert isinstance(loaded.ordinal_like_int, (int, np.integer))
131+
assert loaded.ordinal_like_int == original.ordinal_like_int
132+
assert isinstance(loaded.regular_float, (float, np.floating))
133+
assert loaded.regular_float == original.regular_float
134+
assert isinstance(loaded.regular_int, (int, np.integer))
135+
assert loaded.regular_int == original.regular_int
136+
137+
# Verify dictionary values
138+
assert isinstance(loaded.metadata["created"], datetime.datetime)
139+
assert loaded.metadata["created"] == original.metadata["created"]
140+
assert isinstance(loaded.metadata["modified"], datetime.date)
141+
assert loaded.metadata["modified"] == original.metadata["modified"]
142+
assert isinstance(loaded.metadata["count"], (int, np.integer))
143+
assert loaded.metadata["count"] == original.metadata["count"]
144+
assert isinstance(loaded.metadata["score"], (float, np.floating))
145+
assert loaded.metadata["score"] == original.metadata["score"]
146+
assert isinstance(loaded.metadata["name"], str)
147+
assert loaded.metadata["name"] == original.metadata["name"]
148+
149+
# Overall equality check
150+
assert loaded == original
151+
152+
print("All datetime serialization tests passed!")
153+
154+
155+
if __name__ == "__main__":
156+
test_h5fmt_datetime_serialization()

0 commit comments

Comments
 (0)