Skip to content

Commit 2192ae8

Browse files
authored
NaN handling improvements (#90)
1 parent ca2d3cd commit 2192ae8

9 files changed

Lines changed: 6478 additions & 236 deletions

File tree

CHANGES.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,5 +105,6 @@ v0.2.6b1:
105105
v0.2.6:
106106
2023-05-24 -- Use token for PyPi in Makefile
107107
v0.2.7b1:
108-
2025-03-01 -- https://github.com/hapi-server/client-python/issues/76
109-
2025-03-01 -- https://github.com/hapi-server/client-python/issues/78
108+
2025-03-01 -- Unrecognized ISO 8601 time format: '00:00:00.Z' https://github.com/hapi-server/client-python/issues/76
109+
2025-03-01 -- 'infer_datetime_format' is deprecated ... https://github.com/hapi-server/client-python/issues/78
110+
2026-04-08 -- Improve NaN handling https://github.com/hapi-server/client-python/issues/88

hapiclient/hapi.py

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def hapi(*args, **kwargs):
250250
Cadence = PT1M and request for
251251
252252
start/stop=1999-11-12T00:10:00/stop=1999-11-12T12:09:00
253-
253+
254254
Chunk size is P1D and requested time range < 1/2 of this
255255
=> Default behavior
256256
@@ -812,13 +812,32 @@ def nhapi(SERVER, DATASET, PARAMETERS, pSTART, pDELTA, i, **opts):
812812
error('Malformed response? Could not read response: {}'.format(urlcsv))
813813
if opts['method'] == '' or opts['method'] == 'pandas':
814814
# Read file into Pandas DataFrame
815+
csv_kwargs = {
816+
'sep': ',',
817+
'header': None,
818+
'encoding': 'utf-8',
819+
'skipinitialspace': True,
820+
'keep_default_na': False,
821+
'na_values': ['NaN', 'nan', 'Nan', 'naN', ' "NaN"', ' "nan"', ' "Nan"', ' "naN"', '"NaN"', '"nan"', '"Nan"', '"naN"']
822+
}
823+
"""
824+
Note that this does not handle trailing whitespace after
825+
any of the na_values. (There is no skiptrailingspace option).
826+
Stripping trailing whitespace would require adding something
827+
like
828+
def strip_field(x):
829+
# Strip whitespace and normalize NaN values.
830+
x = x.strip()
831+
return np.nan if x.lower() == "nan" else x
832+
833+
ncols = cols[-1][1] + 1
834+
csv_kwargs["converters"] = {i: strip_field for i in range(ncols)}
835+
"""
815836
try:
816-
df = pandas.read_csv(fnamecsv,
817-
sep=',',
818-
header=None,
819-
encoding='utf-8')
837+
df = pandas.read_csv(fnamecsv, **csv_kwargs)
820838
except:
821839
error('Malformed response? Could not read response: {}'.format(urlcsv))
840+
822841
# Allocate output N-D array (It is not possible to pass dtype=dt
823842
# as computed to pandas.read_csv; pandas dtype is different
824843
# from numpy's dtype.)
@@ -865,7 +884,14 @@ def nhapi(SERVER, DATASET, PARAMETERS, pSTART, pDELTA, i, **opts):
865884
pickle.dump(meta, f, protocol=2)
866885

867886
log('Writing %s' % fnamenpy, opts)
868-
np.save(fnamenpy, data)
887+
with warnings.catch_warnings():
888+
# Ignore warning that occurs when saving Unicode data.
889+
warnings.filterwarnings("ignore",
890+
message=r"Stored array in format 3\.0.*",
891+
category=UserWarning,
892+
module=r"numpy\.lib\.format",
893+
)
894+
np.save(fnamenpy, data)
869895

870896
meta['x_totalTime'] = time.time() - tic_totalTime
871897

@@ -963,9 +989,28 @@ def parse_missing_length(fnamecsv, dt, cols, psizes, pnames, ptypes, opts):
963989
if opts['method'] == 'numpy' or opts['method'] == 'numpynolength':
964990
# If requested method was numpy, use numpynolength method.
965991

966-
# With dtype='None', the data type is determined automatically
967-
table = np.genfromtxt(fnamecsv, dtype=None, deletechars='',
968-
delimiter=',', encoding='utf-8')
992+
ncols = cols[-1][1] + 1
993+
994+
def normalize_field(value):
995+
if isinstance(value, bytes):
996+
value = value.decode('utf-8')
997+
value = value.strip()
998+
if len(value) >= 2 and value[0] == '"' and value[-1] == '"':
999+
value = value[1:-1].strip()
1000+
if value.lower() == 'nan':
1001+
return 'nan'
1002+
return value
1003+
1004+
converters = {i: normalize_field for i in range(ncols)}
1005+
1006+
table = np.genfromtxt(fnamecsv,
1007+
dtype=None,
1008+
deletechars='',
1009+
replace_space=' ',
1010+
delimiter=',',
1011+
encoding='utf-8',
1012+
converters=converters)
1013+
9691014
# table is a 1-D array. Each element is a row in the file.
9701015
# - If the data types are not the same for each column,
9711016
# the elements are tuples with length equal to the number
@@ -1010,8 +1055,17 @@ def parse_missing_length(fnamecsv, dt, cols, psizes, pnames, ptypes, opts):
10101055
if opts['method'] == '' or opts['method'] == 'pandas' or opts['method'] == 'pandasnolength':
10111056
# If requested method was pandas, use pandasnolength method.
10121057

1058+
# TODO: Duplicate code.
10131059
# Read file into Pandas DataFrame
1014-
df = pandas.read_csv(fnamecsv, sep=',', header=None, encoding='utf-8')
1060+
csv_kwargs = {
1061+
'sep': ',',
1062+
'header': None,
1063+
'encoding': 'utf-8',
1064+
'skipinitialspace': True,
1065+
'keep_default_na': False,
1066+
'na_values': ['NaN', 'nan', 'Nan', 'naN', ' "NaN"', ' "nan"', ' "Nan"', ' "naN"', '"NaN"', '"nan"', '"Nan"', '"naN"']
1067+
}
1068+
df = pandas.read_csv(fnamecsv, **csv_kwargs)
10151069

10161070
# Allocate output N-D array (It is not possible to pass dtype=dt
10171071
# as computed to pandas.read_csv, so need to create new ND array.)

hapiclient/test/compare.log

Lines changed: 51 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -3,118 +3,88 @@ Dataset = dataset1; Parameter(s) = scalar; run = short. cache = False; usecache
33
___________________________________________________________
44
Method total d/l->buff parse buff
55
___________________________________________________________
6-
binary 0.1405 0.1138 0.0001
7-
csv; pandas 0.1644 0.1104 0.0393
8-
csv; pandas; no len. 0.1285 0.1107 0.0026
9-
csv; numpy 0.1448 0.1293 0.0006
10-
csv; numpy; no len. 0.1528 0.1366 0.0005
6+
binary 0.1548 0.1229 0.0004
7+
csv; pandas 0.2000 0.1644 0.0166
8+
csv; pandas; no len. 0.1888 0.1641 0.0061
9+
csv; numpy 0.1809 0.1590 0.0017
10+
csv; numpy; no len. 0.1929 0.1725 0.0020
1111

1212
Dataset = dataset1; Parameter(s) = scalar,vector; run = short. cache = False; usecache = False
1313
___________________________________________________________
1414
Method total d/l->buff parse buff
1515
___________________________________________________________
16-
binary 0.1406 0.1120 0.0001
17-
csv; pandas 0.1477 0.1280 0.0042
18-
csv; pandas; no len. 0.1406 0.1207 0.0032
19-
csv; numpy 0.1837 0.1678 0.0007
20-
csv; numpy; no len. 0.1488 0.1316 0.0008
16+
binary 0.1703 0.1293 0.0002
17+
csv; pandas 0.1936 0.1694 0.0066
18+
csv; pandas; no len. 0.1802 0.1545 0.0057
19+
csv; numpy 0.1839 0.1629 0.0012
20+
csv; numpy; no len. 0.1489 0.1258 0.0016
2121

2222
Dataset = dataset1; Parameter(s) = ; run = short. cache = False; usecache = False
2323
___________________________________________________________
2424
Method total d/l->buff parse buff
2525
___________________________________________________________
26-
binary 0.2053 0.1687 0.0004
27-
csv; pandas 0.1406 0.1138 0.0089
28-
csv; pandas; no len. 0.1630 0.1371 0.0081
29-
csv; numpy 0.1675 0.1461 0.0042
30-
csv; numpy; no len. 0.1487 0.1238 0.0069
26+
binary 0.2016 0.1568 0.0013
27+
csv; pandas 0.2054 0.1538 0.0274
28+
csv; pandas; no len. 0.1879 0.1448 0.0257
29+
csv; numpy 0.1544 0.1323 0.0063
30+
csv; numpy; no len. 0.1882 0.1580 0.0101
3131

3232
Dataset = dataset1; Parameter(s) = scalar; run = short. cache = True; usecache = False
3333
_____________________________________________________________
3434
Method total d/l->file read & parse file
3535
_____________________________________________________________
36-
binary 0.2047 0.1489 0.0109
37-
csv; pandas 0.1789 0.1320 0.0142
38-
csv; pandas; no len. 0.2006 0.1373 0.0066
39-
csv; numpy 0.1893 0.1298 0.0045
40-
csv; numpy; no len. 0.1836 0.1260 0.0053
36+
binary 0.1788 0.1163 0.0110
37+
csv; pandas 0.1874 0.0966 0.0156
38+
csv; pandas; no len. 0.1971 0.1236 0.0097
39+
csv; numpy 0.1892 0.1209 0.0061
40+
csv; numpy; no len. 0.1884 0.1235 0.0051
4141

4242
Dataset = dataset1; Parameter(s) = scalar,vector; run = short. cache = True; usecache = False
4343
_____________________________________________________________
4444
Method total d/l->file read & parse file
4545
_____________________________________________________________
46-
binary 0.1687 0.1163 0.0117
47-
csv; pandas 0.1722 0.1198 0.0143
48-
csv; pandas; no len. 0.2083 0.1543 0.0073
49-
csv; numpy 0.1875 0.1297 0.0057
50-
csv; numpy; no len. 0.1960 0.1360 0.0058
46+
binary 0.2295 0.1647 0.0102
47+
csv; pandas 0.1900 0.0980 0.0170
48+
csv; pandas; no len. 0.2089 0.1368 0.0097
49+
csv; numpy 0.1739 0.1215 0.0071
50+
csv; numpy; no len. 0.1863 0.1193 0.0058
5151

5252
Dataset = dataset1; Parameter(s) = ; run = short. cache = True; usecache = False
5353
_____________________________________________________________
5454
Method total d/l->file read & parse file
5555
_____________________________________________________________
56-
binary 0.1738 0.1211 0.0087
57-
csv; pandas 0.1728 0.1248 0.0148
58-
csv; pandas; no len. 0.2127 0.1561 0.0131
59-
csv; numpy 0.1439 0.0850 0.0103
60-
csv; numpy; no len. 0.1147 0.0782 0.0110
56+
binary 0.2398 0.1726 0.0102
57+
csv; pandas 0.2350 0.1312 0.0378
58+
csv; pandas; no len. 0.2157 0.1226 0.0293
59+
csv; numpy 0.2084 0.1470 0.0114
60+
csv; numpy; no len. 0.1379 0.0799 0.0135
6161

62-
Dataset = dataset1-Aα☃; Parameter(s) = Time; run = short. cache = False; usecache = False
62+
Dataset = dataset1; Parameter(s) = scalar,vector,spectra; run = long. cache = False; usecache = False
6363
___________________________________________________________
6464
Method total d/l->buff parse buff
6565
___________________________________________________________
66-
binary 0.1639 0.1324 0.0001
67-
csv; pandas 0.1486 0.1280 0.0027
68-
csv; pandas; no len. 0.1335 0.1132 0.0027
69-
csv; numpy 0.1447 0.1268 0.0004
70-
csv; numpy; no len. 0.1046 0.0866 0.0006
66+
binary 4.1120 4.0676 0.0107
67+
csv; pandas 1.6467 1.4163 0.2102
68+
csv; pandas; no len. 1.8892 1.6304 0.2431
69+
csv; numpy 2.2088 1.5670 0.6262
70+
csv; numpy; no len. 3.6609 1.4510 2.1950
7171

72-
Dataset = dataset1-Aα☃; Parameter(s) = unicodescalar-1-byte (A); run = short. cache = False; usecache = False
73-
___________________________________________________________
74-
Method total d/l->buff parse buff
75-
___________________________________________________________
76-
binary 0.1387 0.1048 0.0001
77-
csv; pandas 0.0882 0.0660 0.0017
78-
csv; pandas; no len. 0.0993 0.0777 0.0022
79-
csv; numpy 0.1299 0.1115 0.0003
80-
csv; numpy; no len. 0.1212 0.1044 0.0007
81-
82-
Dataset = dataset1-Aα☃; Parameter(s) = unicodescalar-2-byte (α); run = short. cache = False; usecache = False
83-
___________________________________________________________
84-
Method total d/l->buff parse buff
85-
___________________________________________________________
86-
binary 0.1160 0.0692 0.0001
87-
csv; pandas 0.1288 0.1082 0.0021
88-
csv; pandas; no len. 0.1348 0.1153 0.0021
89-
csv; numpy 0.1344 0.1175 0.0003
90-
csv; numpy; no len. 0.2600 0.2430 0.0007
91-
92-
Dataset = dataset1-Aα☃; Parameter(s) = unicodescalar-3-byte (☃); run = short. cache = False; usecache = False
93-
___________________________________________________________
94-
Method total d/l->buff parse buff
95-
___________________________________________________________
96-
binary 0.1149 0.0827 0.0001
97-
csv; pandas 0.1327 0.1142 0.0021
98-
csv; pandas; no len. 0.0873 0.0678 0.0020
99-
csv; numpy 0.1518 0.1332 0.0005
100-
csv; numpy; no len. 0.0924 0.0728 0.0008
101-
102-
Dataset = dataset1-Aα☃; Parameter(s) = unicodescalar-4-byte (👍); run = short. cache = False; usecache = False
103-
___________________________________________________________
104-
Method total d/l->buff parse buff
105-
___________________________________________________________
106-
binary 0.1154 0.0808 0.0001
107-
csv; pandas 0.0945 0.0724 0.0022
108-
csv; pandas; no len. 0.1325 0.1134 0.0019
109-
csv; numpy 0.1116 0.0938 0.0003
110-
csv; numpy; no len. 0.1330 0.1148 0.0008
72+
Dataset = dataset1; Parameter(s) = scalar,vector,spectra; run = long. cache = True; usecache = False
73+
_____________________________________________________________
74+
Method total d/l->file read & parse file
75+
_____________________________________________________________
76+
binary 3.6810 3.6196 0.0259
77+
csv; pandas 1.8802 1.6261 0.2286
78+
csv; pandas; no len. 1.6921 1.4040 0.2675
79+
csv; numpy 2.3263 1.6836 0.6233
80+
csv; numpy; no len. 3.5179 1.3522 2.1474
11181

112-
Dataset = dataset1-Aα☃; Parameter(s) = unicodevector (A;α;☃;👍); run = short. cache = False; usecache = False
82+
Dataset = dataset1; Parameter(s) = scalar,vector,spectra; run = long. cache = False; usecache = True
11383
___________________________________________________________
11484
Method total d/l->buff parse buff
11585
___________________________________________________________
116-
binary 0.1416 0.1073 0.0002
117-
csv; pandas 0.1306 0.1102 0.0027
118-
csv; pandas; no len. 0.0904 0.0697 0.0024
119-
csv; numpy 0.0921 0.0738 0.0005
120-
csv; numpy; no len. 0.0990 0.0811 0.0008
86+
binary 0.0192 1.3522 2.1474
87+
csv; pandas 0.0019 1.3522 2.1474
88+
csv; pandas; no len. 0.0023 1.3522 2.1474
89+
csv; numpy 0.0021 1.3522 2.1474
90+
csv; numpy; no len. 0.0019 1.3522 2.1474

hapiclient/test/compare.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from hapiclient import hapi
66

7-
debug = False
7+
debug = True
88

99
def comparisonOK(a, b, nolength=False, a_name="First", b_name="Second"):
1010

@@ -56,9 +56,21 @@ def comparisonOK(a, b, nolength=False, a_name="First", b_name="Second"):
5656
def equal(a, b):
5757
allequal = True
5858
for name in a.dtype.names:
59-
if not np.array_equal(a[name], b[name]):
59+
if np.issubdtype(a[name].dtype, np.double) or np.issubdtype(a[name].dtype, np.floating):
60+
try:
61+
np.testing.assert_array_equal(a[name], b[name])
62+
except AssertionError:
63+
allequal = False
64+
# nan equalities only supported in assert_array_equal before NumPy 1.19.
65+
#ok = np.array_equal(a[name], b[name], equal_nan=True)
66+
else:
67+
ok = np.array_equal(a[name], b[name])
68+
if not ok:
6069
allequal = False
61-
if debug: print(name + ' values differ.')
70+
if debug:
71+
print(name + ' values differ.')
72+
print(a[name])
73+
print(b[name])
6274

6375
return allequal
6476

0 commit comments

Comments
 (0)