Skip to content

Commit acb59a0

Browse files
authored
Merge pull request #3 from ahamptonTIA/20240320
v2.0.1
2 parents 023d8d2 + 3843bb5 commit acb59a0

File tree

1 file changed

+41
-11
lines changed

1 file changed

+41
-11
lines changed

src/schema_validata.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,29 @@ def get_spreadsheet_metadata(file_path):
277277

278278
# ----------------------------------------------------------------------------------
279279

280+
def is_numeric_type(value):
281+
"""
282+
Checks if a value is a common numeric data type in
283+
pandas, NumPy, or Python.
284+
285+
Parameters:
286+
----------
287+
value: The value to check.
288+
Returns:
289+
-------
290+
bool: True if the value is numeric, False otherwise.
291+
"""
292+
# Check for standard numeric types (int, float, complex)
293+
if isinstance(value, (int, float, complex)):
294+
return True
295+
# Check for NumPy numeric dtypes using np.issubdtype
296+
elif np.issubdtype(type(value), np.number):
297+
return True
298+
else:
299+
return False
300+
301+
# ----------------------------------------------------------------------------------
302+
280303
def downcast_ints(value):
281304
"""
282305
Downcast a numeric value to an integer if it is equal to
@@ -1214,7 +1237,7 @@ def series_hasNull(series,
12141237

12151238
#----------------------------------------------------------------------------------
12161239

1217-
def get_numeric_range(series,
1240+
def get_numeric_range(pd_series,
12181241
attribute,
12191242
na_val=None
12201243
):
@@ -1223,7 +1246,7 @@ def get_numeric_range(series,
12231246
numerical and non-numerical cases.
12241247
12251248
Parameters:
1226-
series (pd.Series):
1249+
pd_series (pd.Series):
12271250
The Pandas Series to process.
12281251
attribute (str):
12291252
The desired statistical attribute, either 'min' or 'max'.
@@ -1238,12 +1261,17 @@ def get_numeric_range(series,
12381261
value as an integer if possible; otherwise, returns it as a float. If the
12391262
Series is empty or non-numeric, returns (na_val).
12401263
"""
1241-
_s = series.dropna()
1264+
# Check for integers or float
1265+
_s = pd_series.replace(r'^\s+$', pd.NA, regex=True)
1266+
_s.fillna(pd.NA)
12421267
try:
12431268
_s = pd.to_numeric(_s)
1269+
_s.fillna(pd.NA)
12441270
except:
12451271
pass
1246-
1272+
1273+
_s = _s.dropna()
1274+
12471275
if not pd.api.types.is_numeric_dtype(_s):
12481276
return na_val # Return `na_val` for non-numeric Series
12491277

@@ -1817,9 +1845,9 @@ def schema_validate_range(attribute,
18171845
"""
18181846

18191847
# Check if the expected range is a numeric value
1820-
if isinstance(p_errors[attribute]['expected'], (int, float)):
1848+
if is_numeric_type(p_errors[attribute]['expected']):
18211849
# Check if the observed value is also a numeric value
1822-
if isinstance(p_errors[attribute]['observed'], (int, float)):
1850+
if is_numeric_type(p_errors[attribute]['observed']):
18231851
exp_val = p_errors[attribute]['expected']
18241852
obs_val = p_errors[attribute]['observed']
18251853

@@ -2290,9 +2318,11 @@ def value_errors_out_of_range(df,
22902318
results = []
22912319

22922320
# Check for integers or float
2293-
numeric_column = df[column_name].notna()
2321+
numeric_column = df[column_name].replace(r'^\s+$', pd.NA, regex=True)
2322+
numeric_column.fillna(pd.NA)
22942323
try:
22952324
numeric_column = pd.to_numeric(numeric_column)
2325+
numeric_column.fillna(pd.NA)
22962326
except:
22972327
pass
22982328

@@ -2463,20 +2493,20 @@ def get_value_errors(dataset_path,
24632493
)
24642494
if 'range_max' in flagged_errs \
24652495
and 'range_max' not in ignore_errors:
2466-
max_len = errors['range_max']['expected']
2496+
rng_max = errors['range_max']['expected']
24672497
sheet_v_errors.append(
24682498
value_errors_out_of_range(df, col,
24692499
test_type='max',
2470-
value=max_len,
2500+
value=rng_max,
24712501
unique_column=unique_column)
24722502
)
24732503
if 'range_min' in flagged_errs \
24742504
and 'range_min' not in ignore_errors:
2475-
min_len = errors['range_min']['expected']
2505+
rng_min = errors['range_min']['expected']
24762506
sheet_v_errors.append(
24772507
value_errors_out_of_range(df, col,
24782508
test_type='min',
2479-
value=min_len,
2509+
value=rng_min,
24802510
unique_column=unique_column)
24812511
)
24822512
if 'allowed_value_list' in flagged_errs \

0 commit comments

Comments
 (0)