Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.8' ]
python-version: [ '3.11' ]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -22,7 +22,6 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "numpy<1.19.0"
pip install -r test_requirements.txt
pip install pytest-cov
- name: Test with pytest
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ aodntools/_version.py

# PyCharm settings
.idea
/build
/.eggs
38 changes: 20 additions & 18 deletions aodntools/ncwriter/schema.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,57 @@
"""This module holds schema definitions for validating the various :py:class:`dicts` that make up parts of a
template, and also the helper functions necessary to validate an object against their respective schema.
"""

import json

import numpy as np
from jsonschema import validators, Draft4Validator, FormatChecker, ValidationError
from pkg_resources import resource_filename

# helper function that will later be used to tell the schema validator how to validate objects of type "array"
def is_array(checker, instance):
return isinstance(instance, (list, np.ndarray))

# Create a new validator class (based on Draft4Validator) to allow templates to use
# * Python types or numpy dtypes to specify variable data types; and
# * numpy arrays to specify variable data.
TemplateValidator = validators.create(meta_schema=Draft4Validator.META_SCHEMA,
validators=Draft4Validator.VALIDATORS)
format_checker = FormatChecker()
# Extend the default type checker by redefining "array"
# whenever a schema expects a value of type "array", it will now use the is_array function to check if the value is acceptable.
custom_type_checker = Draft4Validator.TYPE_CHECKER.redefine("array", is_array)

# Create a custom validator that uses the new type checker.
# any validation performed with CustomValidator will use the custom array checker
CustomValidator = validators.extend(Draft4Validator, type_checker=custom_type_checker)
format_checker = FormatChecker()

# Define a custom format checker
# called when a JSON schema specifies that a value should have the format "datatype"
@format_checker.checks('datatype')
def is_python_datatype(value):
"""Return whether the given value is a valid data type specification for a NetCDF variable"""
if isinstance(value, np.dtype):
return True
if isinstance(value, type):
return issubclass(value, np.number)

return False


TYPES = {'array': (list, np.ndarray)}

# Load JSON schema file
TEMPLATE_SCHEMA_JSON = resource_filename(__name__, 'template_schema.json')
with open(TEMPLATE_SCHEMA_JSON) as f:
TEMPLATE_SCHEMA = json.load(f)
TemplateValidator.check_schema(TEMPLATE_SCHEMA)

template_validator = TemplateValidator(TEMPLATE_SCHEMA, types=TYPES, format_checker=format_checker)
# Use the custom validator to check it is valid according to Draft 4 rules
CustomValidator.check_schema(TEMPLATE_SCHEMA)

# ready-to-use validator that applies both custom type and format checks
template_validator = CustomValidator(TEMPLATE_SCHEMA, format_checker=format_checker)


# Validation checks
def validate_template(t):
template_validator.validate(t)


def validate_dimensions(d):
validate_template({'_dimensions': d})


def validate_variables(v):
validate_template({'_variables': v})



def validate_global_attributes(a):
if hasattr(a, 'keys'):
special = [k for k in a.keys() if k.startswith('_')]
Expand Down
2 changes: 1 addition & 1 deletion aodntools/ncwriter/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def create_variables(self, **kwargs):

# variable attributes to convert to the same type as the variable
# datatype
varattrs_to_convert_to_datatype = ['valid_min', 'valid_max', 'valid_range']
varattrs_to_convert_to_datatype = ['valid_min', 'valid_max', 'valid_range', 'flag_values']

for varname, varattr in self.variables.items():
if not varattr['_dimensions']: # no kwargs in createVariable
Expand Down
10 changes: 7 additions & 3 deletions aodntools/timeseries_products/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime, timezone

import numpy as np
import xarray as xr

# Common date/time format strings
TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
Expand Down Expand Up @@ -179,7 +180,7 @@ def in_water_index(nc):
"""
time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
TIME = nc['TIME'][:]
TIME = nc['TIME'].values
return (TIME >= time_deployment_start) & (TIME <= time_deployment_end)

def in_water(nc):
Expand All @@ -189,8 +190,11 @@ def in_water(nc):
:param nc: xarray dataset
:return: xarray dataset
"""
return nc.where(in_water_index(nc), drop=True)

condition = in_water_index(nc) # NumPy boolean array
# Get the integer indices where condition is True.
indices = np.nonzero(condition)[0]
# Use positional indexing to select the TIME entries that satisfy the condition.
return nc.isel(TIME=indices)

def current_utc_timestamp(format=TIMESTAMP_FORMAT):
return datetime.now(timezone.utc).strftime(format)
43 changes: 23 additions & 20 deletions aodntools/timeseries_products/hourly_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,27 @@ def check_files(file_list, site_code, parameter_names_accepted, input_dir=''):
:param input_dir: base path where source files are stored
:return: dictionary with the file name and list of failed tests, list good files chronologically ordered
"""

file_list_dataframe = pd.DataFrame(columns=["url", "deployment_date"])
rows = []
error_dict = {}

for file in file_list:
with xr.open_dataset(os.path.join(input_dir, file)) as nc:
error_list = check_file(nc, site_code, parameter_names_accepted)
if error_list:
error_dict.update({file: error_list})
error_dict[file] = error_list
else:
file_list_dataframe = file_list_dataframe.append({'url': file,
'deployment_date': parse(nc.time_deployment_start)},
ignore_index=True)
rows.append({
'url': file,
'deployment_date': parse(nc.time_deployment_start)
})

file_list_dataframe = pd.DataFrame(rows, columns=["url", "deployment_date"])
file_list_dataframe = file_list_dataframe.sort_values(by='deployment_date')
file_list = file_list_dataframe['url'].to_list()
if file_list == []:
sorted_files = file_list_dataframe['url'].to_list()
if not sorted_files:
raise NoInputFilesError("no valid input files to aggregate")

return file_list, error_dict

return sorted_files, error_dict


def get_parameter_names(nc):
Expand Down Expand Up @@ -308,7 +308,7 @@ def PDresample_by_hour(df, function_dict, function_stats):
df_data = pd.DataFrame(index=pd.DatetimeIndex([]))
for variable in varnames:
ds_var = df[variable]
ds_var_resample = ds_var.resample('1H', base=0.5) # shift by half hour to centre bin on the hour
ds_var_resample = ds_var.resample('1h', offset='30min') # shift by half hour to centre bin on the hour
ds_var_mean = ds_var_resample.apply(function_dict[variable]).astype(np.float32)
df_data = pd.concat([df_data, ds_var_mean], axis=1, sort=False)
for stat_method in function_stats:
Expand Down Expand Up @@ -366,8 +366,6 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
variable_attribute_dictionary = json.load(json_file)['_variables']

df_data = pd.DataFrame()


## create empty DF with dtypes
metadata_df_types = [('source_file', str),
('instrument_id', str),
Expand All @@ -380,6 +378,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
parameter_names_all = []
applied_offset = []
qc_count_all = {}
metadata_rows = []

for file_index, file in enumerate(files_to_aggregate):
print(file_index)
Expand All @@ -398,13 +397,16 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
qc_count = get_QCcount(nc_clean, qcflags)
qc_count_all = update_QCcount(qc_count_all, qc_count)
nc_clean = good_data_only(nc_clean, qcflags) # good quality data only
df_metadata = df_metadata.append({'source_file': file,
'instrument_id': utils.get_instrument_id(nc),
'LONGITUDE': nc.LONGITUDE.squeeze().values,
'LATITUDE': nc.LATITUDE.squeeze().values,
'NOMINAL_DEPTH': get_nominal_depth(nc)},
ignore_index=True)


# Append a new row as a dictionary to the list.
metadata_rows.append({
'source_file': file,
'instrument_id': utils.get_instrument_id(nc),
'LONGITUDE': nc.LONGITUDE.squeeze().values,
'LATITUDE': nc.LATITUDE.squeeze().values,
'NOMINAL_DEPTH': get_nominal_depth(nc)
})

# If TIME had out-of-range values before cleaning, nc_clean would now have a CFTimeIndex, which
# breaks the resampling further down. Here we reset it to a DatetimeIndex as suggested here:
# https://stackoverflow.com/questions/55786995/converting-cftime-datetimejulian-to-datetime/55787899#55787899
Expand All @@ -421,6 +423,7 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
df_temp['instrument_index'] = np.repeat(file_index, len(df_temp)).astype(np.int32)
df_data = pd.concat([df_data, df_temp.reset_index()], ignore_index=True, sort=False)

df_metadata = pd.DataFrame(metadata_rows, columns=['source_file', 'instrument_id', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'])
df_metadata.index.rename('INSTRUMENT', inplace=True)
df_data.index.rename('OBSERVATION', inplace=True)
## rename index to TIME
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def append_resampled_values(nc_cell, ds, slice_start, binning_functions):
# shift the index forward 30min to centre the bins on the hour
df_cell.index = df_cell.index + pd.Timedelta(minutes=30)

df_cell_1H = df_cell.resample('1H')
df_cell_1H = df_cell.resample('1h')
slice_end = len(df_cell_1H) + slice_start

# set binned timestamps
Expand Down
7 changes: 7 additions & 0 deletions aodntools/vocab/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .platform_code_vocab import PlatformVocabHelper, platform_altlabels_per_preflabel, platform_type_uris_by_category

__all__ = [
'PlatformVocabHelper',
'platform_altlabels_per_preflabel',
'platform_type_uris_by_category'
]
Loading
Loading