diff --git a/my.pkl b/my.pkl new file mode 100644 index 0000000..d054621 Binary files /dev/null and b/my.pkl differ diff --git a/pyessv/__init__.py b/pyessv/__init__.py index 873ec5d..02d55e3 100644 --- a/pyessv/__init__.py +++ b/pyessv/__init__.py @@ -11,7 +11,7 @@ """ __title__ = 'pyessv' -__version__ = '0.7.0.0' +__version__ = '0.7.0.1' __author__ = 'ES-DOC' __license__ = 'GPL' __copyright__ = 'Copyright 2017 ES-DOC' @@ -62,18 +62,28 @@ from pyessv._governance import reset from pyessv._initializer import init +from pyessv._initializer import load_cv from pyessv._loader import load_random from pyessv._loader import load +from pyessv._loader import all_scopes from pyessv._model import Authority from pyessv._model import Collection from pyessv._model import Scope from pyessv._model import Term +from pyessv._builders import build_dataset_identifier +from pyessv._builders import build_directory +from pyessv._builders import build_filename + from pyessv._parser import parse -from pyessv._parsers import parse_dataset_identifer -from pyessv._parsers import parse_dataset_identifers +from pyessv._parsers import parse_dataset_identifier +from pyessv._parsers import parse_dataset_identifiers +from pyessv._parsers import parse_directory +from pyessv._parsers import parse_directories +from pyessv._parsers import parse_filename +from pyessv._parsers import parse_filenames from pyessv._utils.logger import log from pyessv._utils.logger import log_error diff --git a/pyessv/_builder_template.py b/pyessv/_builder_template.py new file mode 100644 index 0000000..085355b --- /dev/null +++ b/pyessv/_builder_template.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._model._builder_template.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: A vocabulary constrained template builder, e.g. a dataset identifier. + +.. moduleauthor:: Mark Conway-Greenslade + + +""" +from pyessv._model import Collection, Term +from pyessv._exceptions import TemplateParsingError, TemplateValueError +from pyessv._constants import BUILDER_FIELDS +from pyessv._utils.compat import basestring, str + + +class TemplateBuilder(object): + """A vocabulary template builder. + + """ + def __init__(self, template, collections, strictness, separator='.'): + """Instance constructor. + + :param str template: Identifier template. + :param tuple collections: pyessv collection identifiers. + :param int strictness: Strictness level to apply when applying name matching rules. + :param str seprarator: Separator to apply when parsing. + + """ + from pyessv._loader import load + + self.separator = separator + self.template_parts = template.split(separator) + self.template = template + self.strictness = strictness + + # Inject pyessv collections into template. + collection_idx = 0 + for idx, part in enumerate(self.template_parts): + if part == '{}': + self.template_parts[idx] = load(collections[collection_idx]) + collection_idx += 1 + + def build(self, terms, att='label', alt_name=0): + """Build template instance from a list of pyessv terms. + + :returns: Template instance string. + + """ + assert isinstance(alt_name, int), 'Invalid alternative name index' + assert att in BUILDER_FIELDS, 'Invalid name' + + # Instantiate string parts. + string_parts = list() + + # Iterate template. + for template_part in self.template_parts: + + # Append constant match. + if isinstance(template_part, basestring): + string_parts.append(template_part) + continue + + # Append term match. + collection = template_part + term = None + for term in terms: + if term.collection.namespace == collection.namespace: + break + + # Append term from associations. + if not term: + for term in [association for association in t.associations for t in terms]: + if term.collection.namespace == collection.namespace: + break + + # Verify collection is found among terms. + if not term: + raise TemplateValueError('Collection not found among terms :: {}'.format(collection)) + + # Get term field. + if att == 'alternative_names': + string_parts.append(getattr(term, att)[alt_name]) + else: + string_parts.append(getattr(term, att)) + + + return self.separator.join(string_parts) \ No newline at end of file diff --git a/pyessv/_builders/__init__.py b/pyessv/_builders/__init__.py new file mode 100644 index 0000000..ca3d83f --- /dev/null +++ b/pyessv/_builders/__init__.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._builders.__init__.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Expression builders. + +.. moduleauthor:: Mark Conway-Greenslade + + +""" + +from pyessv._builders.dataset_id import build_dataset_identifier +from pyessv._builders.directory import build_directory +from pyessv._builders.filename import build_filename \ No newline at end of file diff --git a/pyessv/_builders/dataset_id.py b/pyessv/_builders/dataset_id.py new file mode 100644 index 0000000..20d9c51 --- /dev/null +++ b/pyessv/_builders/dataset_id.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._builders.dataset_id.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encapsulates building of an ESGF dataset identifier. + +.. moduleauthor:: Mark Conway-Greenslade + +""" +from pyessv._model.term import Term +from pyessv import all_scopes +from pyessv._constants import PARSING_STRICTNESS_1 +from pyessv._factory import create_template_builder +from pyessv._utils.compat import basestring + +# Instantiated template +_TEMPLATE = None + +# Instantiated template collections +_COLLECTIONS = None + +# Instantiated project. +_PROJECT = None + +# Instantiated builder. +_BUILDER = None + + +def build_dataset_identifier(project, terms): + """Builds a dataset identifier. + + :param str project: Project code. + :param set terms: Dataset identifier terms. + + :returns: Dataset identifier. + :rtype: str + + """ + assert isinstance(project, basestring), 'Invalid project' + assert isinstance(terms, set), 'Invalid terms' + + global _PROJECT, _BUILDER, _TEMPLATE, _COLLECTIONS + + if _PROJECT != project: + + # Get scope corresponding to the project code. + scopes = all_scopes() + assert project in [scope.name for scope in scopes], 'Unsupported project' + scope = [scope for scope in scopes if scope.name == project][0] + + assert 'dataset_id' in scope.data.keys(), 'Dataset ID parser not found' + assert 'template' in scope.data['dataset_id'].keys(), 'Dataset ID parser template not found' + assert 'collections' in scope.data['dataset_id'].keys(), 'Dataset ID parser template collections not found' + + # Get template from data scope. + _TEMPLATE = scope.data['dataset_id']['template'] + assert isinstance(_TEMPLATE, basestring), 'Invalid template' + + # Get template collections from data scope. + _COLLECTIONS = list() + for name in scope.data['dataset_id']['collections']: + _COLLECTIONS.append([collection.namespace for collection in scope.collections if collection.name == name.replace('_','-')][0]) + assert _COLLECTIONS, 'Invalid collections' + + # Instantiate parser JIT. + _BUILDER = create_template_builder(_TEMPLATE, tuple(_COLLECTIONS), PARSING_STRICTNESS_1) + + # Cached project. + _PROJECT = project + + for term in terms: + assert isinstance(term, Term), 'Invalid term :: {}'.format(term) + + return _BUILDER.build(terms) diff --git a/pyessv/_builders/directory.py b/pyessv/_builders/directory.py new file mode 100644 index 0000000..71f23b9 --- /dev/null +++ b/pyessv/_builders/directory.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._builders.dataset_id.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encapsulates building of an ESGF dataset identifier. + +.. moduleauthor:: Mark Conway-Greenslade + +""" +from pyessv._model.term import Term +from pyessv import all_scopes +from pyessv._constants import PARSING_STRICTNESS_1 +from pyessv._factory import create_template_builder +from pyessv._utils.compat import basestring + +# Instantiated template +_TEMPLATE = None + +# Instantiated template collections +_COLLECTIONS = None + +# Instantiated project. +_PROJECT = None + +# Instantiated builder. +_BUILDER = None + + +def build_directory(project, terms): + """Builds a directory. + + :param str project: Project code. + :param set terms: Directory terms. + + :returns: Directory string. + :rtype: str + + """ + assert isinstance(project, basestring), 'Invalid project' + assert isinstance(terms, set), 'Invalid terms' + + global _PROJECT, _BUILDER, _TEMPLATE, _COLLECTIONS + + if _PROJECT != project: + + # Get scope corresponding to the project code. + scopes = all_scopes() + assert project in [scope.name for scope in scopes], 'Unsupported project' + scope = [scope for scope in scopes if scope.name == project][0] + + assert 'directory_structure' in scope.data.keys(), 'Directory parser not found' + assert 'template' in scope.data['directory_structure'].keys(), 'Directory parser template not found' + assert 'collections' in scope.data['directory_structure'].keys(), 'Directory parser template collections not found' + + # Get template from data scope. + _TEMPLATE = scope.data['directory_structure']['template'] + assert isinstance(_TEMPLATE, basestring), 'Invalid template' + + # Get template collections from data scope. + _COLLECTIONS = list() + for name in scope.data['directory_structure']['collections']: + _COLLECTIONS.append([collection.namespace for collection in scope.collections if collection.name == name.replace('_','-')][0]) + assert _COLLECTIONS, 'Invalid collections' + + # Instantiate parser JIT. + _BUILDER = create_template_builder(_TEMPLATE, tuple(_COLLECTIONS), PARSING_STRICTNESS_1, separator='/') + + # Cached project. + _PROJECT = project + + for term in terms: + assert isinstance(term, Term), 'Invalid term :: {}'.format(term) + + return _BUILDER.build(terms) diff --git a/pyessv/_builders/filename.py b/pyessv/_builders/filename.py new file mode 100644 index 0000000..22f4fe9 --- /dev/null +++ b/pyessv/_builders/filename.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._builders.dataset_id.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encapsulates building of an ESGF dataset identifier. + +.. moduleauthor:: Mark Conway-Greenslade + +""" +from pyessv._model.term import Term +from pyessv import all_scopes +from pyessv._constants import PARSING_STRICTNESS_1 +from pyessv._factory import create_template_builder +from pyessv._utils.compat import basestring + +# Instantiated template +_TEMPLATE = None + +# Instantiated template collections +_COLLECTIONS = None + +# Instantiated project. +_PROJECT = None + +# Instantiated builder. +_BUILDER = None + + +def build_filename(project, terms): + """Builds a filename. + + :param str project: Project code. + :param set terms: Filename terms. + + :returns: Filename string. + :rtype: str + + """ + assert isinstance(project, basestring), 'Invalid project' + assert isinstance(terms, set), 'Invalid terms' + + global _PROJECT, _BUILDER, _TEMPLATE, _COLLECTIONS + + if _PROJECT != project: + + # Get scope corresponding to the project code. + scopes = all_scopes() + assert project in [scope.name for scope in scopes], 'Unsupported project' + scope = [scope for scope in scopes if scope.name == project][0] + + assert 'filename' in scope.data.keys(), 'Filename parser not found' + assert 'template' in scope.data['filename'].keys(), 'Filename parser template not found' + assert 'collections' in scope.data['filename'].keys(), 'Filename parser template collections not found' + + # Get template from data scope. + _TEMPLATE = scope.data['filename']['template'] + assert isinstance(_TEMPLATE, basestring), 'Invalid template' + + # Get template collections from data scope. + _COLLECTIONS = list() + for name in scope.data['filename']['collections']: + _COLLECTIONS.append([collection.namespace for collection in scope.collections if collection.name == name.replace('_','-')][0]) + assert _COLLECTIONS, 'Invalid collections' + + # Instantiate parser JIT. + _BUILDER = create_template_builder(_TEMPLATE, tuple(_COLLECTIONS), PARSING_STRICTNESS_1, separator='_') + + # Cached project. + _PROJECT = project + + for term in terms: + assert isinstance(term, Term), 'Invalid term :: {}'.format(term) + + return _BUILDER.build(terms) diff --git a/pyessv/_codecs/json_codec/encoder.py b/pyessv/_codecs/json_codec/encoder.py index fd6a51d..cf83a0d 100644 --- a/pyessv/_codecs/json_codec/encoder.py +++ b/pyessv/_codecs/json_codec/encoder.py @@ -20,7 +20,7 @@ from pyessv._utils import convert from pyessv._utils.compat import json from pyessv._utils.compat import numeric_types -from pyessv._utils.compat import str +from pyessv._utils.compat import str, basestring diff --git a/pyessv/_constants.py b/pyessv/_constants.py index 8b04660..239e6f2 100644 --- a/pyessv/_constants.py +++ b/pyessv/_constants.py @@ -105,6 +105,15 @@ # Regular expression for validating a canonical name. REGEX_CANONICAL_NAME = r'^[a-z0-9\-]*$' +# Builder fields. +BUILDER_FIELDS = ( + 'name', + 'canonical_name', + 'raw_name', + 'alternative_names', + 'label' + ) + # Standard node fields. STANDARD_NODE_FIELDS = ( 'alternative_names', diff --git a/pyessv/_exceptions.py b/pyessv/_exceptions.py index 25865ce..7a5e8ef 100644 --- a/pyessv/_exceptions.py +++ b/pyessv/_exceptions.py @@ -35,6 +35,18 @@ def __init__(self, val): super(TemplateParsingError, self).__init__(msg) +class TemplateValueError(ValueError): + """A template parsing error raised. + + """ + def __init__(self, val): + """Object constructor. + + """ + msg = 'A template value error has occurred: {}'.format(val) + super(TemplateValueError, self).__init__(msg) + + class ValidationError(ValueError): """A validation error raised by the package validator. diff --git a/pyessv/_factory.py b/pyessv/_factory.py index 7afbf41..1fb9c22 100644 --- a/pyessv/_factory.py +++ b/pyessv/_factory.py @@ -23,6 +23,7 @@ from pyessv._model import Scope from pyessv._model import Term from pyessv._parser_template import TemplateParser +from pyessv._builder_template import TemplateBuilder from pyessv._utils.compat import basestring from pyessv._utils.compat import str from pyessv._utils.formatter import format_canonical_name @@ -131,7 +132,7 @@ def create_collection( :param list alternative_names: Collection of associated alternative names. :param str|tuple term_regex: Regular expression information to be applied to terms. - :returns: A vocabulary collection, e.g. insitution-id. + :returns: A vocabulary collection, e.g. institution-id. :rtype: pyessv.Collection """ @@ -197,13 +198,37 @@ def _callback(instance): ) -def create_template_parser(template, collections, strictness=PARSING_STRICTNESS_2, seperator='.'): +def create_template_builder(template, collections, strictness=PARSING_STRICTNESS_2, separator='.'): + """Instantiates, initialises & returns a template builder. + + :param str template: An expression template. + :param tuple collections: Collections that the template maps to. + :param int strictness: Strictness level to apply when applying name matching rules. + :param str separator: Separator to apply when parsing. + + :returns: A vocabulary expression builder. + :rtype: pyessv.TemplateBuilder + + """ + assert isinstance(template, basestring), 'Invalid template' + assert isinstance(collections, tuple), 'Invalid collections' + assert len(template) > 0, 'Invalid template' + assert template.count('{}') > 0, 'Invalid template' + assert len(collections) > 0, 'Invalid collections' + assert template.count('{}') == len(collections), 'Invalid template: collection count mismatch' + assert strictness in PARSING_STRICTNESS_SET, 'Invalid parsing strictness: {}'.format(strictness) + assert isinstance(separator, basestring), 'Invalid separator' + + return TemplateBuilder(template, collections, strictness, separator) + + +def create_template_parser(template, collections, strictness=PARSING_STRICTNESS_2, separator='.'): """Instantiates, initialises & returns a template parser. :param str template: An expression template. :param tuple collections: Collections that the template maps to. :param int strictness: Strictness level to apply when applying name matching rules. - :param str seprarator: Seperator to apply when parsing. + :param str separator: Separator to apply when parsing. :returns: A vocabulary expression parser. :rtype: pyessv.TemplateParser @@ -216,9 +241,9 @@ def create_template_parser(template, collections, strictness=PARSING_STRICTNESS_ assert len(collections) > 0, 'Invalid collections' assert template.count('{}') == len(collections), 'Invalid template: collection count mismatch' assert strictness in PARSING_STRICTNESS_SET, 'Invalid parsing strictness: {}'.format(strictness) - assert isinstance(seperator, basestring), 'Invalid seperator' + assert isinstance(separator, basestring), 'Invalid separator' - return TemplateParser(template, collections, strictness, seperator) + return TemplateParser(template, collections, strictness, separator) def _create_node( diff --git a/pyessv/_initializer.py b/pyessv/_initializer.py index 72365a7..f75e842 100644 --- a/pyessv/_initializer.py +++ b/pyessv/_initializer.py @@ -22,67 +22,63 @@ from pyessv._utils import logger - def init(): - """Library initializer. - - """ - # Verify archive folder exists. - if not os.path.isdir(DIR_ARCHIVE): - raise EnvironmentError('{} directory does not exists'.format(DIR_ARCHIVE)) - - # Load set of authorities from file system. - authorities = _load_authorities() - - # Mixin pseudo-constants. - _mixin_constants(authorities) - - # Set scope level accessor functions. - _mixin_scope_accessors(authorities) - - -def _load_authorities(): - """Loads vocabulary authorities from archive. - - """ - logger.log('Loading vocabularies from {}:'.format(DIR_ARCHIVE)) - authorities = [] - for authority in read(): - authorities.append(authority) - logger.log('... loaded: {}'.format(authority)) - cache(authority) - - return authorities - + """Library initializer. + + """ + # Verify archive folder exists. + if not os.path.isdir(DIR_ARCHIVE): + raise EnvironmentError('{} directory does not exists'.format(DIR_ARCHIVE)) + + +def load_cv(authority=None, scope=None): + authorities = [] + if authority: + if scope: + authority = read(authority=authority, scope=scope) + else: + authority = read(authority=authority) + authorities.append(authority) + cache(authority) + else: + for authority in read(): + authorities.append(authority) + cache(authority) + + # Mixin pseudo-constants. + _mixin_constants(authorities) + + # Set scope level accessor functions. + _mixin_scope_accessors(authorities) def _mixin_constants(authorities): - """Mixes in authorities as pseudo-constants to pyessv. + """Mixes in authorities as pseudo-constants to pyessv. - """ - for authority in authorities: - attr_name = authority.canonical_name.replace('-', '_').upper() - setattr(pyessv, attr_name, authority) + """ + for authority in authorities: + attr_name = authority.canonical_name.replace('-', '_').upper() + setattr(pyessv, attr_name, authority) def _mixin_scope_accessors(authorities): - """Mixes in scope level vocab accessors functions. - - """ - # In pyessv._accessors sub-package are modules that expose helper functions for accessing vocabularies, - # here we are ensuring that those functions are easily accessed. - targets = [] - for authority in authorities: - for scope in authority: - try: - accessor = ACCESSORS[authority.canonical_name][scope.canonical_name] - except KeyError: - pass - else: - targets.append((scope, accessor)) - - # Mixin accessor functions with scope. - for scope, accessor in targets: - funcs = [i for i in inspect.getmembers(accessor) - if inspect.isfunction(i[1]) and not i[0].startswith('_')] - for name, func in funcs: - setattr(scope, name, func) + """Mixes in scope level vocab accessors functions. + + """ + # In pyessv._accessors sub-package are modules that expose helper functions for accessing vocabularies, + # here we are ensuring that those functions are easily accessed. + targets = [] + for authority in authorities: + for scope in authority: + try: + accessor = ACCESSORS[authority.canonical_name][scope.canonical_name] + except KeyError: + pass + else: + targets.append((scope, accessor)) + + # Mixin accessor functions with scope. + for scope, accessor in targets: + funcs = [i for i in inspect.getmembers(accessor) + if inspect.isfunction(i[1]) and not i[0].startswith('_')] + for name, func in funcs: + setattr(scope, name, func) diff --git a/pyessv/_io_manager.py b/pyessv/_io_manager.py index 4ec11ba..ee32985 100644 --- a/pyessv/_io_manager.py +++ b/pyessv/_io_manager.py @@ -71,20 +71,28 @@ def delete(target): pass -def read(archive_dir=DIR_ARCHIVE): +def read(authority=None, scope=None, archive_dir=DIR_ARCHIVE): """Reads vocabularies from archive folder (~/.esdoc/pyessv-archive) upon file system. :returns: List of vocabulary authorities loaded from archive folder. :rtype: list """ - return [_read_authority(i) for i in glob.glob('{}/*'.format(archive_dir)) if isdir(i)] + if authority: + assert '{}/{}'.format(archive_dir, authority), 'Invalid authority' + if scope: + assert '{}/{}/{}'.format(archive_dir, authority, scope), 'Invalid scope' + return _read_authority('{}/{}'.format(archive_dir, authority), scope) + return _read_authority('{}/{}'.format(archive_dir, authority)) + else: + return [_read_authority(i) for i in glob.glob('{}/*'.format(archive_dir)) if isdir(i)] -def _read_authority(dpath): +def _read_authority(dpath, scope=None): """Reads authority CV data from file system. :param str dpath: Path to a directory to which an authority's vocabularies have been written. + :param str scope: Select a scope's vocabularies to load (default loads all scopes). :returns: Authority vocabulary data. :rtype: pyessv.Authority @@ -99,11 +107,18 @@ def _read_authority(dpath): # Read terms. term_cache = {} - for scope in authority: + try: + scope = [s for s in authority if s.name == scope][0] for collection in scope: for term in _read_terms(dpath, scope, collection, term_cache): term.collection = collection collection.terms.append(term) + except IndexError: + for scope in authority: + for collection in scope: + for term in _read_terms(dpath, scope, collection, term_cache): + term.collection = collection + collection.terms.append(term) # Set inter-term hierarchies. for term in term_cache.values(): diff --git a/pyessv/_loader.py b/pyessv/_loader.py index e3b622c..4be953e 100644 --- a/pyessv/_loader.py +++ b/pyessv/_loader.py @@ -26,6 +26,16 @@ from pyessv._utils.formatter import format_string +def all_scopes(): + """Returns all scopes. + + """ + scopes = set() + for authority in load(): + for scope in authority.scopes: + scopes.add(scope) + return scopes + def load(identifier=None, verbose=True): """Loads a vocabulary node from archive. diff --git a/pyessv/_model/collection.py b/pyessv/_model/collection.py index 4d52a80..87124d4 100644 --- a/pyessv/_model/collection.py +++ b/pyessv/_model/collection.py @@ -65,8 +65,7 @@ def is_virtual(self): """Gets flag indicating whether the collection is a virtual one (i.e. simply constrained by a reg-ex). """ - return len(self) == 0 - + return self.term_regex is not None def get_validators(self): """Returns set of validators. diff --git a/pyessv/_model/node.py b/pyessv/_model/node.py index 68dd808..cea06c3 100644 --- a/pyessv/_model/node.py +++ b/pyessv/_model/node.py @@ -16,7 +16,7 @@ import arrow from pyessv._constants import NODE_TYPEKEY_SET -from pyessv._utils.compat import str +from pyessv._utils.compat import str, basestring from pyessv._utils.formatter import format_io_name from pyessv._utils.validation import assert_iterable from pyessv._utils.validation import assert_string @@ -61,6 +61,17 @@ def __getattr__(self, name): except KeyError: raise AttributeError('{} unknown attribute'.format(name)) + def __getstate__(self): + """Serializing method used by Pickle. + + """ + return self.__dict__ + + def __setstate__(self, d): + """Unserializing used by Pickle. + + """ + self.__dict__.update(d) @property def name(self): diff --git a/pyessv/_parser.py b/pyessv/_parser.py index 5221f8c..635a251 100644 --- a/pyessv/_parser.py +++ b/pyessv/_parser.py @@ -26,7 +26,7 @@ def parse( namespace, strictness=PARSING_STRICTNESS_2, - field='canonical_name' + field=None ): """Parses a namespace within a vocabulary hierachy. @@ -36,7 +36,8 @@ def parse( """ assert strictness in PARSING_STRICTNESS_SET, 'Invalid parsing strictness' - assert field in PARSING_NODE_FIELDS, 'Invalid field' + if field: + assert field in PARSING_NODE_FIELDS, 'Invalid field' # Set namespace ns = str(namespace).strip().split(':') @@ -63,8 +64,10 @@ def parse( node = load(namespace) target.set_node(node) - return getattr(target.node, field) - + if field: + return getattr(target.node, field) + else: + return target.node class _NodeInfo(object): """Information about a node whose name is being parsed. diff --git a/pyessv/_parser_template.py b/pyessv/_parser_template.py index 69a46ed..ccc73b7 100644 --- a/pyessv/_parser_template.py +++ b/pyessv/_parser_template.py @@ -11,30 +11,30 @@ """ -from pyessv._exceptions import TemplateParsingError -from pyessv._model.collection import Collection +import pyessv +from pyessv._exceptions import TemplateParsingError, TemplateValueError from pyessv._model.term import Term -from pyessv._utils.compat import basestring - +from pyessv._utils.compat import basestring class TemplateParser(object): """A vocabulary template parser. """ - def __init__(self, template, collections, strictness, seperator='.'): + + def __init__(self, template, collections, strictness, separator='.'): """Instance constructor. :param str template: Identifier template. - :param list collections: pyessv collection identifiers. + :param tuple collections: pyessv collection identifiers. :param int strictness: Strictness level to apply when applying name matching rules. - :param str seprarator: Seperator to apply when parsing. + :param str seprarator: Separator to apply when parsing. """ from pyessv._loader import load - self.seperator = seperator - self.template_parts = template.split(seperator) + self.separator = separator + self.template_parts = template.split(separator) self.template = template self.strictness = strictness @@ -45,7 +45,6 @@ def __init__(self, template, collections, strictness, seperator='.'): self.template_parts[idx] = load(collections[collection_idx]) collection_idx += 1 - def parse(self, val): """Parses a val against a template. @@ -53,7 +52,7 @@ def parse(self, val): """ # Verify that number of parts is equal. - parts = val.split(self.seperator) + parts = val.split(self.separator) if len(parts) != len(self.template_parts): raise TemplateParsingError('Number of elements is invalid: {}: is {}, expected {}'.format(val, len(parts), len(self.template_parts))) @@ -63,15 +62,20 @@ def parse(self, val): # Verify constant match. if isinstance(template_part, basestring): if template_part != name: - raise TemplateParsingError('{} :: {}'.format(name, val)) + raise TemplateValueError('{} :: {}'.format(name, val)) continue # Verify collection match. collection = template_part term = collection.is_matched(name, self.strictness) if term == False: - raise TemplateParsingError('vocab={} :: strictness={} :: value={}'.format(collection, self.strictness, val)) + raise TemplateValueError( + 'vocab={} :: strictness={} :: value={}'.format(collection, self.strictness, name)) + + # Create a virtual term if needed. if isinstance(term, Term): terms.add(term) + else: + terms.add(pyessv.create_term(collection, name)) return terms diff --git a/pyessv/_parsers/__init__.py b/pyessv/_parsers/__init__.py index ccca2f0..5402385 100644 --- a/pyessv/_parsers/__init__.py +++ b/pyessv/_parsers/__init__.py @@ -11,58 +11,10 @@ """ -import collections -from pyessv._parsers.cmip5_dataset_id import parse as parse_cmip5_dataset_id -from pyessv._parsers.cmip6_dataset_id import parse as parse_cmip6_dataset_id -from pyessv._parsers.cordex_dataset_id import parse as parse_cordex_dataset_id -from pyessv._parsers.input4mips_dataset_id import parse as parse_input4mips_dataset_id -from pyessv._utils.compat import basestring - - - -# Map of dataset id parsers to projects. -_DATASET_ID_PARSERS = { - 'cmip5': parse_cmip5_dataset_id, - 'cmip6': parse_cmip6_dataset_id, - 'cordex': parse_cordex_dataset_id, - 'input4mips': parse_input4mips_dataset_id -} - - -def parse_dataset_identifers(project, identifiers): - """Parses a collection of dataset identifiers. - - :param str project: Project code. - :param iterable identifiers: Dataset identifiers. - - :returns: Facets extracted from the identifiers. - :rtype: list - - """ - assert isinstance(identifiers, collections.Iterable), 'Invalid identifiers' - - result = set() - for identifier in identifiers: - result = result.union(parse_dataset_identifer(project, identifier)) - - return result - - -def parse_dataset_identifer(project, identifier): - """Parses a dataset identifier. - - :param str project: Project code. - :param str identifier: Dataset identifier. - - :returns: Set of terms extracted from the identifier. - :rtype: set - - """ - assert isinstance(project, basestring), 'Invalid project' - assert project in _DATASET_ID_PARSERS, 'Unsupported project' - assert isinstance(identifier, basestring), 'Invalid identifier' - - parser = _DATASET_ID_PARSERS[project] - - return parser(identifier) +from pyessv._parsers.dataset_id import parse_dataset_identifier +from pyessv._parsers.dataset_id import parse_dataset_identifiers +from pyessv._parsers.directory import parse_directories +from pyessv._parsers.directory import parse_directory +from pyessv._parsers.filename import parse_filename +from pyessv._parsers.filename import parse_filenames diff --git a/pyessv/_parsers/cmip5_dataset_id.py b/pyessv/_parsers/cmip5_dataset_id.py deleted file mode 100644 index 74ad009..0000000 --- a/pyessv/_parsers/cmip5_dataset_id.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -.. module:: pyessv._parsers.cmip6_dataset_id.py - :copyright: Copyright "December 01, 2016", IPSL - :license: GPL/CeCIL - :platform: Unix, Windows - :synopsis: Encapsulates parsing of a CMIP5 dataset identifier. - -.. moduleauthor:: Mark Conway-Greenslade - -""" -from pyessv._factory import create_template_parser -from pyessv._constants import PARSING_STRICTNESS_1 - - - -# Template extracted from esgf ini file (for reference purpose only). -_INI_PATTERN = 'cmip5.%(product)s.%(institute)s.%(model)s.%(experiment)s.%(time_frequency)s.%(realm)s.%(cmor_table)s.%(ensemble)s' - -# Test identifier (for reference purpose only). -_TEST_IDENTIFIER = 'cmip5.output2.IPSL.IPSL-CM5A-LR.historicalMisc.mon.ocean.Omon.r2i1p1.v20150504' - -# Template that identifiers must conform to. -_TEMPLATE = 'cmip5.{}.{}.{}.{}.{}.{}.{}.{}' - -# Collections injected into template. -_COLLECTIONS = ( - 'wcrp:cmip5:product', - 'wcrp:cmip5:institute', - 'wcrp:cmip5:model', - 'wcrp:cmip5:experiment', - 'wcrp:cmip5:time-frequency', - 'wcrp:cmip5:realm', - 'wcrp:cmip5:cmor-table', - 'wcrp:cmip5:ensemble' - ) - -# Instantiated & cached parser instance. -_PARSER = None - - -def parse(identifier): - """Parses a CMIP6 dataset identifier. - - """ - # Instantiate parser JIT. - global _PARSER - if _PARSER is None: - _PARSER = create_template_parser(_TEMPLATE, _COLLECTIONS, PARSING_STRICTNESS_1) - - # Strip version suffix. - if '#' in identifier: - identifier = identifier.split('#')[0] - - return _PARSER.parse(identifier.split('#')[0]) diff --git a/pyessv/_parsers/cmip6_dataset_id.py b/pyessv/_parsers/cmip6_dataset_id.py deleted file mode 100644 index e7cd2c0..0000000 --- a/pyessv/_parsers/cmip6_dataset_id.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -.. module:: pyessv._parsers.cmip6_dataset_id.py - :copyright: Copyright "December 01, 2016", IPSL - :license: GPL/CeCIL - :platform: Unix, Windows - :synopsis: Encapsulates parsing of a CMIP6 dataset identifier. - -.. moduleauthor:: Mark Conway-Greenslade - -""" -from pyessv._factory import create_template_parser -from pyessv._constants import PARSING_STRICTNESS_1 - - - -# Template extracted from esgf ini file (for reference purpose only). -_INI_PATTERN = 'CMIP6.%(activity_id)s.%(institution_id)s.%(source_id)s.%(experiment_id)s.%(member_id)s.%(table_id)s.%(variable_id)s.%(grid_label)s' - -# Template that identifiers must conform to. -_TEMPLATE = 'CMIP6.{}.{}.{}.{}.{}.{}.{}.{}' - -# Collections injected into template. -_COLLECTIONS = ( - 'wcrp:cmip6:activity-id', - 'wcrp:cmip6:institution-id', - 'wcrp:cmip6:source-id', - 'wcrp:cmip6:experiment-id', - 'wcrp:cmip6:member-id', - 'wcrp:cmip6:table-id', - 'wcrp:cmip6:variable-id', - 'wcrp:cmip6:grid-label' - ) - -# Instantiated & cached parser instance. -_PARSER = None - - -def parse(identifier): - """Parses a CMIP6 dataset identifier. - - """ - # Instantiate parser JIT. - global _PARSER - if _PARSER is None: - _PARSER = create_template_parser(_TEMPLATE, _COLLECTIONS, PARSING_STRICTNESS_1) - - # Strip version suffix. - if '#' in identifier: - identifier = identifier.split('#')[0] - - return _PARSER.parse(identifier) diff --git a/pyessv/_parsers/cordex_dataset_id.py b/pyessv/_parsers/cordex_dataset_id.py deleted file mode 100644 index 5cf01bb..0000000 --- a/pyessv/_parsers/cordex_dataset_id.py +++ /dev/null @@ -1,57 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -.. module:: pyessv._parsers.cordex_dataset_id.py - :copyright: Copyright "December 01, 2016", IPSL - :license: GPL/CeCIL - :platform: Unix, Windows - :synopsis: Encapsulates parsing of a CORDEX dataset identifier. - -.. moduleauthor:: Mark Conway-Greenslade - -""" -from pyessv._factory import create_template_parser -from pyessv._constants import PARSING_STRICTNESS_1 - - - -# Template extracted from esgf ini file (for reference purpose only). -_INI_PATTERN = 'cordex.%(product)s.%(domain)s.%(institute)s.%(driving_model)s.%(experiment)s.%(ensemble)s.%(rcm_name)s.%(rcm_version)s.%(time_frequency)s.%(variable)s' - -# Test identifier (for reference purpose only). -_TEST_IDENTIFIER = 'cordex.output.AFR-44.MOHC.MOHC-HadGEM2-ES.rcp60.r12i1p1.hadgem3-ra.v1.mon.areacella' - -# Template that identifiers must conform to. -_TEMPLATE = 'cordex.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}' - -# Collections injected into template. -_COLLECTIONS = ( - 'wcrp:cordex:product', - 'wcrp:cordex:domain', - 'wcrp:cordex:institute', - 'wcrp:cordex:driving-model', - 'wcrp:cordex:experiment', - 'wcrp:cordex:ensemble', - 'wcrp:cordex:rcm-name', - 'wcrp:cordex:rcm-version', - 'wcrp:cordex:time-frequency', - 'wcrp:cordex:variable', - ) - -# Instantiated & cached parser instance. -_PARSER = None - -def parse(identifier): - """Parses a CMIP6 dataset identifier. - - """ - # Instantiate parser JIT. - global _PARSER - if _PARSER is None: - _PARSER = create_template_parser(_TEMPLATE, _COLLECTIONS, PARSING_STRICTNESS_1) - - # Strip version suffix. - if '#' in identifier: - identifier = identifier.split('#')[0] - - return _PARSER.parse(identifier) diff --git a/pyessv/_parsers/dataset_id.py b/pyessv/_parsers/dataset_id.py new file mode 100644 index 0000000..ccdd0ea --- /dev/null +++ b/pyessv/_parsers/dataset_id.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._parsers.dataset_id.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encapsulates parsing of an ESGF dataset identifier. + +.. moduleauthor:: Mark Conway-Greenslade + +""" +import collections + +from pyessv import all_scopes +from pyessv._constants import PARSING_STRICTNESS_1 +from pyessv._factory import create_template_parser +from pyessv._utils.compat import basestring + +# Test identifier (for reference purpose only). +_TEST_CMIP5_IDENTIFIER = 'cmip5.output2.IPSL.IPSL-CM5A-LR.historicalMisc.mon.ocean.Omon.r2i1p1.v20150504' +_TEST_CORDEX_IDENTIFIER = 'cordex.output.AFR-44.MOHC.MOHC-HadGEM2-ES.rcp60.r12i1p1.HadGEM3-RA.v1.mon.areacella#20190804' + +# Instantiated template +_TEMPLATE = None + +# Instantiated template collections +_COLLECTIONS = None + +# Instantiated project. +_PROJECT = None + +# Instantiated parser. +_PARSER = None + + +def parse_dataset_identifiers(project, identifiers): + """Parses a collection of dataset identifiers. + + :param str project: Project code. + :param iterable identifiers: Dataset identifiers. + + :returns: Facets extracted from the identifiers. + :rtype: list + + """ + assert isinstance(identifiers, collections.Iterable), 'Invalid identifiers' + + result = set() + for identifier in identifiers: + result = result.union(parse_dataset_identifier(project, identifier)) + + return result + + +def parse_dataset_identifier(project, identifier): + """Parses a dataset identifier. + + :param str project: Project code. + :param str identifier: Dataset identifier. + + :returns: Set of terms extracted from the identifier. + :rtype: set + + """ + assert isinstance(project, basestring), 'Invalid project' + assert isinstance(identifier, basestring), 'Invalid identifier' + + global _PROJECT, _PARSER, _TEMPLATE, _COLLECTIONS + + if _PROJECT != project: + + # Get scope corresponding to the project code. + scopes = all_scopes() + assert project in [scope.name for scope in scopes], 'Unsupported project' + scope = [scope for scope in scopes if scope.name == project][0] + + assert 'dataset_id' in scope.data.keys(), 'Dataset ID parser not found' + assert 'template' in scope.data['dataset_id'].keys(), 'Dataset ID parser template not found' + assert 'collections' in scope.data['dataset_id'].keys(), 'Dataset ID parser template collections not found' + + # Get template from data scope. + _TEMPLATE = scope.data['dataset_id']['template'] + assert isinstance(_TEMPLATE, basestring), 'Invalid template' + + # Get template collections from data scope. + _COLLECTIONS = list() + for name in scope.data['dataset_id']['collections']: + _COLLECTIONS.append([collection.namespace for collection in scope.collections if collection.name == name.replace('_','-')][0]) + assert _COLLECTIONS, 'Invalid collections' + + # Instantiate parser JIT. + _PARSER = create_template_parser(_TEMPLATE, tuple(_COLLECTIONS), PARSING_STRICTNESS_1) + + # Cached project. + _PROJECT = project + + # Convert version suffix to an identifier element. + identifier = identifier.replace('#', '.v') + + return _PARSER.parse(identifier) diff --git a/pyessv/_parsers/directory.py b/pyessv/_parsers/directory.py new file mode 100644 index 0000000..30c27c5 --- /dev/null +++ b/pyessv/_parsers/directory.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._parsers.directory.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encapsulates parsing of an ESGF directory. + +.. moduleauthor:: Mark Conway-Greenslade + +""" +import collections + +from pyessv import all_scopes +from pyessv._constants import PARSING_STRICTNESS_1 +from pyessv._factory import create_template_parser +from pyessv._utils.compat import basestring + +# Template extracted from esgf ini file (for reference purpose only). +_INI_PATTERN = '%(root)s/%(project)s/%(product)s/%(institute)s/%(model)s/%(experiment)s/%(time_frequency)s/%(realm)s/%(cmor_table)s/%(ensemble)s/%(version)s/%(variable)s' + +# Test directory (for reference purpose only). +_TEST_DIRECTORY = 'CMIP5/output1/IPSL/IPSL-CM5A-LR/1pctCO2/mon/atmos/Amon/r1i1p1/v20110427/tas' +_TEST_DIRECTORY = 'CMIP5/output1/IPSL/IPSL-CM5A-LR/1pctCO2/mon/atmos/Amon/r1i1p1/latest/tas' + +# Instantiated template +_TEMPLATE = None + +# Instantiated template collections +_COLLECTIONS = None + +# Instantiated project. +_PROJECT = None + +# Instantiated parser. +_PARSER = None + + +def parse_directories(project, directories): + """Parses a collection of directories. + + :param str project: Project code. + :param iterable directories: Data directories. + + :returns: Facets extracted from the directories. + :rtype: list + + """ + assert isinstance(directories, collections.Iterable), 'Invalid directories' + + result = set() + for directory in directories: + result = result.union(parse_directory(project, directory)) + + return result + + +def parse_directory(project, directory): + """Parses a directory. + + :param str project: Project code. + :param str directory: Data directory. + + :returns: Set of terms extracted from the directory. + :rtype: set + + """ + assert isinstance(project, basestring), 'Invalid project' + assert isinstance(directory, basestring), 'Invalid directory' + + global _PROJECT, _PARSER, _TEMPLATE, _COLLECTIONS + + if _PROJECT != project: + + # Instantiated template + _TEMPLATE = None + + # Instantiated template collections + _COLLECTIONS = None + + # Get scope corresponding to the project code. + scopes = all_scopes() + assert project in [scope.name for scope in scopes], 'Unsupported project' + scope = [scope for scope in scopes if scope.name == project][0] + + assert 'directory_structure' in scope.data.keys(), 'Directory parser not found' + assert 'template' in scope.data['directory_structure'].keys(), 'Directory parser template not found' + assert 'collections' in scope.data['directory_structure'].keys(), 'Directory parser template collections not found' + + # Get template from data scope. + _TEMPLATE = scope.data['directory_structure']['template'] + assert isinstance(_TEMPLATE, basestring), 'Invalid template' + + # Get template collections from data scope. + _COLLECTIONS = list() + for name in scope.data['directory_structure']['collections']: + _COLLECTIONS.append([collection.namespace for collection in scope.collections if collection.name == name.replace('_','-')][0]) + assert _COLLECTIONS, 'Invalid collections' + + # Instantiate parser JIT. + _PARSER = create_template_parser(_TEMPLATE, tuple(_COLLECTIONS), PARSING_STRICTNESS_1, separator='/') + + # Cached project. + _PROJECT = project + + return _PARSER.parse(directory) diff --git a/pyessv/_parsers/filename.py b/pyessv/_parsers/filename.py new file mode 100644 index 0000000..1d0213d --- /dev/null +++ b/pyessv/_parsers/filename.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: pyessv._parsers.filename.py + :copyright: Copyright "December 01, 2016", IPSL + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encapsulates parsing of an ESGF filename. + +.. moduleauthor:: Mark Conway-Greenslade + +""" +import collections +from os.path import splitext + +from pyessv import all_scopes +from pyessv._constants import PARSING_STRICTNESS_1 +from pyessv._exceptions import TemplateParsingError +from pyessv._factory import create_template_parser +from pyessv._utils.compat import basestring + +# Template extracted from esgf ini file (for reference purpose only). +_INI_PATTERN = '%(variable)s_%(cmor_table)s_%(model)s_%(experiment)s_%(ensemble)s[_%(period_start)s-%(period_end)s].nc' + +# Test filename (for reference purpose only). +_TEST_FILENAME = 'tas_Amon_IPSL-CM5A-LR_1pctCO2_r1i1p1_185001-198912.nc' +_TEST_FILENAME = 'orog_fx_IPSL-CM5A-LR_1pctCO2_r0i0p0.nc' +_TEST_FILENAME = 'tas_Amon_IPSL-CM5A-LR_1pctCO2_r1i1p1_185001-198912-clim.nc' + +# Instantiated template +_TEMPLATE = None + +# Instantiated template collections +_COLLECTIONS = None + +# Instantiated project. +_PROJECT = None + +# Instantiated parser. +_PARSER = None + + +def parse_filenames(project, filenames): + """Parses a collection of filenames. + + :param str project: Project code. + :param iterable filenames: Filenames. + + :returns: Facets extracted from the filenames. + :rtype: list + + """ + assert isinstance(filenames, collections.Iterable), 'Invalid filenames' + + result = set() + for filename in filenames: + result = result.union(parse_filename(project, filename)) + + return result + + +def parse_filename(project, filename): + """Parses a filename. + + :param str project: Project code. + :param str filename: Filename. + + :returns: Set of terms extracted from the filename. + :rtype: set + + """ + assert isinstance(project, basestring), 'Invalid project' + assert isinstance(filename, basestring), 'Invalid filename' + + global _PROJECT, _PARSER, _TEMPLATE, _COLLECTIONS + + if _PROJECT != project: + + # Get scope corresponding to the project code. + scopes = all_scopes() + assert project in [scope.name for scope in scopes], 'Unsupported project' + scope = [scope for scope in scopes if scope.name == project][0] + + assert 'filename' in scope.data.keys(), 'Filename parser not found' + assert 'template' in scope.data['filename'].keys(), 'Filename parser template not found' + assert 'collections' in scope.data['filename'].keys(), 'Filename parser template collections not found' + + # Get template from data scope. + _TEMPLATE = scope.data['filename']['template'] + assert isinstance(_TEMPLATE, basestring), 'Invalid template' + + # Get template collections from data scope. + _COLLECTIONS = list() + for name in scope.data['filename']['collections']: + _COLLECTIONS.append([collection.namespace for collection in scope.collections if collection.name == name.replace('_','-')][0]) + assert _COLLECTIONS, 'Invalid collections' + + # Instantiate parser JIT. + _PARSER = create_template_parser(_TEMPLATE, tuple(_COLLECTIONS), PARSING_STRICTNESS_1, separator='_') + + # Cached project. + _PROJECT = project + + # Strip file extension. + filename = splitext(filename)[0] + + try: + return _PARSER.parse(filename) + except TemplateParsingError: + # Add suffix to filename without file period. + return _PARSER.parse(filename + '_fixed') diff --git a/pyessv/_parsers/input4mips_dataset_id.py b/pyessv/_parsers/input4mips_dataset_id.py deleted file mode 100644 index 92f9a03..0000000 --- a/pyessv/_parsers/input4mips_dataset_id.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -.. module:: pyessv._parsers.cmip6_dataset_id.py - :copyright: Copyright "December 01, 2016", IPSL - :license: GPL/CeCIL - :platform: Unix, Windows - :synopsis: Encapsulates parsing of a CMIP6 dataset identifier. - -.. moduleauthor:: Mark Conway-Greenslade - -""" -from pyessv._factory import create_template_parser -from pyessv._constants import PARSING_STRICTNESS_3 - - - -# Template extracted from esgf ini file (for reference purpose only). -_INI_PATTERN = '%(activity_id)s.%(mip_era)s.%(target_mip)s.%(institution_id)s.%(source_id)s.%(realm)s.%(frequency)s.%(variable_id)s.%(grid_label)s' - -# Template that identifiers must conform to. -_TEMPLATE = 'input4MIPs.CMIP6.{}.{}.{}.{}.{}.{}.{}' - -# Collections injected into template. -_COLLECTIONS = ( - 'wcrp:input4mips:target-mip', - 'wcrp:input4mips:institution-id', - 'wcrp:input4mips:source-id', - 'wcrp:input4mips:realm', - 'wcrp:input4mips:frequency', - 'wcrp:input4mips:variable-id', - 'wcrp:input4mips:grid-label', - ) - -# Instantiated & cached parser instance. -_PARSER = None - - -def parse(identifier): - """Parses a CMIP6 dataset identifier. - - """ - print identifier - # Instantiate parser JIT. - global _PARSER - if _PARSER is None: - _PARSER = create_template_parser(_TEMPLATE, _COLLECTIONS, PARSING_STRICTNESS_3) - - # Strip version suffix. - if '#' in identifier: - identifier = identifier.split('#')[0] - - return _PARSER.parse(identifier) diff --git a/sh/writers/esgf/map.py b/sh/writers/esgf/map.py index 623cc25..83c129a 100644 --- a/sh/writers/esgf/map.py +++ b/sh/writers/esgf/map.py @@ -24,7 +24,6 @@ import map_cmip6 import map_cordex import map_cordex_adjust -import map_e3sm import map_euclipse import map_geomip import map_input4mips @@ -34,6 +33,10 @@ import map_pmip3 import map_primavera import map_tamip +import warnings +from arrow.factory import ArrowParseWarning + +warnings.simplefilter("ignore", ArrowParseWarning) # Define command line options. _ARGS = argparse.ArgumentParser('Maps ESGF publisher ini files to normalized pyessv vocabulary format.') @@ -57,7 +60,6 @@ map_cmip6, map_cordex, map_cordex_adjust, - map_e3sm, map_euclipse, map_geomip, map_input4mips, @@ -83,6 +85,18 @@ def _main(args): if not os.path.isdir(args.source): raise ValueError('ESGF vocab directory does not exist: {}'.format(args.source)) + # Load vocabulary. + pyessv.load_cv() + + # CV authority = ECMWF. + #_AUTHORITY = pyessv.create_authority( + # 'ECMWF', + # 'European Center for Medium-Range Weather Forecasts', + # label='ECMWF', + # url='https://www.ecmwf.int/', + # create_date=_CREATE_DATE + #) + # Process project modules: for module in _MODULES: # Set project. @@ -103,9 +117,10 @@ def _main(args): scope = _create_scope(authority, project) # Set scope data. - scope.data = scope.data or dict() - for field in module.SCOPE_DATA: - scope.data[field] = ini_section.get_option(field, raw=True) + scope.data = module.SCOPE_DATA or dict() + #scope.data = scope.data or dict() + #for field in module.SCOPE_DATA: + # scope.data[field] = ini_section.get_option(field, raw=True) # Create regex collections. collections = [i for i in module.COLLECTIONS if not inspect.isfunction(i[1])] @@ -122,7 +137,16 @@ def _main(args): except TypeError: pass for term_data in term_factory(ctx): - _get_term(collection, term_data) + try: + term_src, term_dst = term_data + t = _get_term(collection, term_dst) + s = pyessv.load(term_src) + if t not in s.associations: + s.associations.append(t) + if s not in t.associations: + t.associations.append(s) + except (ValueError, AttributeError): + _get_term(collection, term_data) # Add to archive & persist to file system. pyessv.archive(authority) @@ -209,7 +233,7 @@ def _create_collection(module, scope, collection_id, term_regex=None): data = None if collection_id.lower().replace('_', '-') in [collection.name for collection in scope.collections]: - collection = scope[collection_id] + collection = scope[collection_id.lower().replace('_', '-')] collection.description = "ESGF publisher-config CV collection: ".format(collection_id), collection.label = collection_id.title().replace('_', ' ').replace('Rcm', 'RCM').replace('Cmor', 'CMOR') collection.term_regex = term_regex @@ -248,11 +272,18 @@ def _get_term(collection, term_info): alternative_names = [] if synonym is None else [synonym] - return pyessv.create_term(collection, name, - label=label, - description=description, - alternative_names=alternative_names - ) + if name.lower().replace('_', '-') in [term.name for term in collection.terms]: + term = collection[name.lower().replace('_', '-')] + term.label = label + term.description = description + term.alternative_names = alternative_names + return term + else: + return pyessv.create_term(collection, name, + label=label, + description=description, + alternative_names=alternative_names + ) # Entry point. diff --git a/sh/writers/esgf/map_c3s_cmip5.py b/sh/writers/esgf/map_c3s_cmip5.py index e96d2dd..f4f4e09 100644 --- a/sh/writers/esgf/map_c3s_cmip5.py +++ b/sh/writers/esgf/map_c3s_cmip5.py @@ -13,47 +13,85 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) - - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('cmor_table', yield_comma_delimited_options), - ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), - ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('model', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('product', yield_comma_delimited_options), - ('realm', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} +COLLECTIONS = [ + ('cmor_table', yield_comma_delimited_options), + ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), + ('experiment', yield_pipe_delimited_options), + ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), + ('time_frequency', yield_comma_delimited_options), + ('product', yield_comma_delimited_options), + ('realm', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Arbitrary data associated with a collection. COLLECTION_DATA = { - 'experiment': { - 'cim_document_type': 'cim.1.activity.NumericalExperiment', - 'cim_document_type_alternative_name': 'experiment' - }, - 'model': { - 'cim_document_type': 'cim.1.software.ModelComponent', - 'cim_document_type_alternative_name': 'model' - } + 'experiment': { + 'cim_document_type': 'cim.1.activity.NumericalExperiment', + 'cim_document_type_alternative_name': 'experiment' + }, + 'model': { + 'cim_document_type': 'cim.1.software.ModelComponent', + 'cim_document_type_alternative_name': 'model' + } } # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'C3S-CMIP5/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'variable', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'c3s-cmip5.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'variable', + 'dataset_version' + ) + } } def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. + """Yields institute information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'ecmwf:c3s-cmip5:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute diff --git a/sh/writers/esgf/map_c3s_cordex.py b/sh/writers/esgf/map_c3s_cordex.py index cb4d119..b233118 100644 --- a/sh/writers/esgf/map_c3s_cordex.py +++ b/sh/writers/esgf/map_c3s_cordex.py @@ -12,45 +12,87 @@ from utils import yield_comma_delimited_options from utils import yield_pipe_delimited_options - -# TODO process map: rcm_name_map = map(project, rcm_model : rcm_name) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('domain', lambda: yield_domain), - ('driving_model', yield_comma_delimited_options), - ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), - ('experiment', yield_pipe_delimited_options), - ('institute', yield_comma_delimited_options), - ('product', yield_comma_delimited_options), - ('rcm_model', yield_comma_delimited_options), - ('rcm_name', lambda: yield_rcm_name), - ('rcm_version', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'v^[0-9]*$') -} +COLLECTIONS = [ + ('domain', lambda: yield_domain), + ('driving_model', yield_comma_delimited_options), + ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), + ('experiment', yield_pipe_delimited_options), + ('institute', yield_comma_delimited_options), + ('product', yield_comma_delimited_options), + ('rcm_model', yield_comma_delimited_options), + ('rcm_name', lambda: yield_rcm_name), + ('rcm_version', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('time_frequency', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|v^[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'domain', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_model', + 'rcm_version', + 'time_frequency', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'C3S-CORDEX/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'domain', + 'institute', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_model', + 'rcm_version', + 'time_frequency', + 'variable', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'c3s-cordex.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'domain', + 'institute', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_name', + 'rcm_version', + 'time_frequency', + 'variable', + 'dataset_version' + ) + } } def yield_domain(ctx): - """Yields domain information to be converted to pyessv terms. + """Yields domain information to be converted to pyessv terms. - """ - for domain_name, domain_description in ctx.ini_section.get_option('domain_description_map', '\n', '|'): - yield domain_name, domain_name, domain_description + """ + for domain_name, domain_description in ctx.ini_section.get_option('domain_description_map', '\n', '|'): + yield domain_name, domain_name, domain_description def yield_rcm_name(ctx): - """Yields rcm name information to be converted to pyessv terms. + """Yields rcm name information to be converted to pyessv terms. - """ - for _, rcm_name in ctx.ini_section.get_option('rcm_name_map', '\n', '|'): - yield rcm_name + """ + for rcm_model, rcm_name in ctx.ini_section.get_option('rcm_name_map', '\n', '|'): + src_namespace = 'ecmwf:c3s-cordex:rcm_model:{}'.format(rcm_model.lower().replace('_', '-')) + yield src_namespace, rcm_name diff --git a/sh/writers/esgf/map_cc4e.py b/sh/writers/esgf/map_cc4e.py index d6c75a2..dcd418e 100644 --- a/sh/writers/esgf/map_cc4e.py +++ b/sh/writers/esgf/map_cc4e.py @@ -11,24 +11,61 @@ """ from utils import yield_comma_delimited_options - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('work_package', yield_comma_delimited_options), - ('product', yield_comma_delimited_options), - ('source_type', yield_comma_delimited_options), - ('source_data_id', yield_comma_delimited_options), - ('realization', r'r[0-9]+'), - ('domain', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} +COLLECTIONS = [ + ('work_package', yield_comma_delimited_options), + ('product', yield_comma_delimited_options), + ('source_type', yield_comma_delimited_options), + ('source_data_id', yield_comma_delimited_options), + ('realization', r'r[0-9]+'), + ('domain', yield_comma_delimited_options), + ('time_frequency', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_cc4e_{}_{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'work_package', + 'product', + 'source_type', + 'source_data_id', + 'realization', + 'domain', + 'time_frequency', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'CC4E/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'work_package', + 'product', + 'source_type', + 'source_data_id', + 'realization', + 'time_frequency', + 'variable', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'CC4E.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'work_package', + 'product', + 'source_type', + 'source_data_id', + 'realization', + 'time_frequency', + 'variable', + 'dataset_version' + ) + } } diff --git a/sh/writers/esgf/map_cmip5.py b/sh/writers/esgf/map_cmip5.py index fcc374c..5169b55 100644 --- a/sh/writers/esgf/map_cmip5.py +++ b/sh/writers/esgf/map_cmip5.py @@ -12,57 +12,93 @@ from utils import yield_comma_delimited_options from utils import yield_pipe_delimited_options - -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('cmor_table', yield_comma_delimited_options), - ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), - ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('las_time_delta', lambda: yield_las_time_delta), - ('model', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('product', yield_comma_delimited_options), - ('realm', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} +COLLECTIONS = [ + ('cmor_table', yield_comma_delimited_options), + ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), + ('experiment', yield_pipe_delimited_options), + ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), + ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), + ('product', yield_comma_delimited_options), + ('realm', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Arbitrary data associated with a collection. COLLECTION_DATA = { - 'experiment': { - 'cim_document_type': 'cim.1.activity.NumericalExperiment', - 'cim_document_type_alternative_name': 'experiment' - }, - 'model': { - 'cim_document_type': 'cim.1.software.ModelComponent', - 'cim_document_type_alternative_name': 'model' - } + 'experiment': { + 'cim_document_type': 'cim.1.activity.NumericalExperiment', + 'cim_document_type_alternative_name': 'experiment' + }, + 'model': { + 'cim_document_type': 'cim.1.software.ModelComponent', + 'cim_document_type_alternative_name': 'model' + } } # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'CMIP5/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version', + 'variable' + ), + }, + 'dataset_id': { + 'template': 'cmip5.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version' + ) + } } -def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. +def yield_las_time_delta(ctx): + """Yields las time delta information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:cmip5:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_institute(ctx): + """Yields institute information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:cmip5:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute diff --git a/sh/writers/esgf/map_cmip6.py b/sh/writers/esgf/map_cmip6.py index ccf092a..1234d22 100644 --- a/sh/writers/esgf/map_cmip6.py +++ b/sh/writers/esgf/map_cmip6.py @@ -11,27 +11,71 @@ """ from utils import yield_comma_delimited_options - -# TODO process maps: institute_map, las_time_delta_map, model_cohort_map -# TODO process map: las_time_delta_map = las_time_delta_map = map(frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('las_time_delta', lambda: yield_las_time_delta), - ('thredds_exclude_variables', yield_comma_delimited_options), -} +COLLECTIONS = [ + ('las_time_delta', lambda: yield_las_time_delta), + ('model_cohort', lambda: yield_model_cohort), + ('thredds_exclude_variables', yield_comma_delimited_options), +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable_id', + 'table_id', + 'source_id', + 'experiment_id', + 'member_id', + 'grid_label' + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'CMIP6/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'activity_id', + 'institution_id', + 'source_id', + 'experiment_id', + 'member_id', + 'table_id', + 'variable_id', + 'grid_label', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'CMIP6.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'activity_id', + 'institution_id', + 'source_id', + 'experiment_id', + 'member_id', + 'table_id', + 'variable_id', + 'grid_label', + 'dataset_version' + ) + } } +def yield_model_cohort(ctx): + """Yields model cohort information to be converted to pyessv terms. + + """ + for source_id, model_cohort in ctx.ini_section.get_option('model_cohort_map', '\n', '|'): + src_namespace = 'wcrp:cmip6:source-id:{}'.format(source_id.lower().replace('_', '-')) + yield src_namespace, model_cohort + + def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. + """Yields las time delta information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:cmip6:frequency:{}'.format(frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta diff --git a/sh/writers/esgf/map_cordex.py b/sh/writers/esgf/map_cordex.py index 85f6c75..d861e65 100644 --- a/sh/writers/esgf/map_cordex.py +++ b/sh/writers/esgf/map_cordex.py @@ -13,54 +13,97 @@ from utils import yield_pipe_delimited_options -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) -# TODO process map: rcm_name_map = map(project, rcm_model : rcm_name) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('domain', lambda: yield_domain), - ('driving_model', yield_comma_delimited_options), - ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), - ('experiment', yield_pipe_delimited_options), - ('institute', yield_comma_delimited_options), - ('las_time_delta', lambda: yield_las_time_delta), - ('product', yield_comma_delimited_options), - ('rcm_model', yield_comma_delimited_options), - ('rcm_name', lambda: yield_rcm_name), - ('rcm_version', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'v^[0-9]*$') -} +COLLECTIONS = [ + ('domain', lambda: yield_domain), + ('driving_model', yield_comma_delimited_options), + ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), + ('experiment', yield_pipe_delimited_options), + ('institute', yield_comma_delimited_options), + ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), + ('product', yield_comma_delimited_options), + ('rcm_model', yield_comma_delimited_options), + ('rcm_name', lambda: yield_rcm_name), + ('rcm_version', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'domain', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_model', + 'rcm_version', + 'time_frequency', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'CORDEX/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'domain', + 'institute', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_model', + 'rcm_version', + 'time_frequency', + 'variable', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'cordex.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'domain', + 'institute', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_name', + 'rcm_version', + 'time_frequency', + 'variable', + 'dataset_version' + ) + } } def yield_domain(ctx): - """Yields domain information to be converted to pyessv terms. + """Yields domain information to be converted to pyessv terms. - """ - for domain_name, domain_description in ctx.ini_section.get_option('domain_description_map', '\n', '|'): - yield domain_name, domain_name, domain_description + """ + for domain_name, domain_description in ctx.ini_section.get_option('domain_description_map', '\n', '|'): + yield domain_name, domain_name, domain_description def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. + """Yields las time delta information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:cordex:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta def yield_rcm_name(ctx): - """Yields rcm name information to be converted to pyessv terms. + """Yields rcm name information to be converted to pyessv terms. - """ - for _, rcm_name in ctx.ini_section.get_option('rcm_name_map', '\n', '|'): - yield rcm_name + """ + for rcm_model, rcm_name in ctx.ini_section.get_option('rcm_name_map', '\n', '|'): + src_namespace = 'wcrp:cordex:rcm_model:{}'.format(rcm_model.lower().replace('_', '-')) + yield src_namespace, rcm_name diff --git a/sh/writers/esgf/map_cordex_adjust.py b/sh/writers/esgf/map_cordex_adjust.py index 37e0350..561d84c 100644 --- a/sh/writers/esgf/map_cordex_adjust.py +++ b/sh/writers/esgf/map_cordex_adjust.py @@ -13,54 +13,97 @@ from utils import yield_pipe_delimited_options -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) -# TODO process map: rcm_name_map = map(project, rcm_model : rcm_name) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('domain', lambda: yield_domain), - ('driving_model', yield_comma_delimited_options), - ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), - ('experiment', yield_pipe_delimited_options), - ('institute', yield_comma_delimited_options), - ('las_time_delta', lambda: yield_las_time_delta), - ('product', yield_comma_delimited_options), - ('rcm_model', yield_comma_delimited_options), - ('rcm_name', lambda: yield_rcm_name), - ('bias_adjustment', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'v^[0-9]*$') -} +COLLECTIONS = [ + ('domain', lambda: yield_domain), + ('driving_model', yield_comma_delimited_options), + ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), + ('experiment', yield_pipe_delimited_options), + ('institute', yield_comma_delimited_options), + ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), + ('product', yield_comma_delimited_options), + ('rcm_model', yield_comma_delimited_options), + ('rcm_name', lambda: yield_rcm_name), + ('bias_adjustment', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|v^[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'domain', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_model', + 'bias_adjustment', + 'time_frequency', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'CORDEX-Adjust/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'domain', + 'institute', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_model', + 'bias_adjustment', + 'time_frequency', + 'variable', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'cordex-adjust.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'domain', + 'institute', + 'driving_model', + 'experiment', + 'ensemble', + 'rcm_name', + 'bias_adjustment', + 'time_frequency', + 'variable', + 'dataset_version' + ) + } } def yield_domain(ctx): - """Yields domain information to be converted to pyessv terms. + """Yields domain information to be converted to pyessv terms. - """ - for domain_name, domain_description in ctx.ini_section.get_option('domain_description_map', '\n', '|'): - yield domain_name, domain_name, domain_description + """ + for domain_name, domain_description in ctx.ini_section.get_option('domain_description_map', '\n', '|'): + yield domain_name, domain_name, domain_description def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. + """Yields las time delta information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:cordex-adjust:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta def yield_rcm_name(ctx): - """Yields rcm name information to be converted to pyessv terms. + """Yields rcm name information to be converted to pyessv terms. - """ - for _, rcm_name in ctx.ini_section.get_option('rcm_name_map', '\n', '|'): - yield rcm_name + """ + for rcm_model, rcm_name in ctx.ini_section.get_option('rcm_name_map', '\n', '|'): + src_namespace = 'wcrp:cordex-adjust:rcm_model:{}'.format(rcm_model.lower().replace('_', '-')) + yield src_namespace, rcm_name diff --git a/sh/writers/esgf/map_e3sm.py b/sh/writers/esgf/map_e3sm.py deleted file mode 100644 index 1cfd0c5..0000000 --- a/sh/writers/esgf/map_e3sm.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -.. module:: map_e3sm.py - :license: GPL/CeCIL - :platform: Unix, Windows - :synopsis: Maps E3SM ESGF publisher ini file to normalized pyessv format. - -.. moduleauthor:: Mark Conway-Greenslade - -""" -from utils import yield_comma_delimited_options -from utils import yield_pipe_delimited_options - - -# Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('source', yield_comma_delimited_options), - ('model_version', yield_comma_delimited_options), - ('experiment', yield_pipe_delimited_options), - ('atmos_grid_resolution', yield_comma_delimited_options), - ('ocean_grid_resolution', yield_comma_delimited_options), - ('realm', yield_comma_delimited_options), - ('regridding', yield_comma_delimited_options), - ('data_type', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('ensemble_member', r'^[A-Za-z0-9]*$'), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} - -# Fields extracted from ini file & appended as data to the scope. -SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' -} diff --git a/sh/writers/esgf/map_euclipse.py b/sh/writers/esgf/map_euclipse.py index 98af265..8afa008 100644 --- a/sh/writers/esgf/map_euclipse.py +++ b/sh/writers/esgf/map_euclipse.py @@ -13,45 +13,82 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { +COLLECTIONS = [ ('cmor_table', yield_comma_delimited_options), ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('las_time_delta', lambda: yield_las_time_delta), - ('model', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), + ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), + ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), ('product', yield_comma_delimited_options), ('realm', yield_comma_delimited_options), ('thredds_exclude_variables', yield_comma_delimited_options), ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'EUCLIPSE/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version', + 'variable' + ), + }, + 'dataset_id': { + 'template': 'euclipse.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version' + ) + } } -def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. +def yield_las_time_delta(ctx): + """Yields las time delta information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:euclipse:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_institute(ctx): + """Yields institute information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:euclipse:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute diff --git a/sh/writers/esgf/map_geomip.py b/sh/writers/esgf/map_geomip.py index 485cdf3..5ca8516 100644 --- a/sh/writers/esgf/map_geomip.py +++ b/sh/writers/esgf/map_geomip.py @@ -13,44 +13,82 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { +COLLECTIONS = [ ('cmor_table', yield_comma_delimited_options), ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('las_time_delta', lambda: yield_las_time_delta), ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), ('product', yield_comma_delimited_options), ('realm', yield_comma_delimited_options), ('thredds_exclude_variables', yield_comma_delimited_options), ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'GeoMIP/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version', + 'variable' + ), + }, + 'dataset_id': { + 'template': 'geomip.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version' + ) + } } -def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. +def yield_las_time_delta(ctx): + """Yields las time delta information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:geomip:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_institute(ctx): + """Yields institute information to be converted to pyessv terms. + + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:geomip:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta diff --git a/sh/writers/esgf/map_input4mips.py b/sh/writers/esgf/map_input4mips.py index 4d160b1..4b9a77e 100644 --- a/sh/writers/esgf/map_input4mips.py +++ b/sh/writers/esgf/map_input4mips.py @@ -12,29 +12,68 @@ from utils import get_ini_option from utils import yield_comma_delimited_options - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('variable_id', lambda: yield_variable_id_options), - ('activity_id', yield_comma_delimited_options), - ('dataset_category', r'^[A-Za-z0-9]*$'), - ('target_mip', yield_comma_delimited_options), - ('source_id', yield_comma_delimited_options), - ('grid_label', yield_comma_delimited_options), - ('institution_id', yield_comma_delimited_options), - ('realm', yield_comma_delimited_options), - ('frequency', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('version', r'^v[0-9]{8}$'), -} +COLLECTIONS = [ + ('variable_id', lambda: yield_variable_id_options), + ('activity_id', yield_comma_delimited_options), + ('dataset_category', r'^[A-Za-z0-9]*$'), + ('target_mip', yield_comma_delimited_options), + ('source_id', yield_comma_delimited_options), + ('grid_label', yield_comma_delimited_options), + ('institution_id', yield_comma_delimited_options), + ('realm', yield_comma_delimited_options), + ('mip_era', yield_comma_delimited_options), + ('frequency', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0_9]{8}$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable_id', + 'activity_id', + 'dataset_category', + 'target_mip', + 'source_id', + 'grid_label' + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'input4MIPs/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'mip_era', + 'target_mip', + 'institution_id', + 'source_id', + 'realm', + 'frequency', + 'variable_id', + 'grid_label', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'input4MIPs.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'mip_era', + 'target_mip', + 'institution_id', + 'source_id', + 'realm', + 'frequency', + 'variable_id', + 'grid_label', + 'dataset_version' + ) + } } + def yield_variable_id_options(ctx): # Decode options from ini file. opts = get_ini_option(ctx) diff --git a/sh/writers/esgf/map_isimip_ft.py b/sh/writers/esgf/map_isimip_ft.py index ab0e368..d2b2891 100644 --- a/sh/writers/esgf/map_isimip_ft.py +++ b/sh/writers/esgf/map_isimip_ft.py @@ -13,46 +13,99 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('product', yield_comma_delimited_options), - ('model', yield_comma_delimited_options), - ('impact_model', yield_comma_delimited_options), - ('experiment', yield_pipe_delimited_options), - ('sector_short', yield_comma_delimited_options), - ('social_forcing', yield_comma_delimited_options), - ('co2_forcing', yield_comma_delimited_options), - ('irrigation_forcing', yield_comma_delimited_options), - ('land_use_short', yield_comma_delimited_options), - ('time_frequency', yield_comma_delimited_options), - ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$'), - ('thredds_exclude_variables', yield_comma_delimited_options) -} - +COLLECTIONS = [ + ('product', yield_comma_delimited_options), + ('model', yield_comma_delimited_options), + ('impact_model', yield_comma_delimited_options), + ('experiment', yield_pipe_delimited_options), + ('sector_short', lambda: yield_sector), + ('social_forcing', yield_comma_delimited_options), + ('institute', lambda: yield_institute), + ('co2_forcing', yield_comma_delimited_options), + ('irrigation_forcing', yield_comma_delimited_options), + ('land_use_short', lambda: yield_land_use), + ('time_frequency', yield_comma_delimited_options), + ('variable', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0-9]*$'), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'impact_model', + 'model', + 'experiment', + 'social_forcing', + 'co2_forcing', + 'irrigation_forcing', + 'land_use_short', + 'variable' + 'time_frequency', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'ISIMIP-FT/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'model', + 'experiment', + 'impact_model', + 'sector_short', + 'social_forcing', + 'co2_forcing', + 'irrigation_forcing', + 'land_use_short', + 'variable' + 'time_frequency', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'isimip-ft.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'impact_model', + 'sector_short', + 'model', + 'experiment', + 'social_forcing', + 'co2_forcing', + 'irrigation_forcing', + 'time_frequency', + 'land_use_short', + 'variable' + 'dataset_version' + ) + } } def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. + """Yields institute information to be converted to pyessv terms. + + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:isimip-ft:impact-model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute + + +def yield_land_use(ctx): + """Yields land use information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for land_use_acronym, land_use in ctx.ini_section.get_option('land_use_map', '\n', '|'): + yield land_use_acronym, land_use_acronym, land_use -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_sector(ctx): + """Yields sector information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for sector_acronym, sector in ctx.ini_section.get_option('sector_map', '\n', '|'): + yield sector_acronym, sector_acronym, sector diff --git a/sh/writers/esgf/map_lucid.py b/sh/writers/esgf/map_lucid.py index 984957e..556907a 100644 --- a/sh/writers/esgf/map_lucid.py +++ b/sh/writers/esgf/map_lucid.py @@ -13,44 +13,82 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { +COLLECTIONS = [ ('cmor_table', yield_comma_delimited_options), ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('las_time_delta', lambda: yield_las_time_delta), ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), ('product', yield_comma_delimited_options), ('realm', yield_comma_delimited_options), ('thredds_exclude_variables', yield_comma_delimited_options), ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'LUCID/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version', + 'variable' + ), + }, + 'dataset_id': { + 'template': 'lucid.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version' + ) + } } -def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. +def yield_las_time_delta(ctx): + """Yields las time delta information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:lucid:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_institute(ctx): + """Yields institute information to be converted to pyessv terms. + + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:lucid:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta diff --git a/sh/writers/esgf/map_obs4mips.py b/sh/writers/esgf/map_obs4mips.py index 260019e..07b56b0 100644 --- a/sh/writers/esgf/map_obs4mips.py +++ b/sh/writers/esgf/map_obs4mips.py @@ -6,33 +6,65 @@ :platform: Unix, Windows :synopsis: Maps obs4MIPs ESGF publisher ini file to normalized pyessv format. -.. moduleauthor:: Mark Conway-Greenslade +.. moduleauthor:: Mark Conway_Greenslade """ from utils import yield_comma_delimited_options -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { +COLLECTIONS = [ ('product', yield_comma_delimited_options), ('institute', yield_comma_delimited_options), ('realm', yield_comma_delimited_options), - ('variable', lambda: yield_variable), ('var', yield_comma_delimited_options), + ('variable', lambda: yield_variable), ('time_frequency', yield_comma_delimited_options), ('data_structure', yield_comma_delimited_options), ('source_id', yield_comma_delimited_options), - ('version', r'^v[0-9]*$'), + ('dataset_version', r'latest|^v[0-9]*$'), + ('processing_level', r'^[A-Za-z0-9]*$'), + ('processing_version', r'^[A-Za-z0-9]*$'), ('las_time_delta', lambda: yield_las_time_delta), - ('thredds_exclude_variables', yield_comma_delimited_options) -} + ('thredds_exclude_variables', yield_comma_delimited_options), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'source_id', + 'processing_level', + 'processing_version', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'obs4MIPs/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'realm', + 'var', + 'time_frequency', + 'data_structure', + 'institute', + 'source_id', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'obs4MIPs.{}.{}.{}.{}.{}', + 'collections': ( + 'institute', + 'source_id', + 'variable', + 'time_frequency', + 'dataset_version' + ) + } } @@ -40,13 +72,15 @@ def yield_variable(ctx): """Yields institute information to be converted to pyessv terms. """ - for var, _ in ctx.ini_section.get_option('variable_map', '\n', '|'): - yield var + for var, variable in ctx.ini_section.get_option('variable_map', '\n', '|'): + src_namespace = 'wcrp:obs4mips:var:{}'.format(var.lower().replace('_','-')) + yield src_namespace, variable def yield_las_time_delta(ctx): """Yields las time delta information to be converted to pyessv terms. """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:obs4mips:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta diff --git a/sh/writers/esgf/map_pmip3.py b/sh/writers/esgf/map_pmip3.py index 680fcf1..fd727e6 100644 --- a/sh/writers/esgf/map_pmip3.py +++ b/sh/writers/esgf/map_pmip3.py @@ -13,44 +13,81 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { +COLLECTIONS = [ ('cmor_table', yield_comma_delimited_options), ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('las_time_delta', lambda: yield_las_time_delta), ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), ('product', yield_comma_delimited_options), ('realm', yield_comma_delimited_options), ('thredds_exclude_variables', yield_comma_delimited_options), ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'PMIP3/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version', + 'variable' + ), + }, + 'dataset_id': { + 'template': 'pmip3.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version' + ) + } } -def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. +def yield_las_time_delta(ctx): + """Yields las time delta information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:pmip3:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_institute(ctx): + """Yields institute information to be converted to pyessv terms. - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:pmip3:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute diff --git a/sh/writers/esgf/map_primavera.py b/sh/writers/esgf/map_primavera.py index 65f4843..b7a0d39 100644 --- a/sh/writers/esgf/map_primavera.py +++ b/sh/writers/esgf/map_primavera.py @@ -12,34 +12,70 @@ from utils import yield_comma_delimited_options from utils import yield_pipe_delimited_options - -# TODO process map: institute_map = map(model : institute) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { - ('activity', yield_comma_delimited_options), - ('institute', lambda: yield_institute), - ('model', yield_comma_delimited_options), - ('experiment', yield_pipe_delimited_options), - ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+f[0-9]+'), - ('cmor_table', yield_comma_delimited_options), - ('variable', r'^[A-Za-z0-9]*$'), - ('grid_label', yield_comma_delimited_options), - ('thredds_exclude_variables', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} +COLLECTIONS = [ + ('activity', yield_comma_delimited_options), + ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), + ('experiment', yield_pipe_delimited_options), + ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+f[0-9]+'), + ('cmor_table', yield_comma_delimited_options), + ('variable', r'^[A-Za-z0-9]*$'), + ('grid_label', yield_comma_delimited_options), + ('thredds_exclude_variables', yield_comma_delimited_options), + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'grid_label', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'PRIMAVERA/{}/{}=/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'activity', + 'institute', + 'model', + 'experiment', + 'ensemble', + 'cmor_table', + 'variable', + 'grid_version', + 'dataset_version' + ) + }, + 'dataset_id': { + 'template': 'PRIMAVERA.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'activity', + 'institute', + 'model', + 'experiment', + 'ensemble', + 'cmor_table', + 'variable', + 'grid_version', + 'dataset_version' + ) + } } def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. + """Yields institute information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:primavera:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute diff --git a/sh/writers/esgf/map_tamip.py b/sh/writers/esgf/map_tamip.py index a4e3dd1..6687ac0 100644 --- a/sh/writers/esgf/map_tamip.py +++ b/sh/writers/esgf/map_tamip.py @@ -13,44 +13,82 @@ from utils import yield_pipe_delimited_options -# TODO process map: institute_map = map(model : institute) -# TODO process map: las_time_delta_map = map(time_frequency : las_time_delta) - # Vocabulary collections extracted from ini file. -COLLECTIONS = { +COLLECTIONS = [ ('cmor_table', yield_comma_delimited_options), ('ensemble', r'r[0-9]+i[0-9]+p[0-9]+'), ('experiment', yield_pipe_delimited_options), - ('institute', lambda: yield_institute), - ('las_time_delta', lambda: yield_las_time_delta), ('model', yield_comma_delimited_options), + ('institute', lambda: yield_institute), ('time_frequency', yield_comma_delimited_options), + ('las_time_delta', lambda: yield_las_time_delta), ('product', yield_comma_delimited_options), ('realm', yield_comma_delimited_options), ('thredds_exclude_variables', yield_comma_delimited_options), ('variable', yield_comma_delimited_options), - ('version', r'^v[0-9]*$') -} + ('dataset_version', r'latest|^v[0-9]*$'), + ('file_period', r'fixed|^\d+-\d+(-clim)?$') +] # Fields extracted from ini file & appended as data to the scope. SCOPE_DATA = { - 'filename_format', - 'directory_format', - 'dataset_id' + 'filename': { + 'template': '{}_{}_{}_{}_{}_{}', + 'collections': ( + 'variable', + 'cmor_table', + 'model', + 'experiment', + 'ensemble', + 'file_period' + ) + }, + 'directory_structure': { + 'template': 'TAMIP/{}/{}/{}/{}/{}/{}/{}/{}/{}/{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version', + 'variable' + ), + }, + 'dataset_id': { + 'template': 'tamip.{}.{}.{}.{}.{}.{}.{}.{}.{}', + 'collections': ( + 'product', + 'institute', + 'model', + 'experiment', + 'time_frequency', + 'realm', + 'cmor_table', + 'ensemble', + 'dataset_version' + ) + } } -def yield_institute(ctx): - """Yields institute information to be converted to pyessv terms. +def yield_las_time_delta(ctx): + """Yields las time delta information to be converted to pyessv terms. - """ - for _, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): - yield institute + """ + for time_frequency, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): + src_namespace = 'wcrp:tamip:time-frequency:{}'.format(time_frequency.lower().replace('_', '-')) + yield src_namespace, las_time_delta -def yield_las_time_delta(ctx): - """Yields las time delta information to be converted to pyessv terms. +def yield_institute(ctx): + """Yields institute information to be converted to pyessv terms. + + """ + for model, institute in ctx.ini_section.get_option('institute_map', '\n', '|'): + src_namespace = 'wcrp:tamip:model:{}'.format(model.lower().replace('_', '-')) + yield src_namespace, institute - """ - for _, las_time_delta in ctx.ini_section.get_option('las_time_delta_map', '\n', '|'): - yield las_time_delta diff --git a/sh/writers/wcrp/cmip6/write.py b/sh/writers/wcrp/cmip6/write.py index dd349c5..5798954 100644 --- a/sh/writers/wcrp/cmip6/write.py +++ b/sh/writers/wcrp/cmip6/write.py @@ -188,16 +188,25 @@ 'is_virtual': True, 'label': None, 'ommitted': [], - 'term_regex': r'^[A-Za-z0-9]*$', + 'term_regex': r'^[A-Za-z0-9]*$' }, - 'version': { + 'dataset_version': { 'cim_document_type': None, 'cim_document_type_alternative_name': None, 'data_factory': None, 'is_virtual': True, 'label': None, 'ommitted': [], - 'term_regex': r'^[0-9]{8}$', + 'term_regex': r'latest|^[0-9]{8}$' + }, + 'file_period': { + 'cim_document_type': None, + 'cim_document_type_alternative_name': None, + 'data_factory': None, + 'is_virtual': True, + 'label': None, + 'ommitted': [], + 'term_regex': r'fixed|^\d+-\d+(-clim)?$' } }, _SCOPE_GLOBAL: { @@ -313,7 +322,7 @@ def _get_institution_data(_, name): """ obj = _INSTITUTIONAL_DATA.get(name, {}) - del obj['code'] + #del obj['code'] return obj diff --git a/tests/test_interface.py b/tests/test_interface.py index b3ad69c..d090a76 100644 --- a/tests/test_interface.py +++ b/tests/test_interface.py @@ -85,8 +85,8 @@ 'log_warning', # ... parsing 'parse', - 'parse_dataset_identifer', - 'parse_dataset_identifers', + 'parse_dataset_identifier', + 'parse_dataset_identifiers', # ... validation 'get_errors', 'is_valid', diff --git a/tests/test_parse_identifiers.py b/tests/test_parse_identifiers.py index 626f666..adada98 100644 --- a/tests/test_parse_identifiers.py +++ b/tests/test_parse_identifiers.py @@ -21,14 +21,14 @@ # Test configuration: project, parsing function, template seperator, strictness, identifiers. _CONFIG = { - ('cmip5', LIB.parse_dataset_identifer, '.', ( + ('cmip5', LIB.parse_dataset_identifier, '.', ( 'cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.3hr.atmos.3hr.r2i1p1', 'cmip5.output2.IPSL.IPSL-CM5A-LR.historicalMisc.mon.ocean.Omon.r2i1p1' )), - ('cmip6', LIB.parse_dataset_identifer, '.', ( + ('cmip6', LIB.parse_dataset_identifier, '.', ( 'cmip6.FAFMIP.IPSL.IPSL-CM6A-LR.amip.r1i1p1f1.Amon.abs550aer.gm', )), - ('cordex', LIB.parse_dataset_identifer, '.', ( + ('cordex', LIB.parse_dataset_identifier, '.', ( 'cordex.output.AFR-44.MOHC.MOHC-HadGEM2-ES.rcp60.r12i1p1.HadGEM3-RA.v1.mon.areacella', 'cordex.output.EUR-11.SMHI.ICHEC-EC-EARTH.rcp85.r12i1p1.RCA4.v1.sem.rsdt' )),