From 855152d8b8a805fba9610e6c637ff653ff7346a3 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Thu, 20 Aug 2020 10:27:22 -0700 Subject: [PATCH 1/4] - use manifest file to specify files to be included in release - update DJORNL parser to apply manifest file - add tests for manifest file validation - small refactor of parser to apply the same QC to every file - small formatting updates to DJORNL source files --- CHANGELOG.md | 23 ++ importers/djornl/parser.py | 357 ++++++++++++------ importers/test/test_djornl_parser.py | 46 +-- spec/README.md | 1 + spec/collections/djornl/djornl_edge.yaml | 5 + spec/collections/djornl/djornl_node.yaml | 18 +- spec/datasets/djornl/manifest.schema.json | 102 +++++ .../djornl/djornl_fetch_clusters.yaml | 1 + .../djornl/djornl_search_nodes.yaml | 16 +- ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv | 2 +- .../djornl/col_count_errors/manifest.yaml | 8 + spec/test/djornl/empty_files/manifest.yaml | 20 + spec/test/djornl/invalid_file/edges.tsv/empty | 0 spec/test/djornl/invalid_file/manifest.yaml | 12 + .../invalid_manifest/cluster_no_prefix.yaml | 18 + .../invalid_manifest/date_not_in_quotes.yaml | 17 + .../invalid_manifest/invalid_format.yaml | 17 + .../djornl/invalid_manifest/manifest.yaml | 14 + .../djornl/invalid_manifest/missing_path.yaml | 21 ++ .../invalid_manifest/no_file_format.yaml | 16 + .../djornl/invalid_manifest/no_file_list.yaml | 3 + ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv | 3 +- spec/test/djornl/invalid_types/manifest.yaml | 8 + .../merged_edges-AMW-060820_AF.tsv | 1 + spec/test/djornl/missing_files/manifest.yaml | 12 + spec/test/djornl/results.json | 28 +- spec/test/djornl/test_data/I2_named.tsv | 9 + spec/test/djornl/test_data/I4_named.tsv | 4 + spec/test/djornl/test_data/I6_named.tsv | 8 + ...p10percent_anno_AF_082919.abc.I2_named.tsv | 5 - ...p10percent_anno_AF_082919.abc.I4_named.tsv | 1 - ...p10percent_anno_AF_082919.abc.I6_named.tsv | 4 - ...rged_edges-AMW-060820_AF.tsv => edges.tsv} | 2 - spec/test/djornl/test_data/extra_node.tsv | 3 + .../test/djornl/test_data/hithruput-edges.csv | 3 + spec/test/djornl/test_data/manifest.yaml | 35 ++ ...-AMW-v2_091319_nodeTable.csv => nodes.csv} | 8 +- spec/test/djornl/test_data/pheno_nodes.csv | 5 + .../djornl/valid_manifest/no_file_ext.yaml | 40 ++ .../djornl/valid_manifest/no_file_format.yaml | 39 ++ .../valid_manifest/with_descriptions.yaml | 39 ++ spec/test/stored_queries/test_djornl.py | 4 +- spec/test/test_manifest_schema.py | 82 ++++ spec/views/djornl/djornl_node_view.json | 14 +- 44 files changed, 877 insertions(+), 197 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 spec/datasets/djornl/manifest.schema.json create mode 100644 spec/test/djornl/col_count_errors/manifest.yaml create mode 100644 spec/test/djornl/empty_files/manifest.yaml create mode 100644 spec/test/djornl/invalid_file/edges.tsv/empty create mode 100644 spec/test/djornl/invalid_file/manifest.yaml create mode 100644 spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml create mode 100644 spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml create mode 100644 spec/test/djornl/invalid_manifest/invalid_format.yaml create mode 100644 spec/test/djornl/invalid_manifest/manifest.yaml create mode 100644 spec/test/djornl/invalid_manifest/missing_path.yaml create mode 100644 spec/test/djornl/invalid_manifest/no_file_format.yaml create mode 100644 spec/test/djornl/invalid_manifest/no_file_list.yaml create mode 100644 spec/test/djornl/invalid_types/manifest.yaml create mode 100644 spec/test/djornl/missing_files/manifest.yaml create mode 100644 spec/test/djornl/test_data/I2_named.tsv create mode 100644 spec/test/djornl/test_data/I4_named.tsv create mode 100644 spec/test/djornl/test_data/I6_named.tsv delete mode 100644 spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv delete mode 100644 spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv delete mode 100644 spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv rename spec/test/djornl/test_data/{merged_edges-AMW-060820_AF.tsv => edges.tsv} (80%) create mode 100644 spec/test/djornl/test_data/extra_node.tsv create mode 100644 spec/test/djornl/test_data/hithruput-edges.csv create mode 100644 spec/test/djornl/test_data/manifest.yaml rename spec/test/djornl/test_data/{aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv => nodes.csv} (72%) create mode 100644 spec/test/djornl/test_data/pheno_nodes.csv create mode 100644 spec/test/djornl/valid_manifest/no_file_ext.yaml create mode 100644 spec/test/djornl/valid_manifest/no_file_format.yaml create mode 100644 spec/test/djornl/valid_manifest/with_descriptions.yaml create mode 100644 spec/test/test_manifest_schema.py diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..ad4fa3fc --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,23 @@ +## 0.0.6 + +### `relation_engine_server` + +- `relation_engine_server/api_versions/api_v1.py`: add and/or document API endpoints: + - /api/v1/specs/stored_queries + - /api/v1/specs/data_sources + +- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file + +### `importers` + +- `importers/djornl`: use manifest file to specify the list of files to be parsed to create a dataset. Refactor file parsing for more flexibility. + +### `spec` + +- add `datasets` folder for dataset-specific schemas + +---- + +## 0.0.5 + +Last release with RE components in two repositories, https://github.com/kbase/relation_engine_api and https://github.com/kbase/relation_engine_spec diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index 771b359a..928b55fe 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -8,8 +8,10 @@ import requests import os import csv +import yaml import importers.utils.config as config +from relation_engine_server.utils.json_validation import run_validator class DJORNL_Parser(object): @@ -28,45 +30,119 @@ def _configure(self): configuration = config.load_from_env(extra_required=['ROOT_DATA_PATH']) # Collection name config - configuration['_NODE_NAME'] = 'djornl_node' - configuration['_EDGE_NAME'] = 'djornl_edge' + configuration['node_name'] = 'djornl_node' + configuration['edge_name'] = 'djornl_edge' - # Path config - configuration['_NODE_PATH'] = os.path.join( - configuration['ROOT_DATA_PATH'], - 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv' - ) - configuration['_NODE_FILE_COL_COUNT'] = 20 + # fetch the manifest and make sure all the files listed actually exist + manifest = self._get_manifest(configuration) + for type in ['node', 'edge', 'cluster']: + configuration[type + '_files'] = [] + + error_list = [] + for file in manifest['file_list']: + file_path = os.path.join(configuration['ROOT_DATA_PATH'], file['path']) + + if not os.path.exists(file_path): + error_list.append(f"{file_path}: file does not exist") + continue + + if not os.path.isfile(file_path): + error_list.append(f"{file_path}: not a file") + continue + + # add the file to the appropriate list + file['file_path'] = file_path + configuration[file['data_type'] + '_files'].append(file) + + if error_list: + raise RuntimeError("\n".join(error_list)) - configuration['_EDGE_PATH'] = os.path.join( - configuration['ROOT_DATA_PATH'], - 'merged_edges-AMW-060820_AF.tsv' - ) - configuration['_EDGE_FILE_COL_COUNT'] = 5 - - _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data') - configuration['_CLUSTER_PATHS'] = { - 'markov_i2': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv' - ), - 'markov_i4': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv' - ), - 'markov_i6': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv' - ), - } self._config = configuration return self._config + def _get_manifest_schema_file(self): + + return os.path.join('/app', 'spec', 'datasets', 'djornl', 'manifest.schema.json') + + def _get_manifest(self, configuration): + """ + Read the manifest file, which contains path and file type info, and validate it. + The manifest is expected to be at ROOT_DATA_PATH/manifest.yaml + """ + + schema_file = self._get_manifest_schema_file() + + # load the manifest and validate it against the schema + manifest_file = os.path.join(configuration['ROOT_DATA_PATH'], 'manifest.yaml') + + try: + with open(manifest_file) as fd: + manifest = yaml.safe_load(fd) + except FileNotFoundError: + raise RuntimeError( + f"No manifest file found at {manifest_file}.\n" + + "Please ensure that you have created a manifest that lists the files " + + "in the release" + ) + + try: + validated_manifest = run_validator( + schema_file=schema_file, + data=manifest + ) + except Exception as err: + print(err) + raise RuntimeError( + "The manifest file failed validation. Please recheck the file and try again." + ) + + return validated_manifest + + def _get_file_reader(self, fd, file): + '''Given a dict containing file information, instantiate the correct type of parser''' + + delimiter = '\t' + if 'file_format' in file and file['file_format'].lower() == 'csv' or file['path'].lower().endswith('.csv'): + delimiter = ',' + return csv.reader(fd, delimiter=delimiter) + + def parser_gen(self, file): + """generator function to parse a file""" + expected_col_count = 0 + with open(file['file_path']) as fd: + csv_reader = self._get_file_reader(fd, file) + line_no = 0 + for row in csv_reader: + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + + cols = [c.strip() for c in row] + + if len(cols) == expected_col_count: + yield (line_no, cols) + continue + + # if we didn't get the expected number of cols: + if expected_col_count == 0: + # this is the header row; set up the expected column count + expected_col_count = len(cols) + yield (line_no, [c.lower() for c in cols]) + continue + + # otherwise, this row does not have the correct number of columns + n_cols = len(cols) + raise RuntimeError( + f"{file['path']} line {line_no}: " + + f"expected {expected_col_count} cols, found {n_cols}" + ) + def load_edges(self): # Headers and sample row: # node1 node2 edge edge_descrip layer_descrip # AT1G01370 AT1G57820 4.40001558779779 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi - edge_remap = { + edge_type_remap = { 'AraGWAS-Phenotype_Associations': 'pheno_assn', 'AraNetv2-CX_pairwise-gene-coexpression': 'gene_coexpr', 'AraNetv2-DC_domain-co-occurrence': 'domain_co_occur', @@ -77,34 +153,56 @@ def load_edges(self): # dict of nodes, indexed by node ID (node1 and node2 from the file) node_ix = {} edges = [] - node_name = self.config('_NODE_NAME') - expected_col_count = self.config('_EDGE_FILE_COL_COUNT') + node_name = self.config('node_name') + + def edge_type(row): + if row['layer_descrip'] not in edge_type_remap: + raise RuntimeError( + f"{file['path']} line {line_no}: invalid edge type: {row['layer_descrip']}" + ) + return edge_type_remap[row['layer_descrip']] + + def _key(row): + return '__'.join([ + row['node1'], + row['node2'], + edge_type(row), + row['edge'], + ]) + + # these functions remap the values in the columns of the input file to + # appropriate values to go into Arango + remap_functions = { + '_from': lambda row: node_name + '/' + row['node1'], + '_to': lambda row: node_name + '/' + row['node2'], + 'score': lambda row: float(row['edge']), + 'edge_type': edge_type, + '_key': _key, + } - with open(self.config('_EDGE_PATH')) as fd: - csv_reader = csv.reader(fd, delimiter='\t') - next(csv_reader, None) # skip headers - line_no = 1 - for row in csv_reader: - line_no += 1 + for file in self.config('edge_files'): + file_parser = self.parser_gen(file) + headers = [] - cols = [c.strip() for c in row] - if len(cols) != expected_col_count: - n_cols = len(cols) - raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}") - - node_ix[cols[0]] = 1 - node_ix[cols[1]] = 1 - edge_type = cols[4] - if edge_type not in edge_remap: - raise RuntimeError(f"line {line_no}: invalid edge type: {edge_type}") - - edges.append({ - '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}', - '_from': f'{node_name}/{cols[0]}', - '_to': f'{node_name}/{cols[1]}', - 'score': float(cols[2]), - 'edge_type': edge_remap[edge_type], - }) + while True: + try: + (line_no, cols) = next(file_parser) + except StopIteration: + break + + if len(headers) == 0: + headers = cols + continue + + # merge headers with cols to create an object + row_object = dict(zip(headers, cols)) + # transform it using the remap_functions + datum = {key: func(row_object) for (key, func) in remap_functions.items()} + edges.append(datum) + + # keep track of the nodes mentioned in this edge set + for node in ["1", "2"]: + node_ix[row_object[f"node{node}"]] = 1 return { 'nodes': [{'_key': n} for n in node_ix.keys()], @@ -116,49 +214,69 @@ def load_node_metadata(self): nodes = [] valid_node_types = ['gene', 'pheno'] - expected_col_count = self.config('_NODE_FILE_COL_COUNT') - with open(self.config('_NODE_PATH')) as fd: - csv_reader = csv.reader(fd, delimiter=',') - next(csv_reader, None) # skip headers - line_no = 1 - for row in csv_reader: - line_no += 1 - cols = [c.strip() for c in row] - if len(cols) != expected_col_count: - n_cols = len(cols) - raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}") - - _key = cols[0] - node_type = cols[1] - if node_type not in valid_node_types: - raise RuntimeError(f"line {line_no}: invalid node type: {node_type}") - - go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] - - doc = { - '_key': _key, - 'node_type': node_type, - 'transcript': cols[2], - 'gene_symbol': cols[3], - 'gene_full_name': cols[4], - 'gene_model_type': cols[5], - 'tair_computational_desc': cols[6], - 'tair_curator_summary': cols[7], - 'tair_short_desc': cols[8], - 'go_descr': cols[9], - 'go_terms': go_terms, - 'mapman_bin': cols[11], - 'mapman_name': cols[12], - 'mapman_desc': cols[13], - 'pheno_aragwas_id': cols[14], - 'pheno_desc1': cols[15], - 'pheno_desc2': cols[16], - 'pheno_desc3': cols[17], - 'pheno_ref': cols[18], - 'user_notes': cols[19], - } - nodes.append(doc) + def go_terms(row): + if len(row['go_terms']): + return [c.strip() for c in row_object['go_terms'].split(',')] + return [] + + def node_type(row): + if row['node_type'] not in valid_node_types: + raise RuntimeError( + f"{file['path']} line {line_no}: invalid node type: {row['node_type']}" + ) + return row['node_type'] + + remap_functions = { + # these pass straight through + 'transcript': None, + 'gene_symbol': None, + 'gene_full_name': None, + 'gene_model_type': None, + 'tair_computational_description': None, + 'tair_short_description': None, + 'tair_curator_summary': None, + 'mapman_bin': None, + 'mapman_name': None, + 'pheno_aragwas_id': None, + 'pheno_ref': None, + 'user_notes': None, + # rename + '_key': lambda row: row['node_id'], + 'go_description': lambda row: row['go_descr'], + 'mapman_description': lambda row: row['mapman_descr'], + 'pheno_description': lambda row: row['pheno_descrip1'], + 'pheno_pto_name': lambda row: row['pheno_descrip2'], + 'pheno_pto_description': lambda row: row['pheno_descrip3'], + # see functions above + 'node_type': node_type, + 'go_terms': go_terms, + } + + for file in self.config('node_files'): + file_parser = self.parser_gen(file) + headers = [] + + while True: + try: + (line_no, cols) = next(file_parser) + except StopIteration: + break + + if len(headers) == 0: + headers = cols + continue + + # merge with headers to form an object, then remap to create Arango-ready data + row_object = dict(zip(headers, cols)) + + datum = {} + for (key, func) in remap_functions.items(): + if func is None: + datum[key] = row_object[key] + else: + datum[key] = func(row_object) + nodes.append(datum) return {'nodes': nodes} @@ -167,22 +285,29 @@ def load_cluster_data(self): # index of nodes node_ix = {} - - cluster_paths = self.config('_CLUSTER_PATHS') - for (cluster_label, path) in cluster_paths.items(): - with open(path) as fd: - csv_reader = csv.reader(fd, delimiter='\t') - for row in csv_reader: - if len(row) > 1: - # remove the 'Cluster' text and replace it with cluster_label - cluster_id = cluster_label + ':' + row[0].replace('Cluster', '') - - node_keys = row[1:] - for key in node_keys: - if key not in node_ix: - node_ix[key] = [cluster_id] - elif cluster_id not in node_ix[key]: - node_ix[key].append(cluster_id) + for file in self.config('cluster_files'): + cluster_label = file['cluster_prefix'] + headers = [] + file_parser = self.parser_gen(file) + + while True: + try: + (line_no, cols) = next(file_parser) + except StopIteration: + break + + if len(headers) == 0: + headers = cols + continue + + # remove the 'Cluster' text and replace it with cluster_label + cluster_id = cluster_label + ':' + cols[0].replace('Cluster', '') + node_keys = [n.strip() for n in cols[1].split(',')] + for key in node_keys: + if key not in node_ix: + node_ix[key] = [cluster_id] + elif cluster_id not in node_ix[key]: + node_ix[key].append(cluster_id) # gather a list of cluster IDs for each node nodes = [{ @@ -195,10 +320,10 @@ def load_cluster_data(self): def save_dataset(self, dataset): if 'nodes' in dataset and len(dataset['nodes']) > 0: - self.save_docs(self.config('_NODE_NAME'), dataset['nodes']) + self.save_docs(self.config('node_name'), dataset['nodes']) if 'edges' in dataset and len(dataset['edges']) > 0: - self.save_docs(self.config('_EDGE_NAME'), dataset['edges']) + self.save_docs(self.config('edge_name'), dataset['edges']) def save_docs(self, coll_name, docs, on_dupe='update'): diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py index 2ad9184e..e2442c8d 100644 --- a/importers/test/test_djornl_parser.py +++ b/importers/test/test_djornl_parser.py @@ -33,18 +33,29 @@ def init_parser_with_path(self, root_path): parser._configure() return parser + def test_load_no_manifest(self): + """ test loading when the manifest does not exist """ + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest') + err_str = 'No manifest file found at ' + os.path.join(RES_ROOT_DATA_PATH, 'manifest.yaml') + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + def test_load_invalid_manifest(self): + """ test an invalid manifest file """ + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_manifest') + err_str = "The manifest file failed validation" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + def test_load_invalid_file(self): """ test loading when what is supposed to be a file is actually a directory """ RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file') # edges: directory, not a file - edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv") - err_str = f"Is a directory: '{edges_file_path}'" - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - - with self.assertRaisesRegex(IsADirectoryError, err_str): - parser.load_edges() + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": not a file" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) def test_load_empty_files(self): """ test loading files containing no data """ @@ -60,20 +71,11 @@ def test_load_empty_files(self): def test_load_missing_files(self): """ test loading when files cannot be found """ - # this dir does not contain the correct file structure - # path: test/djornl/empty_files/cluster_data - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files', 'cluster_data') - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - - err_str = "No such file or directory: '" + RES_ROOT_DATA_PATH - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_edges() - - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_node_metadata() - - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_cluster_data() + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'missing_files') + # not found + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ': file does not exist' + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) def test_load_invalid_types(self): """ test file format errors """ @@ -83,12 +85,12 @@ def test_load_invalid_types(self): parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) # invalid edge type - edge_err_msg = 'line 2: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' + edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() # invalid node type - node_err_msg = 'line 4: invalid node type: Monkey' + node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey' with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() diff --git a/spec/README.md b/spec/README.md index 7eb975bc..27b2c8ee 100644 --- a/spec/README.md +++ b/spec/README.md @@ -7,6 +7,7 @@ These specifications are used by the [Relation Engine API](relation_engine_serve * **[Stored queries](spec/stored_queries)** are stored [AQL queries](https://docs.arangodb.com/3.5/AQL/index.html) that can be used by KBase apps to fetch data from the database. * **[Collections, or document schemas,](spec/collections)** are [JSON schemas](https://json-schema.org/) that define what form of data can be stored in the database's collections. +* **[Datasets](spec/datasets)** contain partial and full schemas specific to a certain dataset. * **[Data sources](spec/data_sources)** contain general information about where some of our imported data comes from. * **[Views](spec/views)** are raw ArangoSearch view configuration files diff --git a/spec/collections/djornl/djornl_edge.yaml b/spec/collections/djornl/djornl_edge.yaml index 3afa987f..1c956fdc 100644 --- a/spec/collections/djornl/djornl_edge.yaml +++ b/spec/collections/djornl/djornl_edge.yaml @@ -33,15 +33,20 @@ schema: type: string oneOf: - const: domain_co_occur + title: AraNetv2-DC_domain-co-occurrence description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). - const: gene_coexpr + title: AraNetv2-CX_pairwise-gene-coexpression description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from Pearson correlation coefficients to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). - const: pheno_assn + title: AraGWAS-Phenotype_Associations description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction. - const: ppi_hithru + title: AraNetv2-HT_high-throughput-ppi description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). - const: ppi_liter + title: AraNetv2-LC_lit-curated-ppi description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). diff --git a/spec/collections/djornl/djornl_node.yaml b/spec/collections/djornl/djornl_node.yaml index 13b14541..e0a744a7 100644 --- a/spec/collections/djornl/djornl_node.yaml +++ b/spec/collections/djornl/djornl_node.yaml @@ -46,7 +46,7 @@ schema: type: string title: Gene model type examples: ["protein_coding"] - tair_computational_desc: + tair_computational_description: type: string title: TAIR computational description examples: ["NAC domain containing protein 1;(source:Araport11)"] @@ -54,11 +54,11 @@ schema: type: string title: TAIR curator summary examples: ["Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed."] - tair_short_desc: + tair_short_description: type: string title: TAIR short description examples: ["NAC domain containing protein 1"] - go_descr: + go_description: type: string title: GO descriptions examples: ["DNA-binding transcription factor activity"] @@ -75,7 +75,7 @@ schema: type: string title: Mapman name examples: [".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)"] - mapman_desc: + mapman_description: type: string title: Mapman description examples: ["transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])"] @@ -83,17 +83,19 @@ schema: type: string title: AraGWAS ID examples: ["10.21958/phenotype:67"] - pheno_desc1: + pheno_description: type: string - title: Phenotype description 1 + title: Phenotype description examples: ["Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008"] - pheno_desc2: + pheno_pto_name: type: string title: PTO name + description: Plant Trait Ontology name examples: ["arsenic concentration"] - pheno_desc3: + pheno_pto_description: type: string title: PTO description + description: Plant Trait Ontology description examples: ["A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]"] pheno_ref: type: string diff --git a/spec/datasets/djornl/manifest.schema.json b/spec/datasets/djornl/manifest.schema.json new file mode 100644 index 00000000..beb7309f --- /dev/null +++ b/spec/datasets/djornl/manifest.schema.json @@ -0,0 +1,102 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Exascale parser file manifest", + "type": "object", + "required": ["name", "file_list"], + "properties": { + "name": { + "title": "Dataset name", + "type": "string", + "description": "The name of the dataset", + "examples": ["Dan Jacobson Exascale dataset"] + }, + "release_date": { + "title": "Release date", + "type": "string", + "description": "Date of the dataset release, in YYYY-MM-DD format", + "format": "date" + }, + "notes": { + "type": "string", + "title": "Release notes", + "description": "Free text describing the release and any notes, or comments relevant to consumers of the data." + }, + "file_list": { + "type": "array", + "items": { + "type": "object", + "required": ["data_type", "path"], + "oneOf": [{ + "properties": { + "data_type": { + "enum": ["cluster"] + } + }, + "required": ["cluster_prefix"] + }, + { + "properties": { + "data_type": { + "enum": ["node", "edge"] + } + } + } + ], + "anyOf": [{ + "properties": { + "file_format": { + "enum": ["tsv", "csv"] + } + }, + "required": ["file_format"] + }, + { + "properties": { + "path": { + "format": "regex", + "pattern": ".[ct]sv" + } + }, + "required": ["path"] + } + ], + "properties": { + "path": { + "title": "File path", + "type": "string" + }, + "data_type": { + "title": "Data type", + "type": "string", + "enum": ["node", "edge", "cluster"] + }, + "file_format": { + "title": "File format", + "type": "string", + "enum": ["tsv", "csv"] + }, + "date": { + "title": "File creation date", + "description": "date of file creation in the format YYYY-MM-DD", + "type": "string", + "format": "date" + }, + "description": { + "title": "Description of the file contents", + "type": "string" + }, + "cluster_prefix": { + "title": "Prefix", + "type": "string", + "description": "The prefix to be used for clusters, e.g. markov_i2:4. Required for cluster data, not used for node or edge data" + }, + "cluster_title": { + "title": "Cluster set name", + "description": "Human-readable name of the cluster set. Not used for edge or node data", + "type": "string" + } + } + } + } + } +} diff --git a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml index 1fadca36..4aa6070e 100644 --- a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml +++ b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml @@ -5,6 +5,7 @@ params: required: [cluster_ids] properties: cluster_ids: + type: array title: Cluster IDs description: Cluster IDs, in the form "clustering_system_name:cluster_id" items: {type: string} diff --git a/spec/stored_queries/djornl/djornl_search_nodes.yaml b/spec/stored_queries/djornl/djornl_search_nodes.yaml index 7cd36c4f..9c8d6a1d 100644 --- a/spec/stored_queries/djornl/djornl_search_nodes.yaml +++ b/spec/stored_queries/djornl/djornl_search_nodes.yaml @@ -19,15 +19,15 @@ query: | LET node_ids = ( FOR g IN djornl_node_view SEARCH ANALYZER( - PHRASE(g.tair_computational_desc, @search_text) OR - PHRASE(g.tair_short_desc, @search_text) OR - PHRASE(g.mapman_desc, @search_text) OR - PHRASE(g.go_desc, @search_text) OR + PHRASE(g.tair_computational_description, @search_text) OR + PHRASE(g.tair_short_description, @search_text) OR + PHRASE(g.mapman_description, @search_text) OR + PHRASE(g.go_description, @search_text) OR PHRASE(g.mapman_name, @search_text) OR - PHRASE(g.mapman_desc, @search_text) OR - PHRASE(g.pheno_desc1, @search_text) OR - PHRASE(g.pheno_desc2, @search_text) OR - PHRASE(g.pheno_desc3, @search_text) OR + PHRASE(g.mapman_description, @search_text) OR + PHRASE(g.pheno_description, @search_text) OR + PHRASE(g.pheno_pto_name, @search_text) OR + PHRASE(g.pheno_pto_description, @search_text) OR PHRASE(g.user_notes, @search_text), 'text_en' ) diff --git a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv index 48e5ab19..7727cd8a 100644 --- a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -1,4 +1,4 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, diff --git a/spec/test/djornl/col_count_errors/manifest.yaml b/spec/test/djornl/col_count_errors/manifest.yaml new file mode 100644 index 00000000..50c5f454 --- /dev/null +++ b/spec/test/djornl/col_count_errors/manifest.yaml @@ -0,0 +1,8 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + + - data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv diff --git a/spec/test/djornl/empty_files/manifest.yaml b/spec/test/djornl/empty_files/manifest.yaml new file mode 100644 index 00000000..fb231666 --- /dev/null +++ b/spec/test/djornl/empty_files/manifest.yaml @@ -0,0 +1,20 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + + - data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv diff --git a/spec/test/djornl/invalid_file/edges.tsv/empty b/spec/test/djornl/invalid_file/edges.tsv/empty new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/djornl/invalid_file/manifest.yaml b/spec/test/djornl/invalid_file/manifest.yaml new file mode 100644 index 00000000..d79e76a7 --- /dev/null +++ b/spec/test/djornl/invalid_file/manifest.yaml @@ -0,0 +1,12 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: node + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: clusters.tsv diff --git a/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml b/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml new file mode 100644 index 00000000..b8993731 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml @@ -0,0 +1,18 @@ +# first cluster file has no prefix +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv diff --git a/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml b/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml new file mode 100644 index 00000000..d40e9e6f --- /dev/null +++ b/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml @@ -0,0 +1,17 @@ +# edge date is not quoted (pyyaml creates a datetime.date object) +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: 2020-12-25 + path: edge_data + file_format: csv + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + file_format: tsv + path: clusters diff --git a/spec/test/djornl/invalid_manifest/invalid_format.yaml b/spec/test/djornl/invalid_manifest/invalid_format.yaml new file mode 100644 index 00000000..125227ce --- /dev/null +++ b/spec/test/djornl/invalid_manifest/invalid_format.yaml @@ -0,0 +1,17 @@ +# invalid node file format +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: "2020-12-25" + path: edge_data.tsv + + - data_type: node + path: nodes.csv + date: "2019-01-01" + file_format: txt + + - data_type: cluster + cluster_prefix: markov_i2 + file_format: tsv + path: clusters diff --git a/spec/test/djornl/invalid_manifest/manifest.yaml b/spec/test/djornl/invalid_manifest/manifest.yaml new file mode 100644 index 00000000..7abfa0e2 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/manifest.yaml @@ -0,0 +1,14 @@ +# multiple errors +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: node + + - data_type: cluster + path: clusters.tsv + + - data_type: ping-pong balls + path: where? diff --git a/spec/test/djornl/invalid_manifest/missing_path.yaml b/spec/test/djornl/invalid_manifest/missing_path.yaml new file mode 100644 index 00000000..c93bec17 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/missing_path.yaml @@ -0,0 +1,21 @@ +# edge file path missing +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: "2020-12-25" + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv diff --git a/spec/test/djornl/invalid_manifest/no_file_format.yaml b/spec/test/djornl/invalid_manifest/no_file_format.yaml new file mode 100644 index 00000000..ee8b9082 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/no_file_format.yaml @@ -0,0 +1,16 @@ +# edge file has no indicator of file format +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + date: "2020-12-25" + path: edge_data + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + file_format: tsv + path: clusters diff --git a/spec/test/djornl/invalid_manifest/no_file_list.yaml b/spec/test/djornl/invalid_manifest/no_file_list.yaml new file mode 100644 index 00000000..ebaf9fb8 --- /dev/null +++ b/spec/test/djornl/invalid_manifest/no_file_list.yaml @@ -0,0 +1,3 @@ +# missing file_list +name: Dan Jacobson Exascale data +release_date: "2020-06-06" diff --git a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv index af5fa6cb..e98310e5 100644 --- a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -1,4 +1,5 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes +# data_type: node As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", AT1G01010,Monkey,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, diff --git a/spec/test/djornl/invalid_types/manifest.yaml b/spec/test/djornl/invalid_types/manifest.yaml new file mode 100644 index 00000000..50c5f454 --- /dev/null +++ b/spec/test/djornl/invalid_types/manifest.yaml @@ -0,0 +1,8 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + + - data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv diff --git a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv index f9857bde..a98f49f9 100644 --- a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv +++ b/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv @@ -1,3 +1,4 @@ +# data_type: edge node1 node2 edge edge_descrip layer_descrip As2 AT1G01020 8.422046084731258 AraGWAS-Association_score AraGWAS-Some-Old-Rubbish-I-Made-Up As2 AT1G01040 5.422046084731258 AraGWAS-Association_score AraGWAS-Phenotype_Associations diff --git a/spec/test/djornl/missing_files/manifest.yaml b/spec/test/djornl/missing_files/manifest.yaml new file mode 100644 index 00000000..d79e76a7 --- /dev/null +++ b/spec/test/djornl/missing_files/manifest.yaml @@ -0,0 +1,12 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +file_list: + - data_type: edge + path: edges.tsv + + - data_type: node + path: nodes.csv + + - data_type: cluster + cluster_prefix: markov_i2 + path: clusters.tsv diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json index 7fd3a4d5..e15408d2 100644 --- a/spec/test/djornl/results.json +++ b/spec/test/djornl/results.json @@ -40,20 +40,20 @@ }, "load_node_metadata": { "nodes": [ - {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_desc1": "", "pheno_desc2": "bacterial disease resistance", "pheno_desc3": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, - {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_desc1": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_desc2": "arsenic concentration", "pheno_desc3": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, - {"_key": "AT1G01010", "node_type": "gene", "transcript": "AT1G01010.1", "gene_symbol": "NTL10", "gene_full_name": "NAC domain containing protein 1", "gene_model_type": "protein_coding", "tair_computational_desc": "NAC domain containing protein 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "NAC domain containing protein 1", "go_descr": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.17", "mapman_name": ".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)", "mapman_desc": "transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01020", "node_type": "gene", "transcript": "AT1G01020.6", "gene_symbol": "ARV1", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_desc": "ARV1 family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "molecular_function", "go_terms": ["GO:0003674"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_desc": "(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4)", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01030", "node_type": "gene", "transcript": "AT1G01030.2", "gene_symbol": "NGA3", "gene_full_name": "NGATHA3", "gene_model_type": "protein_coding", "tair_computational_desc": "AP2/B3-like transcriptional factor family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.5.3", "mapman_name": ".RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA)", "mapman_desc": "transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01040", "node_type": "gene", "transcript": "AT1G01040.2", "gene_symbol": "SUS1", "gene_full_name": "SUSPENSOR 1", "gene_model_type": "protein_coding", "tair_computational_desc": "dicer-like 1;(source:Araport11)", "tair_curator_summary": "Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.", "tair_short_desc": "dicer-like 1", "go_descr": "metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding", "go_terms": ["GO:0046872", "GO:0005515", "GO:0004525", "GO:0008026", "GO:0005524", "GO:0003723", "GO:0004386", "GO:0003725", "GO:0003677"], "mapman_bin": "16.10.2.1.1", "mapman_name": ".RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1", "mapman_desc": "endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01050", "node_type": "gene", "transcript": "AT1G01050.2", "gene_symbol": "PPa1", "gene_full_name": "pyrophosphorylase 1", "gene_model_type": "protein_coding", "tair_computational_desc": "pyrophosphorylase 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "inorganic diphosphatase activity", "go_terms": ["GO:0004427"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_desc": "(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0)", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01060", "node_type": "gene", "transcript": "AT1G01060.8", "gene_symbol": "LHY1", "gene_full_name": "LATE ELONGATED HYPOCOTYL 1", "gene_model_type": "protein_coding", "tair_computational_desc": "Homeodomain-like superfamily protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding", "go_terms": ["GO:0003700", "GO:0003677", "GO:0044212"], "mapman_bin": "27.1.1", "mapman_name": ".Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1)", "mapman_desc": "circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01070", "node_type": "gene", "transcript": "AT1G01070.2", "gene_symbol": "UMAMIT28", "gene_full_name": "Usually multiple acids move in and out Transporters 28", "gene_model_type": "protein_coding", "tair_computational_desc": "nodulin MtN21 /EamA-like transporter family protein;(source:Araport11)", "tair_curator_summary": "Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.", "tair_short_desc": "nodulin MtN21 /EamA-like transporter family protein", "go_descr": "L-glutamine transmembrane transporter activity", "go_terms": ["GO:0015186"], "mapman_bin": "24.2.1.5", "mapman_name": ".Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT)", "mapman_desc": "solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01080", "node_type": "gene", "transcript": "AT1G01080.3", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_desc": "RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "RNA binding, mRNA binding", "go_terms": ["GO:0003723", "GO:0003729"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_desc": "(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01090", "node_type": "gene", "transcript": "AT1G01090.1", "gene_symbol": "PDH-E1 ALPHA", "gene_full_name": "pyruvate dehydrogenase E1 alpha", "gene_model_type": "protein_coding", "tair_computational_desc": "pyruvate dehydrogenase E1 alpha;(source:Araport11)", "tair_curator_summary": "pyruvate dehydrogenase E1 alpha subunit", "tair_short_desc": "pyruvate dehydrogenase E1 alpha", "go_descr": "pyruvate dehydrogenase (acetyl-transferring) activity, protein binding", "go_terms": ["GO:0004739", "GO:0005515"], "mapman_bin": "5.1.2.2.1.1", "mapman_name": ".Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha", "mapman_desc": "subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "AT1G01100", "node_type": "gene", "transcript": "AT1G01100.4", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_desc": "60S acidic ribosomal protein family;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "60S acidic ribosomal protein family", "go_descr": "structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity", "go_terms": ["GO:0003735", "GO:0043021", "GO:0030295"], "mapman_bin": "17.1.2.1.46", "mapman_name": ".Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1", "mapman_desc": "component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""}, - {"_key": "Na23", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:5", "pheno_desc1": "Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_desc2": "sodium concentration", "pheno_desc3": "The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, - {"_key": "SDV", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:104", "pheno_desc1": "Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200", "pheno_desc2": "days to flowering trait", "pheno_desc3": "A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""} + {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_description": "", "pheno_pto_name": "bacterial disease resistance", "pheno_pto_description": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, + {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_description": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "arsenic concentration", "pheno_pto_description": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, + {"_key": "AT1G01010", "node_type": "gene", "transcript": "AT1G01010.1", "gene_symbol": "NTL10", "gene_full_name": "NAC domain containing protein 1", "gene_model_type": "protein_coding", "tair_computational_description": "NAC domain containing protein 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "NAC domain containing protein 1", "go_description": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.17", "mapman_name": ".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)", "mapman_description": "transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01020", "node_type": "gene", "transcript": "AT1G01020.6", "gene_symbol": "ARV1", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "ARV1 family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "molecular_function", "go_terms": ["GO:0003674"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01030", "node_type": "gene", "transcript": "AT1G01030.2", "gene_symbol": "NGA3", "gene_full_name": "NGATHA3", "gene_model_type": "protein_coding", "tair_computational_description": "AP2/B3-like transcriptional factor family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.5.3", "mapman_name": ".RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA)", "mapman_description": "transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01040", "node_type": "gene", "transcript": "AT1G01040.2", "gene_symbol": "SUS1", "gene_full_name": "SUSPENSOR 1", "gene_model_type": "protein_coding", "tair_computational_description": "dicer-like 1;(source:Araport11)", "tair_curator_summary": "Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.", "tair_short_description": "dicer-like 1", "go_description": "metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding", "go_terms": ["GO:0046872", "GO:0005515", "GO:0004525", "GO:0008026", "GO:0005524", "GO:0003723", "GO:0004386", "GO:0003725", "GO:0003677"], "mapman_bin": "16.10.2.1.1", "mapman_name": ".RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1", "mapman_description": "endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01050", "node_type": "gene", "transcript": "AT1G01050.2", "gene_symbol": "PPa1", "gene_full_name": "pyrophosphorylase 1", "gene_model_type": "protein_coding", "tair_computational_description": "pyrophosphorylase 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "inorganic diphosphatase activity", "go_terms": ["GO:0004427"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01060", "node_type": "gene", "transcript": "AT1G01060.8", "gene_symbol": "LHY1", "gene_full_name": "LATE ELONGATED HYPOCOTYL 1", "gene_model_type": "protein_coding", "tair_computational_description": "Homeodomain-like superfamily protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding", "go_terms": ["GO:0003700", "GO:0003677", "GO:0044212"], "mapman_bin": "27.1.1", "mapman_name": ".Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1)", "mapman_description": "circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01070", "node_type": "gene", "transcript": "AT1G01070.2", "gene_symbol": "UMAMIT28", "gene_full_name": "Usually multiple acids move in and out Transporters 28", "gene_model_type": "protein_coding", "tair_computational_description": "nodulin MtN21 /EamA-like transporter family protein;(source:Araport11)", "tair_curator_summary": "Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.", "tair_short_description": "nodulin MtN21 /EamA-like transporter family protein", "go_description": "L-glutamine transmembrane transporter activity", "go_terms": ["GO:0015186"], "mapman_bin": "24.2.1.5", "mapman_name": ".Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT)", "mapman_description": "solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01080", "node_type": "gene", "transcript": "AT1G01080.3", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "RNA binding, mRNA binding", "go_terms": ["GO:0003723", "GO:0003729"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01090", "node_type": "gene", "transcript": "AT1G01090.1", "gene_symbol": "PDH-E1 ALPHA", "gene_full_name": "pyruvate dehydrogenase E1 alpha", "gene_model_type": "protein_coding", "tair_computational_description": "pyruvate dehydrogenase E1 alpha;(source:Araport11)", "tair_curator_summary": "pyruvate dehydrogenase E1 alpha subunit", "tair_short_description": "pyruvate dehydrogenase E1 alpha", "go_description": "pyruvate dehydrogenase (acetyl-transferring) activity, protein binding", "go_terms": ["GO:0004739", "GO:0005515"], "mapman_bin": "5.1.2.2.1.1", "mapman_name": ".Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha", "mapman_description": "subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "AT1G01100", "node_type": "gene", "transcript": "AT1G01100.4", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "60S acidic ribosomal protein family;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "60S acidic ribosomal protein family", "go_description": "structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity", "go_terms": ["GO:0003735", "GO:0043021", "GO:0030295"], "mapman_bin": "17.1.2.1.46", "mapman_name": ".Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1", "mapman_description": "component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""}, + {"_key": "Na23", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:5", "pheno_description": "Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "sodium concentration", "pheno_pto_description": "The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, + {"_key": "SDV", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:104", "pheno_description": "Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200", "pheno_pto_name": "days to flowering trait", "pheno_pto_description": "A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""} ] }, "fetch_all": { diff --git a/spec/test/djornl/test_data/I2_named.tsv b/spec/test/djornl/test_data/I2_named.tsv new file mode 100644 index 00000000..3f129851 --- /dev/null +++ b/spec/test/djornl/test_data/I2_named.tsv @@ -0,0 +1,9 @@ +cluster ids +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +Cluster3 AT1G01080,AT1G01090 +Cluster4 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/test_data/I4_named.tsv b/spec/test/djornl/test_data/I4_named.tsv new file mode 100644 index 00000000..39e46deb --- /dev/null +++ b/spec/test/djornl/test_data/I4_named.tsv @@ -0,0 +1,4 @@ +cluster ids +# cluster_prefix: markov_i4 +# title: Markov clustering, inflation = 4 +# data_type: cluster diff --git a/spec/test/djornl/test_data/I6_named.tsv b/spec/test/djornl/test_data/I6_named.tsv new file mode 100644 index 00000000..d504f6b4 --- /dev/null +++ b/spec/test/djornl/test_data/I6_named.tsv @@ -0,0 +1,8 @@ +cluster ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1 AT1G01040,AT1G01090 +Cluster2 AT1G01070 +Cluster3 AT1G01010,AT1G01020,AT1G01030 +Cluster4 diff --git a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv deleted file mode 100644 index 086a9209..00000000 --- a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv +++ /dev/null @@ -1,5 +0,0 @@ -Cluster1 AT1G01010 AT1G01030 AT1G01040 -Cluster2 AT1G01050 AT1G01060 AT1G01070 -Cluster3 AT1G01080 AT1G01090 -Cluster4 -Cluster5 AT1G01020 diff --git a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv deleted file mode 100644 index 8b137891..00000000 --- a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv +++ /dev/null @@ -1 +0,0 @@ - diff --git a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv deleted file mode 100644 index 389cae2e..00000000 --- a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv +++ /dev/null @@ -1,4 +0,0 @@ -Cluster1 AT1G01040 AT1G01090 -Cluster2 AT1G01070 -Cluster3 AT1G01010 AT1G01020 AT1G01030 -Cluster4 diff --git a/spec/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/test_data/edges.tsv similarity index 80% rename from spec/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv rename to spec/test/djornl/test_data/edges.tsv index 44acc6ff..ee443140 100644 --- a/spec/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv +++ b/spec/test/djornl/test_data/edges.tsv @@ -2,8 +2,6 @@ node1 node2 edge edge_descrip layer_descrip As2 AT1G01020 8.4 AraGWAS-Association_score AraGWAS-Phenotype_Associations As2 AT1G01040 5.4 AraGWAS-Association_score AraGWAS-Phenotype_Associations As75 AT1G01020 39.9 AraGWAS-Association_score AraGWAS-Phenotype_Associations -AT1G01010 AT1G01020 2.3 AraNetv2_log-likelihood-score AraNetv2-HT_high-throughput-ppi -AT1G01010 AT1G01030 2.4 AraNetv2_log-likelihood-score AraNetv2-HT_high-throughput-ppi AT1G01010 AT1G01040 2.5 AraNetv2_log-likelihood-score AraNetv2-DC_domain-co-occurrence AT1G01010 AT1G01040 170.5 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi AT1G01030 AT1G01050 2.6 AraNetv2_log-likelihood-score AraNetv2-CX_pairwise-gene-coexpression diff --git a/spec/test/djornl/test_data/extra_node.tsv b/spec/test/djornl/test_data/extra_node.tsv new file mode 100644 index 00000000..664425c5 --- /dev/null +++ b/spec/test/djornl/test_data/extra_node.tsv @@ -0,0 +1,3 @@ +# data_type: node +node_id node_type transcript gene_symbol gene_full_name gene_model_type TAIR_Computational_description TAIR_Curator_summary TAIR_short_description GO_descr GO_terms MapMan_bin MapMan_name MapMan_descr pheno_AraGWAS_ID pheno_descrip1 pheno_descrip2 pheno_descrip3 pheno_ref User_Notes +AT1G01100 gene AT1G01100.4 protein_coding 60S acidic ribosomal protein family;(source:Araport11) 60S acidic ribosomal protein family structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity GO:0003735, GO:0043021, GO:0030295 17.1.2.1.46 .Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1 component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) diff --git a/spec/test/djornl/test_data/hithruput-edges.csv b/spec/test/djornl/test_data/hithruput-edges.csv new file mode 100644 index 00000000..586476ab --- /dev/null +++ b/spec/test/djornl/test_data/hithruput-edges.csv @@ -0,0 +1,3 @@ +node1,node2,edge,edge_descrip,layer_descrip +AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi +AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi diff --git a/spec/test/djornl/test_data/manifest.yaml b/spec/test/djornl/test_data/manifest.yaml new file mode 100644 index 00000000..1762a86d --- /dev/null +++ b/spec/test/djornl/test_data/manifest.yaml @@ -0,0 +1,35 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +home_url: "https://github.com/kbase/exascale_data" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: edge + path: hithruput-edges.csv + date: "2020-12-25" + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv + + - data_type: node + path: pheno_nodes.csv + date: "2019-01-01" + + - data_type: node + path: extra_node.tsv + date: "2019-01-01" diff --git a/spec/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/test_data/nodes.csv similarity index 72% rename from spec/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv rename to spec/test/djornl/test_data/nodes.csv index 5bc0e1d8..2245bd59 100644 --- a/spec/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/spec/test/djornl/test_data/nodes.csv @@ -1,6 +1,5 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes -As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", -As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +# data_type: node +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, @@ -10,6 +9,3 @@ AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeod AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,, AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,, AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,, -AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]),,,,,, -Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", -SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/test_data/pheno_nodes.csv b/spec/test/djornl/test_data/pheno_nodes.csv new file mode 100644 index 00000000..d8bb15a9 --- /dev/null +++ b/spec/test/djornl/test_data/pheno_nodes.csv @@ -0,0 +1,5 @@ +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes +As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/valid_manifest/no_file_ext.yaml b/spec/test/djornl/valid_manifest/no_file_ext.yaml new file mode 100644 index 00000000..3120e553 --- /dev/null +++ b/spec/test/djornl/valid_manifest/no_file_ext.yaml @@ -0,0 +1,40 @@ +# nodes does not have a file extension, so file format must be used +name: Dan Jacobson Exascale data +release_date: "2020-08-06" +description: Preliminary Jacobson dataset +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + file_format: tsv + description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer. + date_created: "2020-06-08" + + - data_type: node + path: nodes + file_format: csv + description: Merged AraNet AraGWAS gene and phenotype data + date_created: "2019-09-13" + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + file_format: tsv + title: Markov clustering, inflation 2 + description: Iterative random forest Markov clustering, inflation set to 2 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + file_format: tsv + title: Markov clustering, inflation 4 + description: Iterative random forest Markov clustering, inflation set to 4 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + file_format: tsv + title: Markov clustering, inflation 6 + description: Iterative random forest Markov clustering, inflation set to 6 + date_created: "2019-08-19" diff --git a/spec/test/djornl/valid_manifest/no_file_format.yaml b/spec/test/djornl/valid_manifest/no_file_format.yaml new file mode 100644 index 00000000..d3010deb --- /dev/null +++ b/spec/test/djornl/valid_manifest/no_file_format.yaml @@ -0,0 +1,39 @@ +# node file has no format specified, so the path must be parsed +name: Dan Jacobson Exascale data +release_date: "2020-08-06" +description: Preliminary Jacobson dataset +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + file_format: tsv + description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer. + date_created: "2020-06-08" + + - data_type: node + path: nodes.csv + description: Merged AraNet AraGWAS gene and phenotype data + date_created: "2019-09-13" + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + file_format: tsv + title: Markov clustering, inflation 2 + description: Iterative random forest Markov clustering, inflation set to 2 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + file_format: tsv + title: Markov clustering, inflation 4 + description: Iterative random forest Markov clustering, inflation set to 4 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + file_format: tsv + title: Markov clustering, inflation 6 + description: Iterative random forest Markov clustering, inflation set to 6 + date_created: "2019-08-19" diff --git a/spec/test/djornl/valid_manifest/with_descriptions.yaml b/spec/test/djornl/valid_manifest/with_descriptions.yaml new file mode 100644 index 00000000..6c39a234 --- /dev/null +++ b/spec/test/djornl/valid_manifest/with_descriptions.yaml @@ -0,0 +1,39 @@ +name: Dan Jacobson Exascale data +release_date: "2020-08-06" +description: Preliminary Jacobson dataset +file_list: + - data_type: edge + path: merged_edges-AMW-060820_AF.tsv + file_format: tsv + description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer. + date_created: "2020-06-08" + + - data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + file_format: csv + description: Merged AraNet AraGWAS gene and phenotype data + date_created: "2019-09-13" + + - data_type: cluster + cluster_prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + file_format: tsv + title: Markov clustering, inflation 2 + description: Iterative random forest Markov clustering, inflation set to 2 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + file_format: tsv + title: Markov clustering, inflation 4 + description: Iterative random forest Markov clustering, inflation set to 4 + date_created: "2019-08-19" + + - data_type: cluster + cluster_prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + file_format: tsv + title: Markov clustering, inflation 6 + description: Iterative random forest Markov clustering, inflation set to 6 + date_created: "2019-08-19" diff --git a/spec/test/stored_queries/test_djornl.py b/spec/test/stored_queries/test_djornl.py index 8e492fbe..cde4d0c2 100644 --- a/spec/test/stored_queries/test_djornl.py +++ b/spec/test/stored_queries/test_djornl.py @@ -40,8 +40,8 @@ def setUpClass(cls): root_path = os.path.join(_TEST_DIR, 'djornl', 'test_data') with modified_environ(RES_ROOT_DATA_PATH=root_path): parser = DJORNL_Parser() - node_name = parser.config('_NODE_NAME') - edge_name = parser.config('_EDGE_NAME') + node_name = parser.config('node_name') + edge_name = parser.config('edge_name') edge_data = parser.load_edges() r = create_test_docs(node_name, edge_data['nodes']) diff --git a/spec/test/test_manifest_schema.py b/spec/test/test_manifest_schema.py new file mode 100644 index 00000000..4237624d --- /dev/null +++ b/spec/test/test_manifest_schema.py @@ -0,0 +1,82 @@ +""" +Tests for manifest.schema.json + +Ensure that the manifest schema correctly validates data +""" +import unittest +import os.path as os_path +from relation_engine_server.utils.json_validation import run_validator +from jsonschema.exceptions import ValidationError + +schema_file = os_path.join('/app', 'spec', 'datasets', 'djornl', 'manifest.schema.json') +_TEST_DIR = os_path.join('/app', 'spec', 'test', 'djornl') + + +class Test_Manifest_Schema(unittest.TestCase): + + def test_load_invalid_manifest(self): + """ test an invalid manifest file """ + + invalid_dir = os_path.join(_TEST_DIR, 'invalid_manifest') + + error_list = [ + { + # no file list provided + 'file': 'no_file_list', + 'msg': "'file_list' is a required property", + }, + { + # a cluster file entry should have a prefix + 'file': 'cluster_no_prefix', + 'msg': r"{'data_type': 'cluster', 'path': 'I2_named.tsv'} is not valid under any of the given schemas", + }, + { + # each file_list entry has to have a path + 'file': 'missing_path', + 'msg': "'path' is a required property", + }, + { + # if the date is not quoted, pyyaml will turn it into a date object. Doh! + 'file': 'date_not_in_quotes', + 'msg': "datetime.date\(2020, 12, 25\) is not of type 'string'", + }, + { + # file format is invalid + 'file': 'invalid_format', + 'msg': "'txt' is not one of \['tsv', 'csv'\]" + }, + { + # there must be an indicator of file format + 'file': 'no_file_format', + 'msg': r"{'data_type': 'edge', 'date': '2020-12-25', 'path': 'edge_data'}" + + " is not valid under any of the given schemas", + }, + ] + + for entry in error_list: + data_file = os_path.join(invalid_dir, entry['file'] + '.yaml') + print('looking at ' + data_file) + + with self.assertRaisesRegex(ValidationError, entry['msg']): + run_validator( + schema_file=schema_file, + data_file=data_file, + nicer_errors=True + ) + + def test_load_valid_manifests(self): + + valid_dir = os_path.join(_TEST_DIR, 'valid_manifest') + file_list = ['with_descriptions', 'no_file_ext', 'no_file_format'] + + for file in file_list: + data_file = os_path.join(valid_dir, file + '.yaml') + print('looking at ' + data_file) + + self.assertTrue( + run_validator( + schema_file=schema_file, + data_file=data_file, + nicer_errors=True + ) + ) diff --git a/spec/views/djornl/djornl_node_view.json b/spec/views/djornl/djornl_node_view.json index 7330fd3a..2287f615 100644 --- a/spec/views/djornl/djornl_node_view.json +++ b/spec/views/djornl/djornl_node_view.json @@ -19,19 +19,19 @@ ], "fields": { "transcript": {}, - "tair_computational_desc": { + "tair_computational_description": { "analyzers": [ "text_en" ] }, - "tair_short_desc": { + "tair_short_description": { "analyzers": [ "text_en" ] }, "gene_model_type": {}, "go_terms": {}, - "go_desc": { + "go_description": { "analyzers": [ "text_en" ] @@ -41,22 +41,22 @@ "text_en" ] }, - "mapman_desc": { + "mapman_description": { "analyzers": [ "text_en" ] }, - "pheno_desc1": { + "pheno_description": { "analyzers": [ "text_en" ] }, - "pheno_desc2": { + "pheno_pto_name": { "analyzers": [ "text_en" ] }, - "pheno_desc3": { + "pheno_pto_description": { "analyzers": [ "text_en" ] From b2357f1e580c2b98cfc7242952ed35a62febd66f Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Thu, 20 Aug 2020 16:12:29 -0700 Subject: [PATCH 2/4] Update changelog to use Keep a Changelog format --- CHANGELOG.md | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad4fa3fc..ad51ca09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,23 +1,33 @@ -## 0.0.6 +# Changelog for kbase/relation_engine -### `relation_engine_server` +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.0.6] - 2020-08-20 + +### Added - `relation_engine_server/api_versions/api_v1.py`: add and/or document API endpoints: - - /api/v1/specs/stored_queries - - /api/v1/specs/data_sources + - `/api/v1/specs/stored_queries` + - `/api/v1/specs/data_sources` +- `spec/datasets`: new directory for dataset-specific schemas, e.g. DJORNL parser manifest, `spec/datasets/djornl/manifest.schema.json` +- `spec/test/test_manifest_schema.py`: to test the manifest schema against example input +- `importers/djornl/parser.py`: use manifest file to specify the files to be parsed and loaded into ArangoDB -- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file +### Changed -### `importers` +- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file +- `importers/djornl/parser.py`: refactor parsing code to be more flexible and parse multiple files +- `spec/collections/djornl/*`, `spec/stored_queries/djornl/*`, `spec/views/djornl/*`, and `spec/test/djornl`: rename DB fields and headers in test files -- `importers/djornl`: use manifest file to specify the list of files to be parsed to create a dataset. Refactor file parsing for more flexibility. +### Removed -### `spec` +- `spec/test/djornl`: delete unneeded test files -- add `datasets` folder for dataset-specific schemas ----- -## 0.0.5 +## [0.0.5] Last release with RE components in two repositories, https://github.com/kbase/relation_engine_api and https://github.com/kbase/relation_engine_spec From 541dd4e08b953ea83022f6380401c38425ee850c Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Wed, 26 Aug 2020 17:01:41 -0700 Subject: [PATCH 3/4] Refactor parsing method to be more generic Refactor spec files to use a definitions file Add tests for duplicated data --- importers/djornl/parser.py | 345 +++++++++++------- importers/test/test_djornl_parser.py | 106 +++++- relation_engine_server/utils/bulk_import.py | 6 +- spec/collections/djornl/djornl_edge.yaml | 36 +- spec/collections/djornl/djornl_node.yaml | 5 +- spec/datasets/djornl/csv_cluster.yaml | 15 + spec/datasets/djornl/csv_edge.yaml | 23 ++ spec/datasets/djornl/csv_node.yaml | 52 +++ spec/datasets/djornl/definitions.yaml | 129 +++++++ spec/datasets/djornl/edge_type.yaml | 25 ++ spec/datasets/djornl/node_type.yaml | 10 + ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv | 2 +- spec/test/djornl/duplicate_data/I2_named.tsv | 9 + spec/test/djornl/duplicate_data/I4_named.tsv | 8 + spec/test/djornl/duplicate_data/I6_copy.csv | 8 + spec/test/djornl/duplicate_data/I6_named.tsv | 8 + spec/test/djornl/duplicate_data/edges.tsv | 11 + .../test/djornl/duplicate_data/extra_node.tsv | 5 + .../djornl/duplicate_data/hithruput-edges.csv | 9 + spec/test/djornl/duplicate_data/manifest.yaml | 39 ++ spec/test/djornl/duplicate_data/nodes.csv | 13 + .../djornl/duplicate_data/pheno_nodes.csv | 5 + ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv | 2 +- .../empty_files/cluster_data/comment_only.tsv | 1 + .../empty_files/cluster_data/headers_only.tsv | 4 + ...082919.abc.I2_named.tsv => no_content.tsv} | 0 ...p10percent_anno_AF_082919.abc.I4_named.tsv | 1 - ...p10percent_anno_AF_082919.abc.I6_named.tsv | 1 - spec/test/djornl/empty_files/manifest.yaml | 6 +- .../merged_edges-AMW-060820_AF.tsv | 5 +- spec/test/djornl/invalid_types/edges.tsv | 10 + spec/test/djornl/invalid_types/manifest.yaml | 8 +- .../djornl/invalid_types/markov2_named.tsv | 9 + .../merged_edges-AMW-060820_AF.tsv | 10 - ...-AMW-v2_091319_nodeTable.csv => nodes.csv} | 2 +- spec/test/djornl/results.json | 218 +++++------ spec/test/djornl/test_data/I2_named.tsv | 5 +- spec/test/djornl/test_data/I4_named.tsv | 3 +- spec/test/djornl/test_data/I6_named.tsv | 4 +- spec/test/djornl/test_data/extra_node.tsv | 4 +- spec/test/djornl/test_data/nodes.csv | 2 +- spec/test/djornl/test_data/pheno_nodes.csv | 10 +- 42 files changed, 856 insertions(+), 318 deletions(-) create mode 100644 spec/datasets/djornl/csv_cluster.yaml create mode 100644 spec/datasets/djornl/csv_edge.yaml create mode 100644 spec/datasets/djornl/csv_node.yaml create mode 100644 spec/datasets/djornl/definitions.yaml create mode 100644 spec/datasets/djornl/edge_type.yaml create mode 100644 spec/datasets/djornl/node_type.yaml create mode 100644 spec/test/djornl/duplicate_data/I2_named.tsv create mode 100644 spec/test/djornl/duplicate_data/I4_named.tsv create mode 100644 spec/test/djornl/duplicate_data/I6_copy.csv create mode 100644 spec/test/djornl/duplicate_data/I6_named.tsv create mode 100644 spec/test/djornl/duplicate_data/edges.tsv create mode 100644 spec/test/djornl/duplicate_data/extra_node.tsv create mode 100644 spec/test/djornl/duplicate_data/hithruput-edges.csv create mode 100644 spec/test/djornl/duplicate_data/manifest.yaml create mode 100644 spec/test/djornl/duplicate_data/nodes.csv create mode 100644 spec/test/djornl/duplicate_data/pheno_nodes.csv create mode 100644 spec/test/djornl/empty_files/cluster_data/comment_only.tsv create mode 100644 spec/test/djornl/empty_files/cluster_data/headers_only.tsv rename spec/test/djornl/empty_files/cluster_data/{out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv => no_content.tsv} (100%) delete mode 100644 spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv delete mode 100644 spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv create mode 100644 spec/test/djornl/invalid_types/edges.tsv create mode 100644 spec/test/djornl/invalid_types/markov2_named.tsv delete mode 100644 spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv rename spec/test/djornl/invalid_types/{aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv => nodes.csv} (98%) diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index 928b55fe..990c745a 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -11,7 +11,7 @@ import yaml import importers.utils.config as config -from relation_engine_server.utils.json_validation import run_validator +from relation_engine_server.utils.json_validation import run_validator, get_schema_validator class DJORNL_Parser(object): @@ -62,7 +62,15 @@ def _configure(self): def _get_manifest_schema_file(self): - return os.path.join('/app', 'spec', 'datasets', 'djornl', 'manifest.schema.json') + return os.path.join(self._get_dataset_schema_dir(), 'manifest.schema.json') + + def _get_dataset_schema_dir(self): + + if not hasattr(self, '_dataset_schema_dir'): + dir_path = os.path.dirname(os.path.realpath(__file__)) + self._dataset_schema_dir = os.path.join(dir_path, '../', '../', 'spec', 'datasets', 'djornl') + + return self._dataset_schema_dir def _get_manifest(self, configuration): """ @@ -81,8 +89,8 @@ def _get_manifest(self, configuration): except FileNotFoundError: raise RuntimeError( f"No manifest file found at {manifest_file}.\n" - + "Please ensure that you have created a manifest that lists the files " - + "in the release" + "Please ensure that you have created a manifest that lists the files " + "in the release" ) try: @@ -109,205 +117,282 @@ def _get_file_reader(self, fd, file): def parser_gen(self, file): """generator function to parse a file""" expected_col_count = 0 - with open(file['file_path']) as fd: + with open(file['file_path'], newline='') as fd: csv_reader = self._get_file_reader(fd, file) line_no = 0 for row in csv_reader: line_no += 1 - if len(row) <= 1 or row[0][0] == '#': + if not len(row) or row[0][0] == '#': # comment / metadata continue cols = [c.strip() for c in row] if len(cols) == expected_col_count: - yield (line_no, cols) + yield (line_no, cols, None) continue # if we didn't get the expected number of cols: if expected_col_count == 0: # this is the header row; set up the expected column count expected_col_count = len(cols) - yield (line_no, [c.lower() for c in cols]) + yield (line_no, [c.lower() for c in cols], None) continue # otherwise, this row does not have the correct number of columns - n_cols = len(cols) - raise RuntimeError( - f"{file['path']} line {line_no}: " - + f"expected {expected_col_count} cols, found {n_cols}" - ) + col_count = len(cols) + msg = f"expected {expected_col_count} cols, found {col_count}" + yield(line_no, None, f"{file['path']} line {line_no}: {msg}") + + def remap_object(self, raw_data, remap_functions): + """ Given a dict, raw_data, create a new dict, remapped_data, using the functions in the + dictionary `remap_functions`. """ + remapped_data = {} + for (key, function) in remap_functions.items(): + # these keys get copied over unchanged to the new object if they exist in the input obj + if function is None: + if key in raw_data: + remapped_data[key] = raw_data[key] + else: + remapped_data[key] = function(raw_data) + + return remapped_data + + def process_file(self, file, remap_fn, store_fn, err_list, validator=None): + """ process an input file to generate a dataset and possibly an error list + + Each valid line in the file is turned into a dictionary using the header row, and then + validated against the csv validation schema in spec/datasets/djornl/csv_. + If that completes successfully, it is transformed using the functions in the dictionary + `remap_fn`, checked for uniqueness against existing data, and saved to a dictionary. Once + all files of a certain type have been processed, results can be saved to Arango. + + Any errors that occur during parsing and processing are accumulated in `err_list`. + + :param file: (dict) file data + :param remap_fn: (dict) mapping of output param names to functions + each function should take the row data object as input and + return the value for the output parameter + + :param store_fn: (func) function to store the results of the remapping + + :param err_list: (list) error list + + :param validator: (Validator) jsonschema validator object + + """ + file_parser = self.parser_gen(file) + try: + (line_no, cols, err_str) = next(file_parser) + except StopIteration: + # no valid lines found in the file + err_list.append(f"{file['path']}: no header line found") + return + + headers = cols + n_stored = 0 + for (line_no, cols, err_str) in file_parser: + # mismatch in number of cols + if cols is None: + err_list.append(err_str) + continue + + # merge headers with cols to create an object + row_object = dict(zip(headers, cols)) + + if validator is not None: + # validate the object + if not validator.is_valid(row_object): + err_msg = "".join( + f"{file['path']} line {line_no}: " + e.message + for e in sorted(validator.iter_errors(row_object), key=str) + ) + err_list.append(err_msg) + continue + + # transform it using the remap_functions + datum = self.remap_object(row_object, remap_fn) + + # and store it + storage_error = store_fn(datum) + if storage_error is None: + n_stored += 1 + else: + err_list.append(f"{file['path']} line {line_no}: " + storage_error) + + if not n_stored: + err_list.append(f"{file['path']}: no valid data found") def load_edges(self): - # Headers and sample row: - # node1 node2 edge edge_descrip layer_descrip - # AT1G01370 AT1G57820 4.40001558779779 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi - edge_type_remap = { - 'AraGWAS-Phenotype_Associations': 'pheno_assn', - 'AraNetv2-CX_pairwise-gene-coexpression': 'gene_coexpr', - 'AraNetv2-DC_domain-co-occurrence': 'domain_co_occur', - 'AraNetv2-HT_high-throughput-ppi': 'ppi_hithru', - 'AraNetv2-LC_lit-curated-ppi': 'ppi_liter', - } + """Load edge data from the set of edge files""" # dict of nodes, indexed by node ID (node1 and node2 from the file) node_ix = {} - edges = [] - node_name = self.config('node_name') + # dict of edges, indexed by node1__node2__edge_type + edge_ix = {} + # error accumulator + err_list = [] - def edge_type(row): - if row['layer_descrip'] not in edge_type_remap: - raise RuntimeError( - f"{file['path']} line {line_no}: invalid edge type: {row['layer_descrip']}" - ) - return edge_type_remap[row['layer_descrip']] - - def _key(row): - return '__'.join([ - row['node1'], - row['node2'], - edge_type(row), - row['edge'], - ]) + schema_file = os.path.join(self._get_dataset_schema_dir(), 'csv_edge.yaml') + validator = get_schema_validator(schema_file=schema_file) + node_name = self.config('node_name') # these functions remap the values in the columns of the input file to # appropriate values to go into Arango remap_functions = { + # create a unique key for each record + '_key': lambda row: '__'.join([row[_] for _ in ['node1', 'node2', 'layer_descrip', 'edge']]), + 'node1': None, # this will be deleted in the 'store' step + 'node2': None, # as will this '_from': lambda row: node_name + '/' + row['node1'], '_to': lambda row: node_name + '/' + row['node2'], 'score': lambda row: float(row['edge']), - 'edge_type': edge_type, - '_key': _key, + 'edge_type': lambda row: row['layer_descrip'], } - for file in self.config('edge_files'): - file_parser = self.parser_gen(file) - headers = [] + # store edge data, checking for potential duplicates + def store_edges(datum): + # there should only be one value for each node<->node edge of a given type + edge_key = "__".join([datum['node1'], datum['node2'], datum['edge_type']]) - while True: - try: - (line_no, cols) = next(file_parser) - except StopIteration: - break + if edge_key in edge_ix: + # ignore duplicate lines; report non-matching data + if datum['score'] != edge_ix[edge_key]['score']: + return f"duplicate data for edge {edge_key}" + return None - if len(headers) == 0: - headers = cols - continue + # keep track of the nodes mentioned in this edge set + for node_n in ["1", "2"]: + node_ix[datum[f"node{node_n}"]] = 1 + del datum[f"node{node_n}"] - # merge headers with cols to create an object - row_object = dict(zip(headers, cols)) - # transform it using the remap_functions - datum = {key: func(row_object) for (key, func) in remap_functions.items()} - edges.append(datum) + edge_ix[edge_key] = datum + return None - # keep track of the nodes mentioned in this edge set - for node in ["1", "2"]: - node_ix[row_object[f"node{node}"]] = 1 + for file in self.config('edge_files'): + self.process_file( + file=file, + remap_fn=remap_functions, + store_fn=store_edges, + err_list=err_list, + validator=validator, + ) + + if len(err_list): + raise RuntimeError('\n'.join(err_list)) return { 'nodes': [{'_key': n} for n in node_ix.keys()], - 'edges': edges, + 'edges': edge_ix.values(), } def load_node_metadata(self): """Load node metadata""" - nodes = [] - valid_node_types = ['gene', 'pheno'] + node_ix = {} + err_list = [] + + schema_file = os.path.join(self._get_dataset_schema_dir(), 'csv_node.yaml') + validator = get_schema_validator(schema_file=schema_file) def go_terms(row): - if len(row['go_terms']): - return [c.strip() for c in row_object['go_terms'].split(',')] + if 'go_terms' in row and len(row['go_terms']): + return [c.strip() for c in row['go_terms'].split(',')] return [] - def node_type(row): - if row['node_type'] not in valid_node_types: - raise RuntimeError( - f"{file['path']} line {line_no}: invalid node type: {row['node_type']}" - ) - return row['node_type'] - remap_functions = { # these pass straight through - 'transcript': None, - 'gene_symbol': None, 'gene_full_name': None, 'gene_model_type': None, - 'tair_computational_description': None, - 'tair_short_description': None, - 'tair_curator_summary': None, + 'gene_symbol': None, + 'go_description': None, 'mapman_bin': None, + 'mapman_description': None, 'mapman_name': None, + 'node_type': None, 'pheno_aragwas_id': None, + 'pheno_description': None, + 'pheno_pto_description': None, + 'pheno_pto_name': None, 'pheno_ref': None, + 'tair_computational_description': None, + 'tair_curator_summary': None, + 'tair_short_description': None, + 'transcript': None, 'user_notes': None, # rename '_key': lambda row: row['node_id'], - 'go_description': lambda row: row['go_descr'], - 'mapman_description': lambda row: row['mapman_descr'], - 'pheno_description': lambda row: row['pheno_descrip1'], - 'pheno_pto_name': lambda row: row['pheno_descrip2'], - 'pheno_pto_description': lambda row: row['pheno_descrip3'], # see functions above - 'node_type': node_type, 'go_terms': go_terms, } - for file in self.config('node_files'): - file_parser = self.parser_gen(file) - headers = [] - - while True: - try: - (line_no, cols) = next(file_parser) - except StopIteration: - break - - if len(headers) == 0: - headers = cols - continue + # store nodes in a dict indexed by _key + def store_nodes(datum): + # check whether we have this node already + if datum['_key'] in node_ix: + # report non-matching data + if datum != node_ix[datum['_key']]: + return f"duplicate data for node {datum['_key']}" + # otherwise, it's duplicated line: ignore + return None - # merge with headers to form an object, then remap to create Arango-ready data - row_object = dict(zip(headers, cols)) + node_ix[datum['_key']] = datum + return None - datum = {} - for (key, func) in remap_functions.items(): - if func is None: - datum[key] = row_object[key] - else: - datum[key] = func(row_object) - nodes.append(datum) + for file in self.config('node_files'): + self.process_file( + file=file, + remap_fn=remap_functions, + store_fn=store_nodes, + err_list=err_list, + validator=validator, + ) - return {'nodes': nodes} + if len(err_list): + raise RuntimeError('\n'.join(err_list)) + return {'nodes': node_ix.values()} def load_cluster_data(self): """Annotate genes with cluster ID fields.""" # index of nodes node_ix = {} + err_list = [] + + schema_file = os.path.join(self._get_dataset_schema_dir(), 'csv_cluster.yaml') + validator = get_schema_validator(schema_file=schema_file) + + # these functions remap the values in the columns of the input file to + # appropriate values to go into Arango + remap_functions = { + 'node_ids': lambda row: [n.strip() for n in row['node_ids'].split(',')] + } + + # store clusters in a dictionary with key node_id and value list of cluster IDs to which + # the node is assigned + def store_clusters(datum): + cluster_id = datum['cluster_id'] + for node_id in datum['node_ids']: + if node_id not in node_ix: + node_ix[node_id] = [cluster_id] + elif cluster_id not in node_ix[node_id]: + node_ix[node_id].append(cluster_id) + return None + for file in self.config('cluster_files'): - cluster_label = file['cluster_prefix'] - headers = [] - file_parser = self.parser_gen(file) - - while True: - try: - (line_no, cols) = next(file_parser) - except StopIteration: - break - - if len(headers) == 0: - headers = cols - continue + prefix = file['cluster_prefix'] + remap_functions['cluster_id'] = lambda row: prefix + ':' + row['cluster_id'].replace('Cluster', '') + + self.process_file( + file=file, + remap_fn=remap_functions, + store_fn=store_clusters, + err_list=err_list, + validator=validator, + ) - # remove the 'Cluster' text and replace it with cluster_label - cluster_id = cluster_label + ':' + cols[0].replace('Cluster', '') - node_keys = [n.strip() for n in cols[1].split(',')] - for key in node_keys: - if key not in node_ix: - node_ix[key] = [cluster_id] - elif cluster_id not in node_ix[key]: - node_ix[key].append(cluster_id) + if len(err_list): + raise RuntimeError('\n'.join(err_list)) # gather a list of cluster IDs for each node nodes = [{ @@ -345,6 +430,7 @@ def load_data(self): self.save_dataset(self.load_edges()) self.save_dataset(self.load_node_metadata()) self.save_dataset(self.load_cluster_data()) + return True def check_data_delta(self): edge_data = self.load_edges() @@ -374,3 +460,8 @@ def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): print("Dataset contains " + str(len(edge_data['edges'])) + " edges") # count all nodes print("Dataset contains " + str(len(all_nodes)) + " nodes") + + +if __name__ == '__main__': + parser = DJORNL_Parser() + parser.load_data() diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py index e2442c8d..a3c8ecfa 100644 --- a/importers/test/test_djornl_parser.py +++ b/importers/test/test_djornl_parser.py @@ -64,9 +64,24 @@ def test_load_empty_files(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []}) - self.assertEqual(parser.load_node_metadata(), {"nodes": []}) - self.assertEqual(parser.load_cluster_data(), {"nodes": []}) + # header only, no content + err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found' + with self.assertRaisesRegex(RuntimeError, err_str): + parser.load_node_metadata() + + # comments only + err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found' + with self.assertRaisesRegex(RuntimeError, err_str): + parser.load_edges() + + # mix of problems + err_str = "\n".join([ + 'cluster_data/headers_only.tsv: no valid data found', + 'cluster_data/no_content.tsv: no header line found', + 'cluster_data/comment_only.tsv: no header line found', + ]) + with self.assertRaisesRegex(RuntimeError, err_str): + parser.load_cluster_data() def test_load_missing_files(self): """ test loading when files cannot be found """ @@ -77,23 +92,47 @@ def test_load_missing_files(self): with self.assertRaisesRegex(RuntimeError, err_str): self.init_parser_with_path(RES_ROOT_DATA_PATH) - def test_load_invalid_types(self): + def test_load_invalid_edges(self): """ test file format errors """ # path: test/djornl/invalid_types RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - # invalid edge type - edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' + # invalid edge type, invalid scores + edge_err_msg = "\n".join([ + r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas", + r"edges.tsv line 7: '2.' does not match .*?", + r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas", + r"edges.tsv line 10: 'score!' does not match .*?" + ]) with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() + def test_load_invalid_nodes(self): + """ test file format errors """ + + # path: test/djornl/invalid_types + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + # invalid node type - node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey' + node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas" with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() + def test_load_invalid_clusters(self): + """ test file format errors """ + + # path: test/djornl/invalid_types + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + # invalid node type + cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match" + with self.assertRaisesRegex(RuntimeError, cluster_err_msg): + parser.load_cluster_data() + def test_load_col_count_errors(self): """ test files with invalid numbers of columns """ @@ -101,13 +140,13 @@ def test_load_col_count_errors(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - # invalid edge type - edge_err_msg = 'line 6: expected 5 cols, found 3' + # not enough cols + edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 6: expected 5 cols, found 3' with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() - # invalid node type - node_err_msg = 'line 3: expected 20 cols, found 22' + # too many cols + node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22' with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() @@ -144,10 +183,53 @@ def test_load_valid_cluster_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + cluster_data = parser.load_cluster_data() self.assertEqual( cluster_data, self.json_data["load_cluster_data"] ) - parser.check_data_delta() + def test_duplicate_edge_data(self): + """ test files with duplicate edge data, which should throw an error """ + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + err_msg = "\n".join([ + "hithruput-edges.csv line 5: duplicate data for edge AT1G01010__AT1G01030__AraNetv2-HT_.*?", + "hithruput-edges.csv line 9: duplicate data for edge AT1G01030__AT1G01050__AraNetv2-CX_.*?" + ]) + with self.assertRaisesRegex(RuntimeError, err_msg): + parser.load_edges() + + def test_duplicate_node_data(self): + """ test files with duplicate node data, which should throw an error """ + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080" + with self.assertRaisesRegex(RuntimeError, err_msg): + parser.load_node_metadata() + + def test_duplicate_cluster_data(self): + """ test files with duplicate cluster data, which should be seamlessly merged """ + + # path: test/djornl/col_count_errors + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + cluster_data = parser.load_cluster_data() + self.assertEqual( + cluster_data, + self.json_data["load_cluster_data"] + ) + + def test_the_full_shebang(self): + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + parser.load_data() + self.assertEqual(True, parser.load_data()) diff --git a/relation_engine_server/utils/bulk_import.py b/relation_engine_server/utils/bulk_import.py index 8e520726..fbd1c6c3 100644 --- a/relation_engine_server/utils/bulk_import.py +++ b/relation_engine_server/utils/bulk_import.py @@ -6,7 +6,7 @@ import hashlib from relation_engine_server.utils.json_validation import get_schema_validator -from relation_engine_server.utils import spec_loader +from relation_engine_server.utils.spec_loader import get_collection from relation_engine_server.utils.arango_client import import_from_file @@ -16,8 +16,8 @@ def bulk_import(query_params): schema, then write them into a temporary file that can be passed into the arango client. """ - schema = spec_loader.get_collection(query_params['collection']) - validator = get_schema_validator(schema=schema['schema']) + schema_file = get_collection(query_params['collection'], path_only=True) + validator = get_schema_validator(schema_file=schema_file, validate_at='/schema') # We can't use a context manager here # We need to close the file to have the file contents readable # and we need to prevent deletion of the temp file on close (default behavior of tempfiles) diff --git a/spec/collections/djornl/djornl_edge.yaml b/spec/collections/djornl/djornl_edge.yaml index 1c956fdc..a6f93d51 100644 --- a/spec/collections/djornl/djornl_edge.yaml +++ b/spec/collections/djornl/djornl_edge.yaml @@ -14,39 +14,15 @@ schema: description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data type: object required: [score, edge_type, _from, _to, _key] + additionalProperties: false properties: _key: - type: string - title: Key + $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_key _from: - type: string - title: Gene ID + $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_from _to: - type: string - title: Gene or Phenotype ID + $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_to score: - title: Edge Score (Weight) - # (float) - type: number + $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/score edge_type: - title: Edge Type - type: string - oneOf: - - const: domain_co_occur - title: AraNetv2-DC_domain-co-occurrence - description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). - - const: gene_coexpr - title: AraNetv2-CX_pairwise-gene-coexpression - description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were - calculated from Pearson correlation coefficients to normalize the data - for comparison across studies and different types of data layers (Lee et - al, 2015). - - const: pheno_assn - title: AraGWAS-Phenotype_Associations - description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction. - - const: ppi_hithru - title: AraNetv2-HT_high-throughput-ppi - description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). - - const: ppi_liter - title: AraNetv2-LC_lit-curated-ppi - description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/edge_type diff --git a/spec/collections/djornl/djornl_node.yaml b/spec/collections/djornl/djornl_node.yaml index e0a744a7..62d17dda 100644 --- a/spec/collections/djornl/djornl_node.yaml +++ b/spec/collections/djornl/djornl_node.yaml @@ -3,8 +3,8 @@ type: vertex delta: false indexes: - - type: hash - fields: ["clusters[*]"] + - type: hash + fields: ["clusters[*]"] schema: "$schema": http://json-schema.org/draft-07/schema# @@ -12,6 +12,7 @@ schema: description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab type: object required: [_key] + additionalProperties: false properties: _key: type: string diff --git a/spec/datasets/djornl/csv_cluster.yaml b/spec/datasets/djornl/csv_cluster.yaml new file mode 100644 index 00000000..c60f1dfd --- /dev/null +++ b/spec/datasets/djornl/csv_cluster.yaml @@ -0,0 +1,15 @@ +"$schema": http://json-schema.org/draft-07/schema# +name: csv_cluster +title: Cluster data +description: Cluster ID to node ID mappings +type: object +required: [cluster_id, node_ids] +additionalProperties: false +properties: + cluster_id: + type: string + format: regex + pattern: "^Cluster\\d+" + # pre-transform node_ids + node_ids: + type: string diff --git a/spec/datasets/djornl/csv_edge.yaml b/spec/datasets/djornl/csv_edge.yaml new file mode 100644 index 00000000..c2416262 --- /dev/null +++ b/spec/datasets/djornl/csv_edge.yaml @@ -0,0 +1,23 @@ +"$schema": http://json-schema.org/draft-07/schema# +name: csv_edge +title: Arabidopsis gene-gene or gene-phenotype edge +description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data +type: object +required: [node1, node2, edge, layer_descrip] +properties: + node1: + $ref: definitions.yaml#definitions/djornl_edge/_from + node2: + $ref: definitions.yaml#definitions/djornl_edge/_to + edge: + type: string + format: regex + pattern: "^\\d*(\\.\\d+)?$" + layer_descrip: + type: string + oneOf: + - const: AraNetv2-DC_domain-co-occurrence + - const: AraNetv2-CX_pairwise-gene-coexpression + - const: AraGWAS-Phenotype_Associations + - const: AraNetv2-HT_high-throughput-ppi + - const: AraNetv2-LC_lit-curated-ppi diff --git a/spec/datasets/djornl/csv_node.yaml b/spec/datasets/djornl/csv_node.yaml new file mode 100644 index 00000000..c957ff98 --- /dev/null +++ b/spec/datasets/djornl/csv_node.yaml @@ -0,0 +1,52 @@ +"$schema": http://json-schema.org/draft-07/schema# +name: csv_node +title: CSV node file syntax +description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab +type: object +required: [node_id, node_type] +additionalProperties: false +properties: + node_id: + $ref: definitions.yaml#definitions/djornl_node/_key + node_type: + $ref: definitions.yaml#definitions/djornl_node/node_type + clusters: + $ref: definitions.yaml#definitions/djornl_node/clusters + transcript: + $ref: definitions.yaml#definitions/djornl_node/transcript + gene_symbol: + $ref: definitions.yaml#definitions/djornl_node/gene_symbol + gene_full_name: + $ref: definitions.yaml#definitions/djornl_node/gene_full_name + gene_model_type: + $ref: definitions.yaml#definitions/djornl_node/gene_model_type + tair_computational_description: + $ref: definitions.yaml#definitions/djornl_node/tair_computational_description + tair_curator_summary: + $ref: definitions.yaml#definitions/djornl_node/tair_curator_summary + tair_short_description: + $ref: definitions.yaml#definitions/djornl_node/tair_short_description + go_terms: + type: string + format: regex + pattern: "^(GO:\\d{7}, ?)*(GO:\\d{7})?$" + go_description: + $ref: definitions.yaml#definitions/djornl_node/go_description + mapman_bin: + $ref: definitions.yaml#definitions/djornl_node/mapman_bin + mapman_name: + $ref: definitions.yaml#definitions/djornl_node/mapman_name + mapman_description: + $ref: definitions.yaml#definitions/djornl_node/mapman_description + pheno_aragwas_id: + $ref: definitions.yaml#definitions/djornl_node/pheno_aragwas_id + pheno_description: + $ref: definitions.yaml#definitions/djornl_node/pheno_description + pheno_pto_name: + $ref: definitions.yaml#definitions/djornl_node/pheno_pto_name + pheno_pto_description: + $ref: definitions.yaml#definitions/djornl_node/pheno_pto_description + pheno_ref: + $ref: definitions.yaml#definitions/djornl_node/pheno_ref + user_notes: + $ref: definitions.yaml#definitions/djornl_node/user_notes diff --git a/spec/datasets/djornl/definitions.yaml b/spec/datasets/djornl/definitions.yaml new file mode 100644 index 00000000..81ee5e29 --- /dev/null +++ b/spec/datasets/djornl/definitions.yaml @@ -0,0 +1,129 @@ +"$schema": "http://json-schema.org/draft-07/schema#" +name: definitions +title: DJORNL schema definitions +description: Node and edge metadata definitions for the Dan Jacobson Exascale dataset +definitions: + cluster_id: + type: string + format: regex + pattern: ^\w+:\d+$ + examples: ["markov_i2:1", "markov_i4:5", "markov_i6:3"] + go_term: + type: string + format: regex + pattern: ^GO:\d{7}$ + examples: ["GO:0003700", "GO:0005515"] + djornl_edge: + _key: + type: string + title: Key + format: regex + pattern: ^(\S+__){3}(\S+)$ + _from: + type: string + title: Gene ID + _to: + type: string + title: Gene or Phenotype ID + score: + title: Edge Score (Weight) + # (float) + type: number + edge_type: + $ref: edge_type.yaml + djornl_node: + _key: + type: string + title: Key + examples: ["AT1G01010"] + clusters: + type: array + title: Clusters + description: Clusters to which the node has been assigned + items: + $ref: #definitions/cluster_id + examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]] + node_type: + type: string + title: Node type + oneOf: + - const: gene + title: Gene + - const: pheno + title: Phenotype + examples: ["gene", "pheno"] + transcript: + type: string + title: Transcript + examples: ["AT1G01010.1"] + gene_symbol: + type: string + title: Gene symbol + examples: ["NTL10"] + gene_full_name: + type: string + title: Gene full name + examples: ["NAC domain containing protein 1"] + gene_model_type: + type: string + title: Gene model type + examples: ["protein_coding"] + tair_computational_description: + type: string + title: TAIR computational description + examples: ["NAC domain containing protein 1;(source:Araport11)"] + tair_curator_summary: + type: string + title: TAIR curator summary + examples: ["Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed."] + tair_short_description: + type: string + title: TAIR short description + examples: ["NAC domain containing protein 1"] + go_description: + type: string + title: GO descriptions + examples: ["DNA-binding transcription factor activity"] + go_terms: + type: array + title: GO term IDs + items: + $ref: #definitions/go_term + mapman_bin: + type: string + title: Mapman bin + examples: ["15.5.17"] + mapman_name: + type: string + title: Mapman name + examples: [".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)"] + mapman_description: + type: string + title: Mapman description + examples: ["transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])"] + pheno_aragwas_id: + type: string + title: AraGWAS ID + examples: ["10.21958/phenotype:67"] + pheno_description: + type: string + title: Phenotype description + examples: ["Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008"] + pheno_pto_name: + type: string + title: PTO name + description: Plant Trait Ontology name + examples: ["arsenic concentration"] + pheno_pto_description: + type: string + title: PTO description + description: Plant Trait Ontology description + examples: ["A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]"] + pheno_ref: + type: string + title: Phenotype reference + examples: ["Atwell et. al, Nature 2010"] + user_notes: + type: string + title: User Notes + examples: ["flowering time related"] diff --git a/spec/datasets/djornl/edge_type.yaml b/spec/datasets/djornl/edge_type.yaml new file mode 100644 index 00000000..3ac6d62a --- /dev/null +++ b/spec/datasets/djornl/edge_type.yaml @@ -0,0 +1,25 @@ +$schema: "http://json-schema.org/draft-07/schema#" +name: edge_type +title: Edge Type +description: Edge types in Dan Jacobson Arabidopsis Exascale dataset +type: string +oneOf: + - const: AraGWAS-Phenotype_Associations + title: AraGWAS phenotype associations + description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction. + + - const: AraNetv2-CX_pairwise-gene-coexpression + title: AraNetv2 pairwise gene coexpression + description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from Pearson correlation coefficients to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: AraNetv2-DC_domain-co-occurrence + title: AraNetv2 domain co-occurrence + description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: AraNetv2-HT_high-throughput-ppi + title: AraNetv2 high-throughput protein-protein interaction + description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). + + - const: AraNetv2-LC_lit-curated-ppi + title: AraNetv2 literature-curated protein-protein interaction + description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015). diff --git a/spec/datasets/djornl/node_type.yaml b/spec/datasets/djornl/node_type.yaml new file mode 100644 index 00000000..e839c2d0 --- /dev/null +++ b/spec/datasets/djornl/node_type.yaml @@ -0,0 +1,10 @@ +"$schema": "http://json-schema.org/draft-07/schema#" +name: node_type +title: Node Type +description: Node types in Dan Jacobson Exascale dataset +type: string +oneOf: + - const: gene + title: Gene + - const: pheno + title: Phenotype diff --git a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv index 7727cd8a..e989f2ca 100644 --- a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -1,4 +1,4 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, diff --git a/spec/test/djornl/duplicate_data/I2_named.tsv b/spec/test/djornl/duplicate_data/I2_named.tsv new file mode 100644 index 00000000..bef48c38 --- /dev/null +++ b/spec/test/djornl/duplicate_data/I2_named.tsv @@ -0,0 +1,9 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +Cluster3 AT1G01090 +# Cluster4 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/duplicate_data/I4_named.tsv b/spec/test/djornl/duplicate_data/I4_named.tsv new file mode 100644 index 00000000..29b6cd36 --- /dev/null +++ b/spec/test/djornl/duplicate_data/I4_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids +# cluster_prefix: markov_i4 +# title: Markov clustering, inflation = 4 +# data_type: cluster +# Cluster1 +# Cluster2 +Cluster3 AT1G01080 +# Cluster4 diff --git a/spec/test/djornl/duplicate_data/I6_copy.csv b/spec/test/djornl/duplicate_data/I6_copy.csv new file mode 100644 index 00000000..a21bd2af --- /dev/null +++ b/spec/test/djornl/duplicate_data/I6_copy.csv @@ -0,0 +1,8 @@ +cluster_id,node_ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1,"AT1G01040,AT1G01090" +Cluster2,AT1G01070 +Cluster3,"AT1G01010,AT1G01020,AT1G01030" +# Cluster4 diff --git a/spec/test/djornl/duplicate_data/I6_named.tsv b/spec/test/djornl/duplicate_data/I6_named.tsv new file mode 100644 index 00000000..e7688f17 --- /dev/null +++ b/spec/test/djornl/duplicate_data/I6_named.tsv @@ -0,0 +1,8 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i6 +# title: Markov clustering, inflation = 6 +Cluster1 AT1G01040,AT1G01090 +Cluster2 AT1G01070 +Cluster3 AT1G01010,AT1G01020,AT1G01030 +# Cluster4 diff --git a/spec/test/djornl/duplicate_data/edges.tsv b/spec/test/djornl/duplicate_data/edges.tsv new file mode 100644 index 00000000..432c2deb --- /dev/null +++ b/spec/test/djornl/duplicate_data/edges.tsv @@ -0,0 +1,11 @@ +node1 node2 edge edge_descrip layer_descrip +As2 AT1G01020 8.4 AraGWAS-Association_score AraGWAS-Phenotype_Associations +As2 AT1G01040 5.4 AraGWAS-Association_score AraGWAS-Phenotype_Associations +As75 AT1G01020 39.9 AraGWAS-Association_score AraGWAS-Phenotype_Associations +AT1G01010 AT1G01040 2.5 AraNetv2_log-likelihood-score AraNetv2-DC_domain-co-occurrence +AT1G01010 AT1G01040 170.5 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi +AT1G01030 AT1G01050 2.6 AraNetv2_log-likelihood-score AraNetv2-CX_pairwise-gene-coexpression +AT1G01050 AT1G01060 2.7 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi +# duplicated line +AT1G01010 AT1G01040 2.5 AraNetv2_log-likelihood-score AraNetv2-DC_domain-co-occurrence +AT1G01080 AT1G01090 2.8 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi diff --git a/spec/test/djornl/duplicate_data/extra_node.tsv b/spec/test/djornl/duplicate_data/extra_node.tsv new file mode 100644 index 00000000..a1a28b69 --- /dev/null +++ b/spec/test/djornl/duplicate_data/extra_node.tsv @@ -0,0 +1,5 @@ +# data_type: node +node_id node_type transcript gene_symbol gene_full_name gene_model_type TAIR_Computational_description TAIR_Curator_summary TAIR_short_description GO_description GO_terms MapMan_bin MapMan_name MapMan_description +AT1G01100 gene AT1G01100.4 protein_coding 60S acidic ribosomal protein family;(source:Araport11) 60S acidic ribosomal protein family structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity GO:0003735, GO:0043021, GO:0030295 17.1.2.1.46 .Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1 component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) +# duplicated line with alterations +AT1G01080 gene AT1G01080.3 whatever! protein_coding RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11) "RNA binding, mRNA binding" "GO:0003723, GO:0003729" 35.1 not assigned.annotated "(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)" diff --git a/spec/test/djornl/duplicate_data/hithruput-edges.csv b/spec/test/djornl/duplicate_data/hithruput-edges.csv new file mode 100644 index 00000000..c11caa3d --- /dev/null +++ b/spec/test/djornl/duplicate_data/hithruput-edges.csv @@ -0,0 +1,9 @@ +node1,node2,edge,edge_descrip,layer_descrip +AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi +AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi +# potentially erroneous line +AT1G01010,AT1G01030,2.7,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi +# duplicated line from the other file +AT1G01050,AT1G01060,2.7,AraNetv2_log-likelihood-score,AraNetv2-LC_lit-curated-ppi +# potentially erroneous duplication from the other file +AT1G01030,AT1G01050,2.6000001,AraNetv2_log-likelihood-score,AraNetv2-CX_pairwise-gene-coexpression diff --git a/spec/test/djornl/duplicate_data/manifest.yaml b/spec/test/djornl/duplicate_data/manifest.yaml new file mode 100644 index 00000000..beffb367 --- /dev/null +++ b/spec/test/djornl/duplicate_data/manifest.yaml @@ -0,0 +1,39 @@ +name: Dan Jacobson Exascale data +release_date: "2020-06-06" +home_url: "https://github.com/kbase/exascale_data" +file_list: + - data_type: edge + path: edges.tsv + date: "2020-12-25" + + - data_type: edge + path: hithruput-edges.csv + date: "2020-12-25" + + - data_type: node + path: nodes.csv + date: "2019-01-01" + + - data_type: cluster + cluster_prefix: markov_i2 + path: I2_named.tsv + + - data_type: cluster + cluster_prefix: markov_i4 + path: I4_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_named.tsv + + - data_type: cluster + cluster_prefix: markov_i6 + path: I6_copy.csv + + - data_type: node + path: pheno_nodes.csv + date: "2019-01-01" + + - data_type: node + path: extra_node.tsv + date: "2019-01-01" diff --git a/spec/test/djornl/duplicate_data/nodes.csv b/spec/test/djornl/duplicate_data/nodes.csv new file mode 100644 index 00000000..b1938272 --- /dev/null +++ b/spec/test/djornl/duplicate_data/nodes.csv @@ -0,0 +1,13 @@ +# data_type: node +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes +AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, +AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, +AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, +AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,,, +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, +AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,,, +AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,, +AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,, +AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,, +# duplicated line +AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,, diff --git a/spec/test/djornl/duplicate_data/pheno_nodes.csv b/spec/test/djornl/duplicate_data/pheno_nodes.csv new file mode 100644 index 00000000..83fbf4be --- /dev/null +++ b/spec/test/djornl/duplicate_data/pheno_nodes.csv @@ -0,0 +1,5 @@ +node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes +As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv index 118cfbcc..ab31e045 100644 --- a/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -1 +1 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes diff --git a/spec/test/djornl/empty_files/cluster_data/comment_only.tsv b/spec/test/djornl/empty_files/cluster_data/comment_only.tsv new file mode 100644 index 00000000..9ce2fbf1 --- /dev/null +++ b/spec/test/djornl/empty_files/cluster_data/comment_only.tsv @@ -0,0 +1 @@ +# what? diff --git a/spec/test/djornl/empty_files/cluster_data/headers_only.tsv b/spec/test/djornl/empty_files/cluster_data/headers_only.tsv new file mode 100644 index 00000000..3233ca40 --- /dev/null +++ b/spec/test/djornl/empty_files/cluster_data/headers_only.tsv @@ -0,0 +1,4 @@ +cluster_id node_ids +# comment +# comment +# comment diff --git a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/spec/test/djornl/empty_files/cluster_data/no_content.tsv similarity index 100% rename from spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv rename to spec/test/djornl/empty_files/cluster_data/no_content.tsv diff --git a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv deleted file mode 100644 index 8b137891..00000000 --- a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv +++ /dev/null @@ -1 +0,0 @@ - diff --git a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv deleted file mode 100644 index 8b137891..00000000 --- a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv +++ /dev/null @@ -1 +0,0 @@ - diff --git a/spec/test/djornl/empty_files/manifest.yaml b/spec/test/djornl/empty_files/manifest.yaml index fb231666..ae04f7a4 100644 --- a/spec/test/djornl/empty_files/manifest.yaml +++ b/spec/test/djornl/empty_files/manifest.yaml @@ -9,12 +9,12 @@ file_list: - data_type: cluster cluster_prefix: markov_i2 - path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + path: cluster_data/headers_only.tsv - data_type: cluster cluster_prefix: markov_i4 - path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + path: cluster_data/no_content.tsv - data_type: cluster cluster_prefix: markov_i6 - path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv + path: cluster_data/comment_only.tsv diff --git a/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv index 8b137891..4b2bca02 100644 --- a/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv +++ b/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv @@ -1 +1,4 @@ - +# this line is a comment +# so is this line +# oh no +# there's no content in this file! diff --git a/spec/test/djornl/invalid_types/edges.tsv b/spec/test/djornl/invalid_types/edges.tsv new file mode 100644 index 00000000..06bbe9fd --- /dev/null +++ b/spec/test/djornl/invalid_types/edges.tsv @@ -0,0 +1,10 @@ +# data_type: edge +node1 node2 edge edge_descrip layer_descrip +As2 AT1G01020 8.422046084731258 AraGWAS-Association_score Same-Old-Stuff +As2 AT1G01040 6 AraGWAS-Association_score AraGWAS-Phenotype_Associations +As75 AT1G01020 39.98573324312915 AraGWAS-Association_score AraGWAS-Phenotype_Associations +AT1G01010 AT1G01020 2.39322646755088 AraNetv2_log-likelihood-score AraNetv2-HT_high-throughput-ppi +AT1G01010 AT1G01030 2. AraNetv2_log-likelihood-score AraNetv2-HT_high-throughput-ppi +AT1G01010 AT1G01040 "2.39322646755088" AraNetv2_log-likelihood-score raNetv2-DC_ +AT1G01030 AT1G01050 25494618241936697 AraNetv2_log-likelihood-score AraNetv2-CX_pairwise-gene-coexpression +AT1G01050 AT1G01060 score! AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi diff --git a/spec/test/djornl/invalid_types/manifest.yaml b/spec/test/djornl/invalid_types/manifest.yaml index 50c5f454..e37ca783 100644 --- a/spec/test/djornl/invalid_types/manifest.yaml +++ b/spec/test/djornl/invalid_types/manifest.yaml @@ -2,7 +2,11 @@ name: Dan Jacobson Exascale data release_date: "2020-06-06" file_list: - data_type: edge - path: merged_edges-AMW-060820_AF.tsv + path: edges.tsv - data_type: node - path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + path: nodes.csv + + - data_type: cluster + path: markov2_named.tsv + cluster_prefix: markov_i2 diff --git a/spec/test/djornl/invalid_types/markov2_named.tsv b/spec/test/djornl/invalid_types/markov2_named.tsv new file mode 100644 index 00000000..f82190fd --- /dev/null +++ b/spec/test/djornl/invalid_types/markov2_named.tsv @@ -0,0 +1,9 @@ +cluster_id node_ids +# data_type: cluster +# cluster_prefix: markov_i2 +# title: Markov clustering, inflation = 2 +Cluster1 AT1G01010,AT1G01030,AT1G01040 +Cluster2 AT1G01050,AT1G01060,AT1G01070 +HoneyNutCluster3 AT1G01080,AT1G01090 +Cluster4 +Cluster5 AT1G01020 diff --git a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv deleted file mode 100644 index a98f49f9..00000000 --- a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv +++ /dev/null @@ -1,10 +0,0 @@ -# data_type: edge -node1 node2 edge edge_descrip layer_descrip -As2 AT1G01020 8.422046084731258 AraGWAS-Association_score AraGWAS-Some-Old-Rubbish-I-Made-Up -As2 AT1G01040 5.422046084731258 AraGWAS-Association_score AraGWAS-Phenotype_Associations -As75 AT1G01020 39.98573324312915 AraGWAS-Association_score AraGWAS-Phenotype_Associations -AT1G01010 AT1G01020 2.39322646755088 AraNetv2_log-likelihood-score AraNetv2-HT_high-throughput-ppi -AT1G01010 AT1G01030 2.39322646755088 AraNetv2_log-likelihood-score AraNetv2-HT_high-throughput-ppi -AT1G01010 AT1G01040 2.39322646755088 AraNetv2_log-likelihood-score raNetv2-DC_domain-co-occurrence -AT1G01030 AT1G01050 2.5494618241936697 AraNetv2_log-likelihood-score AraNetv2-CX_pairwise-gene-coexpression -AT1G01050 AT1G01060 4.34242054808616 AraNetv2_log-likelihood-score AraNetv2-LC_lit-curated-ppi diff --git a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/invalid_types/nodes.csv similarity index 98% rename from spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv rename to spec/test/djornl/invalid_types/nodes.csv index e98310e5..e469f003 100644 --- a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/spec/test/djornl/invalid_types/nodes.csv @@ -1,4 +1,4 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes # data_type: node As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json index e15408d2..9eefcebc 100644 --- a/spec/test/djornl/results.json +++ b/spec/test/djornl/results.json @@ -13,16 +13,16 @@ {"_key": "AT1G01090"} ], "edges": [ - {"_key": "As2__AT1G01020__pheno_assn__8.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01020", "edge_type": "pheno_assn", "score": 8.4}, - {"_key": "As2__AT1G01040__pheno_assn__5.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01040", "edge_type": "pheno_assn", "score": 5.4}, - {"_key": "As75__AT1G01020__pheno_assn__39.9", "_from": "djornl_node/As75", "_to": "djornl_node/AT1G01020", "edge_type": "pheno_assn", "score": 39.9}, - {"_key": "AT1G01010__AT1G01020__ppi_hithru__2.3", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01020", "edge_type": "ppi_hithru", "score": 2.3}, - {"_key": "AT1G01010__AT1G01030__ppi_hithru__2.4", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01030", "edge_type": "ppi_hithru", "score": 2.4}, - {"_key": "AT1G01010__AT1G01040__domain_co_occur__2.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "domain_co_occur", "score": 2.5}, - {"_key": "AT1G01010__AT1G01040__ppi_liter__170.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "ppi_liter", "score": 170.5}, - {"_key": "AT1G01030__AT1G01050__gene_coexpr__2.6", "_from": "djornl_node/AT1G01030", "_to": "djornl_node/AT1G01050", "edge_type": "gene_coexpr", "score": 2.6}, - {"_key": "AT1G01050__AT1G01060__ppi_liter__2.7", "_from": "djornl_node/AT1G01050", "_to": "djornl_node/AT1G01060", "edge_type": "ppi_liter", "score": 2.7}, - {"_key": "AT1G01080__AT1G01090__ppi_liter__2.8", "_from": "djornl_node/AT1G01080", "_to": "djornl_node/AT1G01090", "edge_type": "ppi_liter", "score": 2.8} + {"_key": "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01020", "edge_type": "AraGWAS-Phenotype_Associations", "score": 8.4}, + {"_key": "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01040", "edge_type": "AraGWAS-Phenotype_Associations", "score": 5.4}, + {"_key": "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", "_from": "djornl_node/As75", "_to": "djornl_node/AT1G01020", "edge_type": "AraGWAS-Phenotype_Associations", "score": 39.9}, + {"_key": "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01020", "edge_type": "AraNetv2-HT_high-throughput-ppi", "score": 2.3}, + {"_key": "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01030", "edge_type": "AraNetv2-HT_high-throughput-ppi", "score": 2.4}, + {"_key": "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "AraNetv2-DC_domain-co-occurrence", "score": 2.5}, + {"_key": "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 170.5}, + {"_key": "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", "_from": "djornl_node/AT1G01030", "_to": "djornl_node/AT1G01050", "edge_type": "AraNetv2-CX_pairwise-gene-coexpression", "score": 2.6}, + {"_key": "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7", "_from": "djornl_node/AT1G01050", "_to": "djornl_node/AT1G01060", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 2.7}, + {"_key": "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8", "_from": "djornl_node/AT1G01080", "_to": "djornl_node/AT1G01090", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 2.8} ] }, "load_cluster_data": { @@ -33,9 +33,9 @@ {"_key": "AT1G01050", "clusters": ["markov_i2:2"]}, {"_key": "AT1G01060", "clusters": ["markov_i2:2"]}, {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]}, - {"_key": "AT1G01080", "clusters": ["markov_i2:3"]}, {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]}, - {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]} + {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]}, + {"_key": "AT1G01080", "clusters": ["markov_i4:3"]} ] }, "load_node_metadata": { @@ -74,16 +74,16 @@ "SDV" ], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7", + "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8" ] }, "fetch_genes": { @@ -109,24 +109,24 @@ "AT1G01040" ], "edges": [ - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5" + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7" ] } } @@ -140,23 +140,23 @@ "1": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7" ] } } @@ -181,22 +181,22 @@ "1": { "nodes": ["As2", "AT1G01020", "AT1G01040"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7" ] } } @@ -210,22 +210,22 @@ "1": { "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7" ] } } @@ -250,25 +250,25 @@ "1": { "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7", + "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8" ] } } @@ -294,25 +294,25 @@ "1": { "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7", + "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8" ] } } @@ -326,23 +326,23 @@ "1": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3" ] }, "5": { "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" + "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", + "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", + "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", + "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", + "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", + "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", + "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", + "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", + "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7" ] } } diff --git a/spec/test/djornl/test_data/I2_named.tsv b/spec/test/djornl/test_data/I2_named.tsv index 3f129851..c7a9c200 100644 --- a/spec/test/djornl/test_data/I2_named.tsv +++ b/spec/test/djornl/test_data/I2_named.tsv @@ -1,9 +1,8 @@ -cluster ids +cluster_id node_ids # data_type: cluster # cluster_prefix: markov_i2 # title: Markov clustering, inflation = 2 Cluster1 AT1G01010,AT1G01030,AT1G01040 Cluster2 AT1G01050,AT1G01060,AT1G01070 -Cluster3 AT1G01080,AT1G01090 -Cluster4 +Cluster3 AT1G01090 Cluster5 AT1G01020 diff --git a/spec/test/djornl/test_data/I4_named.tsv b/spec/test/djornl/test_data/I4_named.tsv index 39e46deb..6e7d91e4 100644 --- a/spec/test/djornl/test_data/I4_named.tsv +++ b/spec/test/djornl/test_data/I4_named.tsv @@ -1,4 +1,5 @@ -cluster ids +cluster_id node_ids # cluster_prefix: markov_i4 # title: Markov clustering, inflation = 4 # data_type: cluster +Cluster3 AT1G01080 diff --git a/spec/test/djornl/test_data/I6_named.tsv b/spec/test/djornl/test_data/I6_named.tsv index d504f6b4..e7688f17 100644 --- a/spec/test/djornl/test_data/I6_named.tsv +++ b/spec/test/djornl/test_data/I6_named.tsv @@ -1,8 +1,8 @@ -cluster ids +cluster_id node_ids # data_type: cluster # cluster_prefix: markov_i6 # title: Markov clustering, inflation = 6 Cluster1 AT1G01040,AT1G01090 Cluster2 AT1G01070 Cluster3 AT1G01010,AT1G01020,AT1G01030 -Cluster4 +# Cluster4 diff --git a/spec/test/djornl/test_data/extra_node.tsv b/spec/test/djornl/test_data/extra_node.tsv index 664425c5..de069d70 100644 --- a/spec/test/djornl/test_data/extra_node.tsv +++ b/spec/test/djornl/test_data/extra_node.tsv @@ -1,3 +1,3 @@ # data_type: node -node_id node_type transcript gene_symbol gene_full_name gene_model_type TAIR_Computational_description TAIR_Curator_summary TAIR_short_description GO_descr GO_terms MapMan_bin MapMan_name MapMan_descr pheno_AraGWAS_ID pheno_descrip1 pheno_descrip2 pheno_descrip3 pheno_ref User_Notes -AT1G01100 gene AT1G01100.4 protein_coding 60S acidic ribosomal protein family;(source:Araport11) 60S acidic ribosomal protein family structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity GO:0003735, GO:0043021, GO:0030295 17.1.2.1.46 .Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1 component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) +node_id node_type transcript gene_symbol gene_full_name gene_model_type TAIR_Computational_description TAIR_Curator_summary TAIR_short_description GO_description GO_terms MapMan_bin MapMan_name MapMan_description +AT1G01100 gene AT1G01100.4 protein_coding 60S acidic ribosomal protein family;(source:Araport11) 60S acidic ribosomal protein family structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity GO:0003735, GO:0043021, GO:0030295 17.1.2.1.46 .Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1 component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]) diff --git a/spec/test/djornl/test_data/nodes.csv b/spec/test/djornl/test_data/nodes.csv index 2245bd59..92f60761 100644 --- a/spec/test/djornl/test_data/nodes.csv +++ b/spec/test/djornl/test_data/nodes.csv @@ -1,5 +1,5 @@ # data_type: node -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes +node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,, AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3 transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,, diff --git a/spec/test/djornl/test_data/pheno_nodes.csv b/spec/test/djornl/test_data/pheno_nodes.csv index d8bb15a9..83fbf4be 100644 --- a/spec/test/djornl/test_data/pheno_nodes.csv +++ b/spec/test/djornl/test_data/pheno_nodes.csv @@ -1,5 +1,5 @@ -node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes -As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", -As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", -Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", -SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", +node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes +As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", +As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", +Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010", +SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010", From 4875b0528df9dbbe038309d5c68ceef438bf1e2c Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Fri, 28 Aug 2020 07:04:07 -0700 Subject: [PATCH 4/4] Rename some functions and variables for greater consistency Add a couple more parser tests --- importers/README.md | 2 +- importers/djornl/main.py | 11 ----- importers/djornl/parser.py | 63 +++++++++++++++---------- importers/test/test_djornl_parser.py | 37 ++++++++++----- importers/utils/config.py | 3 +- spec/test/djornl/results.json | 4 +- spec/test/stored_queries/test_djornl.py | 6 +-- 7 files changed, 69 insertions(+), 57 deletions(-) delete mode 100644 importers/djornl/main.py diff --git a/importers/README.md b/importers/README.md index 53df13cc..e54bb379 100644 --- a/importers/README.md +++ b/importers/README.md @@ -15,5 +15,5 @@ Global env vars: ```sh RES_ROOT_DATA_PATH=/path/to/djornl_data \ -python -m importers.djornl.main +python -m importers.djornl.parser ``` diff --git a/importers/djornl/main.py b/importers/djornl/main.py deleted file mode 100644 index ba1bb005..00000000 --- a/importers/djornl/main.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -Loads the Dan Jacobson/ORNL group's gene and phenotype network data into -arangodb. - -Running this requires a set of source files provided by the ORNL group. -""" -from importers.djornl.parser import DJORNL_Parser - -if __name__ == '__main__': - parser = DJORNL_Parser() - parser.load_data() diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index 990c745a..c62c6793 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -252,14 +252,16 @@ def load_edges(self): # store edge data, checking for potential duplicates def store_edges(datum): - # there should only be one value for each node<->node edge of a given type + # there should only be one value for each node<->node edge of a given type, + # so use these values as an index key edge_key = "__".join([datum['node1'], datum['node2'], datum['edge_type']]) if edge_key in edge_ix: - # ignore duplicate lines; report non-matching data - if datum['score'] != edge_ix[edge_key]['score']: - return f"duplicate data for edge {edge_key}" - return None + # duplicate lines can be ignored + if datum['score'] == edge_ix[edge_key]['score']: + return None + # report non-matching data + return f"duplicate data for edge {edge_key}" # keep track of the nodes mentioned in this edge set for node_n in ["1", "2"]: @@ -286,7 +288,7 @@ def store_edges(datum): 'edges': edge_ix.values(), } - def load_node_metadata(self): + def load_nodes(self): """Load node metadata""" node_ix = {} @@ -352,7 +354,7 @@ def store_nodes(datum): raise RuntimeError('\n'.join(err_list)) return {'nodes': node_ix.values()} - def load_cluster_data(self): + def load_clusters(self): """Annotate genes with cluster ID fields.""" # index of nodes @@ -428,33 +430,38 @@ def save_docs(self, coll_name, docs, on_dupe='update'): def load_data(self): self.save_dataset(self.load_edges()) - self.save_dataset(self.load_node_metadata()) - self.save_dataset(self.load_cluster_data()) + self.save_dataset(self.load_nodes()) + self.save_dataset(self.load_clusters()) return True def check_data_delta(self): edge_data = self.load_edges() - node_metadata = self.load_node_metadata() - clusters = self.load_cluster_data() + node_data = self.load_nodes() + clusters = self.load_clusters() + + self.check_deltas(edge_data=edge_data, node_data=node_data, cluster_data=clusters) - self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters) + def check_deltas(self, edge_data={}, node_data={}, cluster_data={}): - def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): + edges_nodelist = set([e['_key'] for e in edge_data['nodes']]) + nodes_nodelist = set([e['_key'] for e in node_data['nodes']]) + clusters_nodelist = set([e['_key'] for e in cluster_data['nodes']]) + all_nodes = edges_nodelist.union(nodes_nodelist).union(clusters_nodelist) - edge_nodes = set([e['_key'] for e in edge_data['nodes']]) - node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']]) - cluster_nodes = set([e['_key'] for e in cluster_data['nodes']]) - all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes) + # check all nodes in cluster_data have node data + cluster_no_node_set = clusters_nodelist.difference(nodes_nodelist) + if cluster_no_node_set: + print({'clusters with no node metadata': cluster_no_node_set}) - # check all nodes in cluster_data have node_metadata - clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes) - if clstr_no_node_md_set: - print({'clusters with no node metadata': clstr_no_node_md_set}) + # check all nodes in the edge_data have node data + edge_no_node_set = edges_nodelist.difference(nodes_nodelist) + if edge_no_node_set: + print({'edges with no node metadata': edge_no_node_set}) - # check all nodes in the edge_data have node_metadata - edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes) - if edge_no_node_md_set: - print({'edges with no node metadata': edge_no_node_md_set}) + # check all nodes are in the edge_data set + node_no_edge_set = nodes_nodelist.difference(edges_nodelist) + if node_no_edge_set: + print({'nodes not in an edge': node_no_edge_set}) # count all edges print("Dataset contains " + str(len(edge_data['edges'])) + " edges") @@ -464,4 +471,8 @@ def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): if __name__ == '__main__': parser = DJORNL_Parser() - parser.load_data() + try: + parser.load_data() + except Exception as err: + print(err) + exit(1) diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py index a3c8ecfa..daf2b417 100644 --- a/importers/test/test_djornl_parser.py +++ b/importers/test/test_djornl_parser.py @@ -33,6 +33,19 @@ def init_parser_with_path(self, root_path): parser._configure() return parser + def test_missing_required_env_var(self): + '''test that the parser exits with code 1 if the RES_ROOT_DATA_PATH env var is not set''' + with self.assertRaisesRegex(RuntimeError, 'Missing required env var: RES_ROOT_DATA_PATH'): + parser = DJORNL_Parser() + parser.load_edges() + + def test_config(self): + '''test that the parser raises an error if a config value cannot be found''' + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + with self.assertRaisesRegex(KeyError, 'No such config value: bananas'): + parser.config('bananas') + def test_load_no_manifest(self): """ test loading when the manifest does not exist """ RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest') @@ -67,7 +80,7 @@ def test_load_empty_files(self): # header only, no content err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found' with self.assertRaisesRegex(RuntimeError, err_str): - parser.load_node_metadata() + parser.load_nodes() # comments only err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found' @@ -81,7 +94,7 @@ def test_load_empty_files(self): 'cluster_data/comment_only.tsv: no header line found', ]) with self.assertRaisesRegex(RuntimeError, err_str): - parser.load_cluster_data() + parser.load_clusters() def test_load_missing_files(self): """ test loading when files cannot be found """ @@ -119,7 +132,7 @@ def test_load_invalid_nodes(self): # invalid node type node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas" with self.assertRaisesRegex(RuntimeError, node_err_msg): - parser.load_node_metadata() + parser.load_nodes() def test_load_invalid_clusters(self): """ test file format errors """ @@ -131,7 +144,7 @@ def test_load_invalid_clusters(self): # invalid node type cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match" with self.assertRaisesRegex(RuntimeError, cluster_err_msg): - parser.load_cluster_data() + parser.load_clusters() def test_load_col_count_errors(self): """ test files with invalid numbers of columns """ @@ -148,7 +161,7 @@ def test_load_col_count_errors(self): # too many cols node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22' with self.assertRaisesRegex(RuntimeError, node_err_msg): - parser.load_node_metadata() + parser.load_nodes() def test_load_valid_edge_data(self): @@ -169,8 +182,8 @@ def test_load_valid_node_metadata(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - node_metadata = parser.load_node_metadata() - expected = self.json_data["load_node_metadata"] + node_metadata = parser.load_nodes() + expected = self.json_data["load_nodes"] for data_structure in [node_metadata, expected]: for k in data_structure.keys(): @@ -184,10 +197,10 @@ def test_load_valid_cluster_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - cluster_data = parser.load_cluster_data() + cluster_data = parser.load_clusters() self.assertEqual( cluster_data, - self.json_data["load_cluster_data"] + self.json_data["load_clusters"] ) def test_duplicate_edge_data(self): @@ -211,7 +224,7 @@ def test_duplicate_node_data(self): err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080" with self.assertRaisesRegex(RuntimeError, err_msg): - parser.load_node_metadata() + parser.load_nodes() def test_duplicate_cluster_data(self): """ test files with duplicate cluster data, which should be seamlessly merged """ @@ -220,10 +233,10 @@ def test_duplicate_cluster_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - cluster_data = parser.load_cluster_data() + cluster_data = parser.load_clusters() self.assertEqual( cluster_data, - self.json_data["load_cluster_data"] + self.json_data["load_clusters"] ) def test_the_full_shebang(self): diff --git a/importers/utils/config.py b/importers/utils/config.py index 878b9fee..1f30080e 100644 --- a/importers/utils/config.py +++ b/importers/utils/config.py @@ -20,8 +20,7 @@ def load_from_env(extra_required=None, extra_optional=None, prefix='RES_'): optional = list(OPTIONAL) + (extra_optional or []) for field in required: if (prefix + field) not in os.environ: - print(f"Missing required env var: {prefix + field}") - exit(1) + raise RuntimeError(f"Missing required env var: {prefix + field}") for field in required + optional: if (prefix + field) in os.environ: conf[field] = os.environ[prefix + field] diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json index 9eefcebc..fe9a613c 100644 --- a/spec/test/djornl/results.json +++ b/spec/test/djornl/results.json @@ -25,7 +25,7 @@ {"_key": "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8", "_from": "djornl_node/AT1G01080", "_to": "djornl_node/AT1G01090", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 2.8} ] }, - "load_cluster_data": { + "load_clusters": { "nodes": [ {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]}, {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]}, @@ -38,7 +38,7 @@ {"_key": "AT1G01080", "clusters": ["markov_i4:3"]} ] }, - "load_node_metadata": { + "load_nodes": { "nodes": [ {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_description": "", "pheno_pto_name": "bacterial disease resistance", "pheno_pto_description": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_description": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "arsenic concentration", "pheno_pto_description": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}, diff --git a/spec/test/stored_queries/test_djornl.py b/spec/test/stored_queries/test_djornl.py index cde4d0c2..befe46cd 100644 --- a/spec/test/stored_queries/test_djornl.py +++ b/spec/test/stored_queries/test_djornl.py @@ -49,11 +49,11 @@ def setUpClass(cls): r = create_test_docs(edge_name, edge_data['edges']) print_db_update(r, edge_name) - node_metadata = parser.load_node_metadata() + node_metadata = parser.load_nodes() r = create_test_docs(node_name, node_metadata['nodes'], True) print_db_update(r, node_name) - cluster_data = parser.load_cluster_data() + cluster_data = parser.load_clusters() r = create_test_docs(node_name, cluster_data['nodes'], True) print_db_update(r, node_name) @@ -93,7 +93,7 @@ def test_fetch_all(self): # ensure that all the cluster data is returned OK node_data = response['results'][0]['nodes'] - expected_node_data = self.json_data['load_cluster_data']['nodes'] + expected_node_data = self.json_data['load_clusters']['nodes'] self.assertEqual( {n['_key']: n['clusters'] for n in node_data if 'clusters' in n}, {n['_key']: n['clusters'] for n in expected_node_data if 'clusters' in n},