From 855152d8b8a805fba9610e6c637ff653ff7346a3 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Thu, 20 Aug 2020 10:27:22 -0700
Subject: [PATCH 1/4] - use manifest file to specify files to be included in
 release - update DJORNL parser to apply manifest file - add tests for
 manifest file validation - small refactor of parser to apply the same QC to
 every file - small formatting updates to DJORNL source files

---
 CHANGELOG.md                                  |  23 ++
 importers/djornl/parser.py                    | 357 ++++++++++++------
 importers/test/test_djornl_parser.py          |  46 +--
 spec/README.md                                |   1 +
 spec/collections/djornl/djornl_edge.yaml      |   5 +
 spec/collections/djornl/djornl_node.yaml      |  18 +-
 spec/datasets/djornl/manifest.schema.json     | 102 +++++
 .../djornl/djornl_fetch_clusters.yaml         |   1 +
 .../djornl/djornl_search_nodes.yaml           |  16 +-
 ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv |   2 +-
 .../djornl/col_count_errors/manifest.yaml     |   8 +
 spec/test/djornl/empty_files/manifest.yaml    |  20 +
 spec/test/djornl/invalid_file/edges.tsv/empty |   0
 spec/test/djornl/invalid_file/manifest.yaml   |  12 +
 .../invalid_manifest/cluster_no_prefix.yaml   |  18 +
 .../invalid_manifest/date_not_in_quotes.yaml  |  17 +
 .../invalid_manifest/invalid_format.yaml      |  17 +
 .../djornl/invalid_manifest/manifest.yaml     |  14 +
 .../djornl/invalid_manifest/missing_path.yaml |  21 ++
 .../invalid_manifest/no_file_format.yaml      |  16 +
 .../djornl/invalid_manifest/no_file_list.yaml |   3 +
 ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv |   3 +-
 spec/test/djornl/invalid_types/manifest.yaml  |   8 +
 .../merged_edges-AMW-060820_AF.tsv            |   1 +
 spec/test/djornl/missing_files/manifest.yaml  |  12 +
 spec/test/djornl/results.json                 |  28 +-
 spec/test/djornl/test_data/I2_named.tsv       |   9 +
 spec/test/djornl/test_data/I4_named.tsv       |   4 +
 spec/test/djornl/test_data/I6_named.tsv       |   8 +
 ...p10percent_anno_AF_082919.abc.I2_named.tsv |   5 -
 ...p10percent_anno_AF_082919.abc.I4_named.tsv |   1 -
 ...p10percent_anno_AF_082919.abc.I6_named.tsv |   4 -
 ...rged_edges-AMW-060820_AF.tsv => edges.tsv} |   2 -
 spec/test/djornl/test_data/extra_node.tsv     |   3 +
 .../test/djornl/test_data/hithruput-edges.csv |   3 +
 spec/test/djornl/test_data/manifest.yaml      |  35 ++
 ...-AMW-v2_091319_nodeTable.csv => nodes.csv} |   8 +-
 spec/test/djornl/test_data/pheno_nodes.csv    |   5 +
 .../djornl/valid_manifest/no_file_ext.yaml    |  40 ++
 .../djornl/valid_manifest/no_file_format.yaml |  39 ++
 .../valid_manifest/with_descriptions.yaml     |  39 ++
 spec/test/stored_queries/test_djornl.py       |   4 +-
 spec/test/test_manifest_schema.py             |  82 ++++
 spec/views/djornl/djornl_node_view.json       |  14 +-
 44 files changed, 877 insertions(+), 197 deletions(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 spec/datasets/djornl/manifest.schema.json
 create mode 100644 spec/test/djornl/col_count_errors/manifest.yaml
 create mode 100644 spec/test/djornl/empty_files/manifest.yaml
 create mode 100644 spec/test/djornl/invalid_file/edges.tsv/empty
 create mode 100644 spec/test/djornl/invalid_file/manifest.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/invalid_format.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/manifest.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/missing_path.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/no_file_format.yaml
 create mode 100644 spec/test/djornl/invalid_manifest/no_file_list.yaml
 create mode 100644 spec/test/djornl/invalid_types/manifest.yaml
 create mode 100644 spec/test/djornl/missing_files/manifest.yaml
 create mode 100644 spec/test/djornl/test_data/I2_named.tsv
 create mode 100644 spec/test/djornl/test_data/I4_named.tsv
 create mode 100644 spec/test/djornl/test_data/I6_named.tsv
 delete mode 100644 spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
 delete mode 100644 spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
 delete mode 100644 spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
 rename spec/test/djornl/test_data/{merged_edges-AMW-060820_AF.tsv => edges.tsv} (80%)
 create mode 100644 spec/test/djornl/test_data/extra_node.tsv
 create mode 100644 spec/test/djornl/test_data/hithruput-edges.csv
 create mode 100644 spec/test/djornl/test_data/manifest.yaml
 rename spec/test/djornl/test_data/{aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv => nodes.csv} (72%)
 create mode 100644 spec/test/djornl/test_data/pheno_nodes.csv
 create mode 100644 spec/test/djornl/valid_manifest/no_file_ext.yaml
 create mode 100644 spec/test/djornl/valid_manifest/no_file_format.yaml
 create mode 100644 spec/test/djornl/valid_manifest/with_descriptions.yaml
 create mode 100644 spec/test/test_manifest_schema.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..ad4fa3fc
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,23 @@
+## 0.0.6
+
+### `relation_engine_server`
+
+- `relation_engine_server/api_versions/api_v1.py`: add and/or document API endpoints:
+  - /api/v1/specs/stored_queries
+  - /api/v1/specs/data_sources
+
+- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file
+
+### `importers`
+
+- `importers/djornl`: use manifest file to specify the list of files to be parsed to create a dataset. Refactor file parsing for more flexibility.
+
+### `spec`
+
+- add `datasets` folder for dataset-specific schemas
+
+----
+
+## 0.0.5
+
+Last release with RE components in two repositories, https://github.com/kbase/relation_engine_api and https://github.com/kbase/relation_engine_spec
diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
index 771b359a..928b55fe 100644
--- a/importers/djornl/parser.py
+++ b/importers/djornl/parser.py
@@ -8,8 +8,10 @@
 import requests
 import os
 import csv
+import yaml
 
 import importers.utils.config as config
+from relation_engine_server.utils.json_validation import run_validator
 
 
 class DJORNL_Parser(object):
@@ -28,45 +30,119 @@ def _configure(self):
         configuration = config.load_from_env(extra_required=['ROOT_DATA_PATH'])
 
         # Collection name config
-        configuration['_NODE_NAME'] = 'djornl_node'
-        configuration['_EDGE_NAME'] = 'djornl_edge'
+        configuration['node_name'] = 'djornl_node'
+        configuration['edge_name'] = 'djornl_edge'
 
-        # Path config
-        configuration['_NODE_PATH'] = os.path.join(
-            configuration['ROOT_DATA_PATH'],
-            'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv'
-        )
-        configuration['_NODE_FILE_COL_COUNT'] = 20
+        # fetch the manifest and make sure all the files listed actually exist
+        manifest = self._get_manifest(configuration)
+        for type in ['node', 'edge', 'cluster']:
+            configuration[type + '_files'] = []
+
+        error_list = []
+        for file in manifest['file_list']:
+            file_path = os.path.join(configuration['ROOT_DATA_PATH'], file['path'])
+
+            if not os.path.exists(file_path):
+                error_list.append(f"{file_path}: file does not exist")
+                continue
+
+            if not os.path.isfile(file_path):
+                error_list.append(f"{file_path}: not a file")
+                continue
+
+            # add the file to the appropriate list
+            file['file_path'] = file_path
+            configuration[file['data_type'] + '_files'].append(file)
+
+        if error_list:
+            raise RuntimeError("\n".join(error_list))
 
-        configuration['_EDGE_PATH'] = os.path.join(
-            configuration['ROOT_DATA_PATH'],
-            'merged_edges-AMW-060820_AF.tsv'
-        )
-        configuration['_EDGE_FILE_COL_COUNT'] = 5
-
-        _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
-        configuration['_CLUSTER_PATHS'] = {
-            'markov_i2': os.path.join(
-                _CLUSTER_BASE,
-                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
-            ),
-            'markov_i4': os.path.join(
-                _CLUSTER_BASE,
-                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
-            ),
-            'markov_i6': os.path.join(
-                _CLUSTER_BASE,
-                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
-            ),
-        }
         self._config = configuration
         return self._config
 
+    def _get_manifest_schema_file(self):
+
+        return os.path.join('/app', 'spec', 'datasets', 'djornl', 'manifest.schema.json')
+
+    def _get_manifest(self, configuration):
+        """
+        Read the manifest file, which contains path and file type info, and validate it.
+        The manifest is expected to be at ROOT_DATA_PATH/manifest.yaml
+        """
+
+        schema_file = self._get_manifest_schema_file()
+
+        # load the manifest and validate it against the schema
+        manifest_file = os.path.join(configuration['ROOT_DATA_PATH'], 'manifest.yaml')
+
+        try:
+            with open(manifest_file) as fd:
+                manifest = yaml.safe_load(fd)
+        except FileNotFoundError:
+            raise RuntimeError(
+                f"No manifest file found at {manifest_file}.\n"
+                + "Please ensure that you have created a manifest that lists the files "
+                + "in the release"
+            )
+
+        try:
+            validated_manifest = run_validator(
+                schema_file=schema_file,
+                data=manifest
+            )
+        except Exception as err:
+            print(err)
+            raise RuntimeError(
+                "The manifest file failed validation. Please recheck the file and try again."
+            )
+
+        return validated_manifest
+
+    def _get_file_reader(self, fd, file):
+        '''Given a dict containing file information, instantiate the correct type of parser'''
+
+        delimiter = '\t'
+        if 'file_format' in file and file['file_format'].lower() == 'csv' or file['path'].lower().endswith('.csv'):
+            delimiter = ','
+        return csv.reader(fd, delimiter=delimiter)
+
+    def parser_gen(self, file):
+        """generator function to parse a file"""
+        expected_col_count = 0
+        with open(file['file_path']) as fd:
+            csv_reader = self._get_file_reader(fd, file)
+            line_no = 0
+            for row in csv_reader:
+                line_no += 1
+                if len(row) <= 1 or row[0][0] == '#':
+                    # comment / metadata
+                    continue
+
+                cols = [c.strip() for c in row]
+
+                if len(cols) == expected_col_count:
+                    yield (line_no, cols)
+                    continue
+
+                # if we didn't get the expected number of cols:
+                if expected_col_count == 0:
+                    # this is the header row; set up the expected column count
+                    expected_col_count = len(cols)
+                    yield (line_no, [c.lower() for c in cols])
+                    continue
+
+                # otherwise, this row does not have the correct number of columns
+                n_cols = len(cols)
+                raise RuntimeError(
+                    f"{file['path']} line {line_no}: "
+                    + f"expected {expected_col_count} cols, found {n_cols}"
+                )
+
     def load_edges(self):
         # Headers and sample row:
         # node1	node2	edge	edge_descrip	layer_descrip
         # AT1G01370	AT1G57820	4.40001558779779	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
-        edge_remap = {
+        edge_type_remap = {
             'AraGWAS-Phenotype_Associations': 'pheno_assn',
             'AraNetv2-CX_pairwise-gene-coexpression': 'gene_coexpr',
             'AraNetv2-DC_domain-co-occurrence': 'domain_co_occur',
@@ -77,34 +153,56 @@ def load_edges(self):
         # dict of nodes, indexed by node ID (node1 and node2 from the file)
         node_ix = {}
         edges = []
-        node_name = self.config('_NODE_NAME')
-        expected_col_count = self.config('_EDGE_FILE_COL_COUNT')
+        node_name = self.config('node_name')
+
+        def edge_type(row):
+            if row['layer_descrip'] not in edge_type_remap:
+                raise RuntimeError(
+                    f"{file['path']} line {line_no}: invalid edge type: {row['layer_descrip']}"
+                )
+            return edge_type_remap[row['layer_descrip']]
+
+        def _key(row):
+            return '__'.join([
+                row['node1'],
+                row['node2'],
+                edge_type(row),
+                row['edge'],
+            ])
+
+        # these functions remap the values in the columns of the input file to
+        # appropriate values to go into Arango
+        remap_functions = {
+            '_from': lambda row: node_name + '/' + row['node1'],
+            '_to': lambda row: node_name + '/' + row['node2'],
+            'score': lambda row: float(row['edge']),
+            'edge_type': edge_type,
+            '_key': _key,
+        }
 
-        with open(self.config('_EDGE_PATH')) as fd:
-            csv_reader = csv.reader(fd, delimiter='\t')
-            next(csv_reader, None)  # skip headers
-            line_no = 1
-            for row in csv_reader:
-                line_no += 1
+        for file in self.config('edge_files'):
+            file_parser = self.parser_gen(file)
+            headers = []
 
-                cols = [c.strip() for c in row]
-                if len(cols) != expected_col_count:
-                    n_cols = len(cols)
-                    raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}")
-
-                node_ix[cols[0]] = 1
-                node_ix[cols[1]] = 1
-                edge_type = cols[4]
-                if edge_type not in edge_remap:
-                    raise RuntimeError(f"line {line_no}: invalid edge type: {edge_type}")
-
-                edges.append({
-                    '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}',
-                    '_from': f'{node_name}/{cols[0]}',
-                    '_to': f'{node_name}/{cols[1]}',
-                    'score': float(cols[2]),
-                    'edge_type': edge_remap[edge_type],
-                })
+            while True:
+                try:
+                    (line_no, cols) = next(file_parser)
+                except StopIteration:
+                    break
+
+                if len(headers) == 0:
+                    headers = cols
+                    continue
+
+                # merge headers with cols to create an object
+                row_object = dict(zip(headers, cols))
+                # transform it using the remap_functions
+                datum = {key: func(row_object) for (key, func) in remap_functions.items()}
+                edges.append(datum)
+
+                # keep track of the nodes mentioned in this edge set
+                for node in ["1", "2"]:
+                    node_ix[row_object[f"node{node}"]] = 1
 
         return {
             'nodes': [{'_key': n} for n in node_ix.keys()],
@@ -116,49 +214,69 @@ def load_node_metadata(self):
 
         nodes = []
         valid_node_types = ['gene', 'pheno']
-        expected_col_count = self.config('_NODE_FILE_COL_COUNT')
-        with open(self.config('_NODE_PATH')) as fd:
-            csv_reader = csv.reader(fd, delimiter=',')
-            next(csv_reader, None)  # skip headers
-            line_no = 1
-            for row in csv_reader:
-                line_no += 1
 
-                cols = [c.strip() for c in row]
-                if len(cols) != expected_col_count:
-                    n_cols = len(cols)
-                    raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}")
-
-                _key = cols[0]
-                node_type = cols[1]
-                if node_type not in valid_node_types:
-                    raise RuntimeError(f"line {line_no}: invalid node type: {node_type}")
-
-                go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
-
-                doc = {
-                    '_key': _key,
-                    'node_type': node_type,
-                    'transcript': cols[2],
-                    'gene_symbol': cols[3],
-                    'gene_full_name': cols[4],
-                    'gene_model_type': cols[5],
-                    'tair_computational_desc': cols[6],
-                    'tair_curator_summary': cols[7],
-                    'tair_short_desc': cols[8],
-                    'go_descr': cols[9],
-                    'go_terms': go_terms,
-                    'mapman_bin': cols[11],
-                    'mapman_name': cols[12],
-                    'mapman_desc': cols[13],
-                    'pheno_aragwas_id': cols[14],
-                    'pheno_desc1': cols[15],
-                    'pheno_desc2': cols[16],
-                    'pheno_desc3': cols[17],
-                    'pheno_ref': cols[18],
-                    'user_notes': cols[19],
-                }
-                nodes.append(doc)
+        def go_terms(row):
+            if len(row['go_terms']):
+                return [c.strip() for c in row_object['go_terms'].split(',')]
+            return []
+
+        def node_type(row):
+            if row['node_type'] not in valid_node_types:
+                raise RuntimeError(
+                    f"{file['path']} line {line_no}: invalid node type: {row['node_type']}"
+                )
+            return row['node_type']
+
+        remap_functions = {
+            # these pass straight through
+            'transcript': None,
+            'gene_symbol': None,
+            'gene_full_name': None,
+            'gene_model_type': None,
+            'tair_computational_description': None,
+            'tair_short_description': None,
+            'tair_curator_summary': None,
+            'mapman_bin': None,
+            'mapman_name': None,
+            'pheno_aragwas_id': None,
+            'pheno_ref': None,
+            'user_notes': None,
+            # rename
+            '_key': lambda row: row['node_id'],
+            'go_description': lambda row: row['go_descr'],
+            'mapman_description': lambda row: row['mapman_descr'],
+            'pheno_description': lambda row: row['pheno_descrip1'],
+            'pheno_pto_name': lambda row: row['pheno_descrip2'],
+            'pheno_pto_description': lambda row: row['pheno_descrip3'],
+            # see functions above
+            'node_type': node_type,
+            'go_terms': go_terms,
+        }
+
+        for file in self.config('node_files'):
+            file_parser = self.parser_gen(file)
+            headers = []
+
+            while True:
+                try:
+                    (line_no, cols) = next(file_parser)
+                except StopIteration:
+                    break
+
+                if len(headers) == 0:
+                    headers = cols
+                    continue
+
+                # merge with headers to form an object, then remap to create Arango-ready data
+                row_object = dict(zip(headers, cols))
+
+                datum = {}
+                for (key, func) in remap_functions.items():
+                    if func is None:
+                        datum[key] = row_object[key]
+                    else:
+                        datum[key] = func(row_object)
+                nodes.append(datum)
 
         return {'nodes': nodes}
 
@@ -167,22 +285,29 @@ def load_cluster_data(self):
 
         # index of nodes
         node_ix = {}
-
-        cluster_paths = self.config('_CLUSTER_PATHS')
-        for (cluster_label, path) in cluster_paths.items():
-            with open(path) as fd:
-                csv_reader = csv.reader(fd, delimiter='\t')
-                for row in csv_reader:
-                    if len(row) > 1:
-                        # remove the 'Cluster' text and replace it with cluster_label
-                        cluster_id = cluster_label + ':' + row[0].replace('Cluster', '')
-
-                        node_keys = row[1:]
-                        for key in node_keys:
-                            if key not in node_ix:
-                                node_ix[key] = [cluster_id]
-                            elif cluster_id not in node_ix[key]:
-                                node_ix[key].append(cluster_id)
+        for file in self.config('cluster_files'):
+            cluster_label = file['cluster_prefix']
+            headers = []
+            file_parser = self.parser_gen(file)
+
+            while True:
+                try:
+                    (line_no, cols) = next(file_parser)
+                except StopIteration:
+                    break
+
+                if len(headers) == 0:
+                    headers = cols
+                    continue
+
+                # remove the 'Cluster' text and replace it with cluster_label
+                cluster_id = cluster_label + ':' + cols[0].replace('Cluster', '')
+                node_keys = [n.strip() for n in cols[1].split(',')]
+                for key in node_keys:
+                    if key not in node_ix:
+                        node_ix[key] = [cluster_id]
+                    elif cluster_id not in node_ix[key]:
+                        node_ix[key].append(cluster_id)
 
         # gather a list of cluster IDs for each node
         nodes = [{
@@ -195,10 +320,10 @@ def load_cluster_data(self):
     def save_dataset(self, dataset):
 
         if 'nodes' in dataset and len(dataset['nodes']) > 0:
-            self.save_docs(self.config('_NODE_NAME'), dataset['nodes'])
+            self.save_docs(self.config('node_name'), dataset['nodes'])
 
         if 'edges' in dataset and len(dataset['edges']) > 0:
-            self.save_docs(self.config('_EDGE_NAME'), dataset['edges'])
+            self.save_docs(self.config('edge_name'), dataset['edges'])
 
     def save_docs(self, coll_name, docs, on_dupe='update'):
 
diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py
index 2ad9184e..e2442c8d 100644
--- a/importers/test/test_djornl_parser.py
+++ b/importers/test/test_djornl_parser.py
@@ -33,18 +33,29 @@ def init_parser_with_path(self, root_path):
             parser._configure()
             return parser
 
+    def test_load_no_manifest(self):
+        """ test loading when the manifest does not exist """
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest')
+        err_str = 'No manifest file found at ' + os.path.join(RES_ROOT_DATA_PATH, 'manifest.yaml')
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+    def test_load_invalid_manifest(self):
+        """ test an invalid manifest file """
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_manifest')
+        err_str = "The manifest file failed validation"
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
     def test_load_invalid_file(self):
         """ test loading when what is supposed to be a file is actually a directory """
 
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file')
 
         # edges: directory, not a file
-        edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv")
-        err_str = f"Is a directory: '{edges_file_path}'"
-        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
-
-        with self.assertRaisesRegex(IsADirectoryError, err_str):
-            parser.load_edges()
+        err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": not a file"
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
     def test_load_empty_files(self):
         """ test loading files containing no data """
@@ -60,20 +71,11 @@ def test_load_empty_files(self):
     def test_load_missing_files(self):
         """ test loading when files cannot be found """
 
-        # this dir does not contain the correct file structure
-        # path: test/djornl/empty_files/cluster_data
-        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files', 'cluster_data')
-        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
-
-        err_str = "No such file or directory: '" + RES_ROOT_DATA_PATH
-        with self.assertRaisesRegex(FileNotFoundError, err_str):
-            parser.load_edges()
-
-        with self.assertRaisesRegex(FileNotFoundError, err_str):
-            parser.load_node_metadata()
-
-        with self.assertRaisesRegex(FileNotFoundError, err_str):
-            parser.load_cluster_data()
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'missing_files')
+        # not found
+        err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ': file does not exist'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
     def test_load_invalid_types(self):
         """ test file format errors """
@@ -83,12 +85,12 @@ def test_load_invalid_types(self):
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
         # invalid edge type
-        edge_err_msg = 'line 2: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
+        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
         # invalid node type
-        node_err_msg = 'line 4: invalid node type: Monkey'
+        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
diff --git a/spec/README.md b/spec/README.md
index 7eb975bc..27b2c8ee 100644
--- a/spec/README.md
+++ b/spec/README.md
@@ -7,6 +7,7 @@ These specifications are used by the [Relation Engine API](relation_engine_serve
 * **[Stored queries](spec/stored_queries)** are stored [AQL queries](https://docs.arangodb.com/3.5/AQL/index.html) that can be used
 by KBase apps to fetch data from the database.
 * **[Collections, or document schemas,](spec/collections)** are [JSON schemas](https://json-schema.org/) that define what form of data can be stored in the database's collections.
+* **[Datasets](spec/datasets)** contain partial and full schemas specific to a certain dataset.
 * **[Data sources](spec/data_sources)** contain general information about where some of our imported data comes from.
 * **[Views](spec/views)** are raw ArangoSearch view configuration files
 
diff --git a/spec/collections/djornl/djornl_edge.yaml b/spec/collections/djornl/djornl_edge.yaml
index 3afa987f..1c956fdc 100644
--- a/spec/collections/djornl/djornl_edge.yaml
+++ b/spec/collections/djornl/djornl_edge.yaml
@@ -33,15 +33,20 @@ schema:
       type: string
       oneOf:
         - const: domain_co_occur
+          title: AraNetv2-DC_domain-co-occurrence
           description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
         - const: gene_coexpr
+          title: AraNetv2-CX_pairwise-gene-coexpression
           description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
             calculated from Pearson correlation coefficients to normalize the data
             for comparison across studies and different types of data layers (Lee et
             al, 2015).
         - const: pheno_assn
+          title: AraGWAS-Phenotype_Associations
           description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
         - const: ppi_hithru
+          title: AraNetv2-HT_high-throughput-ppi
           description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
         - const: ppi_liter
+          title: AraNetv2-LC_lit-curated-ppi
           description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
diff --git a/spec/collections/djornl/djornl_node.yaml b/spec/collections/djornl/djornl_node.yaml
index 13b14541..e0a744a7 100644
--- a/spec/collections/djornl/djornl_node.yaml
+++ b/spec/collections/djornl/djornl_node.yaml
@@ -46,7 +46,7 @@ schema:
       type: string
       title: Gene model type
       examples: ["protein_coding"]
-    tair_computational_desc:
+    tair_computational_description:
       type: string
       title: TAIR computational description
       examples: ["NAC domain containing protein 1;(source:Araport11)"]
@@ -54,11 +54,11 @@ schema:
       type: string
       title: TAIR curator summary
       examples: ["Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed."]
-    tair_short_desc:
+    tair_short_description:
       type: string
       title: TAIR short description
       examples: ["NAC domain containing protein 1"]
-    go_descr:
+    go_description:
       type: string
       title: GO descriptions
       examples: ["DNA-binding transcription factor activity"]
@@ -75,7 +75,7 @@ schema:
       type: string
       title: Mapman name
       examples: [".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)"]
-    mapman_desc:
+    mapman_description:
       type: string
       title: Mapman description
       examples: ["transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])"]
@@ -83,17 +83,19 @@ schema:
       type: string
       title: AraGWAS ID
       examples: ["10.21958/phenotype:67"]
-    pheno_desc1:
+    pheno_description:
       type: string
-      title: Phenotype description 1
+      title: Phenotype description
       examples: ["Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008"]
-    pheno_desc2:
+    pheno_pto_name:
       type: string
       title: PTO name
+      description: Plant Trait Ontology name
       examples: ["arsenic concentration"]
-    pheno_desc3:
+    pheno_pto_description:
       type: string
       title: PTO description
+      description: Plant Trait Ontology description
       examples: ["A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]"]
     pheno_ref:
       type: string
diff --git a/spec/datasets/djornl/manifest.schema.json b/spec/datasets/djornl/manifest.schema.json
new file mode 100644
index 00000000..beb7309f
--- /dev/null
+++ b/spec/datasets/djornl/manifest.schema.json
@@ -0,0 +1,102 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Exascale parser file manifest",
+  "type": "object",
+  "required": ["name", "file_list"],
+  "properties": {
+    "name": {
+      "title": "Dataset name",
+      "type": "string",
+      "description": "The name of the dataset",
+      "examples": ["Dan Jacobson Exascale dataset"]
+    },
+    "release_date": {
+      "title": "Release date",
+      "type": "string",
+      "description": "Date of the dataset release, in YYYY-MM-DD format",
+      "format": "date"
+    },
+    "notes": {
+      "type": "string",
+      "title": "Release notes",
+      "description": "Free text describing the release and any notes, or comments relevant to consumers of the data."
+    },
+    "file_list": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["data_type", "path"],
+        "oneOf": [{
+            "properties": {
+              "data_type": {
+                "enum": ["cluster"]
+              }
+            },
+            "required": ["cluster_prefix"]
+          },
+          {
+            "properties": {
+              "data_type": {
+                "enum": ["node", "edge"]
+              }
+            }
+          }
+        ],
+        "anyOf": [{
+            "properties": {
+              "file_format": {
+                "enum": ["tsv", "csv"]
+              }
+            },
+            "required": ["file_format"]
+          },
+          {
+            "properties": {
+              "path": {
+                "format": "regex",
+                "pattern": ".[ct]sv"
+              }
+            },
+            "required": ["path"]
+          }
+        ],
+        "properties": {
+          "path": {
+            "title": "File path",
+            "type": "string"
+          },
+          "data_type": {
+            "title": "Data type",
+            "type": "string",
+            "enum": ["node", "edge", "cluster"]
+          },
+          "file_format": {
+            "title": "File format",
+            "type": "string",
+            "enum": ["tsv", "csv"]
+          },
+          "date": {
+            "title": "File creation date",
+            "description": "date of file creation in the format YYYY-MM-DD",
+            "type": "string",
+            "format": "date"
+          },
+          "description": {
+            "title": "Description of the file contents",
+            "type": "string"
+          },
+          "cluster_prefix": {
+            "title": "Prefix",
+            "type": "string",
+            "description": "The prefix to be used for clusters, e.g. markov_i2:4. Required for cluster data, not used for node or edge data"
+          },
+          "cluster_title": {
+            "title": "Cluster set name",
+            "description": "Human-readable name of the cluster set. Not used for edge or node data",
+            "type": "string"
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml
index 1fadca36..4aa6070e 100644
--- a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml
+++ b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml
@@ -5,6 +5,7 @@ params:
   required: [cluster_ids]
   properties:
     cluster_ids:
+      type: array
       title: Cluster IDs
       description: Cluster IDs, in the form "clustering_system_name:cluster_id"
       items: {type: string}
diff --git a/spec/stored_queries/djornl/djornl_search_nodes.yaml b/spec/stored_queries/djornl/djornl_search_nodes.yaml
index 7cd36c4f..9c8d6a1d 100644
--- a/spec/stored_queries/djornl/djornl_search_nodes.yaml
+++ b/spec/stored_queries/djornl/djornl_search_nodes.yaml
@@ -19,15 +19,15 @@ query: |
   LET node_ids = (
     FOR g IN djornl_node_view
         SEARCH ANALYZER(
-            PHRASE(g.tair_computational_desc, @search_text) OR
-            PHRASE(g.tair_short_desc, @search_text) OR
-            PHRASE(g.mapman_desc, @search_text) OR
-            PHRASE(g.go_desc, @search_text) OR
+            PHRASE(g.tair_computational_description, @search_text) OR
+            PHRASE(g.tair_short_description, @search_text) OR
+            PHRASE(g.mapman_description, @search_text) OR
+            PHRASE(g.go_description, @search_text) OR
             PHRASE(g.mapman_name, @search_text) OR
-            PHRASE(g.mapman_desc, @search_text) OR
-            PHRASE(g.pheno_desc1, @search_text) OR
-            PHRASE(g.pheno_desc2, @search_text) OR
-            PHRASE(g.pheno_desc3, @search_text) OR
+            PHRASE(g.mapman_description, @search_text) OR
+            PHRASE(g.pheno_description, @search_text) OR
+            PHRASE(g.pheno_pto_name, @search_text) OR
+            PHRASE(g.pheno_pto_description, @search_text) OR
             PHRASE(g.user_notes, @search_text),
             'text_en'
         )
diff --git a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
index 48e5ab19..7727cd8a 100644
--- a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
@@ -1,4 +1,4 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
 As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
 As75,pheno,,,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
 AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,,
diff --git a/spec/test/djornl/col_count_errors/manifest.yaml b/spec/test/djornl/col_count_errors/manifest.yaml
new file mode 100644
index 00000000..50c5f454
--- /dev/null
+++ b/spec/test/djornl/col_count_errors/manifest.yaml
@@ -0,0 +1,8 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: merged_edges-AMW-060820_AF.tsv
+
+  - data_type: node
+    path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
diff --git a/spec/test/djornl/empty_files/manifest.yaml b/spec/test/djornl/empty_files/manifest.yaml
new file mode 100644
index 00000000..fb231666
--- /dev/null
+++ b/spec/test/djornl/empty_files/manifest.yaml
@@ -0,0 +1,20 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: merged_edges-AMW-060820_AF.tsv
+
+  - data_type: node
+    path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
diff --git a/spec/test/djornl/invalid_file/edges.tsv/empty b/spec/test/djornl/invalid_file/edges.tsv/empty
new file mode 100644
index 00000000..e69de29b
diff --git a/spec/test/djornl/invalid_file/manifest.yaml b/spec/test/djornl/invalid_file/manifest.yaml
new file mode 100644
index 00000000..d79e76a7
--- /dev/null
+++ b/spec/test/djornl/invalid_file/manifest.yaml
@@ -0,0 +1,12 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: edges.tsv
+
+  - data_type: node
+    path: nodes.csv
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: clusters.tsv
diff --git a/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml b/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml
new file mode 100644
index 00000000..b8993731
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/cluster_no_prefix.yaml
@@ -0,0 +1,18 @@
+# first cluster file has no prefix
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: edges.tsv
+    date: "2020-12-25"
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+
+  - data_type: cluster
+    path: I2_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: I4_named.tsv
diff --git a/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml b/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml
new file mode 100644
index 00000000..d40e9e6f
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/date_not_in_quotes.yaml
@@ -0,0 +1,17 @@
+# edge date is not quoted (pyyaml creates a datetime.date object)
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    date: 2020-12-25
+    path: edge_data
+    file_format: csv
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    file_format: tsv
+    path: clusters
diff --git a/spec/test/djornl/invalid_manifest/invalid_format.yaml b/spec/test/djornl/invalid_manifest/invalid_format.yaml
new file mode 100644
index 00000000..125227ce
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/invalid_format.yaml
@@ -0,0 +1,17 @@
+# invalid node file format
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    date: "2020-12-25"
+    path: edge_data.tsv
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+    file_format: txt
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    file_format: tsv
+    path: clusters
diff --git a/spec/test/djornl/invalid_manifest/manifest.yaml b/spec/test/djornl/invalid_manifest/manifest.yaml
new file mode 100644
index 00000000..7abfa0e2
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/manifest.yaml
@@ -0,0 +1,14 @@
+# multiple errors
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: edges.tsv
+
+  - data_type: node
+
+  - data_type: cluster
+    path: clusters.tsv
+
+  - data_type: ping-pong balls
+    path: where?
diff --git a/spec/test/djornl/invalid_manifest/missing_path.yaml b/spec/test/djornl/invalid_manifest/missing_path.yaml
new file mode 100644
index 00000000..c93bec17
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/missing_path.yaml
@@ -0,0 +1,21 @@
+# edge file path missing
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    date: "2020-12-25"
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: I4_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: I6_named.tsv
diff --git a/spec/test/djornl/invalid_manifest/no_file_format.yaml b/spec/test/djornl/invalid_manifest/no_file_format.yaml
new file mode 100644
index 00000000..ee8b9082
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/no_file_format.yaml
@@ -0,0 +1,16 @@
+# edge file has no indicator of file format
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    date: "2020-12-25"
+    path: edge_data
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    file_format: tsv
+    path: clusters
diff --git a/spec/test/djornl/invalid_manifest/no_file_list.yaml b/spec/test/djornl/invalid_manifest/no_file_list.yaml
new file mode 100644
index 00000000..ebaf9fb8
--- /dev/null
+++ b/spec/test/djornl/invalid_manifest/no_file_list.yaml
@@ -0,0 +1,3 @@
+# missing file_list
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
diff --git a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
index af5fa6cb..e98310e5 100644
--- a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
@@ -1,4 +1,5 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
+# data_type: node
 As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
 As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
 AT1G01010,Monkey,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,,
diff --git a/spec/test/djornl/invalid_types/manifest.yaml b/spec/test/djornl/invalid_types/manifest.yaml
new file mode 100644
index 00000000..50c5f454
--- /dev/null
+++ b/spec/test/djornl/invalid_types/manifest.yaml
@@ -0,0 +1,8 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: merged_edges-AMW-060820_AF.tsv
+
+  - data_type: node
+    path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
diff --git a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
index f9857bde..a98f49f9 100644
--- a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
+++ b/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
@@ -1,3 +1,4 @@
+# data_type: edge
 node1	node2	edge	edge_descrip	layer_descrip
 As2	AT1G01020	8.422046084731258	AraGWAS-Association_score	AraGWAS-Some-Old-Rubbish-I-Made-Up
 As2	AT1G01040	5.422046084731258	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
diff --git a/spec/test/djornl/missing_files/manifest.yaml b/spec/test/djornl/missing_files/manifest.yaml
new file mode 100644
index 00000000..d79e76a7
--- /dev/null
+++ b/spec/test/djornl/missing_files/manifest.yaml
@@ -0,0 +1,12 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+file_list:
+  - data_type: edge
+    path: edges.tsv
+
+  - data_type: node
+    path: nodes.csv
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: clusters.tsv
diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json
index 7fd3a4d5..e15408d2 100644
--- a/spec/test/djornl/results.json
+++ b/spec/test/djornl/results.json
@@ -40,20 +40,20 @@
   },
   "load_node_metadata": {
     "nodes": [
-      {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_desc1": "", "pheno_desc2": "bacterial disease resistance", "pheno_desc3": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
-      {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_desc1": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_desc2": "arsenic concentration", "pheno_desc3": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
-      {"_key": "AT1G01010", "node_type": "gene", "transcript": "AT1G01010.1", "gene_symbol": "NTL10", "gene_full_name": "NAC domain containing protein 1", "gene_model_type": "protein_coding", "tair_computational_desc": "NAC domain containing protein 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "NAC domain containing protein 1", "go_descr": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.17", "mapman_name": ".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)", "mapman_desc": "transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01020", "node_type": "gene", "transcript": "AT1G01020.6", "gene_symbol": "ARV1", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_desc": "ARV1 family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "molecular_function", "go_terms": ["GO:0003674"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_desc": "(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4)", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01030", "node_type": "gene", "transcript": "AT1G01030.2", "gene_symbol": "NGA3", "gene_full_name": "NGATHA3", "gene_model_type": "protein_coding", "tair_computational_desc": "AP2/B3-like transcriptional factor family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.5.3", "mapman_name": ".RNA biosynthesis.transcriptional regulation.B3  transcription factor superfamily.transcription factor (RAV/NGATHA)", "mapman_desc": "transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01040", "node_type": "gene", "transcript": "AT1G01040.2", "gene_symbol": "SUS1", "gene_full_name": "SUSPENSOR 1", "gene_model_type": "protein_coding", "tair_computational_desc": "dicer-like 1;(source:Araport11)", "tair_curator_summary": "Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.", "tair_short_desc": "dicer-like 1", "go_descr": "metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding", "go_terms": ["GO:0046872", "GO:0005515", "GO:0004525", "GO:0008026", "GO:0005524", "GO:0003723", "GO:0004386", "GO:0003725", "GO:0003677"], "mapman_bin": "16.10.2.1.1", "mapman_name": ".RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1", "mapman_desc": "endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01050", "node_type": "gene", "transcript": "AT1G01050.2", "gene_symbol": "PPa1", "gene_full_name": "pyrophosphorylase 1", "gene_model_type": "protein_coding", "tair_computational_desc": "pyrophosphorylase 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "inorganic diphosphatase activity", "go_terms": ["GO:0004427"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_desc": "(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0)", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01060", "node_type": "gene", "transcript": "AT1G01060.8", "gene_symbol": "LHY1", "gene_full_name": "LATE ELONGATED HYPOCOTYL 1", "gene_model_type": "protein_coding", "tair_computational_desc": "Homeodomain-like superfamily protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding", "go_terms": ["GO:0003700", "GO:0003677", "GO:0044212"], "mapman_bin": "27.1.1", "mapman_name": ".Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1)", "mapman_desc": "circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01070", "node_type": "gene", "transcript": "AT1G01070.2", "gene_symbol": "UMAMIT28", "gene_full_name": "Usually multiple acids move in and out Transporters 28", "gene_model_type": "protein_coding", "tair_computational_desc": "nodulin MtN21 /EamA-like transporter family protein;(source:Araport11)", "tair_curator_summary": "Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.", "tair_short_desc": "nodulin MtN21 /EamA-like transporter family protein", "go_descr": "L-glutamine transmembrane transporter activity", "go_terms": ["GO:0015186"], "mapman_bin": "24.2.1.5", "mapman_name": ".Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT)", "mapman_desc": "solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01080", "node_type": "gene", "transcript": "AT1G01080.3", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_desc": "RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "RNA binding, mRNA binding", "go_terms": ["GO:0003723", "GO:0003729"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_desc": "(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01090", "node_type": "gene", "transcript": "AT1G01090.1", "gene_symbol": "PDH-E1 ALPHA", "gene_full_name": "pyruvate dehydrogenase E1 alpha", "gene_model_type": "protein_coding", "tair_computational_desc": "pyruvate dehydrogenase E1 alpha;(source:Araport11)", "tair_curator_summary": "pyruvate dehydrogenase E1 alpha subunit", "tair_short_desc": "pyruvate dehydrogenase E1 alpha", "go_descr": "pyruvate dehydrogenase (acetyl-transferring) activity, protein binding", "go_terms": ["GO:0004739", "GO:0005515"], "mapman_bin": "5.1.2.2.1.1", "mapman_name": ".Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha", "mapman_desc": "subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "AT1G01100", "node_type": "gene", "transcript": "AT1G01100.4", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_desc": "60S acidic ribosomal protein family;(source:Araport11)", "tair_curator_summary": "", "tair_short_desc": "60S acidic ribosomal protein family", "go_descr": "structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity", "go_terms": ["GO:0003735", "GO:0043021", "GO:0030295"], "mapman_bin": "17.1.2.1.46", "mapman_name": ".Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1", "mapman_desc": "component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])", "pheno_aragwas_id": "", "pheno_desc1": "", "pheno_desc2": "", "pheno_desc3": "", "pheno_ref": "", "user_notes": ""},
-      {"_key": "Na23", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:5", "pheno_desc1": "Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_desc2": "sodium concentration", "pheno_desc3": "The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
-      {"_key": "SDV", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_desc": "", "tair_curator_summary": "", "tair_short_desc": "", "go_descr": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_desc": "", "pheno_aragwas_id": "10.21958/phenotype:104", "pheno_desc1": "Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200", "pheno_desc2": "days to flowering trait", "pheno_desc3": "A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}
+      {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_description": "", "pheno_pto_name": "bacterial disease resistance", "pheno_pto_description": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
+      {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_description": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "arsenic concentration", "pheno_pto_description": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
+      {"_key": "AT1G01010", "node_type": "gene", "transcript": "AT1G01010.1", "gene_symbol": "NTL10", "gene_full_name": "NAC domain containing protein 1", "gene_model_type": "protein_coding", "tair_computational_description": "NAC domain containing protein 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "NAC domain containing protein 1", "go_description": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.17", "mapman_name": ".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)", "mapman_description": "transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01020", "node_type": "gene", "transcript": "AT1G01020.6", "gene_symbol": "ARV1", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "ARV1 family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "molecular_function", "go_terms": ["GO:0003674"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01030", "node_type": "gene", "transcript": "AT1G01030.2", "gene_symbol": "NGA3", "gene_full_name": "NGATHA3", "gene_model_type": "protein_coding", "tair_computational_description": "AP2/B3-like transcriptional factor family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "DNA-binding transcription factor activity, DNA binding", "go_terms": ["GO:0003700", "GO:0003677"], "mapman_bin": "15.5.5.3", "mapman_name": ".RNA biosynthesis.transcriptional regulation.B3  transcription factor superfamily.transcription factor (RAV/NGATHA)", "mapman_description": "transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01040", "node_type": "gene", "transcript": "AT1G01040.2", "gene_symbol": "SUS1", "gene_full_name": "SUSPENSOR 1", "gene_model_type": "protein_coding", "tair_computational_description": "dicer-like 1;(source:Araport11)", "tair_curator_summary": "Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.", "tair_short_description": "dicer-like 1", "go_description": "metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding", "go_terms": ["GO:0046872", "GO:0005515", "GO:0004525", "GO:0008026", "GO:0005524", "GO:0003723", "GO:0004386", "GO:0003725", "GO:0003677"], "mapman_bin": "16.10.2.1.1", "mapman_name": ".RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1", "mapman_description": "endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01050", "node_type": "gene", "transcript": "AT1G01050.2", "gene_symbol": "PPa1", "gene_full_name": "pyrophosphorylase 1", "gene_model_type": "protein_coding", "tair_computational_description": "pyrophosphorylase 1;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "inorganic diphosphatase activity", "go_terms": ["GO:0004427"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01060", "node_type": "gene", "transcript": "AT1G01060.8", "gene_symbol": "LHY1", "gene_full_name": "LATE ELONGATED HYPOCOTYL 1", "gene_model_type": "protein_coding", "tair_computational_description": "Homeodomain-like superfamily protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding", "go_terms": ["GO:0003700", "GO:0003677", "GO:0044212"], "mapman_bin": "27.1.1", "mapman_name": ".Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1)", "mapman_description": "circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01070", "node_type": "gene", "transcript": "AT1G01070.2", "gene_symbol": "UMAMIT28", "gene_full_name": "Usually multiple acids move in and out Transporters 28", "gene_model_type": "protein_coding", "tair_computational_description": "nodulin MtN21 /EamA-like transporter family protein;(source:Araport11)", "tair_curator_summary": "Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.", "tair_short_description": "nodulin MtN21 /EamA-like transporter family protein", "go_description": "L-glutamine transmembrane transporter activity", "go_terms": ["GO:0015186"], "mapman_bin": "24.2.1.5", "mapman_name": ".Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT)", "mapman_description": "solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01080", "node_type": "gene", "transcript": "AT1G01080.3", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "", "go_description": "RNA binding, mRNA binding", "go_terms": ["GO:0003723", "GO:0003729"], "mapman_bin": "35.1", "mapman_name": "not assigned.annotated", "mapman_description": "(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01090", "node_type": "gene", "transcript": "AT1G01090.1", "gene_symbol": "PDH-E1 ALPHA", "gene_full_name": "pyruvate dehydrogenase E1 alpha", "gene_model_type": "protein_coding", "tair_computational_description": "pyruvate dehydrogenase E1 alpha;(source:Araport11)", "tair_curator_summary": "pyruvate dehydrogenase E1 alpha subunit", "tair_short_description": "pyruvate dehydrogenase E1 alpha", "go_description": "pyruvate dehydrogenase (acetyl-transferring) activity, protein binding", "go_terms": ["GO:0004739", "GO:0005515"], "mapman_bin": "5.1.2.2.1.1", "mapman_name": ".Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha", "mapman_description": "subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "AT1G01100", "node_type": "gene", "transcript": "AT1G01100.4", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "protein_coding", "tair_computational_description": "60S acidic ribosomal protein family;(source:Araport11)", "tair_curator_summary": "", "tair_short_description": "60S acidic ribosomal protein family", "go_description": "structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity", "go_terms": ["GO:0003735", "GO:0043021", "GO:0030295"], "mapman_bin": "17.1.2.1.46", "mapman_name": ".Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1", "mapman_description": "component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])", "pheno_aragwas_id": "", "pheno_description": "", "pheno_pto_name": "", "pheno_pto_description": "", "pheno_ref": "", "user_notes": ""},
+      {"_key": "Na23", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:5", "pheno_description": "Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "sodium concentration", "pheno_pto_description": "The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
+      {"_key": "SDV", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:104", "pheno_description": "Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200", "pheno_pto_name": "days to flowering trait", "pheno_pto_description": "A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""}
     ]
   },
   "fetch_all": {
diff --git a/spec/test/djornl/test_data/I2_named.tsv b/spec/test/djornl/test_data/I2_named.tsv
new file mode 100644
index 00000000..3f129851
--- /dev/null
+++ b/spec/test/djornl/test_data/I2_named.tsv
@@ -0,0 +1,9 @@
+cluster	ids
+# data_type: cluster
+# cluster_prefix: markov_i2
+# title: Markov clustering, inflation = 2
+Cluster1	AT1G01010,AT1G01030,AT1G01040
+Cluster2	AT1G01050,AT1G01060,AT1G01070
+Cluster3	AT1G01080,AT1G01090
+Cluster4
+Cluster5	AT1G01020
diff --git a/spec/test/djornl/test_data/I4_named.tsv b/spec/test/djornl/test_data/I4_named.tsv
new file mode 100644
index 00000000..39e46deb
--- /dev/null
+++ b/spec/test/djornl/test_data/I4_named.tsv
@@ -0,0 +1,4 @@
+cluster	ids
+# cluster_prefix: markov_i4
+# title: Markov clustering, inflation = 4
+# data_type: cluster
diff --git a/spec/test/djornl/test_data/I6_named.tsv b/spec/test/djornl/test_data/I6_named.tsv
new file mode 100644
index 00000000..d504f6b4
--- /dev/null
+++ b/spec/test/djornl/test_data/I6_named.tsv
@@ -0,0 +1,8 @@
+cluster	ids
+# data_type: cluster
+# cluster_prefix: markov_i6
+# title: Markov clustering, inflation = 6
+Cluster1	AT1G01040,AT1G01090
+Cluster2	AT1G01070
+Cluster3	AT1G01010,AT1G01020,AT1G01030
+Cluster4
diff --git a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
deleted file mode 100644
index 086a9209..00000000
--- a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-Cluster1	AT1G01010	AT1G01030	AT1G01040
-Cluster2	AT1G01050	AT1G01060	AT1G01070
-Cluster3	AT1G01080	AT1G01090
-Cluster4
-Cluster5	AT1G01020
diff --git a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
deleted file mode 100644
index 8b137891..00000000
--- a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
deleted file mode 100644
index 389cae2e..00000000
--- a/spec/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+++ /dev/null
@@ -1,4 +0,0 @@
-Cluster1	AT1G01040	AT1G01090
-Cluster2	AT1G01070
-Cluster3	AT1G01010	AT1G01020	AT1G01030
-Cluster4
diff --git a/spec/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/test_data/edges.tsv
similarity index 80%
rename from spec/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv
rename to spec/test/djornl/test_data/edges.tsv
index 44acc6ff..ee443140 100644
--- a/spec/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv
+++ b/spec/test/djornl/test_data/edges.tsv
@@ -2,8 +2,6 @@ node1	node2	edge	edge_descrip	layer_descrip
 As2	AT1G01020	8.4	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
 As2	AT1G01040	5.4	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
 As75	AT1G01020	39.9	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
-AT1G01010	AT1G01020	2.3	AraNetv2_log-likelihood-score	AraNetv2-HT_high-throughput-ppi
-AT1G01010	AT1G01030	2.4	AraNetv2_log-likelihood-score	AraNetv2-HT_high-throughput-ppi
 AT1G01010	AT1G01040	2.5	AraNetv2_log-likelihood-score	AraNetv2-DC_domain-co-occurrence
 AT1G01010	AT1G01040	170.5	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
 AT1G01030	AT1G01050	2.6	AraNetv2_log-likelihood-score	AraNetv2-CX_pairwise-gene-coexpression
diff --git a/spec/test/djornl/test_data/extra_node.tsv b/spec/test/djornl/test_data/extra_node.tsv
new file mode 100644
index 00000000..664425c5
--- /dev/null
+++ b/spec/test/djornl/test_data/extra_node.tsv
@@ -0,0 +1,3 @@
+# data_type: node
+node_id	node_type	transcript	gene_symbol	gene_full_name	gene_model_type	TAIR_Computational_description	TAIR_Curator_summary	TAIR_short_description	GO_descr	GO_terms	MapMan_bin	MapMan_name	MapMan_descr	pheno_AraGWAS_ID	pheno_descrip1	pheno_descrip2	pheno_descrip3	pheno_ref	User_Notes
+AT1G01100	gene	AT1G01100.4			protein_coding	60S acidic ribosomal protein family;(source:Araport11)		60S acidic ribosomal protein family	structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity	GO:0003735, GO:0043021, GO:0030295	17.1.2.1.46	.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1	component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])						
diff --git a/spec/test/djornl/test_data/hithruput-edges.csv b/spec/test/djornl/test_data/hithruput-edges.csv
new file mode 100644
index 00000000..586476ab
--- /dev/null
+++ b/spec/test/djornl/test_data/hithruput-edges.csv
@@ -0,0 +1,3 @@
+node1,node2,edge,edge_descrip,layer_descrip
+AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi
+AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi
diff --git a/spec/test/djornl/test_data/manifest.yaml b/spec/test/djornl/test_data/manifest.yaml
new file mode 100644
index 00000000..1762a86d
--- /dev/null
+++ b/spec/test/djornl/test_data/manifest.yaml
@@ -0,0 +1,35 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+home_url: "https://github.com/kbase/exascale_data"
+file_list:
+  - data_type: edge
+    path: edges.tsv
+    date: "2020-12-25"
+
+  - data_type: edge
+    path: hithruput-edges.csv
+    date: "2020-12-25"
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: I2_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: I4_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: I6_named.tsv
+
+  - data_type: node
+    path: pheno_nodes.csv
+    date: "2019-01-01"
+
+  - data_type: node
+    path: extra_node.tsv
+    date: "2019-01-01"
diff --git a/spec/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/test_data/nodes.csv
similarity index 72%
rename from spec/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
rename to spec/test/djornl/test_data/nodes.csv
index 5bc0e1d8..2245bd59 100644
--- a/spec/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/spec/test/djornl/test_data/nodes.csv
@@ -1,6 +1,5 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes
-As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
-As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
+# data_type: node
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
 AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,,
 AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,,
 AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3  transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,,
@@ -10,6 +9,3 @@ AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeod
 AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,,
 AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,,
 AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,,
-AT1G01100,gene,AT1G01100.4,,,protein_coding,60S acidic ribosomal protein family;(source:Araport11),,60S acidic ribosomal protein family,"structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity","GO:0003735, GO:0043021, GO:0030295",17.1.2.1.46,.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1,component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9]),,,,,,
-Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010",
-SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010",
diff --git a/spec/test/djornl/test_data/pheno_nodes.csv b/spec/test/djornl/test_data/pheno_nodes.csv
new file mode 100644
index 00000000..d8bb15a9
--- /dev/null
+++ b/spec/test/djornl/test_data/pheno_nodes.csv
@@ -0,0 +1,5 @@
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
+As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
+As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
+Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010",
+SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010",
diff --git a/spec/test/djornl/valid_manifest/no_file_ext.yaml b/spec/test/djornl/valid_manifest/no_file_ext.yaml
new file mode 100644
index 00000000..3120e553
--- /dev/null
+++ b/spec/test/djornl/valid_manifest/no_file_ext.yaml
@@ -0,0 +1,40 @@
+# nodes does not have a file extension, so file format must be used
+name: Dan Jacobson Exascale data
+release_date: "2020-08-06"
+description: Preliminary Jacobson dataset
+file_list:
+  - data_type: edge
+    path: merged_edges-AMW-060820_AF.tsv
+    file_format: tsv
+    description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer.
+    date_created: "2020-06-08"
+
+  - data_type: node
+    path: nodes
+    file_format: csv
+    description: Merged AraNet AraGWAS gene and phenotype data
+    date_created: "2019-09-13"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 2
+    description: Iterative random forest Markov clustering, inflation set to 2
+    date_created: "2019-08-19"
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 4
+    description: Iterative random forest Markov clustering, inflation set to 4
+    date_created: "2019-08-19"
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 6
+    description: Iterative random forest Markov clustering, inflation set to 6
+    date_created: "2019-08-19"
diff --git a/spec/test/djornl/valid_manifest/no_file_format.yaml b/spec/test/djornl/valid_manifest/no_file_format.yaml
new file mode 100644
index 00000000..d3010deb
--- /dev/null
+++ b/spec/test/djornl/valid_manifest/no_file_format.yaml
@@ -0,0 +1,39 @@
+# node file has no format specified, so the path must be parsed
+name: Dan Jacobson Exascale data
+release_date: "2020-08-06"
+description: Preliminary Jacobson dataset
+file_list:
+  - data_type: edge
+    path: merged_edges-AMW-060820_AF.tsv
+    file_format: tsv
+    description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer.
+    date_created: "2020-06-08"
+
+  - data_type: node
+    path: nodes.csv
+    description: Merged AraNet AraGWAS gene and phenotype data
+    date_created: "2019-09-13"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 2
+    description: Iterative random forest Markov clustering, inflation set to 2
+    date_created: "2019-08-19"
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 4
+    description: Iterative random forest Markov clustering, inflation set to 4
+    date_created: "2019-08-19"
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 6
+    description: Iterative random forest Markov clustering, inflation set to 6
+    date_created: "2019-08-19"
diff --git a/spec/test/djornl/valid_manifest/with_descriptions.yaml b/spec/test/djornl/valid_manifest/with_descriptions.yaml
new file mode 100644
index 00000000..6c39a234
--- /dev/null
+++ b/spec/test/djornl/valid_manifest/with_descriptions.yaml
@@ -0,0 +1,39 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-08-06"
+description: Preliminary Jacobson dataset
+file_list:
+  - data_type: edge
+    path: merged_edges-AMW-060820_AF.tsv
+    file_format: tsv
+    description: Merged edge data. AraGWAS phenotype-GWAS layer has an FDR filter was applied, removing some of the edges. The Aranetv2 coexpression layer network contains the top (highest log-likelihood scores) 15% coexpression edges to compensate for the decreased network size of the phenotype-GWAS layer.
+    date_created: "2020-06-08"
+
+  - data_type: node
+    path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+    file_format: csv
+    description: Merged AraNet AraGWAS gene and phenotype data
+    date_created: "2019-09-13"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 2
+    description: Iterative random forest Markov clustering, inflation set to 2
+    date_created: "2019-08-19"
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 4
+    description: Iterative random forest Markov clustering, inflation set to 4
+    date_created: "2019-08-19"
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+    file_format: tsv
+    title: Markov clustering, inflation 6
+    description: Iterative random forest Markov clustering, inflation set to 6
+    date_created: "2019-08-19"
diff --git a/spec/test/stored_queries/test_djornl.py b/spec/test/stored_queries/test_djornl.py
index 8e492fbe..cde4d0c2 100644
--- a/spec/test/stored_queries/test_djornl.py
+++ b/spec/test/stored_queries/test_djornl.py
@@ -40,8 +40,8 @@ def setUpClass(cls):
         root_path = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         with modified_environ(RES_ROOT_DATA_PATH=root_path):
             parser = DJORNL_Parser()
-            node_name = parser.config('_NODE_NAME')
-            edge_name = parser.config('_EDGE_NAME')
+            node_name = parser.config('node_name')
+            edge_name = parser.config('edge_name')
 
             edge_data = parser.load_edges()
             r = create_test_docs(node_name, edge_data['nodes'])
diff --git a/spec/test/test_manifest_schema.py b/spec/test/test_manifest_schema.py
new file mode 100644
index 00000000..4237624d
--- /dev/null
+++ b/spec/test/test_manifest_schema.py
@@ -0,0 +1,82 @@
+"""
+Tests for manifest.schema.json
+
+Ensure that the manifest schema correctly validates data
+"""
+import unittest
+import os.path as os_path
+from relation_engine_server.utils.json_validation import run_validator
+from jsonschema.exceptions import ValidationError
+
+schema_file = os_path.join('/app', 'spec', 'datasets', 'djornl', 'manifest.schema.json')
+_TEST_DIR = os_path.join('/app', 'spec', 'test', 'djornl')
+
+
+class Test_Manifest_Schema(unittest.TestCase):
+
+    def test_load_invalid_manifest(self):
+        """ test an invalid manifest file """
+
+        invalid_dir = os_path.join(_TEST_DIR, 'invalid_manifest')
+
+        error_list = [
+            {
+                # no file list provided
+                'file': 'no_file_list',
+                'msg': "'file_list' is a required property",
+            },
+            {
+                # a cluster file entry should have a prefix
+                'file': 'cluster_no_prefix',
+                'msg': r"{'data_type': 'cluster', 'path': 'I2_named.tsv'} is not valid under any of the given schemas",
+            },
+            {
+                # each file_list entry has to have a path
+                'file': 'missing_path',
+                'msg': "'path' is a required property",
+            },
+            {
+                # if the date is not quoted, pyyaml will turn it into a date object. Doh!
+                'file': 'date_not_in_quotes',
+                'msg': "datetime.date\(2020, 12, 25\) is not of type 'string'",
+            },
+            {
+                # file format is invalid
+                'file': 'invalid_format',
+                'msg': "'txt' is not one of \['tsv', 'csv'\]"
+            },
+            {
+                # there must be an indicator of file format
+                'file': 'no_file_format',
+                'msg': r"{'data_type': 'edge', 'date': '2020-12-25', 'path': 'edge_data'}"
+                        + " is not valid under any of the given schemas",
+            },
+        ]
+
+        for entry in error_list:
+            data_file = os_path.join(invalid_dir, entry['file'] + '.yaml')
+            print('looking at ' + data_file)
+
+            with self.assertRaisesRegex(ValidationError, entry['msg']):
+                run_validator(
+                    schema_file=schema_file,
+                    data_file=data_file,
+                    nicer_errors=True
+                )
+
+    def test_load_valid_manifests(self):
+
+        valid_dir = os_path.join(_TEST_DIR, 'valid_manifest')
+        file_list = ['with_descriptions', 'no_file_ext', 'no_file_format']
+
+        for file in file_list:
+            data_file = os_path.join(valid_dir, file + '.yaml')
+            print('looking at ' + data_file)
+
+            self.assertTrue(
+                run_validator(
+                    schema_file=schema_file,
+                    data_file=data_file,
+                    nicer_errors=True
+                )
+            )
diff --git a/spec/views/djornl/djornl_node_view.json b/spec/views/djornl/djornl_node_view.json
index 7330fd3a..2287f615 100644
--- a/spec/views/djornl/djornl_node_view.json
+++ b/spec/views/djornl/djornl_node_view.json
@@ -19,19 +19,19 @@
       ],
       "fields": {
         "transcript": {},
-        "tair_computational_desc": {
+        "tair_computational_description": {
           "analyzers": [
             "text_en"
           ]
         },
-        "tair_short_desc": {
+        "tair_short_description": {
           "analyzers": [
             "text_en"
           ]
         },
         "gene_model_type": {},
         "go_terms": {},
-        "go_desc": {
+        "go_description": {
           "analyzers": [
             "text_en"
           ]
@@ -41,22 +41,22 @@
             "text_en"
           ]
         },
-        "mapman_desc": {
+        "mapman_description": {
           "analyzers": [
             "text_en"
           ]
         },
-        "pheno_desc1": {
+        "pheno_description": {
           "analyzers": [
             "text_en"
           ]
         },
-        "pheno_desc2": {
+        "pheno_pto_name": {
           "analyzers": [
             "text_en"
           ]
         },
-        "pheno_desc3": {
+        "pheno_pto_description": {
           "analyzers": [
             "text_en"
           ]

From b2357f1e580c2b98cfc7242952ed35a62febd66f Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Thu, 20 Aug 2020 16:12:29 -0700
Subject: [PATCH 2/4] Update changelog to use Keep a Changelog format

---
 CHANGELOG.md | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad4fa3fc..ad51ca09 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,23 +1,33 @@
-## 0.0.6
+# Changelog for kbase/relation_engine
 
-### `relation_engine_server`
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.0.6] - 2020-08-20
+
+### Added
 
 - `relation_engine_server/api_versions/api_v1.py`: add and/or document API endpoints:
-  - /api/v1/specs/stored_queries
-  - /api/v1/specs/data_sources
+  - `/api/v1/specs/stored_queries`
+  - `/api/v1/specs/data_sources`
+- `spec/datasets`: new directory for dataset-specific schemas, e.g. DJORNL parser manifest, `spec/datasets/djornl/manifest.schema.json`
+- `spec/test/test_manifest_schema.py`: to test the manifest schema against example input
+- `importers/djornl/parser.py`: use manifest file to specify the files to be parsed and loaded into ArangoDB
 
-- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file
+### Changed
 
-### `importers`
+- `relation_engine_server/utils/spec_loader.py`: refactor to return a schema or the path to a schema file
+- `importers/djornl/parser.py`: refactor parsing code to be more flexible and parse multiple files
+- `spec/collections/djornl/*`, `spec/stored_queries/djornl/*`, `spec/views/djornl/*`, and `spec/test/djornl`: rename DB fields and headers in test files
 
-- `importers/djornl`: use manifest file to specify the list of files to be parsed to create a dataset. Refactor file parsing for more flexibility.
+### Removed
 
-### `spec`
+- `spec/test/djornl`: delete unneeded test files
 
-- add `datasets` folder for dataset-specific schemas
 
-----
 
-## 0.0.5
+## [0.0.5]
 
 Last release with RE components in two repositories, https://github.com/kbase/relation_engine_api and https://github.com/kbase/relation_engine_spec

From 541dd4e08b953ea83022f6380401c38425ee850c Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Wed, 26 Aug 2020 17:01:41 -0700
Subject: [PATCH 3/4] Refactor parsing method to be more generic Refactor spec
 files to use a definitions file Add tests for duplicated data

---
 importers/djornl/parser.py                    | 345 +++++++++++-------
 importers/test/test_djornl_parser.py          | 106 +++++-
 relation_engine_server/utils/bulk_import.py   |   6 +-
 spec/collections/djornl/djornl_edge.yaml      |  36 +-
 spec/collections/djornl/djornl_node.yaml      |   5 +-
 spec/datasets/djornl/csv_cluster.yaml         |  15 +
 spec/datasets/djornl/csv_edge.yaml            |  23 ++
 spec/datasets/djornl/csv_node.yaml            |  52 +++
 spec/datasets/djornl/definitions.yaml         | 129 +++++++
 spec/datasets/djornl/edge_type.yaml           |  25 ++
 spec/datasets/djornl/node_type.yaml           |  10 +
 ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv |   2 +-
 spec/test/djornl/duplicate_data/I2_named.tsv  |   9 +
 spec/test/djornl/duplicate_data/I4_named.tsv  |   8 +
 spec/test/djornl/duplicate_data/I6_copy.csv   |   8 +
 spec/test/djornl/duplicate_data/I6_named.tsv  |   8 +
 spec/test/djornl/duplicate_data/edges.tsv     |  11 +
 .../test/djornl/duplicate_data/extra_node.tsv |   5 +
 .../djornl/duplicate_data/hithruput-edges.csv |   9 +
 spec/test/djornl/duplicate_data/manifest.yaml |  39 ++
 spec/test/djornl/duplicate_data/nodes.csv     |  13 +
 .../djornl/duplicate_data/pheno_nodes.csv     |   5 +
 ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv |   2 +-
 .../empty_files/cluster_data/comment_only.tsv |   1 +
 .../empty_files/cluster_data/headers_only.tsv |   4 +
 ...082919.abc.I2_named.tsv => no_content.tsv} |   0
 ...p10percent_anno_AF_082919.abc.I4_named.tsv |   1 -
 ...p10percent_anno_AF_082919.abc.I6_named.tsv |   1 -
 spec/test/djornl/empty_files/manifest.yaml    |   6 +-
 .../merged_edges-AMW-060820_AF.tsv            |   5 +-
 spec/test/djornl/invalid_types/edges.tsv      |  10 +
 spec/test/djornl/invalid_types/manifest.yaml  |   8 +-
 .../djornl/invalid_types/markov2_named.tsv    |   9 +
 .../merged_edges-AMW-060820_AF.tsv            |  10 -
 ...-AMW-v2_091319_nodeTable.csv => nodes.csv} |   2 +-
 spec/test/djornl/results.json                 | 218 +++++------
 spec/test/djornl/test_data/I2_named.tsv       |   5 +-
 spec/test/djornl/test_data/I4_named.tsv       |   3 +-
 spec/test/djornl/test_data/I6_named.tsv       |   4 +-
 spec/test/djornl/test_data/extra_node.tsv     |   4 +-
 spec/test/djornl/test_data/nodes.csv          |   2 +-
 spec/test/djornl/test_data/pheno_nodes.csv    |  10 +-
 42 files changed, 856 insertions(+), 318 deletions(-)
 create mode 100644 spec/datasets/djornl/csv_cluster.yaml
 create mode 100644 spec/datasets/djornl/csv_edge.yaml
 create mode 100644 spec/datasets/djornl/csv_node.yaml
 create mode 100644 spec/datasets/djornl/definitions.yaml
 create mode 100644 spec/datasets/djornl/edge_type.yaml
 create mode 100644 spec/datasets/djornl/node_type.yaml
 create mode 100644 spec/test/djornl/duplicate_data/I2_named.tsv
 create mode 100644 spec/test/djornl/duplicate_data/I4_named.tsv
 create mode 100644 spec/test/djornl/duplicate_data/I6_copy.csv
 create mode 100644 spec/test/djornl/duplicate_data/I6_named.tsv
 create mode 100644 spec/test/djornl/duplicate_data/edges.tsv
 create mode 100644 spec/test/djornl/duplicate_data/extra_node.tsv
 create mode 100644 spec/test/djornl/duplicate_data/hithruput-edges.csv
 create mode 100644 spec/test/djornl/duplicate_data/manifest.yaml
 create mode 100644 spec/test/djornl/duplicate_data/nodes.csv
 create mode 100644 spec/test/djornl/duplicate_data/pheno_nodes.csv
 create mode 100644 spec/test/djornl/empty_files/cluster_data/comment_only.tsv
 create mode 100644 spec/test/djornl/empty_files/cluster_data/headers_only.tsv
 rename spec/test/djornl/empty_files/cluster_data/{out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv => no_content.tsv} (100%)
 delete mode 100644 spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
 delete mode 100644 spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
 create mode 100644 spec/test/djornl/invalid_types/edges.tsv
 create mode 100644 spec/test/djornl/invalid_types/markov2_named.tsv
 delete mode 100644 spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
 rename spec/test/djornl/invalid_types/{aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv => nodes.csv} (98%)

diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
index 928b55fe..990c745a 100644
--- a/importers/djornl/parser.py
+++ b/importers/djornl/parser.py
@@ -11,7 +11,7 @@
 import yaml
 
 import importers.utils.config as config
-from relation_engine_server.utils.json_validation import run_validator
+from relation_engine_server.utils.json_validation import run_validator, get_schema_validator
 
 
 class DJORNL_Parser(object):
@@ -62,7 +62,15 @@ def _configure(self):
 
     def _get_manifest_schema_file(self):
 
-        return os.path.join('/app', 'spec', 'datasets', 'djornl', 'manifest.schema.json')
+        return os.path.join(self._get_dataset_schema_dir(), 'manifest.schema.json')
+
+    def _get_dataset_schema_dir(self):
+
+        if not hasattr(self, '_dataset_schema_dir'):
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            self._dataset_schema_dir = os.path.join(dir_path, '../', '../', 'spec', 'datasets', 'djornl')
+
+        return self._dataset_schema_dir
 
     def _get_manifest(self, configuration):
         """
@@ -81,8 +89,8 @@ def _get_manifest(self, configuration):
         except FileNotFoundError:
             raise RuntimeError(
                 f"No manifest file found at {manifest_file}.\n"
-                + "Please ensure that you have created a manifest that lists the files "
-                + "in the release"
+                "Please ensure that you have created a manifest that lists the files "
+                "in the release"
             )
 
         try:
@@ -109,205 +117,282 @@ def _get_file_reader(self, fd, file):
     def parser_gen(self, file):
         """generator function to parse a file"""
         expected_col_count = 0
-        with open(file['file_path']) as fd:
+        with open(file['file_path'], newline='') as fd:
             csv_reader = self._get_file_reader(fd, file)
             line_no = 0
             for row in csv_reader:
                 line_no += 1
-                if len(row) <= 1 or row[0][0] == '#':
+                if not len(row) or row[0][0] == '#':
                     # comment / metadata
                     continue
 
                 cols = [c.strip() for c in row]
 
                 if len(cols) == expected_col_count:
-                    yield (line_no, cols)
+                    yield (line_no, cols, None)
                     continue
 
                 # if we didn't get the expected number of cols:
                 if expected_col_count == 0:
                     # this is the header row; set up the expected column count
                     expected_col_count = len(cols)
-                    yield (line_no, [c.lower() for c in cols])
+                    yield (line_no, [c.lower() for c in cols], None)
                     continue
 
                 # otherwise, this row does not have the correct number of columns
-                n_cols = len(cols)
-                raise RuntimeError(
-                    f"{file['path']} line {line_no}: "
-                    + f"expected {expected_col_count} cols, found {n_cols}"
-                )
+                col_count = len(cols)
+                msg = f"expected {expected_col_count} cols, found {col_count}"
+                yield(line_no, None, f"{file['path']} line {line_no}: {msg}")
+
+    def remap_object(self, raw_data, remap_functions):
+        """ Given a dict, raw_data, create a new dict, remapped_data, using the functions in the
+        dictionary `remap_functions`. """
+        remapped_data = {}
+        for (key, function) in remap_functions.items():
+            # these keys get copied over unchanged to the new object if they exist in the input obj
+            if function is None:
+                if key in raw_data:
+                    remapped_data[key] = raw_data[key]
+            else:
+                remapped_data[key] = function(raw_data)
+
+        return remapped_data
+
+    def process_file(self, file, remap_fn, store_fn, err_list, validator=None):
+        """ process an input file to generate a dataset and possibly an error list
+
+        Each valid line in the file is turned into a dictionary using the header row, and then
+        validated against the csv validation schema in spec/datasets/djornl/csv_<file_type>.
+        If that completes successfully, it is transformed using the functions in the dictionary
+        `remap_fn`, checked for uniqueness against existing data, and saved to a dictionary. Once
+        all files of a certain type have been processed, results can be saved to Arango.
+
+        Any errors that occur during parsing and processing are accumulated in `err_list`.
+
+        :param file: (dict)             file data
+        :param remap_fn: (dict)         mapping of output param names to functions
+                                        each function should take the row data object as input and
+                                        return the value for the output parameter
+
+        :param store_fn: (func)         function to store the results of the remapping
+
+        :param err_list: (list)         error list
+
+        :param validator: (Validator)   jsonschema validator object
+
+        """
+        file_parser = self.parser_gen(file)
+        try:
+            (line_no, cols, err_str) = next(file_parser)
+        except StopIteration:
+            # no valid lines found in the file
+            err_list.append(f"{file['path']}: no header line found")
+            return
+
+        headers = cols
+        n_stored = 0
+        for (line_no, cols, err_str) in file_parser:
+            # mismatch in number of cols
+            if cols is None:
+                err_list.append(err_str)
+                continue
+
+            # merge headers with cols to create an object
+            row_object = dict(zip(headers, cols))
+
+            if validator is not None:
+                # validate the object
+                if not validator.is_valid(row_object):
+                    err_msg = "".join(
+                        f"{file['path']} line {line_no}: " + e.message
+                        for e in sorted(validator.iter_errors(row_object), key=str)
+                    )
+                    err_list.append(err_msg)
+                    continue
+
+            # transform it using the remap_functions
+            datum = self.remap_object(row_object, remap_fn)
+
+            # and store it
+            storage_error = store_fn(datum)
+            if storage_error is None:
+                n_stored += 1
+            else:
+                err_list.append(f"{file['path']} line {line_no}: " + storage_error)
+
+        if not n_stored:
+            err_list.append(f"{file['path']}: no valid data found")
 
     def load_edges(self):
-        # Headers and sample row:
-        # node1	node2	edge	edge_descrip	layer_descrip
-        # AT1G01370	AT1G57820	4.40001558779779	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
-        edge_type_remap = {
-            'AraGWAS-Phenotype_Associations': 'pheno_assn',
-            'AraNetv2-CX_pairwise-gene-coexpression': 'gene_coexpr',
-            'AraNetv2-DC_domain-co-occurrence': 'domain_co_occur',
-            'AraNetv2-HT_high-throughput-ppi': 'ppi_hithru',
-            'AraNetv2-LC_lit-curated-ppi': 'ppi_liter',
-        }
+        """Load edge data from the set of edge files"""
 
         # dict of nodes, indexed by node ID (node1 and node2 from the file)
         node_ix = {}
-        edges = []
-        node_name = self.config('node_name')
+        # dict of edges, indexed by node1__node2__edge_type
+        edge_ix = {}
+        # error accumulator
+        err_list = []
 
-        def edge_type(row):
-            if row['layer_descrip'] not in edge_type_remap:
-                raise RuntimeError(
-                    f"{file['path']} line {line_no}: invalid edge type: {row['layer_descrip']}"
-                )
-            return edge_type_remap[row['layer_descrip']]
-
-        def _key(row):
-            return '__'.join([
-                row['node1'],
-                row['node2'],
-                edge_type(row),
-                row['edge'],
-            ])
+        schema_file = os.path.join(self._get_dataset_schema_dir(), 'csv_edge.yaml')
+        validator = get_schema_validator(schema_file=schema_file)
 
+        node_name = self.config('node_name')
         # these functions remap the values in the columns of the input file to
         # appropriate values to go into Arango
         remap_functions = {
+            # create a unique key for each record
+            '_key': lambda row: '__'.join([row[_] for _ in ['node1', 'node2', 'layer_descrip', 'edge']]),
+            'node1': None,  # this will be deleted in the 'store' step
+            'node2': None,  # as will this
             '_from': lambda row: node_name + '/' + row['node1'],
             '_to': lambda row: node_name + '/' + row['node2'],
             'score': lambda row: float(row['edge']),
-            'edge_type': edge_type,
-            '_key': _key,
+            'edge_type': lambda row: row['layer_descrip'],
         }
 
-        for file in self.config('edge_files'):
-            file_parser = self.parser_gen(file)
-            headers = []
+        # store edge data, checking for potential duplicates
+        def store_edges(datum):
+            # there should only be one value for each node<->node edge of a given type
+            edge_key = "__".join([datum['node1'], datum['node2'], datum['edge_type']])
 
-            while True:
-                try:
-                    (line_no, cols) = next(file_parser)
-                except StopIteration:
-                    break
+            if edge_key in edge_ix:
+                # ignore duplicate lines; report non-matching data
+                if datum['score'] != edge_ix[edge_key]['score']:
+                    return f"duplicate data for edge {edge_key}"
+                return None
 
-                if len(headers) == 0:
-                    headers = cols
-                    continue
+            # keep track of the nodes mentioned in this edge set
+            for node_n in ["1", "2"]:
+                node_ix[datum[f"node{node_n}"]] = 1
+                del datum[f"node{node_n}"]
 
-                # merge headers with cols to create an object
-                row_object = dict(zip(headers, cols))
-                # transform it using the remap_functions
-                datum = {key: func(row_object) for (key, func) in remap_functions.items()}
-                edges.append(datum)
+            edge_ix[edge_key] = datum
+            return None
 
-                # keep track of the nodes mentioned in this edge set
-                for node in ["1", "2"]:
-                    node_ix[row_object[f"node{node}"]] = 1
+        for file in self.config('edge_files'):
+            self.process_file(
+                file=file,
+                remap_fn=remap_functions,
+                store_fn=store_edges,
+                err_list=err_list,
+                validator=validator,
+            )
+
+        if len(err_list):
+            raise RuntimeError('\n'.join(err_list))
 
         return {
             'nodes': [{'_key': n} for n in node_ix.keys()],
-            'edges': edges,
+            'edges': edge_ix.values(),
         }
 
     def load_node_metadata(self):
         """Load node metadata"""
 
-        nodes = []
-        valid_node_types = ['gene', 'pheno']
+        node_ix = {}
+        err_list = []
+
+        schema_file = os.path.join(self._get_dataset_schema_dir(), 'csv_node.yaml')
+        validator = get_schema_validator(schema_file=schema_file)
 
         def go_terms(row):
-            if len(row['go_terms']):
-                return [c.strip() for c in row_object['go_terms'].split(',')]
+            if 'go_terms' in row and len(row['go_terms']):
+                return [c.strip() for c in row['go_terms'].split(',')]
             return []
 
-        def node_type(row):
-            if row['node_type'] not in valid_node_types:
-                raise RuntimeError(
-                    f"{file['path']} line {line_no}: invalid node type: {row['node_type']}"
-                )
-            return row['node_type']
-
         remap_functions = {
             # these pass straight through
-            'transcript': None,
-            'gene_symbol': None,
             'gene_full_name': None,
             'gene_model_type': None,
-            'tair_computational_description': None,
-            'tair_short_description': None,
-            'tair_curator_summary': None,
+            'gene_symbol': None,
+            'go_description': None,
             'mapman_bin': None,
+            'mapman_description': None,
             'mapman_name': None,
+            'node_type': None,
             'pheno_aragwas_id': None,
+            'pheno_description': None,
+            'pheno_pto_description': None,
+            'pheno_pto_name': None,
             'pheno_ref': None,
+            'tair_computational_description': None,
+            'tair_curator_summary': None,
+            'tair_short_description': None,
+            'transcript': None,
             'user_notes': None,
             # rename
             '_key': lambda row: row['node_id'],
-            'go_description': lambda row: row['go_descr'],
-            'mapman_description': lambda row: row['mapman_descr'],
-            'pheno_description': lambda row: row['pheno_descrip1'],
-            'pheno_pto_name': lambda row: row['pheno_descrip2'],
-            'pheno_pto_description': lambda row: row['pheno_descrip3'],
             # see functions above
-            'node_type': node_type,
             'go_terms': go_terms,
         }
 
-        for file in self.config('node_files'):
-            file_parser = self.parser_gen(file)
-            headers = []
-
-            while True:
-                try:
-                    (line_no, cols) = next(file_parser)
-                except StopIteration:
-                    break
-
-                if len(headers) == 0:
-                    headers = cols
-                    continue
+        # store nodes in a dict indexed by _key
+        def store_nodes(datum):
+            # check whether we have this node already
+            if datum['_key'] in node_ix:
+                # report non-matching data
+                if datum != node_ix[datum['_key']]:
+                    return f"duplicate data for node {datum['_key']}"
+                # otherwise, it's duplicated line: ignore
+                return None
 
-                # merge with headers to form an object, then remap to create Arango-ready data
-                row_object = dict(zip(headers, cols))
+            node_ix[datum['_key']] = datum
+            return None
 
-                datum = {}
-                for (key, func) in remap_functions.items():
-                    if func is None:
-                        datum[key] = row_object[key]
-                    else:
-                        datum[key] = func(row_object)
-                nodes.append(datum)
+        for file in self.config('node_files'):
+            self.process_file(
+                file=file,
+                remap_fn=remap_functions,
+                store_fn=store_nodes,
+                err_list=err_list,
+                validator=validator,
+            )
 
-        return {'nodes': nodes}
+        if len(err_list):
+            raise RuntimeError('\n'.join(err_list))
+        return {'nodes': node_ix.values()}
 
     def load_cluster_data(self):
         """Annotate genes with cluster ID fields."""
 
         # index of nodes
         node_ix = {}
+        err_list = []
+
+        schema_file = os.path.join(self._get_dataset_schema_dir(), 'csv_cluster.yaml')
+        validator = get_schema_validator(schema_file=schema_file)
+
+        # these functions remap the values in the columns of the input file to
+        # appropriate values to go into Arango
+        remap_functions = {
+            'node_ids': lambda row: [n.strip() for n in row['node_ids'].split(',')]
+        }
+
+        # store clusters in a dictionary with key node_id and value list of cluster IDs to which
+        # the node is assigned
+        def store_clusters(datum):
+            cluster_id = datum['cluster_id']
+            for node_id in datum['node_ids']:
+                if node_id not in node_ix:
+                    node_ix[node_id] = [cluster_id]
+                elif cluster_id not in node_ix[node_id]:
+                    node_ix[node_id].append(cluster_id)
+            return None
+
         for file in self.config('cluster_files'):
-            cluster_label = file['cluster_prefix']
-            headers = []
-            file_parser = self.parser_gen(file)
-
-            while True:
-                try:
-                    (line_no, cols) = next(file_parser)
-                except StopIteration:
-                    break
-
-                if len(headers) == 0:
-                    headers = cols
-                    continue
+            prefix = file['cluster_prefix']
+            remap_functions['cluster_id'] = lambda row: prefix + ':' + row['cluster_id'].replace('Cluster', '')
+
+            self.process_file(
+                file=file,
+                remap_fn=remap_functions,
+                store_fn=store_clusters,
+                err_list=err_list,
+                validator=validator,
+            )
 
-                # remove the 'Cluster' text and replace it with cluster_label
-                cluster_id = cluster_label + ':' + cols[0].replace('Cluster', '')
-                node_keys = [n.strip() for n in cols[1].split(',')]
-                for key in node_keys:
-                    if key not in node_ix:
-                        node_ix[key] = [cluster_id]
-                    elif cluster_id not in node_ix[key]:
-                        node_ix[key].append(cluster_id)
+        if len(err_list):
+            raise RuntimeError('\n'.join(err_list))
 
         # gather a list of cluster IDs for each node
         nodes = [{
@@ -345,6 +430,7 @@ def load_data(self):
         self.save_dataset(self.load_edges())
         self.save_dataset(self.load_node_metadata())
         self.save_dataset(self.load_cluster_data())
+        return True
 
     def check_data_delta(self):
         edge_data = self.load_edges()
@@ -374,3 +460,8 @@ def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
         print("Dataset contains " + str(len(edge_data['edges'])) + " edges")
         # count all nodes
         print("Dataset contains " + str(len(all_nodes)) + " nodes")
+
+
+if __name__ == '__main__':
+    parser = DJORNL_Parser()
+    parser.load_data()
diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py
index e2442c8d..a3c8ecfa 100644
--- a/importers/test/test_djornl_parser.py
+++ b/importers/test/test_djornl_parser.py
@@ -64,9 +64,24 @@ def test_load_empty_files(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
-        self.assertEqual(parser.load_node_metadata(), {"nodes": []})
-        self.assertEqual(parser.load_cluster_data(), {"nodes": []})
+        # header only, no content
+        err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_node_metadata()
+
+        # comments only
+        err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_edges()
+
+        # mix of problems
+        err_str = "\n".join([
+            'cluster_data/headers_only.tsv: no valid data found',
+            'cluster_data/no_content.tsv: no header line found',
+            'cluster_data/comment_only.tsv: no header line found',
+        ])
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            parser.load_cluster_data()
 
     def test_load_missing_files(self):
         """ test loading when files cannot be found """
@@ -77,23 +92,47 @@ def test_load_missing_files(self):
         with self.assertRaisesRegex(RuntimeError, err_str):
             self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-    def test_load_invalid_types(self):
+    def test_load_invalid_edges(self):
         """ test file format errors """
 
         # path: test/djornl/invalid_types
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # invalid edge type
-        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
+        # invalid edge type, invalid scores
+        edge_err_msg = "\n".join([
+            r"edges.tsv line 3: 'Same-Old-Stuff' is not valid under any of the given schemas",
+            r"edges.tsv line 7: '2.' does not match .*?",
+            r"edges.tsv line 8: 'raNetv2-DC_' is not valid under any of the given schemas",
+            r"edges.tsv line 10: 'score!' does not match .*?"
+        ])
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
+    def test_load_invalid_nodes(self):
+        """ test file format errors """
+
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
         # invalid node type
-        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
+        node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas"
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
+    def test_load_invalid_clusters(self):
+        """ test file format errors """
+
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        # invalid node type
+        cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match"
+        with self.assertRaisesRegex(RuntimeError, cluster_err_msg):
+            parser.load_cluster_data()
+
     def test_load_col_count_errors(self):
         """ test files with invalid numbers of columns """
 
@@ -101,13 +140,13 @@ def test_load_col_count_errors(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # invalid edge type
-        edge_err_msg = 'line 6: expected 5 cols, found 3'
+        # not enough cols
+        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 6: expected 5 cols, found 3'
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
-        # invalid node type
-        node_err_msg = 'line 3: expected 20 cols, found 22'
+        # too many cols
+        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
@@ -144,10 +183,53 @@ def test_load_valid_cluster_data(self):
 
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
         cluster_data = parser.load_cluster_data()
         self.assertEqual(
             cluster_data,
             self.json_data["load_cluster_data"]
         )
 
-        parser.check_data_delta()
+    def test_duplicate_edge_data(self):
+        """ test files with duplicate edge data, which should throw an error """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        err_msg = "\n".join([
+            "hithruput-edges.csv line 5: duplicate data for edge AT1G01010__AT1G01030__AraNetv2-HT_.*?",
+            "hithruput-edges.csv line 9: duplicate data for edge AT1G01030__AT1G01050__AraNetv2-CX_.*?"
+        ])
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            parser.load_edges()
+
+    def test_duplicate_node_data(self):
+        """ test files with duplicate node data, which should throw an error """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080"
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            parser.load_node_metadata()
+
+    def test_duplicate_cluster_data(self):
+        """ test files with duplicate cluster data, which should be seamlessly merged """
+
+        # path: test/djornl/col_count_errors
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        cluster_data = parser.load_cluster_data()
+        self.assertEqual(
+            cluster_data,
+            self.json_data["load_cluster_data"]
+        )
+
+    def test_the_full_shebang(self):
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        parser.load_data()
+        self.assertEqual(True, parser.load_data())
diff --git a/relation_engine_server/utils/bulk_import.py b/relation_engine_server/utils/bulk_import.py
index 8e520726..fbd1c6c3 100644
--- a/relation_engine_server/utils/bulk_import.py
+++ b/relation_engine_server/utils/bulk_import.py
@@ -6,7 +6,7 @@
 import hashlib
 
 from relation_engine_server.utils.json_validation import get_schema_validator
-from relation_engine_server.utils import spec_loader
+from relation_engine_server.utils.spec_loader import get_collection
 from relation_engine_server.utils.arango_client import import_from_file
 
 
@@ -16,8 +16,8 @@ def bulk_import(query_params):
     schema, then write them into a temporary file that can be passed into the
     arango client.
     """
-    schema = spec_loader.get_collection(query_params['collection'])
-    validator = get_schema_validator(schema=schema['schema'])
+    schema_file = get_collection(query_params['collection'], path_only=True)
+    validator = get_schema_validator(schema_file=schema_file, validate_at='/schema')
     # We can't use a context manager here
     # We need to close the file to have the file contents readable
     #  and we need to prevent deletion of the temp file on close (default behavior of tempfiles)
diff --git a/spec/collections/djornl/djornl_edge.yaml b/spec/collections/djornl/djornl_edge.yaml
index 1c956fdc..a6f93d51 100644
--- a/spec/collections/djornl/djornl_edge.yaml
+++ b/spec/collections/djornl/djornl_edge.yaml
@@ -14,39 +14,15 @@ schema:
   description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
   type: object
   required: [score, edge_type, _from, _to, _key]
+  additionalProperties: false
   properties:
     _key:
-      type: string
-      title: Key
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_key
     _from:
-      type: string
-      title: Gene ID
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_from
     _to:
-      type: string
-      title: Gene or Phenotype ID
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/_to
     score:
-      title: Edge Score (Weight)
-      # (float)
-      type: number
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/score
     edge_type:
-      title: Edge Type
-      type: string
-      oneOf:
-        - const: domain_co_occur
-          title: AraNetv2-DC_domain-co-occurrence
-          description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
-        - const: gene_coexpr
-          title: AraNetv2-CX_pairwise-gene-coexpression
-          description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
-            calculated from Pearson correlation coefficients to normalize the data
-            for comparison across studies and different types of data layers (Lee et
-            al, 2015).
-        - const: pheno_assn
-          title: AraGWAS-Phenotype_Associations
-          description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
-        - const: ppi_hithru
-          title: AraNetv2-HT_high-throughput-ppi
-          description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
-        - const: ppi_liter
-          title: AraNetv2-LC_lit-curated-ppi
-          description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+      $ref: ../../datasets/djornl/definitions.yaml#definitions/djornl_edge/edge_type
diff --git a/spec/collections/djornl/djornl_node.yaml b/spec/collections/djornl/djornl_node.yaml
index e0a744a7..62d17dda 100644
--- a/spec/collections/djornl/djornl_node.yaml
+++ b/spec/collections/djornl/djornl_node.yaml
@@ -3,8 +3,8 @@ type: vertex
 delta: false
 
 indexes:
- - type: hash
-   fields: ["clusters[*]"]
+  - type: hash
+    fields: ["clusters[*]"]
 
 schema:
   "$schema": http://json-schema.org/draft-07/schema#
@@ -12,6 +12,7 @@ schema:
   description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
   type: object
   required: [_key]
+  additionalProperties: false
   properties:
     _key:
       type: string
diff --git a/spec/datasets/djornl/csv_cluster.yaml b/spec/datasets/djornl/csv_cluster.yaml
new file mode 100644
index 00000000..c60f1dfd
--- /dev/null
+++ b/spec/datasets/djornl/csv_cluster.yaml
@@ -0,0 +1,15 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_cluster
+title: Cluster data
+description: Cluster ID to node ID mappings
+type: object
+required: [cluster_id, node_ids]
+additionalProperties: false
+properties:
+  cluster_id:
+    type: string
+    format: regex
+    pattern: "^Cluster\\d+"
+  # pre-transform node_ids
+  node_ids:
+    type: string
diff --git a/spec/datasets/djornl/csv_edge.yaml b/spec/datasets/djornl/csv_edge.yaml
new file mode 100644
index 00000000..c2416262
--- /dev/null
+++ b/spec/datasets/djornl/csv_edge.yaml
@@ -0,0 +1,23 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_edge
+title: Arabidopsis gene-gene or gene-phenotype edge
+description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
+type: object
+required: [node1, node2, edge, layer_descrip]
+properties:
+  node1:
+    $ref: definitions.yaml#definitions/djornl_edge/_from
+  node2:
+    $ref: definitions.yaml#definitions/djornl_edge/_to
+  edge:
+    type: string
+    format: regex
+    pattern: "^\\d*(\\.\\d+)?$"
+  layer_descrip:
+    type: string
+    oneOf:
+      - const: AraNetv2-DC_domain-co-occurrence
+      - const: AraNetv2-CX_pairwise-gene-coexpression
+      - const: AraGWAS-Phenotype_Associations
+      - const: AraNetv2-HT_high-throughput-ppi
+      - const: AraNetv2-LC_lit-curated-ppi
diff --git a/spec/datasets/djornl/csv_node.yaml b/spec/datasets/djornl/csv_node.yaml
new file mode 100644
index 00000000..c957ff98
--- /dev/null
+++ b/spec/datasets/djornl/csv_node.yaml
@@ -0,0 +1,52 @@
+"$schema": http://json-schema.org/draft-07/schema#
+name: csv_node
+title: CSV node file syntax
+description: Arabidopsis gene and phenotype nodes from the Dan Jacobson Lab
+type: object
+required: [node_id, node_type]
+additionalProperties: false
+properties:
+  node_id:
+    $ref: definitions.yaml#definitions/djornl_node/_key
+  node_type:
+    $ref: definitions.yaml#definitions/djornl_node/node_type
+  clusters:
+    $ref: definitions.yaml#definitions/djornl_node/clusters
+  transcript:
+    $ref: definitions.yaml#definitions/djornl_node/transcript
+  gene_symbol:
+    $ref: definitions.yaml#definitions/djornl_node/gene_symbol
+  gene_full_name:
+    $ref: definitions.yaml#definitions/djornl_node/gene_full_name
+  gene_model_type:
+    $ref: definitions.yaml#definitions/djornl_node/gene_model_type
+  tair_computational_description:
+    $ref: definitions.yaml#definitions/djornl_node/tair_computational_description
+  tair_curator_summary:
+    $ref: definitions.yaml#definitions/djornl_node/tair_curator_summary
+  tair_short_description:
+    $ref: definitions.yaml#definitions/djornl_node/tair_short_description
+  go_terms:
+    type: string
+    format: regex
+    pattern: "^(GO:\\d{7}, ?)*(GO:\\d{7})?$"
+  go_description:
+    $ref: definitions.yaml#definitions/djornl_node/go_description
+  mapman_bin:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_bin
+  mapman_name:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_name
+  mapman_description:
+    $ref: definitions.yaml#definitions/djornl_node/mapman_description
+  pheno_aragwas_id:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_aragwas_id
+  pheno_description:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_description
+  pheno_pto_name:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_pto_name
+  pheno_pto_description:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_pto_description
+  pheno_ref:
+    $ref: definitions.yaml#definitions/djornl_node/pheno_ref
+  user_notes:
+    $ref: definitions.yaml#definitions/djornl_node/user_notes
diff --git a/spec/datasets/djornl/definitions.yaml b/spec/datasets/djornl/definitions.yaml
new file mode 100644
index 00000000..81ee5e29
--- /dev/null
+++ b/spec/datasets/djornl/definitions.yaml
@@ -0,0 +1,129 @@
+"$schema": "http://json-schema.org/draft-07/schema#"
+name: definitions
+title: DJORNL schema definitions
+description: Node and edge metadata definitions for the Dan Jacobson Exascale dataset
+definitions:
+  cluster_id:
+    type: string
+    format: regex
+    pattern: ^\w+:\d+$
+    examples: ["markov_i2:1", "markov_i4:5", "markov_i6:3"]
+  go_term:
+    type: string
+    format: regex
+    pattern: ^GO:\d{7}$
+    examples: ["GO:0003700", "GO:0005515"]
+  djornl_edge:
+    _key:
+      type: string
+      title: Key
+      format: regex
+      pattern: ^(\S+__){3}(\S+)$
+    _from:
+      type: string
+      title: Gene ID
+    _to:
+      type: string
+      title: Gene or Phenotype ID
+    score:
+      title: Edge Score (Weight)
+      # (float)
+      type: number
+    edge_type:
+      $ref: edge_type.yaml
+  djornl_node:
+    _key:
+      type: string
+      title: Key
+      examples: ["AT1G01010"]
+    clusters:
+      type: array
+      title: Clusters
+      description: Clusters to which the node has been assigned
+      items:
+        $ref: #definitions/cluster_id
+      examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
+    node_type:
+      type: string
+      title: Node type
+      oneOf:
+        - const: gene
+          title: Gene
+        - const: pheno
+          title: Phenotype
+      examples: ["gene", "pheno"]
+    transcript:
+      type: string
+      title: Transcript
+      examples: ["AT1G01010.1"]
+    gene_symbol:
+      type: string
+      title: Gene symbol
+      examples: ["NTL10"]
+    gene_full_name:
+      type: string
+      title: Gene full name
+      examples: ["NAC domain containing protein 1"]
+    gene_model_type:
+      type: string
+      title: Gene model type
+      examples: ["protein_coding"]
+    tair_computational_description:
+      type: string
+      title: TAIR computational description
+      examples: ["NAC domain containing protein 1;(source:Araport11)"]
+    tair_curator_summary:
+      type: string
+      title: TAIR curator summary
+      examples: ["Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed."]
+    tair_short_description:
+      type: string
+      title: TAIR short description
+      examples: ["NAC domain containing protein 1"]
+    go_description:
+      type: string
+      title: GO descriptions
+      examples: ["DNA-binding transcription factor activity"]
+    go_terms:
+      type: array
+      title: GO term IDs
+      items:
+        $ref: #definitions/go_term
+    mapman_bin:
+      type: string
+      title: Mapman bin
+      examples: ["15.5.17"]
+    mapman_name:
+      type: string
+      title: Mapman name
+      examples: [".RNA biosynthesis.transcriptional regulation.transcription factor (NAC)"]
+    mapman_description:
+      type: string
+      title: Mapman description
+      examples: ["transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96])"]
+    pheno_aragwas_id:
+      type: string
+      title: AraGWAS ID
+      examples: ["10.21958/phenotype:67"]
+    pheno_description:
+      type: string
+      title: Phenotype description
+      examples: ["Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008"]
+    pheno_pto_name:
+      type: string
+      title: PTO name
+      description: Plant Trait Ontology name
+      examples: ["arsenic concentration"]
+    pheno_pto_description:
+      type: string
+      title: PTO description
+      description: Plant Trait Ontology description
+      examples: ["A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]"]
+    pheno_ref:
+      type: string
+      title: Phenotype reference
+      examples: ["Atwell et. al, Nature 2010"]
+    user_notes:
+      type: string
+      title: User Notes
+      examples: ["flowering time related"]
diff --git a/spec/datasets/djornl/edge_type.yaml b/spec/datasets/djornl/edge_type.yaml
new file mode 100644
index 00000000..3ac6d62a
--- /dev/null
+++ b/spec/datasets/djornl/edge_type.yaml
@@ -0,0 +1,25 @@
+$schema: "http://json-schema.org/draft-07/schema#"
+name: edge_type
+title: Edge Type
+description: Edge types in Dan Jacobson Arabidopsis Exascale dataset
+type: string
+oneOf:
+  - const: AraGWAS-Phenotype_Associations
+    title: AraGWAS phenotype associations
+    description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
+
+  - const: AraNetv2-CX_pairwise-gene-coexpression
+    title: AraNetv2 pairwise gene coexpression
+    description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from Pearson correlation coefficients to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+
+  - const: AraNetv2-DC_domain-co-occurrence
+    title: AraNetv2 domain co-occurrence
+    description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+
+  - const: AraNetv2-HT_high-throughput-ppi
+    title: AraNetv2 high-throughput protein-protein interaction
+    description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+
+  - const: AraNetv2-LC_lit-curated-ppi
+    title: AraNetv2 literature-curated protein-protein interaction
+    description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
diff --git a/spec/datasets/djornl/node_type.yaml b/spec/datasets/djornl/node_type.yaml
new file mode 100644
index 00000000..e839c2d0
--- /dev/null
+++ b/spec/datasets/djornl/node_type.yaml
@@ -0,0 +1,10 @@
+"$schema": "http://json-schema.org/draft-07/schema#"
+name: node_type
+title: Node Type
+description: Node types in Dan Jacobson Exascale dataset
+type: string
+oneOf:
+  - const: gene
+    title: Gene
+  - const: pheno
+    title: Phenotype
diff --git a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
index 7727cd8a..e989f2ca 100644
--- a/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/spec/test/djornl/col_count_errors/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
@@ -1,4 +1,4 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
 As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
 As75,pheno,,,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
 AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,,
diff --git a/spec/test/djornl/duplicate_data/I2_named.tsv b/spec/test/djornl/duplicate_data/I2_named.tsv
new file mode 100644
index 00000000..bef48c38
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/I2_named.tsv
@@ -0,0 +1,9 @@
+cluster_id	node_ids
+# data_type: cluster
+# cluster_prefix: markov_i2
+# title: Markov clustering, inflation = 2
+Cluster1	AT1G01010,AT1G01030,AT1G01040
+Cluster2	AT1G01050,AT1G01060,AT1G01070
+Cluster3	AT1G01090
+# Cluster4
+Cluster5	AT1G01020
diff --git a/spec/test/djornl/duplicate_data/I4_named.tsv b/spec/test/djornl/duplicate_data/I4_named.tsv
new file mode 100644
index 00000000..29b6cd36
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/I4_named.tsv
@@ -0,0 +1,8 @@
+cluster_id	node_ids
+# cluster_prefix: markov_i4
+# title: Markov clustering, inflation = 4
+# data_type: cluster
+# Cluster1
+# Cluster2
+Cluster3	AT1G01080
+# Cluster4
diff --git a/spec/test/djornl/duplicate_data/I6_copy.csv b/spec/test/djornl/duplicate_data/I6_copy.csv
new file mode 100644
index 00000000..a21bd2af
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/I6_copy.csv
@@ -0,0 +1,8 @@
+cluster_id,node_ids
+# data_type: cluster
+# cluster_prefix: markov_i6
+# title: Markov clustering, inflation = 6
+Cluster1,"AT1G01040,AT1G01090"
+Cluster2,AT1G01070
+Cluster3,"AT1G01010,AT1G01020,AT1G01030"
+# Cluster4
diff --git a/spec/test/djornl/duplicate_data/I6_named.tsv b/spec/test/djornl/duplicate_data/I6_named.tsv
new file mode 100644
index 00000000..e7688f17
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/I6_named.tsv
@@ -0,0 +1,8 @@
+cluster_id	node_ids
+# data_type: cluster
+# cluster_prefix: markov_i6
+# title: Markov clustering, inflation = 6
+Cluster1	AT1G01040,AT1G01090
+Cluster2	AT1G01070
+Cluster3	AT1G01010,AT1G01020,AT1G01030
+# Cluster4
diff --git a/spec/test/djornl/duplicate_data/edges.tsv b/spec/test/djornl/duplicate_data/edges.tsv
new file mode 100644
index 00000000..432c2deb
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/edges.tsv
@@ -0,0 +1,11 @@
+node1	node2	edge	edge_descrip	layer_descrip
+As2	AT1G01020	8.4	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
+As2	AT1G01040	5.4	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
+As75	AT1G01020	39.9	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
+AT1G01010	AT1G01040	2.5	AraNetv2_log-likelihood-score	AraNetv2-DC_domain-co-occurrence
+AT1G01010	AT1G01040	170.5	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
+AT1G01030	AT1G01050	2.6	AraNetv2_log-likelihood-score	AraNetv2-CX_pairwise-gene-coexpression
+AT1G01050	AT1G01060	2.7	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
+# duplicated line
+AT1G01010	AT1G01040	2.5	AraNetv2_log-likelihood-score	AraNetv2-DC_domain-co-occurrence
+AT1G01080	AT1G01090	2.8	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
diff --git a/spec/test/djornl/duplicate_data/extra_node.tsv b/spec/test/djornl/duplicate_data/extra_node.tsv
new file mode 100644
index 00000000..a1a28b69
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/extra_node.tsv
@@ -0,0 +1,5 @@
+# data_type: node
+node_id	node_type	transcript	gene_symbol	gene_full_name	gene_model_type	TAIR_Computational_description	TAIR_Curator_summary	TAIR_short_description	GO_description	GO_terms	MapMan_bin	MapMan_name	MapMan_description
+AT1G01100	gene	AT1G01100.4			protein_coding	60S acidic ribosomal protein family;(source:Araport11)		60S acidic ribosomal protein family	structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity	GO:0003735, GO:0043021, GO:0030295	17.1.2.1.46	.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1	component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])
+# duplicated line with alterations
+AT1G01080	gene	AT1G01080.3	whatever!		protein_coding	RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11)			"RNA binding, mRNA binding"	"GO:0003723, GO:0003729"	35.1	not assigned.annotated	"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein	 chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)"
diff --git a/spec/test/djornl/duplicate_data/hithruput-edges.csv b/spec/test/djornl/duplicate_data/hithruput-edges.csv
new file mode 100644
index 00000000..c11caa3d
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/hithruput-edges.csv
@@ -0,0 +1,9 @@
+node1,node2,edge,edge_descrip,layer_descrip
+AT1G01010,AT1G01020,2.3,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi
+AT1G01010,AT1G01030,2.4,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi
+# potentially erroneous line
+AT1G01010,AT1G01030,2.7,AraNetv2_log-likelihood-score,AraNetv2-HT_high-throughput-ppi
+# duplicated line from the other file
+AT1G01050,AT1G01060,2.7,AraNetv2_log-likelihood-score,AraNetv2-LC_lit-curated-ppi
+# potentially erroneous duplication from the other file
+AT1G01030,AT1G01050,2.6000001,AraNetv2_log-likelihood-score,AraNetv2-CX_pairwise-gene-coexpression
diff --git a/spec/test/djornl/duplicate_data/manifest.yaml b/spec/test/djornl/duplicate_data/manifest.yaml
new file mode 100644
index 00000000..beffb367
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/manifest.yaml
@@ -0,0 +1,39 @@
+name: Dan Jacobson Exascale data
+release_date: "2020-06-06"
+home_url: "https://github.com/kbase/exascale_data"
+file_list:
+  - data_type: edge
+    path: edges.tsv
+    date: "2020-12-25"
+
+  - data_type: edge
+    path: hithruput-edges.csv
+    date: "2020-12-25"
+
+  - data_type: node
+    path: nodes.csv
+    date: "2019-01-01"
+
+  - data_type: cluster
+    cluster_prefix: markov_i2
+    path: I2_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i4
+    path: I4_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: I6_named.tsv
+
+  - data_type: cluster
+    cluster_prefix: markov_i6
+    path: I6_copy.csv
+
+  - data_type: node
+    path: pheno_nodes.csv
+    date: "2019-01-01"
+
+  - data_type: node
+    path: extra_node.tsv
+    date: "2019-01-01"
diff --git a/spec/test/djornl/duplicate_data/nodes.csv b/spec/test/djornl/duplicate_data/nodes.csv
new file mode 100644
index 00000000..b1938272
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/nodes.csv
@@ -0,0 +1,13 @@
+# data_type: node
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
+AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,,
+AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,,
+AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3  transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,,
+AT1G01040,gene,AT1G01040.2,SUS1,SUSPENSOR 1,protein_coding,dicer-like 1;(source:Araport11),"Encodes a Dicer homolog. Dicer is a RNA helicase involved in microRNA processing. Mutations in this locus can result in embryo lethality. Embryo shape at seed maturity is globular-elongate. Other mutants convert the floral meristems to an indeterminate state, others yet show defects in ovule development. mRNA is expressed in all shoot tissues. DCL1 is able to produce miRNAs and siRNAs. The mRNA is cell-to-cell mobile.",dicer-like 1,"metal ion binding, protein binding, ribonuclease III activity, ATP-dependent helicase activity, ATP binding, RNA binding, helicase activity, double-stranded RNA binding, DNA binding","GO:0046872, GO:0005515, GO:0004525, GO:0008026, GO:0005524, GO:0003723, GO:0004386, GO:0003725, GO:0003677",16.10.2.1.1,.RNA processing.mRNA silencing.miRNA pathway.DCL1-HYL1 miRNA biogenesis complex.endoribonuclease component DCL1,endoribonuclease component DCL1 of DCL1-HYL1 miRNA biogenesis complex (original description: pep chromosome:TAIR10:1:23416:31120:1 gene:AT1G01040 transcript:AT1G01040.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:DCL1 description:Dicer-like 1 [Source:UniProtKB/TrEMBL;Acc:F4HQG6]),,,,,,
+AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,,
+AT1G01060,gene,AT1G01060.8,LHY1,LATE ELONGATED HYPOCOTYL 1,protein_coding,Homeodomain-like superfamily protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding, transcription regulatory region DNA binding","GO:0003700, GO:0003677, GO:0044212",27.1.1,.Multi-process regulation.circadian clock system.core oscillator protein (LHY|CCA1),circadian clock core oscillator protein (LHY|CCA1) (original description: pep chromosome:TAIR10:1:33967:37230:-1 gene:AT1G01060 transcript:AT1G01060.8 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LHY description:LHY1 [Source:UniProtKB/TrEMBL;Acc:A0A178W761]),,,,,,
+AT1G01070,gene,AT1G01070.2,UMAMIT28,Usually multiple acids move in and out Transporters 28,protein_coding,nodulin MtN21 /EamA-like transporter family protein;(source:Araport11),Encodes a plasma membrane-localized amino acid transporter likely involved in amino acid export in the developing seed.,nodulin MtN21 /EamA-like transporter family protein,L-glutamine transmembrane transporter activity,GO:0015186,24.2.1.5,.Solute transport.carrier-mediated transport.DMT superfamily.solute transporter (UmamiT),solute transporter (UmamiT) (original description: pep chromosome:TAIR10:1:38752:40945:-1 gene:AT1G01070 transcript:AT1G01070.2 gene_biotype:protein_coding transcript_biotype:protein_coding description:WAT1-related protein [Source:UniProtKB/TrEMBL;Acc:A0A178WFU3]),,,,,,
+AT1G01080,gene,AT1G01080.3,,,protein_coding,RNA-binding (RRM/RBD/RNP motifs) family protein;(source:Araport11),,,"RNA binding, mRNA binding","GO:0003723, GO:0003729",35.1,not assigned.annotated,"(original description: pep chromosome:TAIR10:1:44970:47059:-1 gene:AT1G01080 transcript:AT1G01080.3 gene_biotype:protein_coding transcript_biotype:protein_coding description:RNA-binding (RRM/RBD/RNP motifs) family protein [Source:UniProtKB/TrEMBL;Acc:F4HQH8]) & 33 kDa ribonucleoprotein, chloroplastic OS=Nicotiana sylvestris (sp|p19684|roc5_nicsy : 109.0)",,,,,,
+AT1G01090,gene,AT1G01090.1,PDH-E1 ALPHA,pyruvate dehydrogenase E1 alpha,protein_coding,pyruvate dehydrogenase E1 alpha;(source:Araport11),pyruvate dehydrogenase E1 alpha subunit,pyruvate dehydrogenase E1 alpha,"pyruvate dehydrogenase (acetyl-transferring) activity, protein binding","GO:0004739, GO:0005515",5.1.2.2.1.1,.Lipid metabolism.fatty acid biosynthesis.acetyl-CoA generation.plastidial pyruvate dehydrogenase complex.E1 pyruvate dehydrogenase subcomplex.subunit alpha,subunit alpha of E1 pyruvate dehydrogenase component (original description: pep chromosome:TAIR10:1:47234:49304:-1 gene:AT1G01090 transcript:AT1G01090.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PDH-E1 ALPHA description:Pyruvate dehydrogenase E1 component subunit alpha [Source:UniProtKB/TrEMBL;Acc:A0A178W8A7]),,,,,,
+# duplicated line
+AT1G01050,gene,AT1G01050.2,PPa1,pyrophosphorylase 1,protein_coding,pyrophosphorylase 1;(source:Araport11),,,inorganic diphosphatase activity,GO:0004427,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:31382:33009:-1 gene:AT1G01050 transcript:AT1G01050.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:PPA1 description:Soluble inorganic pyrophosphatase 1 [Source:UniProtKB/Swiss-Prot;Acc:Q93V56]) & Soluble inorganic pyrophosphatase 1 OS=Arabidopsis thaliana (sp|q93v56|ipyr1_arath : 419.0),,,,,,
diff --git a/spec/test/djornl/duplicate_data/pheno_nodes.csv b/spec/test/djornl/duplicate_data/pheno_nodes.csv
new file mode 100644
index 00000000..83fbf4be
--- /dev/null
+++ b/spec/test/djornl/duplicate_data/pheno_nodes.csv
@@ -0,0 +1,5 @@
+node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
+As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
+As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
+Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010",
+SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010",
diff --git a/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
index 118cfbcc..ab31e045 100644
--- a/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/spec/test/djornl/empty_files/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
@@ -1 +1 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
diff --git a/spec/test/djornl/empty_files/cluster_data/comment_only.tsv b/spec/test/djornl/empty_files/cluster_data/comment_only.tsv
new file mode 100644
index 00000000..9ce2fbf1
--- /dev/null
+++ b/spec/test/djornl/empty_files/cluster_data/comment_only.tsv
@@ -0,0 +1 @@
+# what?
diff --git a/spec/test/djornl/empty_files/cluster_data/headers_only.tsv b/spec/test/djornl/empty_files/cluster_data/headers_only.tsv
new file mode 100644
index 00000000..3233ca40
--- /dev/null
+++ b/spec/test/djornl/empty_files/cluster_data/headers_only.tsv
@@ -0,0 +1,4 @@
+cluster_id	node_ids
+# comment
+# comment
+# comment
diff --git a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/spec/test/djornl/empty_files/cluster_data/no_content.tsv
similarity index 100%
rename from spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
rename to spec/test/djornl/empty_files/cluster_data/no_content.tsv
diff --git a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
deleted file mode 100644
index 8b137891..00000000
--- a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
deleted file mode 100644
index 8b137891..00000000
--- a/spec/test/djornl/empty_files/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/spec/test/djornl/empty_files/manifest.yaml b/spec/test/djornl/empty_files/manifest.yaml
index fb231666..ae04f7a4 100644
--- a/spec/test/djornl/empty_files/manifest.yaml
+++ b/spec/test/djornl/empty_files/manifest.yaml
@@ -9,12 +9,12 @@ file_list:
 
   - data_type: cluster
     cluster_prefix: markov_i2
-    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+    path: cluster_data/headers_only.tsv
 
   - data_type: cluster
     cluster_prefix: markov_i4
-    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+    path: cluster_data/no_content.tsv
 
   - data_type: cluster
     cluster_prefix: markov_i6
-    path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+    path: cluster_data/comment_only.tsv
diff --git a/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv
index 8b137891..4b2bca02 100644
--- a/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv
+++ b/spec/test/djornl/empty_files/merged_edges-AMW-060820_AF.tsv
@@ -1 +1,4 @@
-
+# this line is a comment
+# so is this line
+# oh no
+# there's no content in this file!
diff --git a/spec/test/djornl/invalid_types/edges.tsv b/spec/test/djornl/invalid_types/edges.tsv
new file mode 100644
index 00000000..06bbe9fd
--- /dev/null
+++ b/spec/test/djornl/invalid_types/edges.tsv
@@ -0,0 +1,10 @@
+# data_type: edge
+node1	node2	edge	edge_descrip	layer_descrip
+As2	AT1G01020	8.422046084731258	AraGWAS-Association_score	Same-Old-Stuff
+As2	AT1G01040	6	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
+As75	AT1G01020	39.98573324312915	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
+AT1G01010	AT1G01020	2.39322646755088	AraNetv2_log-likelihood-score	AraNetv2-HT_high-throughput-ppi
+AT1G01010	AT1G01030	2.	AraNetv2_log-likelihood-score	AraNetv2-HT_high-throughput-ppi
+AT1G01010	AT1G01040	"2.39322646755088"	AraNetv2_log-likelihood-score	raNetv2-DC_
+AT1G01030	AT1G01050	25494618241936697	AraNetv2_log-likelihood-score	AraNetv2-CX_pairwise-gene-coexpression
+AT1G01050	AT1G01060	score!	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
diff --git a/spec/test/djornl/invalid_types/manifest.yaml b/spec/test/djornl/invalid_types/manifest.yaml
index 50c5f454..e37ca783 100644
--- a/spec/test/djornl/invalid_types/manifest.yaml
+++ b/spec/test/djornl/invalid_types/manifest.yaml
@@ -2,7 +2,11 @@ name: Dan Jacobson Exascale data
 release_date: "2020-06-06"
 file_list:
   - data_type: edge
-    path: merged_edges-AMW-060820_AF.tsv
+    path: edges.tsv
 
   - data_type: node
-    path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+    path: nodes.csv
+
+  - data_type: cluster
+    path: markov2_named.tsv
+    cluster_prefix: markov_i2
diff --git a/spec/test/djornl/invalid_types/markov2_named.tsv b/spec/test/djornl/invalid_types/markov2_named.tsv
new file mode 100644
index 00000000..f82190fd
--- /dev/null
+++ b/spec/test/djornl/invalid_types/markov2_named.tsv
@@ -0,0 +1,9 @@
+cluster_id	node_ids
+# data_type: cluster
+# cluster_prefix: markov_i2
+# title: Markov clustering, inflation = 2
+Cluster1	AT1G01010,AT1G01030,AT1G01040
+Cluster2	AT1G01050,AT1G01060,AT1G01070
+HoneyNutCluster3	AT1G01080,AT1G01090
+Cluster4
+Cluster5	AT1G01020
diff --git a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
deleted file mode 100644
index a98f49f9..00000000
--- a/spec/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
+++ /dev/null
@@ -1,10 +0,0 @@
-# data_type: edge
-node1	node2	edge	edge_descrip	layer_descrip
-As2	AT1G01020	8.422046084731258	AraGWAS-Association_score	AraGWAS-Some-Old-Rubbish-I-Made-Up
-As2	AT1G01040	5.422046084731258	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
-As75	AT1G01020	39.98573324312915	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
-AT1G01010	AT1G01020	2.39322646755088	AraNetv2_log-likelihood-score	AraNetv2-HT_high-throughput-ppi
-AT1G01010	AT1G01030	2.39322646755088	AraNetv2_log-likelihood-score	AraNetv2-HT_high-throughput-ppi
-AT1G01010	AT1G01040	2.39322646755088	AraNetv2_log-likelihood-score	raNetv2-DC_domain-co-occurrence
-AT1G01030	AT1G01050	2.5494618241936697	AraNetv2_log-likelihood-score	AraNetv2-CX_pairwise-gene-coexpression
-AT1G01050	AT1G01060	4.34242054808616	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
diff --git a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/spec/test/djornl/invalid_types/nodes.csv
similarity index 98%
rename from spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
rename to spec/test/djornl/invalid_types/nodes.csv
index e98310e5..e469f003 100644
--- a/spec/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/spec/test/djornl/invalid_types/nodes.csv
@@ -1,4 +1,4 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
 # data_type: node
 As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
 As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json
index e15408d2..9eefcebc 100644
--- a/spec/test/djornl/results.json
+++ b/spec/test/djornl/results.json
@@ -13,16 +13,16 @@
       {"_key": "AT1G01090"}
     ],
     "edges": [
-      {"_key": "As2__AT1G01020__pheno_assn__8.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01020", "edge_type": "pheno_assn", "score": 8.4},
-      {"_key": "As2__AT1G01040__pheno_assn__5.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01040", "edge_type": "pheno_assn", "score": 5.4},
-      {"_key": "As75__AT1G01020__pheno_assn__39.9", "_from": "djornl_node/As75", "_to": "djornl_node/AT1G01020", "edge_type": "pheno_assn", "score": 39.9},
-      {"_key": "AT1G01010__AT1G01020__ppi_hithru__2.3", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01020", "edge_type": "ppi_hithru", "score": 2.3},
-      {"_key": "AT1G01010__AT1G01030__ppi_hithru__2.4", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01030", "edge_type": "ppi_hithru", "score": 2.4},
-      {"_key": "AT1G01010__AT1G01040__domain_co_occur__2.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "domain_co_occur", "score": 2.5},
-      {"_key": "AT1G01010__AT1G01040__ppi_liter__170.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "ppi_liter", "score": 170.5},
-      {"_key": "AT1G01030__AT1G01050__gene_coexpr__2.6", "_from": "djornl_node/AT1G01030", "_to": "djornl_node/AT1G01050", "edge_type": "gene_coexpr", "score": 2.6},
-      {"_key": "AT1G01050__AT1G01060__ppi_liter__2.7", "_from": "djornl_node/AT1G01050", "_to": "djornl_node/AT1G01060", "edge_type": "ppi_liter", "score": 2.7},
-      {"_key": "AT1G01080__AT1G01090__ppi_liter__2.8", "_from": "djornl_node/AT1G01080", "_to": "djornl_node/AT1G01090", "edge_type": "ppi_liter", "score": 2.8}
+      {"_key": "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01020", "edge_type": "AraGWAS-Phenotype_Associations", "score": 8.4},
+      {"_key": "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4", "_from": "djornl_node/As2", "_to": "djornl_node/AT1G01040", "edge_type": "AraGWAS-Phenotype_Associations", "score": 5.4},
+      {"_key": "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9", "_from": "djornl_node/As75", "_to": "djornl_node/AT1G01020", "edge_type": "AraGWAS-Phenotype_Associations", "score": 39.9},
+      {"_key": "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01020", "edge_type": "AraNetv2-HT_high-throughput-ppi", "score": 2.3},
+      {"_key": "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01030", "edge_type": "AraNetv2-HT_high-throughput-ppi", "score": 2.4},
+      {"_key": "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "AraNetv2-DC_domain-co-occurrence", "score": 2.5},
+      {"_key": "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5", "_from": "djornl_node/AT1G01010", "_to": "djornl_node/AT1G01040", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 170.5},
+      {"_key": "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6", "_from": "djornl_node/AT1G01030", "_to": "djornl_node/AT1G01050", "edge_type": "AraNetv2-CX_pairwise-gene-coexpression", "score": 2.6},
+      {"_key": "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7", "_from": "djornl_node/AT1G01050", "_to": "djornl_node/AT1G01060", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 2.7},
+      {"_key": "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8", "_from": "djornl_node/AT1G01080", "_to": "djornl_node/AT1G01090", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 2.8}
     ]
   },
   "load_cluster_data": {
@@ -33,9 +33,9 @@
       {"_key": "AT1G01050", "clusters": ["markov_i2:2"]},
       {"_key": "AT1G01060", "clusters": ["markov_i2:2"]},
       {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]},
-      {"_key": "AT1G01080", "clusters": ["markov_i2:3"]},
       {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]},
-      {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]}
+      {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]},
+      {"_key": "AT1G01080", "clusters": ["markov_i4:3"]}
     ]
   },
   "load_node_metadata": {
@@ -74,16 +74,16 @@
       "SDV"
     ],
     "edges": [
-      "As2__AT1G01020__pheno_assn__8.4",
-      "As2__AT1G01040__pheno_assn__5.4",
-      "As75__AT1G01020__pheno_assn__39.9",
-      "AT1G01010__AT1G01020__ppi_hithru__2.3",
-      "AT1G01010__AT1G01030__ppi_hithru__2.4",
-      "AT1G01010__AT1G01040__domain_co_occur__2.5",
-      "AT1G01010__AT1G01040__ppi_liter__170.5",
-      "AT1G01030__AT1G01050__gene_coexpr__2.6",
-      "AT1G01050__AT1G01060__ppi_liter__2.7",
-      "AT1G01080__AT1G01090__ppi_liter__2.8"
+      "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+      "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+      "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+      "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+      "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+      "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+      "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+      "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+      "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7",
+      "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8"
     ]
   },
   "fetch_genes": {
@@ -109,24 +109,24 @@
               "AT1G01040"
             ],
             "edges": [
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5"
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7"
             ]
           }
         }
@@ -140,23 +140,23 @@
           "1": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7"
             ]
           }
         }
@@ -181,22 +181,22 @@
           "1": {
             "nodes": ["As2", "AT1G01020", "AT1G01040"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7"
             ]
           }
         }
@@ -210,22 +210,22 @@
           "1": {
             "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7"
             ]
           }
         }
@@ -250,25 +250,25 @@
           "1": {
             "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"],
             "edges": [
-              "As2__AT1G01040__pheno_assn__5.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01080__AT1G01090__ppi_liter__2.8"
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080",  "AT1G01090"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7",
-              "AT1G01080__AT1G01090__ppi_liter__2.8"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7",
+              "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8"
             ]
           }
         }
@@ -294,25 +294,25 @@
           "1": {
             "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"],
             "edges": [
-              "As2__AT1G01040__pheno_assn__5.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01080__AT1G01090__ppi_liter__2.8"
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080",  "AT1G01090"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7",
-              "AT1G01080__AT1G01090__ppi_liter__2.8"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7",
+              "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8"
             ]
           }
         }
@@ -326,23 +326,23 @@
           "1": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3"
             ]
           },
           "5": {
             "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"],
             "edges": [
-              "As2__AT1G01020__pheno_assn__8.4",
-              "As2__AT1G01040__pheno_assn__5.4",
-              "As75__AT1G01020__pheno_assn__39.9",
-              "AT1G01010__AT1G01020__ppi_hithru__2.3",
-              "AT1G01010__AT1G01030__ppi_hithru__2.4",
-              "AT1G01010__AT1G01040__domain_co_occur__2.5",
-              "AT1G01010__AT1G01040__ppi_liter__170.5",
-              "AT1G01030__AT1G01050__gene_coexpr__2.6",
-              "AT1G01050__AT1G01060__ppi_liter__2.7"
+              "As2__AT1G01020__AraGWAS-Phenotype_Associations__8.4",
+              "As2__AT1G01040__AraGWAS-Phenotype_Associations__5.4",
+              "As75__AT1G01020__AraGWAS-Phenotype_Associations__39.9",
+              "AT1G01010__AT1G01020__AraNetv2-HT_high-throughput-ppi__2.3",
+              "AT1G01010__AT1G01030__AraNetv2-HT_high-throughput-ppi__2.4",
+              "AT1G01010__AT1G01040__AraNetv2-DC_domain-co-occurrence__2.5",
+              "AT1G01010__AT1G01040__AraNetv2-LC_lit-curated-ppi__170.5",
+              "AT1G01030__AT1G01050__AraNetv2-CX_pairwise-gene-coexpression__2.6",
+              "AT1G01050__AT1G01060__AraNetv2-LC_lit-curated-ppi__2.7"
             ]
           }
         }
diff --git a/spec/test/djornl/test_data/I2_named.tsv b/spec/test/djornl/test_data/I2_named.tsv
index 3f129851..c7a9c200 100644
--- a/spec/test/djornl/test_data/I2_named.tsv
+++ b/spec/test/djornl/test_data/I2_named.tsv
@@ -1,9 +1,8 @@
-cluster	ids
+cluster_id	node_ids
 # data_type: cluster
 # cluster_prefix: markov_i2
 # title: Markov clustering, inflation = 2
 Cluster1	AT1G01010,AT1G01030,AT1G01040
 Cluster2	AT1G01050,AT1G01060,AT1G01070
-Cluster3	AT1G01080,AT1G01090
-Cluster4
+Cluster3	AT1G01090
 Cluster5	AT1G01020
diff --git a/spec/test/djornl/test_data/I4_named.tsv b/spec/test/djornl/test_data/I4_named.tsv
index 39e46deb..6e7d91e4 100644
--- a/spec/test/djornl/test_data/I4_named.tsv
+++ b/spec/test/djornl/test_data/I4_named.tsv
@@ -1,4 +1,5 @@
-cluster	ids
+cluster_id	node_ids
 # cluster_prefix: markov_i4
 # title: Markov clustering, inflation = 4
 # data_type: cluster
+Cluster3	AT1G01080
diff --git a/spec/test/djornl/test_data/I6_named.tsv b/spec/test/djornl/test_data/I6_named.tsv
index d504f6b4..e7688f17 100644
--- a/spec/test/djornl/test_data/I6_named.tsv
+++ b/spec/test/djornl/test_data/I6_named.tsv
@@ -1,8 +1,8 @@
-cluster	ids
+cluster_id	node_ids
 # data_type: cluster
 # cluster_prefix: markov_i6
 # title: Markov clustering, inflation = 6
 Cluster1	AT1G01040,AT1G01090
 Cluster2	AT1G01070
 Cluster3	AT1G01010,AT1G01020,AT1G01030
-Cluster4
+# Cluster4
diff --git a/spec/test/djornl/test_data/extra_node.tsv b/spec/test/djornl/test_data/extra_node.tsv
index 664425c5..de069d70 100644
--- a/spec/test/djornl/test_data/extra_node.tsv
+++ b/spec/test/djornl/test_data/extra_node.tsv
@@ -1,3 +1,3 @@
 # data_type: node
-node_id	node_type	transcript	gene_symbol	gene_full_name	gene_model_type	TAIR_Computational_description	TAIR_Curator_summary	TAIR_short_description	GO_descr	GO_terms	MapMan_bin	MapMan_name	MapMan_descr	pheno_AraGWAS_ID	pheno_descrip1	pheno_descrip2	pheno_descrip3	pheno_ref	User_Notes
-AT1G01100	gene	AT1G01100.4			protein_coding	60S acidic ribosomal protein family;(source:Araport11)		60S acidic ribosomal protein family	structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity	GO:0003735, GO:0043021, GO:0030295	17.1.2.1.46	.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1	component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])						
+node_id	node_type	transcript	gene_symbol	gene_full_name	gene_model_type	TAIR_Computational_description	TAIR_Curator_summary	TAIR_short_description	GO_description	GO_terms	MapMan_bin	MapMan_name	MapMan_description
+AT1G01100	gene	AT1G01100.4			protein_coding	60S acidic ribosomal protein family;(source:Araport11)		60S acidic ribosomal protein family	structural constituent of ribosome, ribonucleoprotein complex binding, protein kinase activator activity	GO:0003735, GO:0043021, GO:0030295	17.1.2.1.46	.Protein biosynthesis.ribosome biogenesis.large ribosomal subunit (LSU).LSU proteome.component RPP1	component RPP1 of LSU proteome component (original description: pep chromosome:TAIR10:1:50090:51187:-1 gene:AT1G01100 transcript:AT1G01100.4 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:RPP1A description:60S acidic ribosomal protein P1-1 [Source:UniProtKB/Swiss-Prot;Acc:Q8LCW9])
diff --git a/spec/test/djornl/test_data/nodes.csv b/spec/test/djornl/test_data/nodes.csv
index 2245bd59..92f60761 100644
--- a/spec/test/djornl/test_data/nodes.csv
+++ b/spec/test/djornl/test_data/nodes.csv
@@ -1,5 +1,5 @@
 # data_type: node
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
+node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_description,GO_terms,MapMan_bin,MapMan_name,MapMan_description,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
 AT1G01010,gene,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,,
 AT1G01020,gene,AT1G01020.6,ARV1,,protein_coding,ARV1 family protein;(source:Araport11),,,molecular_function,GO:0003674,35.1,not assigned.annotated,(original description: pep chromosome:TAIR10:1:6788:8737:-1 gene:AT1G01020 transcript:AT1G01020.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:ARV1 description:ARV1 family protein [Source:UniProtKB/TrEMBL;Acc:Q5MK24]) & no description available(sp|q5mk24|arv1_arath : 99.4),,,,,,
 AT1G01030,gene,AT1G01030.2,NGA3,NGATHA3,protein_coding,AP2/B3-like transcriptional factor family protein;(source:Araport11),,,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.5.3,.RNA biosynthesis.transcriptional regulation.B3  transcription factor superfamily.transcription factor (RAV/NGATHA),transcription factor (RAV/NGATHA) (original description: pep chromosome:TAIR10:1:11649:13714:-1 gene:AT1G01030 transcript:AT1G01030.2 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NGA3 description:B3 domain-containing transcription factor NGA3 [Source:UniProtKB/Swiss-Prot;Acc:Q9MAN1]),,,,,,
diff --git a/spec/test/djornl/test_data/pheno_nodes.csv b/spec/test/djornl/test_data/pheno_nodes.csv
index d8bb15a9..83fbf4be 100644
--- a/spec/test/djornl/test_data/pheno_nodes.csv
+++ b/spec/test/djornl/test_data/pheno_nodes.csv
@@ -1,5 +1,5 @@
-node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,User_Notes
-As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
-As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
-Na23,pheno,,,,,,,,,,,,,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010",
-SDV,pheno,,,,,,,,,,,,,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010",
+node_id,node_type,pheno_AraGWAS_ID,pheno_description,pheno_pto_name,pheno_pto_description,pheno_ref,User_Notes
+As2,pheno,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
+As75,pheno,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
+Na23,pheno,10.21958/phenotype:5,"Sodium concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",sodium concentration,The total sodium ion concentration measured in a given volume of a plant or a plant part or plant extract. [GR:pj],"Atwell et. al, Nature 2010",
+SDV,pheno,10.21958/phenotype:104,"Number of days following stratification to opening of first flower. The experiment was stopped at 200 d, and accessions that had not flowered at that point were assigned a value of 200",days to flowering trait,"A flowering time trait (TO:0002616)which is the number of days required for an individual flower (PO:0009046), a whole plant (PO:0000003) or a plant population to reach flowering stage (PO:0007616) from a predetermined time point (e.g. the date of seed sowing, seedling transplant, or seedling emergence). [GR:pj, TO:cooperl]","Atwell et. al, Nature 2010",

From 4875b0528df9dbbe038309d5c68ceef438bf1e2c Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Fri, 28 Aug 2020 07:04:07 -0700
Subject: [PATCH 4/4] Rename some functions and variables for greater
 consistency Add a couple more parser tests

---
 importers/README.md                     |  2 +-
 importers/djornl/main.py                | 11 -----
 importers/djornl/parser.py              | 63 +++++++++++++++----------
 importers/test/test_djornl_parser.py    | 37 ++++++++++-----
 importers/utils/config.py               |  3 +-
 spec/test/djornl/results.json           |  4 +-
 spec/test/stored_queries/test_djornl.py |  6 +--
 7 files changed, 69 insertions(+), 57 deletions(-)
 delete mode 100644 importers/djornl/main.py

diff --git a/importers/README.md b/importers/README.md
index 53df13cc..e54bb379 100644
--- a/importers/README.md
+++ b/importers/README.md
@@ -15,5 +15,5 @@ Global env vars:
 
 ```sh
 RES_ROOT_DATA_PATH=/path/to/djornl_data \
-python -m importers.djornl.main
+python -m importers.djornl.parser
 ```
diff --git a/importers/djornl/main.py b/importers/djornl/main.py
deleted file mode 100644
index ba1bb005..00000000
--- a/importers/djornl/main.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""
-Loads the Dan Jacobson/ORNL group's gene and phenotype network data into
-arangodb.
-
-Running this requires a set of source files provided by the ORNL group.
-"""
-from importers.djornl.parser import DJORNL_Parser
-
-if __name__ == '__main__':
-    parser = DJORNL_Parser()
-    parser.load_data()
diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
index 990c745a..c62c6793 100644
--- a/importers/djornl/parser.py
+++ b/importers/djornl/parser.py
@@ -252,14 +252,16 @@ def load_edges(self):
 
         # store edge data, checking for potential duplicates
         def store_edges(datum):
-            # there should only be one value for each node<->node edge of a given type
+            # there should only be one value for each node<->node edge of a given type,
+            # so use these values as an index key
             edge_key = "__".join([datum['node1'], datum['node2'], datum['edge_type']])
 
             if edge_key in edge_ix:
-                # ignore duplicate lines; report non-matching data
-                if datum['score'] != edge_ix[edge_key]['score']:
-                    return f"duplicate data for edge {edge_key}"
-                return None
+                # duplicate lines can be ignored
+                if datum['score'] == edge_ix[edge_key]['score']:
+                    return None
+                # report non-matching data
+                return f"duplicate data for edge {edge_key}"
 
             # keep track of the nodes mentioned in this edge set
             for node_n in ["1", "2"]:
@@ -286,7 +288,7 @@ def store_edges(datum):
             'edges': edge_ix.values(),
         }
 
-    def load_node_metadata(self):
+    def load_nodes(self):
         """Load node metadata"""
 
         node_ix = {}
@@ -352,7 +354,7 @@ def store_nodes(datum):
             raise RuntimeError('\n'.join(err_list))
         return {'nodes': node_ix.values()}
 
-    def load_cluster_data(self):
+    def load_clusters(self):
         """Annotate genes with cluster ID fields."""
 
         # index of nodes
@@ -428,33 +430,38 @@ def save_docs(self, coll_name, docs, on_dupe='update'):
 
     def load_data(self):
         self.save_dataset(self.load_edges())
-        self.save_dataset(self.load_node_metadata())
-        self.save_dataset(self.load_cluster_data())
+        self.save_dataset(self.load_nodes())
+        self.save_dataset(self.load_clusters())
         return True
 
     def check_data_delta(self):
         edge_data = self.load_edges()
-        node_metadata = self.load_node_metadata()
-        clusters = self.load_cluster_data()
+        node_data = self.load_nodes()
+        clusters = self.load_clusters()
+
+        self.check_deltas(edge_data=edge_data, node_data=node_data, cluster_data=clusters)
 
-        self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters)
+    def check_deltas(self, edge_data={}, node_data={}, cluster_data={}):
 
-    def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
+        edges_nodelist = set([e['_key'] for e in edge_data['nodes']])
+        nodes_nodelist = set([e['_key'] for e in node_data['nodes']])
+        clusters_nodelist = set([e['_key'] for e in cluster_data['nodes']])
+        all_nodes = edges_nodelist.union(nodes_nodelist).union(clusters_nodelist)
 
-        edge_nodes = set([e['_key'] for e in edge_data['nodes']])
-        node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']])
-        cluster_nodes = set([e['_key'] for e in cluster_data['nodes']])
-        all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes)
+        # check all nodes in cluster_data have node data
+        cluster_no_node_set = clusters_nodelist.difference(nodes_nodelist)
+        if cluster_no_node_set:
+            print({'clusters with no node metadata': cluster_no_node_set})
 
-        # check all nodes in cluster_data have node_metadata
-        clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes)
-        if clstr_no_node_md_set:
-            print({'clusters with no node metadata': clstr_no_node_md_set})
+        # check all nodes in the edge_data have node data
+        edge_no_node_set = edges_nodelist.difference(nodes_nodelist)
+        if edge_no_node_set:
+            print({'edges with no node metadata': edge_no_node_set})
 
-        # check all nodes in the edge_data have node_metadata
-        edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes)
-        if edge_no_node_md_set:
-            print({'edges with no node metadata': edge_no_node_md_set})
+        # check all nodes are in the edge_data set
+        node_no_edge_set = nodes_nodelist.difference(edges_nodelist)
+        if node_no_edge_set:
+            print({'nodes not in an edge': node_no_edge_set})
 
         # count all edges
         print("Dataset contains " + str(len(edge_data['edges'])) + " edges")
@@ -464,4 +471,8 @@ def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
 
 if __name__ == '__main__':
     parser = DJORNL_Parser()
-    parser.load_data()
+    try:
+        parser.load_data()
+    except Exception as err:
+        print(err)
+        exit(1)
diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py
index a3c8ecfa..daf2b417 100644
--- a/importers/test/test_djornl_parser.py
+++ b/importers/test/test_djornl_parser.py
@@ -33,6 +33,19 @@ def init_parser_with_path(self, root_path):
             parser._configure()
             return parser
 
+    def test_missing_required_env_var(self):
+        '''test that the parser exits with code 1 if the RES_ROOT_DATA_PATH env var is not set'''
+        with self.assertRaisesRegex(RuntimeError, 'Missing required env var: RES_ROOT_DATA_PATH'):
+            parser = DJORNL_Parser()
+            parser.load_edges()
+
+    def test_config(self):
+        '''test that the parser raises an error if a config value cannot be found'''
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+        with self.assertRaisesRegex(KeyError, 'No such config value: bananas'):
+            parser.config('bananas')
+
     def test_load_no_manifest(self):
         """ test loading when the manifest does not exist """
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest')
@@ -67,7 +80,7 @@ def test_load_empty_files(self):
         # header only, no content
         err_str = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv: no valid data found'
         with self.assertRaisesRegex(RuntimeError, err_str):
-            parser.load_node_metadata()
+            parser.load_nodes()
 
         # comments only
         err_str = 'merged_edges-AMW-060820_AF.tsv: no header line found'
@@ -81,7 +94,7 @@ def test_load_empty_files(self):
             'cluster_data/comment_only.tsv: no header line found',
         ])
         with self.assertRaisesRegex(RuntimeError, err_str):
-            parser.load_cluster_data()
+            parser.load_clusters()
 
     def test_load_missing_files(self):
         """ test loading when files cannot be found """
@@ -119,7 +132,7 @@ def test_load_invalid_nodes(self):
         # invalid node type
         node_err_msg = "nodes.csv line 5: 'Monkey' is not valid under any of the given schemas"
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
-            parser.load_node_metadata()
+            parser.load_nodes()
 
     def test_load_invalid_clusters(self):
         """ test file format errors """
@@ -131,7 +144,7 @@ def test_load_invalid_clusters(self):
         # invalid node type
         cluster_err_msg = "markov2_named.tsv line 7: 'HoneyNutCluster3' does not match"
         with self.assertRaisesRegex(RuntimeError, cluster_err_msg):
-            parser.load_cluster_data()
+            parser.load_clusters()
 
     def test_load_col_count_errors(self):
         """ test files with invalid numbers of columns """
@@ -148,7 +161,7 @@ def test_load_col_count_errors(self):
         # too many cols
         node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 3: expected 20 cols, found 22'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
-            parser.load_node_metadata()
+            parser.load_nodes()
 
     def test_load_valid_edge_data(self):
 
@@ -169,8 +182,8 @@ def test_load_valid_node_metadata(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        node_metadata = parser.load_node_metadata()
-        expected = self.json_data["load_node_metadata"]
+        node_metadata = parser.load_nodes()
+        expected = self.json_data["load_nodes"]
 
         for data_structure in [node_metadata, expected]:
             for k in data_structure.keys():
@@ -184,10 +197,10 @@ def test_load_valid_cluster_data(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        cluster_data = parser.load_cluster_data()
+        cluster_data = parser.load_clusters()
         self.assertEqual(
             cluster_data,
-            self.json_data["load_cluster_data"]
+            self.json_data["load_clusters"]
         )
 
     def test_duplicate_edge_data(self):
@@ -211,7 +224,7 @@ def test_duplicate_node_data(self):
 
         err_msg = "extra_node.tsv line 5: duplicate data for node AT1G01080"
         with self.assertRaisesRegex(RuntimeError, err_msg):
-            parser.load_node_metadata()
+            parser.load_nodes()
 
     def test_duplicate_cluster_data(self):
         """ test files with duplicate cluster data, which should be seamlessly merged """
@@ -220,10 +233,10 @@ def test_duplicate_cluster_data(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'duplicate_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        cluster_data = parser.load_cluster_data()
+        cluster_data = parser.load_clusters()
         self.assertEqual(
             cluster_data,
-            self.json_data["load_cluster_data"]
+            self.json_data["load_clusters"]
         )
 
     def test_the_full_shebang(self):
diff --git a/importers/utils/config.py b/importers/utils/config.py
index 878b9fee..1f30080e 100644
--- a/importers/utils/config.py
+++ b/importers/utils/config.py
@@ -20,8 +20,7 @@ def load_from_env(extra_required=None, extra_optional=None, prefix='RES_'):
     optional = list(OPTIONAL) + (extra_optional or [])
     for field in required:
         if (prefix + field) not in os.environ:
-            print(f"Missing required env var: {prefix + field}")
-            exit(1)
+            raise RuntimeError(f"Missing required env var: {prefix + field}")
     for field in required + optional:
         if (prefix + field) in os.environ:
             conf[field] = os.environ[prefix + field]
diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json
index 9eefcebc..fe9a613c 100644
--- a/spec/test/djornl/results.json
+++ b/spec/test/djornl/results.json
@@ -25,7 +25,7 @@
       {"_key": "AT1G01080__AT1G01090__AraNetv2-LC_lit-curated-ppi__2.8", "_from": "djornl_node/AT1G01080", "_to": "djornl_node/AT1G01090", "edge_type": "AraNetv2-LC_lit-curated-ppi", "score": 2.8}
     ]
   },
-  "load_cluster_data": {
+  "load_clusters": {
     "nodes": [
       {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]},
       {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]},
@@ -38,7 +38,7 @@
       {"_key": "AT1G01080", "clusters": ["markov_i4:3"]}
     ]
   },
-  "load_node_metadata": {
+  "load_nodes": {
     "nodes": [
       {"_key": "As2", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:103", "pheno_description": "", "pheno_pto_name": "bacterial disease resistance", "pheno_pto_description": "The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
       {"_key": "As75", "node_type": "pheno", "transcript": "", "gene_symbol": "", "gene_full_name": "", "gene_model_type": "", "tair_computational_description": "", "tair_curator_summary": "", "tair_short_description": "", "go_description": "", "go_terms": [], "mapman_bin": "", "mapman_name": "", "mapman_description": "", "pheno_aragwas_id": "10.21958/phenotype:67", "pheno_description": "Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008", "pheno_pto_name": "arsenic concentration", "pheno_pto_description": "A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik]", "pheno_ref": "Atwell et. al, Nature 2010", "user_notes": ""},
diff --git a/spec/test/stored_queries/test_djornl.py b/spec/test/stored_queries/test_djornl.py
index cde4d0c2..befe46cd 100644
--- a/spec/test/stored_queries/test_djornl.py
+++ b/spec/test/stored_queries/test_djornl.py
@@ -49,11 +49,11 @@ def setUpClass(cls):
             r = create_test_docs(edge_name, edge_data['edges'])
             print_db_update(r, edge_name)
 
-            node_metadata = parser.load_node_metadata()
+            node_metadata = parser.load_nodes()
             r = create_test_docs(node_name, node_metadata['nodes'], True)
             print_db_update(r, node_name)
 
-            cluster_data = parser.load_cluster_data()
+            cluster_data = parser.load_clusters()
             r = create_test_docs(node_name, cluster_data['nodes'], True)
             print_db_update(r, node_name)
 
@@ -93,7 +93,7 @@ def test_fetch_all(self):
 
         # ensure that all the cluster data is returned OK
         node_data = response['results'][0]['nodes']
-        expected_node_data = self.json_data['load_cluster_data']['nodes']
+        expected_node_data = self.json_data['load_clusters']['nodes']
         self.assertEqual(
             {n['_key']: n['clusters'] for n in node_data if 'clusters' in n},
             {n['_key']: n['clusters'] for n in expected_node_data if 'clusters' in n},