Skip to content

Commit 5e6b47d

Browse files
committed
move and update hypernets_processor_main to sequence_processor_main
1 parent 75c6883 commit 5e6b47d

File tree

2 files changed

+240
-0
lines changed

2 files changed

+240
-0
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""
2+
Module with main to run sequence file processing chain
3+
"""
4+
5+
from hypernets_processor.version import __version__
6+
from hypernets_processor.utils.config import read_config_file
7+
from hypernets_processor.utils.logging import configure_logging
8+
from hypernets_processor.utils.paths import parse_sequence_path
9+
from hypernets_processor.context import Context
10+
from hypernets_processor.sequence_processor import SequenceProcessor
11+
import os
12+
13+
14+
"""___Authorship___"""
15+
__author__ = "Sam Hunr"
16+
__created__ = "21/10/2020"
17+
__version__ = __version__
18+
__maintainer__ = "Sam Hunt"
19+
__email__ = "sam.hunt@npl.co.uk"
20+
__status__ = "Development"
21+
22+
23+
def get_target_sequences(context, to_archive):
24+
"""
25+
Returns paths of sequences to process, checking against previous archived data if
26+
adding to archive
27+
28+
:type context: hypernets_processor.context.Context
29+
:param context: processor context
30+
31+
:type to_archive: bool
32+
:param to_archive: switch for if to add processed data to data archive
33+
"""
34+
35+
# Find potential sequence paths to process
36+
# raw_data_directory may either be a sequence path or directory of sequence paths
37+
38+
raw_paths = []
39+
if parse_sequence_path(context.get_config_value("raw_data_directory")) is not None:
40+
raw_paths.append(context.get_config_value("raw_data_directory"))
41+
else:
42+
for path in os.listdir(context.get_config_value("raw_data_directory")):
43+
if parse_sequence_path(path) is not None:
44+
raw_paths.append(
45+
os.path.join(context.get_config_value("raw_data_directory"), path)
46+
)
47+
48+
# If adding to archive, remove previously processed paths from list by referencing
49+
# archive db
50+
51+
if to_archive:
52+
processed_products = [
53+
product["raw_product_name"]
54+
for product in context.archive_db["products"].find(
55+
site=context.get_config_value("site")
56+
)
57+
]
58+
59+
directory = os.path.dirname(raw_paths[0])
60+
61+
raw_products = [os.path.basename(raw_path) for raw_path in raw_paths]
62+
raw_products = list(set(raw_products) - set(processed_products))
63+
raw_paths = [
64+
os.path.join(directory, raw_product) for raw_product in raw_products
65+
]
66+
67+
return raw_paths
68+
69+
70+
def main(processor_config_path, job_config_path, to_archive):
71+
"""
72+
Main function to run processing chain for sequence files
73+
74+
:type processor_config_path: str
75+
:param processor_config_path: processor configuration file path
76+
77+
:type job_config_path: str
78+
:param job_config_path: job configuration file path
79+
80+
:type to_archive: bool
81+
:param to_archive: switch for if to add processed data to data archive
82+
"""
83+
84+
processor_config = read_config_file(processor_config_path)
85+
job_config = read_config_file(job_config_path)
86+
87+
# Configure logging
88+
logger = configure_logging(config=job_config)
89+
90+
# Define context
91+
context = Context(
92+
processor_config=processor_config, job_config=job_config, logger=logger
93+
)
94+
context.set_config_value("to_archive", to_archive)
95+
96+
# Determine target sequences
97+
target_sequences = get_target_sequences(context, to_archive)
98+
99+
# Run processor
100+
sp = SequenceProcessor(context=context)
101+
102+
for target_sequence in target_sequences:
103+
sp.process_sequence(target_sequence)
104+
105+
return None
106+
107+
108+
if __name__ == "__main__":
109+
pass
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""
2+
Tests for hypernets_processor_main module
3+
"""
4+
5+
import unittest
6+
from unittest.mock import patch
7+
from hypernets_processor.version import __version__
8+
from hypernets_processor.main.sequence_processor_main import main, get_target_sequences
9+
from hypernets_processor.test.test_functions import setup_test_context
10+
import string
11+
import random
12+
import os
13+
import shutil
14+
15+
16+
'''___Authorship___'''
17+
__author__ = "Sam Hunt"
18+
__created__ = "21/10/2020"
19+
__version__ = __version__
20+
__maintainer__ = "Sam Hunt"
21+
__email__ = "sam.hunt@npl.co.uk"
22+
__status__ = "Development"
23+
24+
25+
class TestSequenceProcessorMain(unittest.TestCase):
26+
27+
@patch('hypernets_processor.main.sequence_processor_main.configure_logging')
28+
@patch('hypernets_processor.main.sequence_processor_main.SequenceProcessor')
29+
@patch('hypernets_processor.main.sequence_processor_main.get_target_sequences')
30+
@patch('hypernets_processor.main.sequence_processor_main.Context')
31+
def test_main(self, mock_con, mock_gts, mock_sp, mock_cf):
32+
job_config_path = "jpath"
33+
processor_config_path = "ppath"
34+
main(job_config_path=job_config_path, processor_config_path=processor_config_path, to_archive=True)
35+
36+
self.assertTrue(mock_sp.called)
37+
mock_sp.assert_called_once_with(context=mock_con.return_value)
38+
39+
def test_get_target_sequences_toarchive(self):
40+
tmpdir = "tmp_" + "".join(random.choices(string.ascii_lowercase, k=6))
41+
context = setup_test_context(
42+
raw_data_directory=os.path.join(tmpdir, "data"),
43+
archive_directory=os.path.join(tmpdir, "out"),
44+
metadata_db_url="sqlite:///"+tmpdir+"/metadata.db",
45+
anomoly_db_url="sqlite:///"+tmpdir+"/anomoly.db",
46+
archive_db_url="sqlite:///"+tmpdir+"/archive.db",
47+
create_directories=True,
48+
create_dbs=True
49+
)
50+
51+
expected_sequences = [os.path.join(tmpdir, "data", "SEQ20200311T112530")]
52+
sequences = get_target_sequences(context, True)
53+
54+
self.assertCountEqual(expected_sequences, sequences)
55+
56+
shutil.rmtree(tmpdir)
57+
58+
def test_get_target_sequences_nottoarchive(self):
59+
tmpdir = "tmp_" + "".join(random.choices(string.ascii_lowercase, k=6))
60+
context = setup_test_context(
61+
raw_data_directory=os.path.join(tmpdir, "data"),
62+
archive_directory=os.path.join(tmpdir, "out"),
63+
metadata_db_url="sqlite:///" + tmpdir + "/metadata.db",
64+
anomoly_db_url="sqlite:///" + tmpdir + "/anomoly.db",
65+
archive_db_url="sqlite:///" + tmpdir + "/archive.db",
66+
create_directories=True,
67+
create_dbs=True
68+
)
69+
70+
expected_sequences = [
71+
os.path.join(tmpdir, "data", "SEQ20200311T112330"),
72+
os.path.join(tmpdir, "data", "SEQ20200311T112430"),
73+
os.path.join(tmpdir, "data", "SEQ20200311T112530")
74+
]
75+
sequences = get_target_sequences(context, False)
76+
77+
self.assertCountEqual(expected_sequences, sequences)
78+
79+
shutil.rmtree(tmpdir)
80+
81+
def test_get_target_sequences_1sequence(self):
82+
tmpdir = "tmp_" + "".join(random.choices(string.ascii_lowercase, k=6))
83+
context = setup_test_context(
84+
raw_data_directory=os.path.join(tmpdir, "data"),
85+
archive_directory=os.path.join(tmpdir, "out"),
86+
metadata_db_url="sqlite:///" + tmpdir + "/metadata.db",
87+
anomoly_db_url="sqlite:///" + tmpdir + "/anomoly.db",
88+
archive_db_url="sqlite:///" + tmpdir + "/archive.db",
89+
create_directories=True,
90+
create_dbs=True
91+
)
92+
93+
context.set_config_value(
94+
"raw_data_directory",
95+
os.path.join(context.get_config_value("raw_data_directory"), "SEQ20200311T112230"),
96+
)
97+
98+
expected_sequences = [os.path.join(tmpdir, "data", "SEQ20200311T112230")]
99+
sequences = get_target_sequences(context, False)
100+
101+
self.assertCountEqual(expected_sequences, sequences)
102+
103+
shutil.rmtree(tmpdir)
104+
105+
def test_get_target_sequences_0sequence(self):
106+
tmpdir = "tmp_" + "".join(random.choices(string.ascii_lowercase, k=6))
107+
context = setup_test_context(
108+
raw_data_directory=os.path.join(tmpdir, "data"),
109+
archive_directory=os.path.join(tmpdir, "out"),
110+
metadata_db_url="sqlite:///" + tmpdir + "/metadata.db",
111+
anomoly_db_url="sqlite:///" + tmpdir + "/anomoly.db",
112+
archive_db_url="sqlite:///" + tmpdir + "/archive.db",
113+
create_directories=True,
114+
create_dbs=True
115+
)
116+
117+
context.set_config_value(
118+
"raw_data_directory",
119+
context.get_config_value("archive_directory"),
120+
)
121+
122+
expected_sequences = []
123+
sequences = get_target_sequences(context, False)
124+
125+
self.assertCountEqual(expected_sequences, sequences)
126+
127+
shutil.rmtree(tmpdir)
128+
129+
130+
if __name__ == "__main__":
131+
unittest.main()

0 commit comments

Comments
 (0)