parallelize read from DSS in workers (#94)

Calvin Nhieu · web-flow · commit bb0ee5efd74a · 2018-10-24T10:54:21.000-07:00
* parallelize read from DSS in workers * Use thread-safer DCP interface This incorporates the change in HumanCellAtlas/dcp-cli#202 * Include thread id in logs This is helpful in our multithreaded code paths * Don't print logs twice AWS Lambda has a default root logger that captures everything that goes to stdout. If you don't turn `propagate` off, you get every message twice in cloudwatch. * Poll for locks much less frequently A failure mode we encounter is using up capacity on the lock table. If we're anticipating smaller numbers of worker lambdas that handle larger blocks of works, we can wait longer between checking for lock availability. This reduced our dynamo consumed capacity. * Distribute 100 bundles to each lambda And aggressively parallelize DSS I/O. * Assume that each bundle has a single cell in the mapper This is a bad assumption, but it's helpful for the demo. * Address comments
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 .PHONY: lint test unit-tests
 MODULES=matrix tests daemons chalice
-EXCLUDE=target,vendor,chalicelib
+EXCLUDE=target,vendor,chalicelib,target.in
 
 deploy:
 	$(MAKE) -C chalice $@
diff --git a/daemons/worker/Makefile b/daemons/worker/Makefile
@@ -16,6 +16,7 @@ build:
 	mkdir target
 	pip install -r requirements.txt -t target/ --upgrade
 	cp target.in/blosc.cpython-36m-x86_64-linux-gnu.so target/numcodecs
+	cp target.in/hca_util__init__.py target/hca/util/__init__.py
 	cp -R ../../matrix target/
 	cp -R *.py target/
 
diff --git a/daemons/worker/target.in/hca_util__init__.py b/daemons/worker/target.in/hca_util__init__.py
diff --git a/matrix/common/dynamo_utils.py b/matrix/common/dynamo_utils.py
@@ -12,6 +12,10 @@
 import boto3
 import botocore
 
+from matrix.common.logging import Logging
+
+logger = Logging.get_logger(__name__)
+
 
 class Lock(object):
     """Implement a lock with DynamoDB."""
@@ -115,9 +119,9 @@ def acquire(self):
             # Or maybe we hold the lock ourselves, then just return
             elif db_response["Item"]["LockHolder"] == self._lock_id:
                 return
-
+            logger.debug(f"Waiting for lock on {self._lock_key}")
             # Chill out for a bit
-            time.sleep(.5)
+            time.sleep(6)
 
     def release(self):
         """Release the lock.
diff --git a/matrix/common/logging.py b/matrix/common/logging.py
@@ -10,13 +10,14 @@ def get_logger(name: str):
         log_level_name = os.environ['LOG_LEVEL'] if 'LOG_LEVEL' in os.environ else 'DEBUG'
         log_level = getattr(logging, log_level_name.upper())
         ch.setLevel(log_level)
-        formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s',
+        formatter = logging.Formatter('%(asctime)s %(thread)d %(levelname)s %(name)s %(message)s',
                                       datefmt="%Y-%m-%dT%H:%M:%S%z")
         ch.setFormatter(formatter)
         logger = logging.getLogger(name)
         logger.handlers = []
         logger.addHandler(ch)
         logger.setLevel(logging.DEBUG)
+        logger.propagate = False
         return logger
 
     @staticmethod
diff --git a/matrix/lambdas/daemons/driver.py b/matrix/lambdas/daemons/driver.py
@@ -13,7 +13,7 @@ class Driver:
     """
     The first task in a distributed filter merge job.
     """
-    def __init__(self, request_id: str, bundles_per_worker: int=25):
+    def __init__(self, request_id: str, bundles_per_worker: int=100):
         Logging.set_correlation_id(logger, value=request_id)
 
         self.request_id = request_id
diff --git a/matrix/lambdas/daemons/mapper.py b/matrix/lambdas/daemons/mapper.py
@@ -1,9 +1,5 @@
-import os
 import typing
 
-import zarr
-
-from matrix.common.dss_zarr_store import DSSZarrStore
 from matrix.common.dynamo_handler import DynamoHandler, DynamoTable, StateTableField
 from matrix.common.lambda_handler import LambdaHandler, LambdaName
 from matrix.common.logging import Logging
@@ -82,19 +78,13 @@ def _get_chunk_specs(bundle_fqids: typing.List[str]) -> typing.List[dict]:
         chunk_work_spec = []
         for bundle_fqid in bundle_fqids:
             bundle_uuid, bundle_version = bundle_fqid.split(".", 1)
-            zarr_store = DSSZarrStore(bundle_uuid,
-                                      bundle_version=bundle_version,
-                                      dss_instance=os.environ['DEPLOYMENT_STAGE'])
-
-            root = zarr.group(store=zarr_store)
-
-            rows_per_chunk = root.expression.chunks[0]
-            total_chunks = root.expression.nchunks
+            # TODO: Not this! This is taking advantage of the fact that every
+            # matrixable bundle in the DSS will at first have data for a single
+            # cell. That won't be true for too long
 
             chunk_work_spec.extend(
                 [{"bundle_uuid": bundle_uuid,
                   "bundle_version": bundle_version,
-                  "start_row": n * rows_per_chunk,
-                  "num_rows": rows_per_chunk}
-                 for n in range(total_chunks)])
+                  "start_row": 0,
+                  "num_rows": 1}])
         return chunk_work_spec
diff --git a/matrix/lambdas/daemons/worker.py b/matrix/lambdas/daemons/worker.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import math
 import os
 import typing
@@ -31,6 +32,7 @@ def __init__(self, request_id: str):
         self._input_start_rows = []
         self._input_end_rows = []
         self._num_rows = []
+        self.zarr_group = None
 
     def run(self, format: str, worker_chunk_spec: typing.List[dict]):
         """Process and write one chunk of dss bundle matrix to s3 and
@@ -43,24 +45,30 @@ def run(self, format: str, worker_chunk_spec: typing.List[dict]):
         logger.debug(f"Worker running with parameters: worker_chunk_spec={worker_chunk_spec}, format={format}")
         # TO DO pass in the parameters in worker chunk spec flat
         self._parse_worker_chunk_spec(worker_chunk_spec)
+        num_bundles = len(self._bundle_uuids)
         exp_dfs = []
         qc_dfs = []
-        num_bundles = len(self._bundle_uuids)
-        for chunk_idx in range(num_bundles):
-            dss_zarr_store = DSSZarrStore(bundle_uuid=self._bundle_uuids[chunk_idx],
-                                          bundle_version=self._bundle_versions[chunk_idx],
-                                          dss_instance=self._deployment_stage)
-            group = zarr.group(store=dss_zarr_store)
-            exp_df, qc_df = convert_dss_zarr_root_to_subset_pandas_dfs(
-                group, self._input_start_rows[chunk_idx], self._input_end_rows[chunk_idx])
-            exp_dfs.append(exp_df)
-            qc_dfs.append(qc_df)
-
-            # log every tertile of bundles read
-            if any(chunk_idx == int(math.ceil((num_bundles - 1) * ((i + 1) / 3))) for i in range(2)):
-                logger.debug(f"{chunk_idx + 1} of {len(self._bundle_uuids)} bundles successfully read from the DSS")
-
-        # In some test cases, dataframes aren't actually returned. Don't try to
+
+        # Parallelize high latency bundle reads from DSS
+        with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
+            future_to_chunk_map = {executor.submit(self._parse_chunk_to_dataframe, chunk_idx): chunk_idx
+                                   for chunk_idx in range(num_bundles)}
+            for future in concurrent.futures.as_completed(future_to_chunk_map):
+                chunk_idx = future_to_chunk_map[future]
+                try:
+                    exp_df, qc_df = future.result()
+                except Exception as e:
+                    logger.debug(f"Parsing bundle uuid {self._bundle_uuids[chunk_idx]} from DSS "
+                                 f"to pandas.DataFrame caused exception {e}")
+                    raise
+                exp_dfs.append(exp_df)
+                qc_dfs.append(qc_df)
+
+                # log every tertile of bundles read
+                if any(chunk_idx + 1 == math.ceil(num_bundles * ((i + 1) / 3)) for i in range(3)):
+                    logger.debug(f"{chunk_idx + 1} of {num_bundles} bundles successfully read from the DSS")
+
+        # In some test cases, empty dataframes are actually returned. Don't try to
         # pass those to pandas.concat
         if any(not df.empty for df in exp_dfs):
             exp_df = pandas.concat(exp_dfs, axis=0, copy=False)
@@ -81,13 +89,22 @@ def run(self, format: str, worker_chunk_spec: typing.List[dict]):
         if workers_and_mappers_are_complete:
             logger.debug("Mappers and workers are complete. Invoking reducer.")
 
-            s3_zarr_store.write_column_data(group)
+            s3_zarr_store.write_column_data(self.zarr_group)
             reducer_payload = {
                 "request_id": self._request_id,
                 "format": format
             }
             self.lambda_handler.invoke(LambdaName.REDUCER, reducer_payload)
 
+    def _parse_chunk_to_dataframe(self, i: int):
+        dss_zarr_store = DSSZarrStore(bundle_uuid=self._bundle_uuids[i],
+                                      bundle_version=self._bundle_versions[i],
+                                      dss_instance=self._deployment_stage)
+        group = zarr.group(store=dss_zarr_store)
+        if not self.zarr_group:
+            self.zarr_group = group
+        return convert_dss_zarr_root_to_subset_pandas_dfs(group, self._input_start_rows[i], self._input_end_rows[i])
+
     def _parse_worker_chunk_spec(self, worker_chunk_spec: typing.List[dict]):
         """Parse worker chunk spec into Worker instance variables.
 
diff --git a/terraform/modules/matrix-service/lambdas/worker_lambda.tf b/terraform/modules/matrix-service/lambdas/worker_lambda.tf
@@ -85,7 +85,7 @@ resource "aws_lambda_function" "matrix_service_worker_lambda" {
   handler          = "app.worker_handler"
   runtime          = "python3.6"
   timeout          = 900
-  memory_size      = 1500
+  memory_size      = 3000
 
   environment {
     variables = {
diff --git a/tests/unit/lambdas/daemons/test_mapper.py b/tests/unit/lambdas/daemons/test_mapper.py
@@ -1,5 +1,8 @@
 import unittest
 import uuid
+
+import pytest
+
 from mock import call
 from unittest import mock
 
@@ -99,8 +102,8 @@ def test_get_chunk_specs_ok(self, mock_dss_zarr_store, mock_zarr_group):
         bundle_uuid = str(uuid.uuid4())
         bundle_version = "version"
         bundle_fqids = ['.'.join([bundle_uuid, bundle_version])]
-        chunk_size = 10
-        nchunks = 5
+        chunk_size = 1
+        nchunks = 1
 
         test_zarr_group = mock.Mock()
         test_expression_data = mock.Mock()
@@ -118,6 +121,9 @@ def test_get_chunk_specs_ok(self, mock_dss_zarr_store, mock_zarr_group):
             self.assertEqual(chunk_spec['start_row'], i * chunk_size)
             self.assertEqual(chunk_spec['num_rows'], chunk_size)
 
+    # TODO: Turn this back on once #95 is addressed
+    @pytest.mark.skip(reason="Not needed while one-cell bundle assumption is in place.")
+    @unittest.skip("Not needed while one-cell bundle assumption is in place.")
     @mock.patch("zarr.group")
     @mock.patch.object(DSSZarrStore, "__init__")
     def test_get_chunk_specs_no_chunks(self, mock_dss_zarr_store, mock_zarr_group):