Skip to content

Error when a cell-nucleus pair does not exists: IndexError: list index out of rangeΒ #339

@MikeLippincott

Description

@MikeLippincott

When running this code:

convert(
    "my_sqlite_file.sqlite"
    preset=preset,
    joins=joins,
    chunk_size=500,
    dest_datatype=dest_datatype,
    dest_path="my_parquet_file.parquet"
)

I get the following error:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = convert(
      2     "../../data/NF0014/cellprofiler_middle_slice_output/C11-2/gff_extracted_features.sqlite",
      3     preset=preset,
      4     joins=joins,
      5     chunk_size=500,
      6     dest_datatype=dest_datatype,
      7     dest_path="../../data/NF0014/0.converted/C11-2.parquet"
      8 )

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/convert.py:1636, in convert(source_path, dest_path, dest_datatype, source_datatype, metadata, compartments, identifying_columns, concat, join, joins, chunk_size, infer_common_schema, drop_null, data_type_cast_map, add_tablenumber, page_keys, sort_output, preset, parsl_config, **kwargs)
   1634 # send sources to be written to parquet if selected
   1635 if dest_datatype == "parquet":
-> 1636     output = _to_parquet(
   1637         source_path=source_path,
   1638         dest_path=dest_path,
   1639         source_datatype=source_datatype,
   1640         metadata=metadata,
   1641         compartments=compartments,
   1642         identifying_columns=identifying_columns,
   1643         concat=concat,
   1644         join=join,
   1645         joins=joins,
   1646         chunk_size=chunk_size,
   1647         infer_common_schema=infer_common_schema,
   1648         drop_null=drop_null,
   1649         data_type_cast_map=data_type_cast_map,
   1650         add_tablenumber=add_tablenumber,
   1651         sort_output=sort_output,
   1652         page_keys=cast(dict, page_keys),
   1653         **kwargs,
   1654     )
   1656 # cleanup Parsl executor and related
   1657 parsl.dfk().cleanup()

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/convert.py:1437, in _to_parquet(source_path, dest_path, source_datatype, metadata, compartments, identifying_columns, concat, join, joins, chunk_size, infer_common_schema, drop_null, sort_output, page_keys, data_type_cast_map, add_tablenumber, **kwargs)
   1434         return evaluate_futures(join_sources_result)
   1436 # wrap the final result as a future and return
-> 1437 return evaluate_futures(results)

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/utils.py:627, in evaluate_futures(sources)
    590 def evaluate_futures(
    591     sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str],
    592 ) -> Any:
    593     """
    594     Evaluates any Parsl futures for use within other tasks.
    595     This enables a pattern of Parsl app usage as "tasks" and delayed
   (...)
    607             A data structure which includes evaluated futures where they were found.
    608     """
    610     return (
    611         {
    612             source_group_name: [
    613                 # unwrap sources into future results
    614                 _unwrap_source(source)
    615                 for source in (
    616                     source_group_vals.result()
    617                     # if we have a future, return the result
    618                     if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
    619                     # otherwise return the value
    620                     else source_group_vals
    621                 )
    622             ]
    623             for source_group_name, source_group_vals in sources.items()
    624             # if we have a dict, use the above, otherwise unwrap the value in case of future
    625         }
    626         if isinstance(sources, dict)
--> 627         else _unwrap_value(sources)
    628     )

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/utils.py:553, in _unwrap_value(val)
    551 # if we have a future value, evaluate the result
    552 if isinstance(val, parsl.dataflow.futures.AppFuture):
--> 553     return val.result()
    554 elif isinstance(val, list):
    555     # if we have a list of futures, return the results
    556     if isinstance(val[0], parsl.dataflow.futures.AppFuture):

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/concurrent/futures/_base.py:458, in Future.result(self, timeout)
    456     raise CancelledError()
    457 elif self._state == FINISHED:
--> 458     return self.__get_result()
    459 else:
    460     raise TimeoutError()

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
    401 if self._exception:
    402     try:
--> 403         raise self._exception
    404     finally:
    405         # Break a reference cycle with the exception in self._exception
    406         self = None

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/dataflow/dflow.py:339, in DataFlowKernel.handle_exec_update(self, task_record, future)
    336     raise InternalConsistencyError("done callback called, despite future not reporting itself as done")
    338 try:
--> 339     res = self._unwrap_remote_exception_wrapper(future)
    341 except Exception as e:
    342     logger.info(f"Task {task_id} try {task_record['try_id']} failed with exception of type {type(e).__name__}")

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/dataflow/dflow.py:609, in DataFlowKernel._unwrap_remote_exception_wrapper(future)
    607 result = future.result()
    608 if isinstance(result, RemoteExceptionWrapper):
--> 609     result.reraise()
    610 return result

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/app/errors.py:117, in RemoteExceptionWrapper.reraise(self)
    113 logger.debug("Reraising exception of type {}".format(self.e_type))
    115 v = self.get_exception()
--> 117 raise v

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/app/errors.py:141, in wrapper()
    139 from parsl.app.errors import RemoteExceptionWrapper
    140 try:
--> 141     return func(*args, **kwargs)
    142 except Exception:
    143     return RemoteExceptionWrapper(*sys.exc_info())

File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/convert.py:1021, in _concat_join_sources()
   1016     shutil.rmtree(path=dest_path)
   1018 # build a parquet file writer which will be used to append files
   1019 # as a single concatted parquet file, referencing the first file's schema
   1020 # (all must be the same schema)
-> 1021 writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
   1022     CYTOTABLE_DEFAULT_PARQUET_METADATA
   1023 )
   1024 with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
   1025     for table_path in (
   1026         join_sources
   1027         if not sort_output
   1028         else _natural_sort(list_to_sort=join_sources)
   1029     ):

IndexError: list index out of range

With the help of @jenna-tomkinson we determined that the root cause of this error is due to a mismatching of cytoplasm compartments to nuclear compartments. This occurs in CellProfiler when I import independently derived segmentation masks. The result is no relation of compartments when zero overlap exists and thus no parquet can be output :/.

A simple fix for me is to use a try and except block in my code to continue on for these cases but maybe CytoTable could catch this error and report an exception???

Please let me know thoughts about this as more and more of us are using external segmentations.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingenhancementNew feature or request

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions