-
Notifications
You must be signed in to change notification settings - Fork 6
Open
Labels
bugSomething isn't workingSomething isn't workingenhancementNew feature or requestNew feature or request
Description
When running this code:
convert(
"my_sqlite_file.sqlite"
preset=preset,
joins=joins,
chunk_size=500,
dest_datatype=dest_datatype,
dest_path="my_parquet_file.parquet"
)I get the following error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = convert(
2 "../../data/NF0014/cellprofiler_middle_slice_output/C11-2/gff_extracted_features.sqlite",
3 preset=preset,
4 joins=joins,
5 chunk_size=500,
6 dest_datatype=dest_datatype,
7 dest_path="../../data/NF0014/0.converted/C11-2.parquet"
8 )
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/convert.py:1636, in convert(source_path, dest_path, dest_datatype, source_datatype, metadata, compartments, identifying_columns, concat, join, joins, chunk_size, infer_common_schema, drop_null, data_type_cast_map, add_tablenumber, page_keys, sort_output, preset, parsl_config, **kwargs)
1634 # send sources to be written to parquet if selected
1635 if dest_datatype == "parquet":
-> 1636 output = _to_parquet(
1637 source_path=source_path,
1638 dest_path=dest_path,
1639 source_datatype=source_datatype,
1640 metadata=metadata,
1641 compartments=compartments,
1642 identifying_columns=identifying_columns,
1643 concat=concat,
1644 join=join,
1645 joins=joins,
1646 chunk_size=chunk_size,
1647 infer_common_schema=infer_common_schema,
1648 drop_null=drop_null,
1649 data_type_cast_map=data_type_cast_map,
1650 add_tablenumber=add_tablenumber,
1651 sort_output=sort_output,
1652 page_keys=cast(dict, page_keys),
1653 **kwargs,
1654 )
1656 # cleanup Parsl executor and related
1657 parsl.dfk().cleanup()
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/convert.py:1437, in _to_parquet(source_path, dest_path, source_datatype, metadata, compartments, identifying_columns, concat, join, joins, chunk_size, infer_common_schema, drop_null, sort_output, page_keys, data_type_cast_map, add_tablenumber, **kwargs)
1434 return evaluate_futures(join_sources_result)
1436 # wrap the final result as a future and return
-> 1437 return evaluate_futures(results)
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/utils.py:627, in evaluate_futures(sources)
590 def evaluate_futures(
591 sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str],
592 ) -> Any:
593 """
594 Evaluates any Parsl futures for use within other tasks.
595 This enables a pattern of Parsl app usage as "tasks" and delayed
(...)
607 A data structure which includes evaluated futures where they were found.
608 """
610 return (
611 {
612 source_group_name: [
613 # unwrap sources into future results
614 _unwrap_source(source)
615 for source in (
616 source_group_vals.result()
617 # if we have a future, return the result
618 if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
619 # otherwise return the value
620 else source_group_vals
621 )
622 ]
623 for source_group_name, source_group_vals in sources.items()
624 # if we have a dict, use the above, otherwise unwrap the value in case of future
625 }
626 if isinstance(sources, dict)
--> 627 else _unwrap_value(sources)
628 )
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/utils.py:553, in _unwrap_value(val)
551 # if we have a future value, evaluate the result
552 if isinstance(val, parsl.dataflow.futures.AppFuture):
--> 553 return val.result()
554 elif isinstance(val, list):
555 # if we have a list of futures, return the results
556 if isinstance(val[0], parsl.dataflow.futures.AppFuture):
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/concurrent/futures/_base.py:458, in Future.result(self, timeout)
456 raise CancelledError()
457 elif self._state == FINISHED:
--> 458 return self.__get_result()
459 else:
460 raise TimeoutError()
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
401 if self._exception:
402 try:
--> 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
406 self = None
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/dataflow/dflow.py:339, in DataFlowKernel.handle_exec_update(self, task_record, future)
336 raise InternalConsistencyError("done callback called, despite future not reporting itself as done")
338 try:
--> 339 res = self._unwrap_remote_exception_wrapper(future)
341 except Exception as e:
342 logger.info(f"Task {task_id} try {task_record['try_id']} failed with exception of type {type(e).__name__}")
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/dataflow/dflow.py:609, in DataFlowKernel._unwrap_remote_exception_wrapper(future)
607 result = future.result()
608 if isinstance(result, RemoteExceptionWrapper):
--> 609 result.reraise()
610 return result
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/app/errors.py:117, in RemoteExceptionWrapper.reraise(self)
113 logger.debug("Reraising exception of type {}".format(self.e_type))
115 v = self.get_exception()
--> 117 raise v
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/parsl/app/errors.py:141, in wrapper()
139 from parsl.app.errors import RemoteExceptionWrapper
140 try:
--> 141 return func(*args, **kwargs)
142 except Exception:
143 return RemoteExceptionWrapper(*sys.exc_info())
File ~/miniforge3/envs/gff_preprocessing_env/lib/python3.10/site-packages/cytotable/convert.py:1021, in _concat_join_sources()
1016 shutil.rmtree(path=dest_path)
1018 # build a parquet file writer which will be used to append files
1019 # as a single concatted parquet file, referencing the first file's schema
1020 # (all must be the same schema)
-> 1021 writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
1022 CYTOTABLE_DEFAULT_PARQUET_METADATA
1023 )
1024 with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
1025 for table_path in (
1026 join_sources
1027 if not sort_output
1028 else _natural_sort(list_to_sort=join_sources)
1029 ):
IndexError: list index out of rangeWith the help of @jenna-tomkinson we determined that the root cause of this error is due to a mismatching of cytoplasm compartments to nuclear compartments. This occurs in CellProfiler when I import independently derived segmentation masks. The result is no relation of compartments when zero overlap exists and thus no parquet can be output :/.
A simple fix for me is to use a try and except block in my code to continue on for these cases but maybe CytoTable could catch this error and report an exception???
Please let me know thoughts about this as more and more of us are using external segmentations.
jenna-tomkinson
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingenhancementNew feature or requestNew feature or request