initial attempt to work with pydra 0.23+

satra · satra · commit fdf2d5efe09b · 2023-10-22T10:29:37.000-04:00
diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py
@@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         messengers=FileMessenger(),
         messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},
     )
-    wf.split(["clf_info", "permute"])
+    wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])
     wf.add(
         read_file_pdt(
             name="readcsv",
@@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
             permute=wf.lzin.permute,
         )
     )
-    wf.fit_clf.split("split_index")
+    wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)
     wf.add(
         calc_metric_pdt(
             name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics
diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py
@@ -1,5 +1,14 @@
 #!/usr/bin/env python
 
+import cloudpickle as cp
+from pydra.utils.hash import Cache, register_serializer
+from sklearn.pipeline import Pipeline
+
+
+@register_serializer
+def bytes_repr_Pipeline(obj: Pipeline, cache: Cache):
+    yield cp.dump(obj)
+
 
 def read_file(filename, x_indices=None, target_vars=None, group=None):
     """Read a CSV data file
@@ -126,7 +135,27 @@ def calc_metric(output, metrics):
     return score, output
 
 
-def get_feature_importance(permute, model, gen_feature_importance=True):
+def get_feature_importance(
+    *,
+    permute: bool,
+    model: tuple[Pipeline, list, list],
+    gen_feature_importance: bool = True,
+):
+    """Compute feature importance for the model
+
+    Parameters
+    ----------
+    permute : bool
+        Whether or not to run the model in permuted mode
+    model : tuple(sklearn.pipeline.Pipeline, list, list)
+        The model to compute feature importance for
+    gen_feature_importance : bool
+        Whether or not to generate the feature importance
+    Returns
+    -------
+    list
+        List of feature importance
+    """
     if permute or not gen_feature_importance:
         return []
     pipeline, train_index, test_index = model
@@ -172,7 +201,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True):
                 pipeline_steps.coefs_
                 pipeline_steps.coef_
 
-                Please add correct method in tasks.py or if inexistent,
+                Please add correct method in tasks.py or if non-existent,
                 set gen_feature_importance to false in the spec file.
 
                 This is the error that was returned by sklearn:\n\t{e}\n
diff --git a/setup.cfg b/setup.cfg
@@ -26,7 +26,7 @@ classifiers =
 [options]
 python_requires = >= 3.8
 install_requires =
-    pydra == 0.22.0
+    pydra >= 0.23.0-alpha
     psutil
     scikit-learn
     seaborn

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):`
`71`	`71`	`messengers=FileMessenger(),`
`72`	`72`	`messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},`
`73`	`73`	`)`
`74`		`- wf.split(["clf_info", "permute"])`
	`74`	`+ wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])`
`75`	`75`	`wf.add(`
`76`	`76`	`read_file_pdt(`
`77`	`77`	`name="readcsv",`
`@@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):`
`102`	`102`	`permute=wf.lzin.permute,`
`103`	`103`	`)`
`104`	`104`	`)`
`105`		`- wf.fit_clf.split("split_index")`
	`105`	`+ wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)`
`106`	`106`	`wf.add(`
`107`	`107`	`calc_metric_pdt(`
`108`	`108`	`name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics`