Merge pull request #38 from LorenzLamm/dataloading_adjustments

LorenzLamm · web-flow · commit 4e7900a46127 · 2025-01-17T15:34:03.000+01:00
Dataloading adjustments
diff --git a/src/membrain_pick/cli/predict_cli.py b/src/membrain_pick/cli/predict_cli.py
@@ -51,6 +51,9 @@ def predict(
     verbose: bool = Option(
         True, help="Should the prediction progress bar be printed?"
     ),  # noqa: B008
+    num_workers: int = Option(
+        None, help="Number of workers for the DataLoader."
+    ),  # noqa: B008
 ):
     """Predict the output of the trained model on the given data.
 
@@ -80,6 +83,7 @@ def predict(
         mean_shift_score_threshold=mean_shift_score_threshold,
         mean_shift_device=mean_shift_device,
         verbose=verbose,
+        num_workers=num_workers,
     )
 
 
diff --git a/src/membrain_pick/cli/train_cli.py b/src/membrain_pick/cli/train_cli.py
@@ -30,6 +30,9 @@ def train(
     verbose: bool = Option(
         True, help="Should the training progress bar be printed?"
     ),  # noqa: B008
+    num_workers: int = Option(
+        None, help="Number of workers for the DataLoader."
+    ),  # noqa: B008
 ):
     """Train a diffusion net model.
 
@@ -63,6 +66,7 @@ def train(
         mean_shift_output=False,
         max_epochs=max_epochs,
         verbose=verbose,
+        num_workers=num_workers,
     )
 
 
@@ -126,6 +130,9 @@ def train_advanced(
     verbose: bool = Option(
         True, help="Should the training progress bar be printed?"
     ),  # noqa: B008
+    num_workers: int = Option(
+        None, help="Number of workers for the DataLoader."
+    ),  # noqa: B008
 ):
     """Train a diffusion net model.
 
@@ -163,4 +170,5 @@ def train_advanced(
         mean_shift_margin=mean_shift_margin,
         max_epochs=max_epochs,
         verbose=verbose,
+        num_workers=num_workers,
     )
diff --git a/src/membrain_pick/dataloading/diffusionnet_datamodule.py b/src/membrain_pick/dataloading/diffusionnet_datamodule.py
@@ -9,32 +9,24 @@
 def custom_collate(batch):
     """Custom collate function to handle a complex data structure.
 
-    Each sample is a dictionary containing numpy arrays and another dictionary
-    with sparse matrices. Since we're using a batch size of 1, this function
-    simplifies the handling of these structures.
-
     Args:
-        batch: A list of samples, where each sample is the complex data structure
-               described above.
+        batch: A list of samples, where each sample is the complex data structure.
 
     Returns:
         Processed batch ready for model input.
     """
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Unpack the single sample from the batch
     sample = batch[0]
-    # Initialize a new dictionary to store the processed sample
     processed_sample = {}
 
     for key, value in sample.items():
         if isinstance(value, np.ndarray):
             # Convert numpy arrays to tensors
-            processed_sample[key] = torch.tensor(value).to(device)
+            processed_sample[key] = torch.tensor(value)
         elif isinstance(value, dict):
-            # For the nested dictionary, we assume it contains sparse matrices
-            # and pass it through directly without modifications
+            # For the nested dictionary, directly pass it through without GPU operations
             processed_sample[key] = {
-                subkey: subvalue.to(device) for subkey, subvalue in value.items()
+                subkey: subvalue for subkey, subvalue in value.items()
             }
         else:
             # Directly pass through any other types of values
diff --git a/src/membrain_pick/networks/diffusion_net/utils.py b/src/membrain_pick/networks/diffusion_net/utils.py
@@ -57,15 +57,21 @@ def random_rotate_points_y(pts):
 # Numpy things
 
 
-# Numpy sparse matrix to pytorch
 def sparse_np_to_torch(A):
-    Acoo = A.tocoo()
-    values = Acoo.data
-    indices = np.vstack((Acoo.row, Acoo.col))
+    """
+    Converts a numpy sparse matrix to a PyTorch sparse tensor.
+
+    Args:
+        A: A scipy sparse matrix (e.g., COO, CSR).
+
+    Returns:
+        PyTorch sparse tensor.
+    """
+    Acoo = A.tocoo()  # Convert to COO format if not already
+    values = torch.tensor(Acoo.data, dtype=torch.float32)
+    indices = torch.tensor(np.vstack((Acoo.row, Acoo.col)), dtype=torch.int64)
     shape = Acoo.shape
-    return torch.sparse.FloatTensor(
-        torch.LongTensor(indices), torch.FloatTensor(values), torch.Size(shape)
-    ).coalesce()
+    return torch.sparse_coo_tensor(indices, values, torch.Size(shape)).coalesce()
 
 
 # Pytorch sparse to numpy csc matrix
diff --git a/src/membrain_pick/optimization/diffusion_training_pylit.py b/src/membrain_pick/optimization/diffusion_training_pylit.py
@@ -103,7 +103,7 @@ def forward(self, batch):
         return out
 
     def configure_optimizers(self):
-        optimizer = Adam(self.parameters(), lr=1e-3)
+        optimizer = Adam(self.parameters(), lr=1e-3 * 5)
         scheduler = {
             "scheduler": LambdaLR(
                 optimizer, lr_lambda=lambda epoch: (1 - epoch / self.max_epochs) ** 0.9
diff --git a/src/membrain_pick/predict.py b/src/membrain_pick/predict.py
@@ -7,6 +7,7 @@
 from membrain_pick.dataloading.diffusionnet_datamodule import (
     MemSegDiffusionNetDataModule,
 )
+from membrain_pick.train import get_optimal_num_workers
 from membrain_pick.optimization.diffusion_training_pylit import DiffusionNetModule
 
 from membrain_pick.dataloading.data_utils import (
@@ -99,6 +100,7 @@ def predict(
     # mean_shift_device: str = "cuda:0",
     mean_shift_device: str = "cpu",
     verbose: bool = True,
+    num_workers: int = None,
 ):
     """Predict the output of the trained model on the given data.
 
@@ -120,7 +122,9 @@ def predict(
         k_eig=k_eig,
         batch_size=1,
         force_recompute=force_recompute_partitioning,
-        num_workers=0,
+        num_workers=(
+            num_workers if num_workers is not None else get_optimal_num_workers()
+        ),
         pin_memory=False,
         overfit=False,
     )
@@ -164,6 +168,13 @@ def predict(
         outputs = []
         for i in range(all_diffusion_feature.shape[1] - 15):
             batch["diffusion_inputs"]["features"] = all_diffusion_feature[:, i : i + 16]
+            # put the batch on the device
+            for key in batch:
+                if isinstance(batch[key], torch.Tensor):
+                    batch[key] = batch[key].to(device)
+                elif isinstance(batch[key], dict):
+                    for sub_key in batch[key]:
+                        batch[key][sub_key] = batch[key][sub_key].to(device)
             with torch.no_grad():
                 output = model(batch)
             outputs.append(output["mse"].squeeze().detach().cpu().numpy())
diff --git a/src/membrain_pick/train.py b/src/membrain_pick/train.py
@@ -11,6 +11,19 @@
 from membrain_pick.optimization.diffusion_training_pylit import DiffusionNetModule
 
 
+def get_optimal_num_workers():
+    """
+    Dynamically determine an optimal number of DataLoader workers.
+
+    Returns:
+        int: Recommended number of workers.
+    """
+    cpu_count = os.cpu_count()
+    if not cpu_count:
+        return 0  # Fallback if CPU count is unavailable
+    return min(cpu_count // 2, 16)
+
+
 def train(
     data_dir: str,
     training_dir: str = "./training_output",
@@ -43,6 +56,7 @@ def train(
     # Training parameters
     max_epochs: int = 1000,
     verbose: bool = True,
+    num_workers: int = None,
 ):
 
     train_path = os.path.join(data_dir, "train")
@@ -69,7 +83,9 @@ def train(
         position_tokens=position_tokens,
         k_eig=k_eig,
         batch_size=1,
-        num_workers=0,
+        num_workers=(
+            num_workers if num_workers is not None else get_optimal_num_workers()
+        ),
         pin_memory=False,
     )
     data_module.setup()
@@ -135,6 +151,7 @@ def on_epoch_start(self, trainer, pl_module):
         ],
         max_epochs=max_epochs,
         enable_progress_bar=verbose,
+        accumulate_grad_batches=16,
     )
 
     # Start the training process