add feature extraction

lukagerlach · lukagerlach · commit 8a83da1b4c26 · 2025-07-06T15:21:20.000+02:00
diff --git a/experiments/feature_extraction/extractor.py b/experiments/feature_extraction/extractor.py
@@ -0,0 +1,127 @@
+from enum import Enum
+import pickle
+from typing import Dict, Optional, Tuple
+import torch
+from torch.utils.data import DataLoader
+import numpy as np
+
+from models.resnet_50_base import load_pretrained_model, create_feature_extractor, MODEL_NAMES
+from pybbbc import BBBC021, constants
+
+import os
+
+
+def extract_moa_features(model_name: MODEL_NAMES, device, batch_size = 16, data_root: str = "/scratch/cv-course2025/group8", compounds: list[str] = None) -> None:
+    """
+    Extract features for the BBBC021 dataset using a pretrained ResNet50 model.
+    
+    Args:
+        model_name: Name of the model to use. Is of type MODEL_NAMES.
+        device: Device to run the model on
+        batch_size: Batch size for data loading
+        data_root: Root directory where the BBBC021 dataset is stored.
+        compounds: List of compounds to process. If None, all compounds will be processed.
+    """
+    
+    # Load pretrained ResNet50 model
+    pretrained_model = load_pretrained_model(model_name)
+    # Create feature extractor
+    feature_extractor = create_feature_extractor(pretrained_model)
+    
+    if not compounds:
+        compounds = constants.COMPOUNDS
+    else:
+        for compound in compounds:
+            if compound not in constants.COMPOUNDS:
+                raise ValueError(f"Compound '{compound}' is not a valid compound. "
+                                 f"Valid compounds are: {constants.COMPOUNDS}")
+    
+    # Create output directory with model name
+    output_dir = os.path.join(data_root, "bbbc021_features", model_name.value)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Set device
+    feature_extractor = feature_extractor.to(device)
+    feature_extractor.eval()
+    
+    # Process each compound dynamically
+    for compound in compounds:        
+        data = BBBC021(root_path=data_root, compound=compound)  # Fixed: use single compound
+        print(f"Processing Compound: {compound} with {len(data.images)} images")
+        
+        # Dictionary to store images grouped by (compound, concentration, moa)
+        image_groups: Dict[Tuple[str, float, str], list[torch.Tensor]] = {}
+        
+        # Collect images for this compound
+        for image, metadata in data:
+            if metadata.compound.moa == 'null':
+                print(f"Skipping image with null MOA for compound {compound}.")
+                continue
+            
+            key = (metadata.compound.compound, 
+                   metadata.compound.concentration,
+                   metadata.compound.moa)
+            
+            if key not in image_groups:
+                image_groups[key] = []
+            # Convert numpy array to tensor if needed
+            if isinstance(image, np.ndarray):
+                image = torch.from_numpy(image).float()
+            image_groups[key].append(image)
+        
+        # Process each group for this compound immediately
+        for key, images in image_groups.items():
+            compound_name, concentration, moa = key
+            
+            if len(images) == 0:
+                print(f"Warning: No images for group {compound_name}_{concentration}. Skipping...")
+                continue
+                
+            print(f"Extracting features for {compound_name}@{concentration}({moa}) - {len(images)} images")
+            
+            try:
+                # Create DataLoader for this group
+                dataset = torch.utils.data.TensorDataset(torch.stack(images))
+                dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+                
+                # Extract features
+                all_features = []
+                with torch.no_grad():
+                    for batch in dataloader:
+                        batch_images = batch[0].to(device)
+                        features = feature_extractor(batch_images)
+                        features = features.squeeze()  # Remove spatial dimensions
+                        all_features.append(features.cpu())
+                
+                # Compute average features
+                all_features = torch.cat(all_features, dim=0)
+                avg_features = torch.mean(all_features, dim=0)
+                
+                # Create result as tuple (key, feature)
+                result = (key, avg_features)
+                
+                # Create filename: compound_concentration
+                filename = f"{compound_name}_{concentration}.pkl".replace(' ', '_').replace('/', '_')
+                filepath = os.path.join(output_dir, filename)
+                
+                # Save to file
+                with open(filepath, 'wb') as f:
+                    pickle.dump(result, f)
+                
+                print(f"Saved features to {filepath}")
+                
+            except Exception as e:
+                print(f"Error processing group {compound_name}_{concentration}: {e}. Skipping...")
+                continue
+
+
+# main function to run the feature extraction
+if __name__ == "__main__":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
+    
+    extract_moa_features(
+        model_name=MODEL_NAMES.BASE_RESNET,
+        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), 
+        batch_size=16, 
+        data_root="/scratch/cv-course2025/group8",
+        compounds=constants.COMPOUNDS)
diff --git a/models/resnet_50_base.py b/models/resnet_50_base.py
@@ -1,7 +1,23 @@
+from enum import Enum
 from torchvision import models
 import torch.nn as nn
 import torch
 
+class MODEL_NAMES(Enum):
+    BASE_RESNET = "base_resnet"
+    SIMCLR = "resnet_simclr"
+    DINO = "resnet_wsdino"
+
+def load_pretrained_model(model_name: MODEL_NAMES, weight_path='/scratch/cv-course2025/group8/model_weights'):
+    """Load pretrained ResNet50 model."""
+    
+    # Load full model
+    if model_name == MODEL_NAMES.BASE_RESNET:
+        return load_pretrained_resnet50(weights="IMAGENET1K_V2")
+        
+    elif model_name == MODEL_NAMES.SIMCLR:
+        return load_pretrained_model_from_weights("resnet50_simclr", weight_path)
+
 def load_pretrained_resnet50(weights: str = "IMAGENET1K_V2") -> object:
     """Load pretrained ResNet50 model.
     
@@ -26,7 +42,30 @@ def load_pretrained_resnet50(weights: str = "IMAGENET1K_V2") -> object:
     pretrained_model.eval()
     return pretrained_model
 
-
+def load_pretrained_model_from_weights(model_name: str, weight_path: str) -> nn.Module:
+    # TODO: Test this after we trained models
+    """Load pretrained ResNet50 model from custom weights.
+    
+    Args:
+        model_name: Name of the model to load
+        weight_path: Path to the weights file
+        
+    Returns:
+        nn.Module: Pretrained ResNet50 model
+    """
+    print(f"Loading pretrained ResNet50 from {weight_path}...")
+    
+    # Load the model architecture
+    pretrained_model = models.resnet50(weights=None)
+    
+    # Load the weights
+    try:
+        pretrained_model.load_state_dict(torch.load(f"{weight_path}/{model_name.value}.pth"))
+    except FileNotFoundError:
+        raise ValueError(f"Weight file '{model_name}.pth' not found in '{weight_path}'")
+    
+    pretrained_model.eval()
+    return pretrained_model
 
 def create_feature_extractor(pretrained_model: nn.Module) -> nn.Module:
     """Create a feature extractor from a pretrained ResNet50 model.