enables bagging and parallel training

jcapp4 · jcapp4 · commit 7ade9cbaa16a · 2025-12-03T14:02:30.000+01:00
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -19,7 +19,7 @@
 
 from chebai.preprocessing import reader as dr
 
-import extras.adamh as f
+from extras.adamh import Ensemble_loader
 
 
 class XYBaseDataModule(LightningDataModule):
@@ -723,6 +723,8 @@ class _DynamicDataset(XYBaseDataModule, ABC):
 
     def __init__(
         self,
+            ensemble: bool,
+            load_path: str,
         **kwargs,
     ):
         super(_DynamicDataset, self).__init__(**kwargs)
@@ -731,6 +733,7 @@ def __init__(
         self._dynamic_df_train = None
         self._dynamic_df_test = None
         self._dynamic_df_val = None
+        self.loader= Ensemble_loader(ensemble=ensemble,load_path=load_path)
         # Path of csv file which contains a list of ids & their assignment to a dataset (either train,
         # validation or test).
         self.splits_file_path = self._validate_splits_file_path(
@@ -1182,11 +1185,20 @@ def load_processed_data(
                 data_df = self.dynamic_split_dfs[kind]
                 data = data_df.to_dict(orient="records")
                 if kind == "train" :
-                    #       f.init_weights()
-                    data = f.add_train_weights(data)
+
+                    if self.loader.ensemble:
+                        data = self.loader.add_val_weights(data)
+                        
+                        data = self.loader.add_duplicates(data,self.loader.load_path)
+
+                    else:
+                        data = self.loader.add_train_weights(data,self.loader.load_path)
+                    exit()
                 if kind == "validation" :
-                    data = f.add_val_weights(data)
-                # torch.save(data,"gewicht.pt")
+                    data = self.loader.add_val_weights(data)
+
+
+
 
                 return data
 
diff --git a/extras/adamh.py b/extras/adamh.py
@@ -3,8 +3,68 @@
 import numpy
 
 
-train = 0
 
+class Ensemble_loader():
+
+    def __init__(
+            self,
+            #True :bagging, False : boosting
+            ensemble:bool,
+            load_path:str,
+    ):
+        self.ensemble=ensemble
+        self.load_path=load_path
+
+
+
+
+    def add_train_weights(self,ids,load_path):
+        d = torch.load(load_path,weights_only=False)
+        print("start")
+        it = 0
+        for i in ids:
+            if it % 10000 == 0:
+                print(it)
+            ident = i["ident"]
+            print(d[str(ident)])
+            i["weight"] = d[str(ident)]
+            it = it + 1
+        return ids
+
+    def add_val_weights(self,ids):
+        for i in ids:
+            i["weight"] = [1]*1528
+        return ids
+    #dict reverse to the dict created by the method bootstrapping in sample.py
+    def add_duplicates(self,data,load_path):
+        path_to_dict = load_path
+        d = torch.load(path_to_dict,weights_only=False)
+        length = len(data)
+        print(length)
+        for i in range(0,length):
+            ident = data[i]["ident"]
+            if(d[str(ident)] > 1):
+                r = d[str(ident)]
+                for j in range(0,r-1):
+                    data.append(data[i])
+                    print("append")
+        print(len(data))
+
+        return data
+
+
+def create_data_weights(batchsize:int,dim:int,weights:dict[str,list[float,...]],idents:tuple[int,...])-> torch.tensor:
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    weight = None
+    index = 0
+    for i in idents:
+        w = torch.Tensor([weights[str(i)],]).to(device)
+        if weight == None:
+            weight = w
+        else:
+            weight = torch.cat((weight,w),0)
+        index = index + 1
+    return weight
 
 def create_weight(path_to_split="/home/programmer/Bachelorarbeit/split/splits.csv"):
     weights = {}
@@ -21,7 +81,7 @@ def create_weight(path_to_split="/home/programmer/Bachelorarbeit/split/splits.cs
     torch.save(weights,"/home/programmer/Bachelorarbeit/weights/init_mh.pt")
 
 
-
+#for 1_ada_no_normal_weights weights =0.0001
 def new_create_weight(path_to_split="/home/programmer/Bachelorarbeit/split/splits.csv"):
     weights = {}
     with open(path_to_split, 'r') as csvfile:
@@ -30,44 +90,30 @@ def new_create_weight(path_to_split="/home/programmer/Bachelorarbeit/split/split
         for row in reader:
             if (row[1] == "train") and i > 0:
                 # print(row[0])
-                weights[row[0]] = [1/(1528 * 160715)]* 1528
+                weights[row[0]] = [1/(1528*160715)]* 1528
                 # print(row[0])
             i = i + 1
         print(len(weights))
     torch.save(weights, "/home/programmer/Bachelorarbeit/weights/init_mh.pt")
 
 
-def add_train_weights(ids):
-    d = torch.load("/home/programmer/Bachelorarbeit/weights/init_mh.pt",weights_only=False)
-    it = 0
-    for i in ids:
-        if it % 10000 == 0:
-            print(it)
-        ident = i["ident"]
-        i["weight"] = d[str(ident)]
-        it = it + 1
-    return ids
 
-def add_val_weights(ids):
-    for i in ids:
-        weight = 1
-        #i["weight"] = torch.full((1,1528),1)
-        i["weight"] = [1]*1528
 
-    return ids
 
-def create_data_weights(batchsize:int,dim:int,weights:dict[str,list[float,...]],idents:tuple[int,...])-> torch.tensor:
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    weight = None
-    index = 0
-    for i in idents:
-        w = torch.Tensor([weights[str(i)],]).to(device)
-        if weight == None:
-            weight = w
-        else:
-            weight = torch.cat((weight,w),0)
-        index = index + 1
-    return weight
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 #new_create_weight()
 #create_weight()