LamarrSim
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎PyLamarr/LHCb/Photons/PyPhotons.py‎
Lines changed: 165 additions & 149 deletions b/‎PyLamarr/LHCb/Photons/PyPhotons.py‎
Lines changed: 165 additions & 149 deletions
diff --git a/‎PyLamarr/LHCb/Photons/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎PyLamarr/LHCb/Photons/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎PyLamarr/RemoteResource.py‎
Lines changed: 5 additions & 2 deletions b/‎PyLamarr/RemoteResource.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎PyLamarr/collectors/PandasCollector.py‎
Lines changed: 4 additions & 4 deletions b/‎PyLamarr/collectors/PandasCollector.py‎
Lines changed: 4 additions & 4 deletions
@@ -4,3 +4,4 @@
 /dist/
 **.egg-info
 **/__pycache__/
+test_data/**
@@ -1,4 +1,7 @@
-import numpy                as np 
+import os.path
+from typing import Union
+from dataclasses import dataclass
+import numpy as np
 import pandas as pd 
 import tensorflow as tf
 
@@ -19,154 +22,167 @@
 
 
 
-def invertColumnTransformer(column_transformer, preprocessed_X):
-    from sklearn.compose import ColumnTransformer
-    assert isinstance(column_transformer, ColumnTransformer)
 
-    iCol = 0
-    postprocessed_split = dict()
-    for name, algo, cols in column_transformer.transformers_:
-        preprocessed_cols = list()
-        for _ in range(len(cols)):
-            preprocessed_cols.append(preprocessed_X[:, iCol][:, None])
-            iCol += 1
-        preprocessed_block = np.concatenate(preprocessed_cols, axis=1)
-        if algo == "passthrough":
-            postprocessed_split[name] = preprocessed_block
+@dataclass(frozen=True)
+class PyPhotons:
+    efficiency_model: Union[str, None]
+    resolution_model: Union[str, None]
+
+    def invertColumnTransformer(self, column_transformer, preprocessed_X):
+        from sklearn.compose import ColumnTransformer
+        assert isinstance(column_transformer, ColumnTransformer)
+
+        iCol = 0
+        postprocessed_split = dict()
+        for name, algo, cols in column_transformer.transformers_:
+            preprocessed_cols = list()
+            for _ in range(len(cols)):
+                preprocessed_cols.append(preprocessed_X[:, iCol][:, None])
+                iCol += 1
+            preprocessed_block = np.concatenate(preprocessed_cols, axis=1)
+            if algo == "passthrough":
+                postprocessed_split[name] = preprocessed_block
+            else:
+                postprocessed_split[name] = algo.inverse_transform(preprocessed_block)
+
+        X = [None] * preprocessed_X.shape[1]
+        for name, _, cols in column_transformer.transformers_:
+            for i, iCol in enumerate(cols):
+                X[iCol] = postprocessed_split[name][:, i][:, None]
+
+        return np.concatenate(X, axis=1)
+
+    def _eval_efficiency(self, X):
+        efficiency_model = tf.keras.models.load_model(self.efficiency_model)
+        with open(os.path.join(self.efficiency_model, 'tX.pkl'), 'rb') as tx_file:
+            tX = pickle.load(tx_file)
+
+        y_hat = efficiency_model.predict(tX.transform(X), batch_size=10000, verbose=0)
+
+        return y_hat
+
+    def _eval_smearing(self, X):
+        smearing_model = tf.keras.models.load_model(self.resolution_model)
+        with open(os.path.join(self.resolution_model, 'tX.pkl'), 'rb') as tx_file:
+            tX = pickle.load(tx_file)
+
+        with open(os.path.join(self.resolution_model, 'tY.pkl'), 'rb') as ty_file:
+            tY = pickle.load(ty_file)
+
+        prep_x = tX.transform(X)
+        n_entries, _ = X.shape
+        prep_y_hat = smearing_model.predict(
+            np.c_[prep_x, np.random.normal(0, 1, (n_entries, 64))],
+            verbose=0,
+            batch_size=10000
+        )
+
+        ret = self.invertColumnTransformer(tY, prep_y_hat)
+        return ret
+
+    @PyLamarr.method
+    def __call__(self, db):
+        gen_photons = pd.read_sql_query("""
+          SELECT gev.datasource_id AS event_id, p.*, v.*
+          FROM MCParticles AS p
+          JOIN MCVertices AS v 
+            ON p.production_vertex == v.mcvertex_id
+               AND p.genevent_id == v.genevent_id
+          JOIN GenEvents AS gev
+            ON gev.genevent_id = p.genevent_id 
+          WHERE 
+              pid = 22 
+            AND 
+              p.pe > 1000
+            AND 
+              abs(v.x) < 200
+            AND 
+              abs(v.y) < 200
+            AND 
+              abs(v.z) < 2000
+          """, db)
+
+        event_id = gen_photons.event_id.values.astype(np.int64)
+        mcparticle_id = gen_photons.mcparticle_id.values.astype(np.int32)
+        ovx, ovy, ovz = gen_photons[['x', 'y', 'z']].values.astype(np.float64).T
+        e, px, py, pz = gen_photons[['pe', 'px', 'py', 'pz']].values.astype(np.float64).T
+
+        print (e)
+
+        tx = px/pz
+        ty = py/pz
+
+        ecal_z = 12689. # mm
+        ecal_x = ovx + tx * (ecal_z - ovz)
+        ecal_y = ovy + ty * (ecal_z - ovz)
+
+        mask = (
+                (ecal_x > -4e3) & (ecal_x < 4e3) &
+                (ecal_y > -4e3) & (ecal_y < 4e3) &
+                (tx > -0.35) & (tx < 0.35) &
+                (ty > -0.25) & (ty < 0.25) &
+                (pz > 0) & (pz < 200e3)
+        )
+
+        X_eff = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x)]
+
+
+        if self.efficiency_model not in [None, "None", "none", "null"]:
+            efficiency = self._eval_efficiency(X_eff)
+            r = np.random.uniform(0, 1, len(X_eff))
+
+            ceff = np.cumsum(efficiency, 1)
+            eff_as_photon = r < ceff[:,0]
+            eff_as_photon_from_pi0 = (r > ceff[:,0]) & (r < ceff[:,1])
         else:
-            postprocessed_split[name] = algo.inverse_transform(preprocessed_block)
-
-    X = [None] * preprocessed_X.shape[1]
-    for name, _, cols in column_transformer.transformers_:
-        for i, iCol in enumerate(cols):
-            X[iCol] = postprocessed_split[name][:, i][:, None]
-
-    return np.concatenate(X, axis=1)
-
-
-def _eval_efficiency(X):
-  efficiency_model = tf.keras.models.load_model(EFFICIENCY_MODEL/EFFICIENCY_MODEL_VERSION)
-  with open(EFFICIENCY_MODEL / EFFICIENCY_MODEL_VERSION / 'tX.pkl', 'rb') as tx_file:
-    tX = pickle.load(tx_file)
-
-  y_hat = efficiency_model.predict(tX.transform(X), batch_size=10000, verbose=0)
-
-  return y_hat 
-    
-
-def _eval_smearing(X):
-  smearing_model = tf.keras.models.load_model(SMEARING_MODEL / SMEARING_MODEL_VERSION)
-  with open(SMEARING_MODEL / SMEARING_MODEL_VERSION / 'tX.pkl', 'rb') as tx_file:
-    tX = pickle.load(tx_file)
-
-  with open(SMEARING_MODEL / SMEARING_MODEL_VERSION / 'tY.pkl', 'rb') as ty_file:
-    tY = pickle.load(ty_file)
-
-  prep_x = tX.transform(X)
-  n_entries, _ = X.shape
-  prep_y_hat = smearing_model.predict(
-      np.c_[prep_x, np.random.normal(0, 1, (n_entries, 64))],
-      verbose=0,
-      batch_size=10000
-      )
-
-  ret = invertColumnTransformer(tY, prep_y_hat)
-  return ret
-
-
-@PyLamarr.function
-def PyPhotons(db):
-  gen_photons = pd.read_sql_query("""
-      SELECT gev.datasource_id AS event_id, p.*, v.*
-      FROM MCParticles AS p
-      JOIN MCVertices AS v 
-        ON p.production_vertex == v.mcvertex_id
-           AND p.genevent_id == v.genevent_id
-      JOIN GenEvents AS gev
-        ON gev.genevent_id = p.genevent_id 
-      WHERE 
-          pid = 22 
-        AND 
-          p.pe > 1000
-        AND 
-          abs(v.x) < 200
-        AND 
-          abs(v.y) < 200
-        AND 
-          abs(v.z) < 2000
-      """, db)
-
-  event_id = gen_photons.event_id.values
-  mcparticle_id = gen_photons.mcparticle_id.values
-  ovx, ovy, ovz = gen_photons[['x', 'y', 'z']].values.T
-  e, px, py, pz = gen_photons[['pe', 'px', 'py', 'pz']].values.T
-
-  tx = px/pz
-  ty = py/pz
-
-  ecal_z = 12689. # mm
-  ecal_x = ovx + tx * (ecal_z - ovz)
-  ecal_y = ovy + ty * (ecal_z - ovz)
-
-  mask = (
-    (ecal_x > -4e3) & (ecal_x < 4e3) &
-    (ecal_y > -4e3) & (ecal_y < 4e3) &
-    (tx > -0.35) & (tx < 0.35) &
-    (ty > -0.25) & (ty < 0.25) &
-    (pz > 0) & (pz < 200e3)
-    )
-
-  X_eff = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x)]
-
-  efficiency = _eval_efficiency(X_eff)
-  r = np.random.uniform(0, 1, len(X_eff))
-
-  ceff = np.cumsum(efficiency, 1)
-  eff_as_photon = r < ceff[:,0]
-  eff_as_photon_from_pi0 = (r > ceff[:,0]) & (r < ceff[:,1])
-
-  X_res = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x), eff_as_photon, eff_as_photon_from_pi0]
-  dx, dy, de_rel, reco_PhotonID, reco_IsNotE, reco_IsNotH = _eval_smearing(X_res).T 
-
-  sigma_x = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
-  sigma_y = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
-  sigma_e = np.sqrt(np.exp(0.89 * np.log(e) + 5.1))
-
-  #IPython.embed()
-
-
-  clusters = pd.DataFrame(dict(
-    mask=mask & eff_as_photon,
-    event_id=event_id,
-    type=np.full_like(mcparticle_id, 4),
-    calocluster_id = mcparticle_id,
-    center_x = ecal_x,# + dx,  #np.random.normal(ecal_x, sigma_x),
-    center_y = ecal_y,# + dy,  #np.random.normal(ecal_y, sigma_y),
-    z = ecal_z,
-    # energy = e * (1. + de_rel),  
-    energy = np.random.normal(e, sigma_e),
-    cov_xx = sigma_x * sigma_x,
-    cov_yy = sigma_y * sigma_y,
-    cov_ee = sigma_e * sigma_e,
-  )).query("mask").drop(columns=['mask'])
-
-  clusters.to_sql("Cluster", db, if_exists='replace')
-
-  cluster_info = pd.concat([
-    pd.DataFrame(dict(
-      mask=eff_as_photon,
-      event_id=event_id,
-      calocluster_id=mcparticle_id,
-      info_key=np.full_like(mcparticle_id, info_key),
-      info_value=info_value
-    )).query("mask").drop(columns=['mask'])
-    for info_key, info_value in [
-        (383, reco_IsNotH),
-        (382, reco_IsNotE),
-        (380, reco_PhotonID),
-        # These magic numbers come from https://lhcb-doxygen.web.cern.ch/lhcb-doxygen/davinci/v50r5/d2/d57/class_l_h_cb_1_1_proto_particle.html
-      ]
-    ]).sort_values('calocluster_id', ignore_index=True)
-
-  cluster_info.to_sql("ClusterInfo", db, if_exists='replace')
+            eff_as_photon = np.ones(len(X_eff), dtype=bool)
+            eff_as_photon_from_pi0 = np.zeros(len(X_eff), dtype=bool)
+
+        if self.resolution_model not in [None, "None", "none", "null"]:
+            X_res = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x), eff_as_photon, eff_as_photon_from_pi0]
+            dx, dy, de_rel, reco_PhotonID, reco_IsNotE, reco_IsNotH = self._eval_smearing(X_res).T
+
+            sigma_x = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
+            sigma_y = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
+            sigma_e = np.sqrt(np.exp(0.89 * np.log(e) + 5.1))
+        else:
+            dx, dy, de_rel, reco_PhotonID, reco_IsNotE, reco_IsNotH = np.zeros((6, len(e)))
+            sigma_x, sigma_y, sigma_e = np.zeros((3, len(e)))
+
+        #IPython.embed()
+
+        clusters = pd.DataFrame(dict(
+            mask=mask & eff_as_photon,
+            event_id=event_id,
+            type=np.full_like(mcparticle_id, 4),
+            calocluster_id = mcparticle_id,
+            center_x = ecal_x,# + dx,  #np.random.normal(ecal_x, sigma_x),
+            center_y = ecal_y,# + dy,  #np.random.normal(ecal_y, sigma_y),
+            z = ecal_z,
+            # energy = e * (1. + de_rel),
+            energy = np.random.normal(e, sigma_e),
+            cov_xx = sigma_x * sigma_x,
+            cov_yy = sigma_y * sigma_y,
+            cov_ee = sigma_e * sigma_e,
+        )).query("mask").drop(columns=['mask'])
+
+        clusters.to_sql("Cluster", db, if_exists='replace')
+
+        cluster_info = pd.concat([
+            pd.DataFrame(dict(
+                mask=eff_as_photon,
+                event_id=event_id,
+                calocluster_id=mcparticle_id,
+                info_key=np.full_like(mcparticle_id, info_key),
+                info_value=info_value
+            )).query("mask").drop(columns=['mask'])
+            for info_key, info_value in [
+                (383, reco_IsNotH),
+                (382, reco_IsNotE),
+                (380, reco_PhotonID),
+                # These magic numbers come from https://lhcb-doxygen.web.cern.ch/lhcb-doxygen/davinci/v50r5/d2/d57/class_l_h_cb_1_1_proto_particle.html
+            ]
+        ]).sort_values('calocluster_id', ignore_index=True)
+
+        cluster_info.to_sql("ClusterInfo", db, if_exists='replace')
 
@@ -1,8 +1,10 @@
+from typing import Union
+
 from .PyPhotons import PyPhotons
 #------------------------------#
 
-def configure_pipeline():
+def configure_pipeline(efficiency_model: Union[str, None], resolution_model: Union[str, None]):
   return [
-      ("MkPhotons", PyPhotons),
+      ("MkPhotons", PyPhotons(efficiency_model=efficiency_model, resolution_model=resolution_model)),
       ]
 
@@ -59,7 +59,7 @@ class RemoteResource (BaseModel):
 
     ### Accessing local resources
     A local resourcce can be encapsulated inside `RemoteResource` which is
-    the expected format for most of the parametrization data in `PyLamarr`.
+    the expected format for most of the parametrization test_data in `PyLamarr`.
 
     For example, if testing your local version of `MyPrimaryVertexSmearing.db`,
     you can write
@@ -81,7 +81,7 @@ class RemoteResource (BaseModel):
     ### Implicit conversion from URL
     Most of the parametrizations relying on external dependencies expect an
     instance of `RemoteResource` identifying the file to obtain the parametrization
-    from. An implicit cast from sring to `RemoteResource` enables passing directly
+    from. An implicit cast from string to `RemoteResource` enables passing directly
     a string with a URL (possibly pointing to a local file), which gets
     transparently converted into a `RemoteResource` instance and used in the file.
     """
@@ -154,6 +154,9 @@ def download (self, force: bool = False):
         if os.path.exists(self._file) and not force:
             return self
 
+        if self.remote_url.startswith("file://"):
+            raise FileNotFoundError(f"File {self._file} not found.")
+
         logger = logging.getLogger(self.__class__.__name__)
         logger.info(f"Downloading {self.remote_url} to {self._file}")
 
 
@@ -1,14 +1,14 @@
 import PyLamarr
 
 from dataclasses import dataclass, field
-from typing import List, Optional, Dict
+from typing import Collection, List, Optional, Dict, Union
 import pandas as pd 
 import logging
 
 @dataclass
 class PandasCollector:
-    tables: List[str]
-    dataframes: Dict[str,List[pd.DataFrame]] = field(default_factory=lambda: {})
+    tables: Collection[str]
+    dataframes: Dict[str,List[Union[pd.DataFrame, None]]] = field(default_factory=lambda: {})
     batch_ids: Optional[List[int]] = None
 
     @PyLamarr.method
@@ -33,7 +33,7 @@ def dataframe(self):
 
         for table, dfs in self.dataframes.items():
             batch_ids = self.batch_ids if self.batch_ids is not None else list(range(len(dfs)))
-            dataframes = [df.assign(batch_id=bid) for bid, df in zip(batch_ids, dfs) if df is not None]
+            dataframes = [df.assign(batch_id=bid) for bid, df in zip(batch_ids, dfs) if df is not None and len(df) > 0]
             if len(dataframes):
               ret[table] = pd.concat(dataframes, ignore_index=True)