Skip to content

Commit 9751079

Browse files
Lucio AnderliniLucio Anderlini
authored andcommitted
Calorimeter model should be optional.
1 parent 9e64cfc commit 9751079

9 files changed

Lines changed: 298 additions & 238 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
/dist/
55
**.egg-info
66
**/__pycache__/
7+
test_data/**

PyLamarr/LHCb/Photons/PyPhotons.py

Lines changed: 165 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import numpy as np
1+
import os.path
2+
from typing import Union
3+
from dataclasses import dataclass
4+
import numpy as np
25
import pandas as pd
36
import tensorflow as tf
47

@@ -19,154 +22,167 @@
1922

2023

2124

22-
def invertColumnTransformer(column_transformer, preprocessed_X):
23-
from sklearn.compose import ColumnTransformer
24-
assert isinstance(column_transformer, ColumnTransformer)
2525

26-
iCol = 0
27-
postprocessed_split = dict()
28-
for name, algo, cols in column_transformer.transformers_:
29-
preprocessed_cols = list()
30-
for _ in range(len(cols)):
31-
preprocessed_cols.append(preprocessed_X[:, iCol][:, None])
32-
iCol += 1
33-
preprocessed_block = np.concatenate(preprocessed_cols, axis=1)
34-
if algo == "passthrough":
35-
postprocessed_split[name] = preprocessed_block
26+
@dataclass(frozen=True)
27+
class PyPhotons:
28+
efficiency_model: Union[str, None]
29+
resolution_model: Union[str, None]
30+
31+
def invertColumnTransformer(self, column_transformer, preprocessed_X):
32+
from sklearn.compose import ColumnTransformer
33+
assert isinstance(column_transformer, ColumnTransformer)
34+
35+
iCol = 0
36+
postprocessed_split = dict()
37+
for name, algo, cols in column_transformer.transformers_:
38+
preprocessed_cols = list()
39+
for _ in range(len(cols)):
40+
preprocessed_cols.append(preprocessed_X[:, iCol][:, None])
41+
iCol += 1
42+
preprocessed_block = np.concatenate(preprocessed_cols, axis=1)
43+
if algo == "passthrough":
44+
postprocessed_split[name] = preprocessed_block
45+
else:
46+
postprocessed_split[name] = algo.inverse_transform(preprocessed_block)
47+
48+
X = [None] * preprocessed_X.shape[1]
49+
for name, _, cols in column_transformer.transformers_:
50+
for i, iCol in enumerate(cols):
51+
X[iCol] = postprocessed_split[name][:, i][:, None]
52+
53+
return np.concatenate(X, axis=1)
54+
55+
def _eval_efficiency(self, X):
56+
efficiency_model = tf.keras.models.load_model(self.efficiency_model)
57+
with open(os.path.join(self.efficiency_model, 'tX.pkl'), 'rb') as tx_file:
58+
tX = pickle.load(tx_file)
59+
60+
y_hat = efficiency_model.predict(tX.transform(X), batch_size=10000, verbose=0)
61+
62+
return y_hat
63+
64+
def _eval_smearing(self, X):
65+
smearing_model = tf.keras.models.load_model(self.resolution_model)
66+
with open(os.path.join(self.resolution_model, 'tX.pkl'), 'rb') as tx_file:
67+
tX = pickle.load(tx_file)
68+
69+
with open(os.path.join(self.resolution_model, 'tY.pkl'), 'rb') as ty_file:
70+
tY = pickle.load(ty_file)
71+
72+
prep_x = tX.transform(X)
73+
n_entries, _ = X.shape
74+
prep_y_hat = smearing_model.predict(
75+
np.c_[prep_x, np.random.normal(0, 1, (n_entries, 64))],
76+
verbose=0,
77+
batch_size=10000
78+
)
79+
80+
ret = self.invertColumnTransformer(tY, prep_y_hat)
81+
return ret
82+
83+
@PyLamarr.method
84+
def __call__(self, db):
85+
gen_photons = pd.read_sql_query("""
86+
SELECT gev.datasource_id AS event_id, p.*, v.*
87+
FROM MCParticles AS p
88+
JOIN MCVertices AS v
89+
ON p.production_vertex == v.mcvertex_id
90+
AND p.genevent_id == v.genevent_id
91+
JOIN GenEvents AS gev
92+
ON gev.genevent_id = p.genevent_id
93+
WHERE
94+
pid = 22
95+
AND
96+
p.pe > 1000
97+
AND
98+
abs(v.x) < 200
99+
AND
100+
abs(v.y) < 200
101+
AND
102+
abs(v.z) < 2000
103+
""", db)
104+
105+
event_id = gen_photons.event_id.values.astype(np.int64)
106+
mcparticle_id = gen_photons.mcparticle_id.values.astype(np.int32)
107+
ovx, ovy, ovz = gen_photons[['x', 'y', 'z']].values.astype(np.float64).T
108+
e, px, py, pz = gen_photons[['pe', 'px', 'py', 'pz']].values.astype(np.float64).T
109+
110+
print (e)
111+
112+
tx = px/pz
113+
ty = py/pz
114+
115+
ecal_z = 12689. # mm
116+
ecal_x = ovx + tx * (ecal_z - ovz)
117+
ecal_y = ovy + ty * (ecal_z - ovz)
118+
119+
mask = (
120+
(ecal_x > -4e3) & (ecal_x < 4e3) &
121+
(ecal_y > -4e3) & (ecal_y < 4e3) &
122+
(tx > -0.35) & (tx < 0.35) &
123+
(ty > -0.25) & (ty < 0.25) &
124+
(pz > 0) & (pz < 200e3)
125+
)
126+
127+
X_eff = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x)]
128+
129+
130+
if self.efficiency_model not in [None, "None", "none", "null"]:
131+
efficiency = self._eval_efficiency(X_eff)
132+
r = np.random.uniform(0, 1, len(X_eff))
133+
134+
ceff = np.cumsum(efficiency, 1)
135+
eff_as_photon = r < ceff[:,0]
136+
eff_as_photon_from_pi0 = (r > ceff[:,0]) & (r < ceff[:,1])
36137
else:
37-
postprocessed_split[name] = algo.inverse_transform(preprocessed_block)
38-
39-
X = [None] * preprocessed_X.shape[1]
40-
for name, _, cols in column_transformer.transformers_:
41-
for i, iCol in enumerate(cols):
42-
X[iCol] = postprocessed_split[name][:, i][:, None]
43-
44-
return np.concatenate(X, axis=1)
45-
46-
47-
def _eval_efficiency(X):
48-
efficiency_model = tf.keras.models.load_model(EFFICIENCY_MODEL/EFFICIENCY_MODEL_VERSION)
49-
with open(EFFICIENCY_MODEL / EFFICIENCY_MODEL_VERSION / 'tX.pkl', 'rb') as tx_file:
50-
tX = pickle.load(tx_file)
51-
52-
y_hat = efficiency_model.predict(tX.transform(X), batch_size=10000, verbose=0)
53-
54-
return y_hat
55-
56-
57-
def _eval_smearing(X):
58-
smearing_model = tf.keras.models.load_model(SMEARING_MODEL / SMEARING_MODEL_VERSION)
59-
with open(SMEARING_MODEL / SMEARING_MODEL_VERSION / 'tX.pkl', 'rb') as tx_file:
60-
tX = pickle.load(tx_file)
61-
62-
with open(SMEARING_MODEL / SMEARING_MODEL_VERSION / 'tY.pkl', 'rb') as ty_file:
63-
tY = pickle.load(ty_file)
64-
65-
prep_x = tX.transform(X)
66-
n_entries, _ = X.shape
67-
prep_y_hat = smearing_model.predict(
68-
np.c_[prep_x, np.random.normal(0, 1, (n_entries, 64))],
69-
verbose=0,
70-
batch_size=10000
71-
)
72-
73-
ret = invertColumnTransformer(tY, prep_y_hat)
74-
return ret
75-
76-
77-
@PyLamarr.function
78-
def PyPhotons(db):
79-
gen_photons = pd.read_sql_query("""
80-
SELECT gev.datasource_id AS event_id, p.*, v.*
81-
FROM MCParticles AS p
82-
JOIN MCVertices AS v
83-
ON p.production_vertex == v.mcvertex_id
84-
AND p.genevent_id == v.genevent_id
85-
JOIN GenEvents AS gev
86-
ON gev.genevent_id = p.genevent_id
87-
WHERE
88-
pid = 22
89-
AND
90-
p.pe > 1000
91-
AND
92-
abs(v.x) < 200
93-
AND
94-
abs(v.y) < 200
95-
AND
96-
abs(v.z) < 2000
97-
""", db)
98-
99-
event_id = gen_photons.event_id.values
100-
mcparticle_id = gen_photons.mcparticle_id.values
101-
ovx, ovy, ovz = gen_photons[['x', 'y', 'z']].values.T
102-
e, px, py, pz = gen_photons[['pe', 'px', 'py', 'pz']].values.T
103-
104-
tx = px/pz
105-
ty = py/pz
106-
107-
ecal_z = 12689. # mm
108-
ecal_x = ovx + tx * (ecal_z - ovz)
109-
ecal_y = ovy + ty * (ecal_z - ovz)
110-
111-
mask = (
112-
(ecal_x > -4e3) & (ecal_x < 4e3) &
113-
(ecal_y > -4e3) & (ecal_y < 4e3) &
114-
(tx > -0.35) & (tx < 0.35) &
115-
(ty > -0.25) & (ty < 0.25) &
116-
(pz > 0) & (pz < 200e3)
117-
)
118-
119-
X_eff = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x)]
120-
121-
efficiency = _eval_efficiency(X_eff)
122-
r = np.random.uniform(0, 1, len(X_eff))
123-
124-
ceff = np.cumsum(efficiency, 1)
125-
eff_as_photon = r < ceff[:,0]
126-
eff_as_photon_from_pi0 = (r > ceff[:,0]) & (r < ceff[:,1])
127-
128-
X_res = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x), eff_as_photon, eff_as_photon_from_pi0]
129-
dx, dy, de_rel, reco_PhotonID, reco_IsNotE, reco_IsNotH = _eval_smearing(X_res).T
130-
131-
sigma_x = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
132-
sigma_y = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
133-
sigma_e = np.sqrt(np.exp(0.89 * np.log(e) + 5.1))
134-
135-
#IPython.embed()
136-
137-
138-
clusters = pd.DataFrame(dict(
139-
mask=mask & eff_as_photon,
140-
event_id=event_id,
141-
type=np.full_like(mcparticle_id, 4),
142-
calocluster_id = mcparticle_id,
143-
center_x = ecal_x,# + dx, #np.random.normal(ecal_x, sigma_x),
144-
center_y = ecal_y,# + dy, #np.random.normal(ecal_y, sigma_y),
145-
z = ecal_z,
146-
# energy = e * (1. + de_rel),
147-
energy = np.random.normal(e, sigma_e),
148-
cov_xx = sigma_x * sigma_x,
149-
cov_yy = sigma_y * sigma_y,
150-
cov_ee = sigma_e * sigma_e,
151-
)).query("mask").drop(columns=['mask'])
152-
153-
clusters.to_sql("Cluster", db, if_exists='replace')
154-
155-
cluster_info = pd.concat([
156-
pd.DataFrame(dict(
157-
mask=eff_as_photon,
158-
event_id=event_id,
159-
calocluster_id=mcparticle_id,
160-
info_key=np.full_like(mcparticle_id, info_key),
161-
info_value=info_value
162-
)).query("mask").drop(columns=['mask'])
163-
for info_key, info_value in [
164-
(383, reco_IsNotH),
165-
(382, reco_IsNotE),
166-
(380, reco_PhotonID),
167-
# These magic numbers come from https://lhcb-doxygen.web.cern.ch/lhcb-doxygen/davinci/v50r5/d2/d57/class_l_h_cb_1_1_proto_particle.html
168-
]
169-
]).sort_values('calocluster_id', ignore_index=True)
170-
171-
cluster_info.to_sql("ClusterInfo", db, if_exists='replace')
138+
eff_as_photon = np.ones(len(X_eff), dtype=bool)
139+
eff_as_photon_from_pi0 = np.zeros(len(X_eff), dtype=bool)
140+
141+
if self.resolution_model not in [None, "None", "none", "null"]:
142+
X_res = np.c_[ecal_x, ecal_y, np.log(e/1e3), tx, ty, ovx, ovy, ovz, np.zeros_like(ecal_x), eff_as_photon, eff_as_photon_from_pi0]
143+
dx, dy, de_rel, reco_PhotonID, reco_IsNotE, reco_IsNotH = self._eval_smearing(X_res).T
144+
145+
sigma_x = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
146+
sigma_y = np.sqrt(np.exp(-1.65 * np.log(e) + 17.0))
147+
sigma_e = np.sqrt(np.exp(0.89 * np.log(e) + 5.1))
148+
else:
149+
dx, dy, de_rel, reco_PhotonID, reco_IsNotE, reco_IsNotH = np.zeros((6, len(e)))
150+
sigma_x, sigma_y, sigma_e = np.zeros((3, len(e)))
151+
152+
#IPython.embed()
153+
154+
clusters = pd.DataFrame(dict(
155+
mask=mask & eff_as_photon,
156+
event_id=event_id,
157+
type=np.full_like(mcparticle_id, 4),
158+
calocluster_id = mcparticle_id,
159+
center_x = ecal_x,# + dx, #np.random.normal(ecal_x, sigma_x),
160+
center_y = ecal_y,# + dy, #np.random.normal(ecal_y, sigma_y),
161+
z = ecal_z,
162+
# energy = e * (1. + de_rel),
163+
energy = np.random.normal(e, sigma_e),
164+
cov_xx = sigma_x * sigma_x,
165+
cov_yy = sigma_y * sigma_y,
166+
cov_ee = sigma_e * sigma_e,
167+
)).query("mask").drop(columns=['mask'])
168+
169+
clusters.to_sql("Cluster", db, if_exists='replace')
170+
171+
cluster_info = pd.concat([
172+
pd.DataFrame(dict(
173+
mask=eff_as_photon,
174+
event_id=event_id,
175+
calocluster_id=mcparticle_id,
176+
info_key=np.full_like(mcparticle_id, info_key),
177+
info_value=info_value
178+
)).query("mask").drop(columns=['mask'])
179+
for info_key, info_value in [
180+
(383, reco_IsNotH),
181+
(382, reco_IsNotE),
182+
(380, reco_PhotonID),
183+
# These magic numbers come from https://lhcb-doxygen.web.cern.ch/lhcb-doxygen/davinci/v50r5/d2/d57/class_l_h_cb_1_1_proto_particle.html
184+
]
185+
]).sort_values('calocluster_id', ignore_index=True)
186+
187+
cluster_info.to_sql("ClusterInfo", db, if_exists='replace')
172188

PyLamarr/LHCb/Photons/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
from typing import Union
2+
13
from .PyPhotons import PyPhotons
24
#------------------------------#
35

4-
def configure_pipeline():
6+
def configure_pipeline(efficiency_model: Union[str, None], resolution_model: Union[str, None]):
57
return [
6-
("MkPhotons", PyPhotons),
8+
("MkPhotons", PyPhotons(efficiency_model=efficiency_model, resolution_model=resolution_model)),
79
]
810

PyLamarr/RemoteResource.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class RemoteResource (BaseModel):
5959
6060
### Accessing local resources
6161
A local resourcce can be encapsulated inside `RemoteResource` which is
62-
the expected format for most of the parametrization data in `PyLamarr`.
62+
the expected format for most of the parametrization test_data in `PyLamarr`.
6363
6464
For example, if testing your local version of `MyPrimaryVertexSmearing.db`,
6565
you can write
@@ -81,7 +81,7 @@ class RemoteResource (BaseModel):
8181
### Implicit conversion from URL
8282
Most of the parametrizations relying on external dependencies expect an
8383
instance of `RemoteResource` identifying the file to obtain the parametrization
84-
from. An implicit cast from sring to `RemoteResource` enables passing directly
84+
from. An implicit cast from string to `RemoteResource` enables passing directly
8585
a string with a URL (possibly pointing to a local file), which gets
8686
transparently converted into a `RemoteResource` instance and used in the file.
8787
"""
@@ -154,6 +154,9 @@ def download (self, force: bool = False):
154154
if os.path.exists(self._file) and not force:
155155
return self
156156

157+
if self.remote_url.startswith("file://"):
158+
raise FileNotFoundError(f"File {self._file} not found.")
159+
157160
logger = logging.getLogger(self.__class__.__name__)
158161
logger.info(f"Downloading {self.remote_url} to {self._file}")
159162

PyLamarr/collectors/PandasCollector.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import PyLamarr
22

33
from dataclasses import dataclass, field
4-
from typing import List, Optional, Dict
4+
from typing import Collection, List, Optional, Dict, Union
55
import pandas as pd
66
import logging
77

88
@dataclass
99
class PandasCollector:
10-
tables: List[str]
11-
dataframes: Dict[str,List[pd.DataFrame]] = field(default_factory=lambda: {})
10+
tables: Collection[str]
11+
dataframes: Dict[str,List[Union[pd.DataFrame, None]]] = field(default_factory=lambda: {})
1212
batch_ids: Optional[List[int]] = None
1313

1414
@PyLamarr.method
@@ -33,7 +33,7 @@ def dataframe(self):
3333

3434
for table, dfs in self.dataframes.items():
3535
batch_ids = self.batch_ids if self.batch_ids is not None else list(range(len(dfs)))
36-
dataframes = [df.assign(batch_id=bid) for bid, df in zip(batch_ids, dfs) if df is not None]
36+
dataframes = [df.assign(batch_id=bid) for bid, df in zip(batch_ids, dfs) if df is not None and len(df) > 0]
3737
if len(dataframes):
3838
ret[table] = pd.concat(dataframes, ignore_index=True)
3939

0 commit comments

Comments
 (0)