2626import copy
2727import csv
2828from datetime import datetime
29- from func_timeout import func_timeout , FunctionTimedOut
29+ from pm4py . util . timeout import func_timeout , FunctionTimedOut
3030from tqdm import tqdm
3131import numpy
3232import random
3636from pm4py .util import exec_utils , constants
3737from pm4py .util import xes_constants as xes
3838from pm4py .algo .discovery .genetic .algorithm import Parameters
39+ from pm4py .objects .conversion .log import converter as log_converter
3940from pm4py .objects .conversion .genetic_matrix .variants .to_petri_net import apply as matrix2petrinet
4041from pm4py .algo .discovery .genetic .util import get_src_sink_sets_for_wfnet , iset , rand_partition
4142from pm4py .objects .genetic_matrix .obj import GeneticMatrix
4243
4344# typing
44- import typing
45- from typing import Union , TextIO , Self
45+ from typing import Any , Dict , Optional , TextIO , Tuple , Union
4646from pandas .core .frame import DataFrame
47- from pm4py .objects .log .obj import EventLog
47+ from pm4py .objects .log .obj import EventLog , EventStream
4848from pm4py .objects .petri_net .obj import PetriNet , Marking
4949from pm4py .algo .discovery .genetic .util import InputMap , OutputMap , Individual
50+ from pm4py .utils import is_polars_lazyframe
5051
5152
5253def apply (
53- log : EventLog ,
54+ log : Union [ EventLog , EventStream , DataFrame ] ,
5455 parameters : Optional [Dict [Union [str , Parameters ], Any ]] = None ,
5556) -> Tuple [PetriNet , Marking , Marking ]:
5657 """
@@ -89,6 +90,14 @@ def apply(
8990 """
9091 if parameters is None :
9192 parameters = {}
93+ if is_polars_lazyframe (log ):
94+ log = log .collect ().to_pandas ()
95+ if not isinstance (log , DataFrame ):
96+ log = log_converter .apply (
97+ log ,
98+ variant = log_converter .Variants .TO_DATA_FRAME ,
99+ parameters = parameters ,
100+ )
92101
93102 activity_key = exec_utils .get_param_value (
94103 Parameters .ACTIVITY_KEY , parameters , xes .DEFAULT_NAME_KEY
@@ -128,7 +137,7 @@ def apply(
128137 # configure parameters
129138 if population_size < 2 :
130139 raise ValueError ("population_size < 2: You need at least two parents for each next generation, thus at least a population size of 2." )
131- if elitism_min_sample > population_size :
140+ if elitism_min_sample >= population_size :
132141 elitism_min_sample = population_size - 1
133142 if elitism_min_sample < 1 :
134143 raise ValueError ("elitism_min_sample < 1: No empty samples allowed." )
@@ -143,11 +152,25 @@ def apply(
143152 if tournament_timeout < 1 :
144153 tournament_timeout = 1
145154 history = []
155+ best_individual = None
156+ best_fitness = - 1.0
146157 population = individuals (log , population_size , T , { # [(I,O), …]
147158 "activity_key" :activity_key , "timestamp_key" :timestamp_key , "case_id_key" :case_id_key
148159 })
149160 for _ in tqdm (range (generations ), "Genetic generations" ):
150- population , fitness = tournament (tqdm (population , f"└─Tournament { len (history )} " ), log , T , sort = True , timeout = tournament_timeout )
161+ population , fitness = tournament (
162+ tqdm (population , f"└─Tournament { len (history )} " ),
163+ log ,
164+ T ,
165+ sort = True ,
166+ timeout = tournament_timeout ,
167+ activity_key = activity_key ,
168+ timestamp_key = timestamp_key ,
169+ case_id_key = case_id_key ,
170+ )
171+ if fitness [0 ] > best_fitness :
172+ best_fitness = fitness [0 ]
173+ best_individual = copy .deepcopy (population [0 ])
151174 if log_csv :
152175 log_csv .writerow ([datetime .now (), len (history )] + fitness )
153176 if fitness [0 ] == 1 or (history and all (f == fitness [0 ] for f in history [- int (generations / 2 ):])):
@@ -167,27 +190,43 @@ def apply(
167190 offspring = mutate (offspring , mutation_rate )
168191 next_population .append (offspring )
169192 population = next_population
170- return matrix2petrinet (GeneticMatrix (* population [0 ], T ))
193+ if best_individual is None :
194+ population , fitness = tournament (
195+ population ,
196+ log ,
197+ T ,
198+ sort = True ,
199+ timeout = tournament_timeout ,
200+ activity_key = activity_key ,
201+ timestamp_key = timestamp_key ,
202+ case_id_key = case_id_key ,
203+ )
204+ best_individual = copy .deepcopy (population [0 ])
205+ return matrix2petrinet (GeneticMatrix (* best_individual , T ))
171206
172207def individuals (log : Union [DataFrame , EventLog ], sample_size = 1 , T = None , keys : dict [str ,str ] = {"activity_key" :xes .DEFAULT_NAME_KEY , "timestamp_key" :xes .DEFAULT_TIMESTAMP_KEY , "case_id_key" :constants .CASE_CONCEPT_NAME }) -> list [Individual ]:
173208 if not T :
174209 T = tuple (log [keys ['activity_key' ]].unique ())
210+ T_idx = {activity : idx for idx , activity in enumerate (T )}
175211 # @src 6.1. Initial Population; https://doi.org/10.1007/11494744_5
176212 # create matrix C
177213 C = numpy .zeros ((len (T ), len (T )))
178214 for _ ,group in tqdm (log .sort_values (keys ['timestamp_key' ], ascending = True ).groupby (keys ['case_id_key' ]), desc = "Find consecutive activities" ):
179- for row1 ,row2 in itertools .pairwise (map (lambda r : r [1 ], group .iterrows ())):
180- i = T .index (row1 [keys ['activity_key' ]])
181- o = T .index (row2 [keys ['activity_key' ]])
182- C [i ,o ] += 1
183- Cn = C / C .sum (axis = 1 )[:,None ] # normalise row-wise
215+ prev_activity = None
216+ for _ , row in group .iterrows ():
217+ activity = row [keys ['activity_key' ]]
218+ if prev_activity is not None :
219+ C [T_idx [prev_activity ], T_idx [activity ]] += 1
220+ prev_activity = activity
221+ row_sums = C .sum (axis = 1 , keepdims = True )
222+ Cn = numpy .divide (C , row_sums , out = numpy .zeros_like (C , dtype = float ), where = row_sums != 0 )
184223 samples = []
185- for sample in tqdm (range (sample_size ), "Initial population" ):
224+ for _ in tqdm (range (sample_size ), "Initial population" ):
186225 I ,O = defaultdict (list ), defaultdict (list )
187226 for i ,o in numpy .ndindex (C .shape ):
188227 if random .random () < Cn [i ,o ]: # [0,1[ < [0,1]; 0 < 0 = false
189- I [T [i ]].append (T [o ])
190- O [T [o ]].append (T [i ])
228+ O [T [i ]].append (T [o ])
229+ I [T [o ]].append (T [i ])
191230 I ,O = repair (I , O , C , T )
192231 # partitioning already ensures no T in >1 partitions
193232 # s. 4. Causal Matrix, Def. 4; https://doi.org/10.1007/11494744_5
@@ -198,7 +237,17 @@ def individuals(log: Union[DataFrame, EventLog], sample_size=1, T=None, keys: di
198237 samples .append ((I ,O ))
199238 return samples
200239
201- def tournament (population : list [Individual ], log : Union [DataFrame , EventLog ], T , sort = True , timeout = 1 ) -> tuple [list [Individual ],list [float ]]:
240+ def tournament (
241+ population : list [Individual ],
242+ log : Union [DataFrame , EventLog ],
243+ T ,
244+ sort = True ,
245+ timeout = 1 ,
246+ * ,
247+ activity_key : str = xes .DEFAULT_NAME_KEY ,
248+ timestamp_key : str = xes .DEFAULT_TIMESTAMP_KEY ,
249+ case_id_key : str = constants .CASE_CONCEPT_NAME ,
250+ ) -> tuple [list [Individual ],list [float ]]:
202251 """sort=True: sort descending by fitness (i.e. best first)"""
203252 # @src 6.2. Fitness Calculation; https://doi.org/10.1007/11494744_5
204253 fitness = []
@@ -208,7 +257,12 @@ def tournament(population: list[Individual], log: Union[DataFrame, EventLog], T,
208257 metrics = func_timeout (
209258 timeout = timeout ,
210259 func = pm4py .fitness_token_based_replay ,
211- args = (log , * model )
260+ args = (log , * model ),
261+ kwargs = {
262+ "activity_key" : activity_key ,
263+ "timestamp_key" : timestamp_key ,
264+ "case_id_key" : case_id_key ,
265+ },
212266 )
213267 except FunctionTimedOut :
214268 print ("\t Timeout for individual" , i )
@@ -262,63 +316,72 @@ def crossover(parent1: Individual, parent2: Individual, T: list[str]) -> tuple[I
262316 I2 ,O2 = offspring2 = copy .deepcopy (parent2 )
263317 # 3. swap and recombine
264318 if I1 [t ] and I2 [t ]:
319+ old_I1 = iset .flat (I1 [t ])
320+ old_I2 = iset .flat (I2 [t ])
265321 swap_point = random .randrange (min (len (I1 [t ]), len (I2 [t ])))
266322 toI1 , toI2 = I2 [t ][swap_point :], I1 [t ][swap_point :]
267323 # no T can exist twice in I/O[t], s. Def. 4; https://doi.org/10.1007/11494744_5
268324 # COPY of I_i, else not properly removed in opposite I_j
269- I1_flat = iset .flat (I1 [t ][:swap_point - 1 ])
270- toI1_dedup = [ iset (S - I1_flat ) for S in toI1 ]
271- I2_flat = iset .flat (I2 [t ][:swap_point - 1 ])
272- toI2_dedup = [ iset (S - I2_flat ) for S in toI2 ]
325+ I1_flat = iset .flat (I1 [t ][:swap_point ])
326+ toI1_dedup = [iset (S - I1_flat ) for S in toI1 if S - I1_flat ]
327+ I2_flat = iset .flat (I2 [t ][:swap_point ])
328+ toI2_dedup = [iset (S - I2_flat ) for S in toI2 if S - I2_flat ]
273329 # merge
274- I1 [t ], I2 [t ] = I1 [t ][:swap_point - 1 ] + toI1_dedup , I2 [t ][:swap_point - 1 ] + toI2_dedup
275- # @src 6.3. Genetic Operations: Update Related Elements; https://doi.org/10.1007/11494744_5
276- for c in iset .flat (toI1 ) - iset .flat (toI2 ): # no reassign staying T
277- O1 [c ].append (iset ({t }))
278- for i ,p in enumerate (O2 [c ]): # p = only local var
279- if t in p :
280- O2 [c ][i ] = iset (p - {t })
281- if not O2 [c ][i ]:
282- O2 [c ].remove (O2 [c ][i ])
283- break
284- for c in iset .flat (toI2 ) - iset .flat (toI1 ): # no reassign staying T
285- O2 [c ].append (iset ({t }))
286- for i ,p in enumerate (O1 [c ]): # p = only local var
287- if t in p :
288- O1 [c ][i ] = iset (p - {t })
289- if not O1 [c ][i ]:
290- O1 [c ].remove (O1 [c ][i ])
291- break
330+ I1 [t ], I2 [t ] = I1 [t ][:swap_point ] + toI1_dedup , I2 [t ][:swap_point ] + toI2_dedup
331+ new_I1 = iset .flat (I1 [t ])
332+ new_I2 = iset .flat (I2 [t ])
333+ for c in new_I1 - old_I1 :
334+ _add_singleton_partition (O1 , c , t )
335+ for c in old_I1 - new_I1 :
336+ _remove_value_from_partitions (O1 , c , t )
337+ for c in new_I2 - old_I2 :
338+ _add_singleton_partition (O2 , c , t )
339+ for c in old_I2 - new_I2 :
340+ _remove_value_from_partitions (O2 , c , t )
292341 if O1 [t ] and O2 [t ]:
342+ old_O1 = iset .flat (O1 [t ])
343+ old_O2 = iset .flat (O2 [t ])
293344 swap_point = random .randrange (min (len (O1 [t ]), len (O2 [t ])))
294345 toO1 , toO2 = O2 [t ][swap_point :], O1 [t ][swap_point :]
295346 # no T can exist twice in I/O[t], s. Def. 4; https://doi.org/10.1007/11494744_5
296347 # COPY of I_i, else not properly removed in opposite I_j
297- O1_flat = iset .flat (O1 [t ][:swap_point - 1 ])
298- toO1_dedup = [ iset (S - O1_flat ) for S in toO1 ]
299- O2_flat = iset .flat (O2 [t ][:swap_point - 1 ])
300- toO2_dedup = [ iset (S - O2_flat ) for S in toO2 ]
348+ O1_flat = iset .flat (O1 [t ][:swap_point ])
349+ toO1_dedup = [iset (S - O1_flat ) for S in toO1 if S - O1_flat ]
350+ O2_flat = iset .flat (O2 [t ][:swap_point ])
351+ toO2_dedup = [iset (S - O2_flat ) for S in toO2 if S - O2_flat ]
301352 # merge
302- O1 [t ], O2 [t ] = O1 [t ][:swap_point - 1 ] + toO1_dedup , O2 [t ][:swap_point - 1 ] + toO2_dedup
303- # @src 6.3. Genetic Operations: Update Related Elements; https://doi.org/10.1007/11494744_5
304- for c in iset .flat (toO1 ) - iset .flat (toO2 ):
305- I1 [c ].append (iset ({t }))
306- for i ,p in enumerate (I2 [c ]): # p = only local var
307- if t in p :
308- I2 [c ][i ] = iset (p - {t })
309- if not I2 [c ][i ]:
310- I2 [c ].remove (I2 [c ][i ])
311- break
312- for c in iset .flat (toO2 ) - iset .flat (toO1 ):
313- I2 [c ].append (iset ({t }))
314- for i ,p in enumerate (I1 [c ]): # p = only local var
315- if t in p :
316- I1 [c ][i ] = iset (p - {t })
317- if not I1 [c ][i ]:
318- I1 [c ].remove (I1 [c ][i ])
319- break
353+ O1 [t ], O2 [t ] = O1 [t ][:swap_point ] + toO1_dedup , O2 [t ][:swap_point ] + toO2_dedup
354+ new_O1 = iset .flat (O1 [t ])
355+ new_O2 = iset .flat (O2 [t ])
356+ for c in new_O1 - old_O1 :
357+ _add_singleton_partition (I1 , c , t )
358+ for c in old_O1 - new_O1 :
359+ _remove_value_from_partitions (I1 , c , t )
360+ for c in new_O2 - old_O2 :
361+ _add_singleton_partition (I2 , c , t )
362+ for c in old_O2 - new_O2 :
363+ _remove_value_from_partitions (I2 , c , t )
320364 return (offspring1 , offspring2 )
321365
366+
367+ def _add_singleton_partition (mapping : Dict [str , list [iset ]], key : str , value : str ) -> None :
368+ partitions = mapping .setdefault (key , [])
369+ if any (value in partition for partition in partitions ):
370+ return
371+ partitions .append (iset ({value }))
372+
373+
374+ def _remove_value_from_partitions (mapping : Dict [str , list [iset ]], key : str , value : str ) -> None :
375+ partitions = mapping .setdefault (key , [])
376+ for index , partition in enumerate (partitions ):
377+ if value in partition :
378+ updated = iset (partition - {value })
379+ if updated :
380+ partitions [index ] = updated
381+ else :
382+ del partitions [index ]
383+ return
384+
322385def mutate (individual : Individual , rate : float = 0.01 ) -> Individual :
323386 # @src 6.3. Genetic Operations: Mutation; https://doi.org/10.1007/11494744_5
324387 I ,O = individual
@@ -348,18 +411,20 @@ def repair(I: list[str], O: list[str], C: numpy.ndarray, T: list[str]) -> tuple[
348411 partition .add (t )
349412 Tn |= (set (I [t ]) | set (O [t ])) & left
350413 partitions .append (partition )
414+ T_idx = {activity : idx for idx , activity in enumerate (T )}
415+
351416 def rand_connect_one (I : InputMap , O : OutputMap , Ti : list [set ], To : list [set ], C : numpy .ndarray ) -> Individual :
352417 comb = tuple (itertools .product (Ti , To ))
353418 try :
354419 ti ,to = random .choices (
355420 comb ,
356- weights = [ C [T . index ( ti ), T . index ( to ) ] for ti ,to in comb ],
421+ weights = [ C [T_idx [ ti ], T_idx [ to ] ] for ti ,to in comb ],
357422 k = 1
358423 )[0 ]
359424 except ValueError :
360425 ti ,to = random .choice (comb )
361- I [ti ].append (to )
362- O [to ].append (ti )
426+ O [ti ].append (to )
427+ I [to ].append (ti )
363428 return I ,O
364429 while len (partitions ) > 1 :
365430 random .shuffle (partitions )
0 commit comments