From 213935235aea7540021831335849b1b83a4414eb Mon Sep 17 00:00:00 2001 From: "mh595@uni-freiburg.de" Date: Mon, 22 Mar 2021 15:34:47 +0100 Subject: [PATCH 1/7] Added a print for the total average time for chemical compounds in the results section --- reconstruct.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/reconstruct.py b/reconstruct.py index 6f2f760..88f11cc 100644 --- a/reconstruct.py +++ b/reconstruct.py @@ -117,6 +117,7 @@ def maketasks(params): # Pareto Option "default": (3*5 best graphs for each category + 15 pareto front) # Pareto Option "random": (No pareto front and no 3*5 best graphs. Just take 30 random graphs total) # Pareto Option "greedy": (Instead of using the pareto front, take graphs with the lowest direct distance to the target) +# Pareto Option "paretogreed": (Greedy approach for pareto front) # Pareto Option "pareto_only": (Instead of using the 3*5 best graphs it takes double the graphs from the pareto front. # Pareto Option: "all": (Takes EVERY graph from the pareto front) parser = argparse.ArgumentParser() @@ -127,11 +128,13 @@ def maketasks(params): parser.add_argument('--cipselector_option', nargs=1, type=int, default=[1], ## Change this back choices=[0, 1, 2], help='1: Take k best from all, 2: Take k best from each current cip') -parser.add_argument('--cipselector_k', nargs=1, type=int, default=[100], +parser.add_argument('--cipselector_k', nargs=1, type=int, default=[10], help='k for Cipselector') parser.add_argument('--pareto_option', nargs=1, type=str, default=['greedy'], - choices=['default', 'random', 'greedy', 'pareto_only', 'all'], + choices=['default', 'random', 'greedy', 'paretogreed', 'pareto_only', 'all'], help='Pareto option for optimization') +parser.add_argument('--keepgraphs', nargs=1, type=int, default=[30], + help='Number of graphs kept from the pareto part') parser.add_argument('--use_normalization', nargs=1, type=int, default=[1], choices=[1,0], help='If 1, normalization will be applied for cipselection') parser.add_argument('--min_count', nargs=1, type=int, default=[2], @@ -339,10 +342,12 @@ def report(folder = '.res', tasklist=None): lsuccess = [int(succ) for data in dat.values() for v in data.values() for succ,steps,times,avg in v] avg_productions = np.array([int(avg) for data in dat.values() for v in data.values() for succ,steps,times,avg in v]) rnd = [int(steps) for data in dat.values() for v in data.values() for succ,steps,times,avg in v] + time = np.array([int(times) for data in dat.values() for v in data.values() for succ,steps,times,avg in v]) print ("nores",nores) print ('nosucc',nosucc) print ("sumsuccess:", sum(lsuccess), lsuccess) print ("Average productions:", avg_productions.mean(), avg_productions) + print ("Average times:", np.average(time), time) # print ("maxrnd:", max([int(b) for c in dat.values() for a,b,_ in c.values()])) print("maxrnd:", max(rnd)) From 2f73c50b2ed3593b6ac8dbb2fd2d4a811cb8e163 Mon Sep 17 00:00:00 2001 From: "mh595@uni-freiburg.de" Date: Wed, 24 Mar 2021 18:54:35 +0100 Subject: [PATCH 2/7] New stuff for run script --- run_in_order.sh | 110 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 25 deletions(-) diff --git a/run_in_order.sh b/run_in_order.sh index 0094a60..3d0a735 100644 --- a/run_in_order.sh +++ b/run_in_order.sh @@ -5,7 +5,8 @@ conda activate binenv REPEATS=50 ### Change to 100 for Normal or 250 for Chem -execute () { #### Make sure to change filename in first sed command to 'chem_runall_binac.sh' or just 'runall_binac.sh' +execute () { ######## Make sure to change filename in first sed command to 'chem_runall_binac.sh' or just 'runall_binac.sh' + sed '/reconstruct.py/s/$/'"$STRING"'/' runall_binac.sh > .run_$RESPREFIX.sh JOBID=$(qsub -q short -t 1-$REPEATS .run_$RESPREFIX.sh | sed -r 's/^[^0-9]*([0-9]+).*$/\1/') echo "Current Task: $JOBID : $STRING" @@ -41,7 +42,7 @@ for PARETO in 'random' 'greedy' 'pareto_only' 'all' 'default'; do for SIZELIMITER in 1; do RESPREFIX="$CIPSELECTOR-$CIPK-$CONTEXTSIZE-$MINCOUNT-$SIZELIMITER-$NORMALIZATION-$DECOMPRADIUS-$PARETO" STRING=" --cipselector_option $CIPSELECTOR --pareto_option $PARETO --cipselector_k $CIPK --context_size $CONTEXTSIZE --min_count $MINCOUNT --graph_size_limiter $SIZELIMITER --use_normalization $NORMALIZATION --max_decompose_radius $DECOMPRADIUS --resprefix $RESPREFIX" - report ## Replace this with report/execute/pass + ##report ## Replace this with report/execute/pass done done done @@ -50,37 +51,64 @@ for PARETO in 'random' 'greedy' 'pareto_only' 'all' 'default'; do done done ## Chemset Comparison -#for CONTEXTSIZE in 1 2; do -# for CIPK in 200 300 400; do -# RESPREFIX="cipK-$CIPK-contextsize-$CONTEXTSIZE" -# STRING=" --context_size $CONTEXTSIZE --cipselector_k $CIPK --resprefix $RESPREFIX" -# pass -# done -#done +for CONTEXTSIZE in 1 2; do + for CIPK in 100 200 300 400; do + RESPREFIX="cipK-$CIPK-contextsize-$CONTEXTSIZE" + STRING=" --pareto_option 'default' --core_sizes 0 1 2 3 --context_size $CONTEXTSIZE --cipselector_k $CIPK --resprefix $RESPREFIX" +# reportchem + done +done ## Artificial Comparison -#for CONTEXTSIZE in 1 2; do -# RESPREFIX="coresizes-012-contextsize-$CONTEXTSIZE" -# STRING=" --core_sizes 0 1 2 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" +CIPSELECTOR=2 +for CONTEXTSIZE in 1 2; do + RESPREFIX="coresizes-012-contextsize-$CONTEXTSIZE" + STRING=" --cipselector_option $CIPSELECTOR --core_sizes 0 1 2 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" # report -# RESPREFIX="coresizes-01-contextsize-$CONTEXTSIZE" -# STRING=" --core_sizes 0 1 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" + RESPREFIX="coresizes-01-contextsize-$CONTEXTSIZE" + STRING=" --cipselector_option $CIPSELECTOR --core_sizes 0 1 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" # report -# RESPREFIX="coresizes-0-contextsize-$CONTEXTSIZE" -# STRING=" --core_sizes 0 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" + RESPREFIX="coresizes-0-contextsize-$CONTEXTSIZE" + STRING=" --cipselector_option $CIPSELECTOR --core_sizes 0 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" # report -#done +done +## Comparison of Average Productions ## + CIPSELECTOR=0 + CIPK=1000 + RESPREFIX="cipsel0_$CIPK" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" +# report + CIPSELECTOR=1 + CIPK=100 + RESPREFIX="cipsel1_$CIPK" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" +# report + CIPSELECTOR=2 + CIPK=10 + RESPREFIX="cipsel2_$CIPK" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" +# report +########################## ## Cipselector 1: 100 200 400 800 -#for CIPK in 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200; do -for CIPK in 10 30 50 70 90; do +for CIPK in 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200; do +#for CIPK in 10 30 50 70 90; do RESPREFIX="cipsel1_$CIPK" STRING=" --cipselector_option 1 --cipselector_k $CIPK --resprefix $RESPREFIX" -# pass +# report +done + +for KEEP in 12 30 60; do + for CIPK in 50 100 250; do + RESPREFIX="keep-$KEEP-cipk-$CIPK" + STRING=" --keepgraphs $KEEP --cipselector_option 1 --cipselector_k $CIPK --resprefix $RESPREFIX" +# report + done done -## Cipselector 2: (Default) 1 5 10 15 20 #### REMOVED 100 FOR CHEMSETS + +## Cipselector 2: 1 5 10 15 20 #### REMOVED 100 FOR CHEMSETS for CIPK in 1 5 10 15 20; do RESPREFIX="cipsel2_$CIPK" STRING=" --cipselector_option 2 --cipselector_k $CIPK --resprefix $RESPREFIX" @@ -95,10 +123,11 @@ for NORM in 0; do done ## Pareto Options: ###### REMOVED 'all' FOR CHEMSETS -for PARETO in 'random' 'greedy' 'pareto_only' 'all' 'default'; do - RESPREFIX="pareto_$PARETO" - STRING=" --pareto_option $PARETO --resprefix $RESPREFIX" -# pass +for PARETO in 'random' 'greedy' 'paretogreed' 'pareto_only' 'all' 'default'; do + RESPREFIX="res_pareto_$PARETO" +# STRING=" --pareto_option $PARETO --resprefix $RESPREFIX" + STRING=" --core_sizes 0 1 2 3 --pareto_option $PARETO --resprefix $RESPREFIX" +# reportchem done ## Contextsizes/Thickness: @@ -121,3 +150,34 @@ for SIZELIMITER in 0 1; do STRING=" --graph_size_limiter $SIZELIMITER --resprefix $RESPREFIX" # pass done + + +## Cipselector Comparison Graphs +KEEP=60 +#CIP0 +###for CIPK in 300 1500 3000 4500 6000 7500 9000; do # KEEP 30 +###for CIPK in 100 900 2250 3750 5250 6750 8250; do +###for CIPK in 150 750 1500 2250 3000 3750 4500; do # KEEP 60 +for CIPK in 50 450 1125 1875 2625 3375 4125; do + RESPREFIX="res_keep-$KEEP-cipsel0-$CIPK" + STRING=" --keepgraphs $KEEP --cipselector_option 0 --cipselector_k $CIPK --pareto_option 'greedy' --resprefix $RESPREFIX" + report +done +#CIP1 +###for CIPK in 10 50 100 150 200 250 300; do # KEEP 30 +###for CIPK in 50 30 75 125 175 225 275; do +###for CIPK in 5 25 50 75 100 125 150; do # KEEP 60 +for CIPK in 1 15 38 63 88 113 138; do + RESPREFIX="res_keep-$KEEP-cipsel1-$CIPK" + STRING=" --keepgraphs $KEEP --cipselector_option 1 --cipselector_k $CIPK --pareto_option 'greedy' --resprefix $RESPREFIX" + report +done +#CIP2 +###for CIPK in 10 20 30 40 50 60 70 80 90 100; do # KEEP 30 +###for CIPK in 1 2 3 4 5 6 7 8 9; do +###for CIPK in 5 10 15 20 25 30 35 40 45 50; do # KEEP 60 +for CIPK in 1 2 3 4 6 7 8 9; do + RESPREFIX="res_keep-$KEEP-cipsel2-$CIPK" + STRING=" --keepgraphs $KEEP --cipselector_option 2 --cipselector_k $CIPK --pareto_option 'greedy' --resprefix $RESPREFIX" + report +done From fb2d33943a63ebd2d99d8173441df79c2a381d56 Mon Sep 17 00:00:00 2001 From: MatthiasHerrmann Date: Thu, 25 Mar 2021 14:37:49 +0100 Subject: [PATCH 3/7] Cleaned up pareto.py --- exploration/pareto.py | 89 ++++++++++++++--------------------- exploration/pareto_options.py | 46 ++++++++++++++++++ 2 files changed, 81 insertions(+), 54 deletions(-) create mode 100644 exploration/pareto_options.py diff --git a/exploration/pareto.py b/exploration/pareto.py index 0d06c72..e4deb0e 100644 --- a/exploration/pareto.py +++ b/exploration/pareto.py @@ -5,13 +5,12 @@ import heapq import numpy as np from eden.util import timeit -### from graphlearn.local_substitution_graph_grammar import LocalSubstitutionGraphGrammar as lsggold ########## TMP from graphlearn.cipcorevector import LsggCoreVec as lsgg from toolz.curried import compose, map, concat from exploration.pareto_funcs import _manage_int_or_float logger = logging.getLogger(__name__) import structout as so -from exploration import pareto_funcs as paretof, cost_estimator as costs +from exploration import pareto_options, cost_estimator as costs from extensions import lsggscramble as lsggs from sklearn.metrics.pairwise import euclidean_distances @@ -223,33 +222,34 @@ def optimize(self, graphs): def optimize_step(self, graphs): # filter, expand, chk duplicates - costs = self.get_costs(graphs) - status = self.checkstatus(costs, graphs) + step_start_time = time.time() + graphlen_start = len(graphs) + graphs, status = self.filter_by_cost(graphs) if status: return [],True,None - graphs = self.filter_by_cost(costs, graphs) + graphlen_filter = len(graphs) + logger.log(10, f"cost_filter: Got {graphlen_start} graphs, reduced to {graphlen_filter} ({time.time()-step_start_time})") num_graphs = len(graphs) if self.grammar.cipselector == new_cipselector0: ### SPECIAL CASE - logger.log(10, "USING CIPSELECTOR 0") graphs = self._expand_neighbors2(graphs) else: graphs = self._expand_neighbors(graphs) - avg_productions = len(graphs)/num_graphs - logger.log(10, f"Average productions per graph: {avg_productions}") + graphlen_expand = len(graphs) + avg_productions = graphlen_expand/graphlen_filter + logger.log(10, f"graph generation: Got {avg_productions} per graph. ({time.time()-timenow})") graphs = self.duplicate_rm(graphs) + logger.log(10, "duplicate_rm: {graphlen_expand} -> {len(graphs} graphs. ({time.time()-timenow})") return graphs, status, avg_productions - def filter_by_cost(self,costs,graphs): + def filter_by_cost(self,graphs): """expand "keepgraphs" graphs, divided between top graphs in everything and pareto front, discard rest""" - timenow=time.time() - in_count = len(graphs) keepgraphs = self.keepgraphs if in_count <= self.keepgraphs: # Only few graphs remaining so just return all of them. - logger.debug('cost_filter: keep all %d graphs' % in_count) + logger.log(10, "cost_filter: keep all graphs") return graphs elif self.pareto_option == "random": @@ -267,42 +267,35 @@ def filter_by_cost(self,costs,graphs): elif self.pareto_option == 'greedy': # Return graphs with the lowest euclidean distance to the target vector - distances = [] - for g in graphs: - distances.append(euclidean_distances(self.target_graph_vector, vertex_vec(g, self.decomposer).sum(axis=0))[0][0]) - ranked_distances = np.argsort(distances)[:keepgraphs] - res = [graphs[i] for i in ranked_distances] + return pareto_option.greedy(graphs, self.target_graph_vector, self.decomposer, keepgraphs) + + costs = self.get_costs(graphs) + status = self.checkstatus(costs, graphs) + if status: + # Some graph has distance == 0 + return graphs, True + elif self.pareto_option == "default": # Take best graphs from estimators and pareto front - costs_ranked = np.argsort(costs,axis=0)[:int(keepgraphs/6),[0,1,3]] - want , counts = np.unique(costs_ranked,return_counts=True) - res = [graphs[idd] for idd,count in zip( want,counts) if count > 0 ] - dontwant = [i for i in range(len(graphs)) if i not in want] - restgraphs = [graphs[i] for i in dontwant] - restcosts = costs[dontwant][:,[0,1,2]] - paretoselectedgraphs = paretof._pareto_set(restgraphs, restcosts) - random.shuffle(paretoselectedgraphs) - res += paretoselectedgraphs[:int(keepgraphs/2)] + return pareto_option.default(graphs, costs, keepgraphs), False elif self.pareto_option == "paretogreed": # 1. choose pareto graphs - # 2. new score is the average rank over all costs + # 2. new score is the average rank over all costs # 3. choose k best of those - graphs, costs = paretof._pareto_set(graphs, costs,return_costs=True) - costs_ranked = np.argsort(costs,axis=0).sum(axis=1) - choosegr = np.argsort(costs_ranked) - res = [graphs[x] for x in choosegr[:keepgraphs]] + return pareto_option.paretogreed(graphs, costs, keepgraphs), False + + paretoselectedgraphs = paretof._pareto_set(graphs, costs) + random.shuffle(paretoselectedgraphs) + if self.pareto_option == "pareto_only": + # Return only graphs from the pareto front + return paretoselectedgraphs[:keepgraphs], False + + elif self.pareto_option == "all": + # Return ALL graphs from the pareto front + return paretoselectedgraphs, False else: - paretoselectedgraphs = paretof._pareto_set(graphs, costs) - random.shuffle(paretoselectedgraphs) - - if self.pareto_option == "pareto_only": - # Return only graphs from the pareto front - res = paretoselectedgraphs[:keepgraphs] - - elif self.pareto_option == "all": - # Return ALL graphs from the pareto front - res = paretoselectedgraphs + raise ValueError("Invalid Pareto Option") ## # DEBUG TO SHOW THE REAL DISTANCE ## if self.cheat: @@ -320,16 +313,10 @@ def filter_by_cost(self,costs,graphs): ## from util import util ## util.dumpfile(graphs[costs_ranked[stuff][0]],"gr") ## print ("graph dumped") - logger.log(10, f"cost_filter: got {in_count} graphs, reduced to {len(res)} (%.2fs)"%(time.time()-timenow)) - - return res def duplicate_rm(self,graphs): - timenow=time.time() - count = len(graphs) graphs = list(self._duplicate_rm(graphs)) - logger.debug("duplicate_rm: %d -> %d graphs (%.2fs)" % (count, len(graphs), time.time()-timenow)) return graphs def _duplicate_rm(self,graphs): @@ -360,7 +347,7 @@ def get_costs(self, graphs): for i,e in enumerate(costs[:,2]): nucol[i,2] = resdic[e] costs = np.hstack((costs, np.sum(nucol,axis =1).reshape(-1,1))) - logger.debug("costs: best dist: %f (%.2fs)" % (np.min(costs[:,0]) ,time.time()-timenow)) + logger.log(10, f"costs: best dist: {np.min(costs[:,0])} ({time.time()-timenow})") return costs def _get_neighbors(self, graph): @@ -373,17 +360,14 @@ def _get_neighbors(self, graph): return neighs def _expand_neighbors(self, graphs): - timenow = time.time() global _decomposer ##### Stupid hack but I dont know how else to allow lambda functions in multiprocessing _decomposer = self.decomposer ##### if self.multiproc>1: with multiprocessing.Pool(self.multiproc) as p: res = list(concat(p.map(self._get_neighbors,graphs))) - logger.debug("graph generation: %.2fs" % (time.time()-timenow)) return res else: res = list(concat(map(self._get_neighbors,graphs))) - logger.debug("graph generation: %.2fs" % (time.time()-timenow)) return res @@ -398,14 +382,11 @@ def _get_score_substitution(self, graph): # def _expand_neighbors2(self, graphs): # """Only used with Cipselector Option 0. Replaces _expand_neighbors""" - timenow = time.time() if self.multiproc>1: with multiprocessing.Pool(self.multiproc) as p: res = list(concat(p.map(self._get_score_substitution,graphs))) - logger.debug("graph generation: %.2fs" % (time.time()-timenow)) else: res = list(concat(map(self._get_score_substitution,graphs))) - logger.debug("graph generation: %.2fs" % (time.time()-timenow)) res.sort(reverse=True, key=lambda a: a[0]) counter = 0 grlist = [] diff --git a/exploration/pareto_options.py b/exploration/pareto_options.py new file mode 100644 index 0000000..7efc605 --- /dev/null +++ b/exploration/pareto_options.py @@ -0,0 +1,46 @@ +import pareto_funcs as paretof +import numpy as np + +def greedy(graphs, target, decomposer, keepgraphs): + """ + Return graphs with the lowest euclidean distance to the target vector. + Also returns if one of the distances equals 0. + """ + distances = [] + for g in graphs: + distances.append(euclidean_distances(target, vertex_vec(g, decomposer).sum(axis=0))[0][0]) + ranked_distances = np.argsort(distances)[:keepgraphs] + res = [graphs[i] for i in ranked_distances] + if distances[ranked_distances[0]] == 0: + ## => At least 1 distance is 0 => Successful reconstruction + return res, True + return res, False + + +def default(graphs, costs, keepgraphs): + """ + Take best graphs from estimators and pareto front. + """ + costs_ranked = np.argsort(costs,axis=0)[:int(keepgraphs/6),[0,1,3]] + want , counts = np.unique(costs_ranked,return_counts=True) + res = [graphs[idd] for idd,count in zip( want,counts) if count > 0 ] + dontwant = [i for i in range(len(graphs)) if i not in want] + restgraphs = [graphs[i] for i in dontwant] + restcosts = costs[dontwant][:,[0,1,2]] + paretoselectedgraphs = paretof._pareto_set(restgraphs, restcosts) + random.shuffle(paretoselectedgraphs) + res += paretoselectedgraphs[:int(keepgraphs/2)] + return res + + +def paretogreed(graphs, costs, keepgraphs): + """ + 1. choose pareto graphs + 2. new score is the average rank over all costs + 3. choose k best of those + """ + graphs, costs = paretof._pareto_set(graphs, costs,return_costs=True) + costs_ranked = np.argsort(costs,axis=0).sum(axis=1) + choosegr = np.argsort(costs_ranked) + res = [graphs[x] for x in choosegr[:keepgraphs]] + return res From dea95f7c0395d0dea4894e318f1eca3491ec0bf2 Mon Sep 17 00:00:00 2001 From: "mh595@uni-freiburg.de" Date: Thu, 25 Mar 2021 14:53:35 +0100 Subject: [PATCH 4/7] Fix minor issues --- exploration/pareto.py | 12 ++++++------ exploration/pareto_options.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/exploration/pareto.py b/exploration/pareto.py index e4deb0e..573dd14 100644 --- a/exploration/pareto.py +++ b/exploration/pareto.py @@ -235,9 +235,9 @@ def optimize_step(self, graphs): graphs = self._expand_neighbors(graphs) graphlen_expand = len(graphs) avg_productions = graphlen_expand/graphlen_filter - logger.log(10, f"graph generation: Got {avg_productions} per graph. ({time.time()-timenow})") + logger.log(10, f"graph generation: Got {avg_productions} per graph. ({time.time()-step_start_time})") graphs = self.duplicate_rm(graphs) - logger.log(10, "duplicate_rm: {graphlen_expand} -> {len(graphs} graphs. ({time.time()-timenow})") + logger.log(10, f"duplicate_rm: {graphlen_expand} -> {len(graphs)} graphs. ({time.time()-step_start_time})") return graphs, status, avg_productions @@ -247,7 +247,7 @@ def filter_by_cost(self,graphs): and pareto front, discard rest""" keepgraphs = self.keepgraphs - if in_count <= self.keepgraphs: + if len(graphs) <= self.keepgraphs: # Only few graphs remaining so just return all of them. logger.log(10, "cost_filter: keep all graphs") return graphs @@ -267,7 +267,7 @@ def filter_by_cost(self,graphs): elif self.pareto_option == 'greedy': # Return graphs with the lowest euclidean distance to the target vector - return pareto_option.greedy(graphs, self.target_graph_vector, self.decomposer, keepgraphs) + return pareto_options.greedy(graphs, self.target_graph_vector, self.decomposer, keepgraphs) costs = self.get_costs(graphs) status = self.checkstatus(costs, graphs) @@ -277,13 +277,13 @@ def filter_by_cost(self,graphs): elif self.pareto_option == "default": # Take best graphs from estimators and pareto front - return pareto_option.default(graphs, costs, keepgraphs), False + return pareto_options.default(graphs, costs, keepgraphs), False elif self.pareto_option == "paretogreed": # 1. choose pareto graphs # 2. new score is the average rank over all costs # 3. choose k best of those - return pareto_option.paretogreed(graphs, costs, keepgraphs), False + return pareto_options.paretogreed(graphs, costs, keepgraphs), False paretoselectedgraphs = paretof._pareto_set(graphs, costs) random.shuffle(paretoselectedgraphs) diff --git a/exploration/pareto_options.py b/exploration/pareto_options.py index 7efc605..4bcf89a 100644 --- a/exploration/pareto_options.py +++ b/exploration/pareto_options.py @@ -1,5 +1,7 @@ -import pareto_funcs as paretof +import exploration.pareto_funcs as paretof +from graphlearn.cipcorevector import vertex_vec import numpy as np +from sklearn.metrics.pairwise import euclidean_distances def greedy(graphs, target, decomposer, keepgraphs): """ From d770c96743020897cd74d2d38f1e59a8dcefcd36 Mon Sep 17 00:00:00 2001 From: "mh595@uni-freiburg.de" Date: Sat, 27 Mar 2021 14:03:36 +0100 Subject: [PATCH 5/7] Fixed Pareto options. Greedy still problematic --- exploration/pareto.py | 25 +++++++++++++------------ exploration/pareto_options.py | 4 +++- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/exploration/pareto.py b/exploration/pareto.py index 573dd14..214024f 100644 --- a/exploration/pareto.py +++ b/exploration/pareto.py @@ -10,7 +10,7 @@ from exploration.pareto_funcs import _manage_int_or_float logger = logging.getLogger(__name__) import structout as so -from exploration import pareto_options, cost_estimator as costs +from exploration import pareto_options, pareto_funcs as paretof, cost_estimator as costs from extensions import lsggscramble as lsggs from sklearn.metrics.pairwise import euclidean_distances @@ -246,16 +246,18 @@ def filter_by_cost(self,graphs): """expand "keepgraphs" graphs, divided between top graphs in everything and pareto front, discard rest""" keepgraphs = self.keepgraphs + + costs = self.get_costs(graphs) + status = self.checkstatus(costs, graphs) + if status: + # Some graph has distance == 0 + return graphs, True if len(graphs) <= self.keepgraphs: # Only few graphs remaining so just return all of them. logger.log(10, "cost_filter: keep all graphs") - return graphs + return graphs, False - elif self.pareto_option == "random": - # Return randomly selected graphs without any application of pareto. - res = random.sample(graphs, keepgraphs) - ## elif self.prefilter_kick!=0: ## # DELETE THE 25% worst in each category ## assert False @@ -264,16 +266,14 @@ def filter_by_cost(self,graphs): ## keep = [i for i in range(len(graphs)) if i not in trash] ## graphs = [graphs[i] for i in keep] ## costs = costs[keep] - + elif self.pareto_option == 'greedy': # Return graphs with the lowest euclidean distance to the target vector return pareto_options.greedy(graphs, self.target_graph_vector, self.decomposer, keepgraphs) - costs = self.get_costs(graphs) - status = self.checkstatus(costs, graphs) - if status: - # Some graph has distance == 0 - return graphs, True + elif self.pareto_option == "random": + # Return randomly selected graphs without any application of pareto. + return random.sample(graphs, keepgraphs), False elif self.pareto_option == "default": # Take best graphs from estimators and pareto front @@ -287,6 +287,7 @@ def filter_by_cost(self,graphs): paretoselectedgraphs = paretof._pareto_set(graphs, costs) random.shuffle(paretoselectedgraphs) + if self.pareto_option == "pareto_only": # Return only graphs from the pareto front return paretoselectedgraphs[:keepgraphs], False diff --git a/exploration/pareto_options.py b/exploration/pareto_options.py index 4bcf89a..aee7793 100644 --- a/exploration/pareto_options.py +++ b/exploration/pareto_options.py @@ -2,6 +2,8 @@ from graphlearn.cipcorevector import vertex_vec import numpy as np from sklearn.metrics.pairwise import euclidean_distances +import random + def greedy(graphs, target, decomposer, keepgraphs): """ @@ -13,7 +15,7 @@ def greedy(graphs, target, decomposer, keepgraphs): distances.append(euclidean_distances(target, vertex_vec(g, decomposer).sum(axis=0))[0][0]) ranked_distances = np.argsort(distances)[:keepgraphs] res = [graphs[i] for i in ranked_distances] - if distances[ranked_distances[0]] == 0: + if distances[ranked_distances[0]] <= 0: ## => At least 1 distance is 0 => Successful reconstruction return res, True return res, False From 0a9e6e648509f294b0d68e7558a17259d5ab2a42 Mon Sep 17 00:00:00 2001 From: "mh595@uni-freiburg.de" Date: Thu, 1 Apr 2021 14:48:52 +0200 Subject: [PATCH 6/7] Probably last commit from the cluster --- exploration/pareto.py | 11 +-- exploration/pareto_options.py | 5 +- reconstruct.py | 2 +- run_in_order.sh | 134 +++++++++++++--------------------- 4 files changed, 61 insertions(+), 91 deletions(-) diff --git a/exploration/pareto.py b/exploration/pareto.py index 214024f..589eae3 100644 --- a/exploration/pareto.py +++ b/exploration/pareto.py @@ -247,11 +247,12 @@ def filter_by_cost(self,graphs): and pareto front, discard rest""" keepgraphs = self.keepgraphs - costs = self.get_costs(graphs) - status = self.checkstatus(costs, graphs) - if status: - # Some graph has distance == 0 - return graphs, True + if True: # Greedy Cost calculation doesnt work at the moment.. self.pareto_option != 'greedy': + costs = self.get_costs(graphs) + status = self.checkstatus(costs, graphs) + if status: + # Some graph has distance == 0 + return graphs, True if len(graphs) <= self.keepgraphs: # Only few graphs remaining so just return all of them. diff --git a/exploration/pareto_options.py b/exploration/pareto_options.py index aee7793..3acdf5b 100644 --- a/exploration/pareto_options.py +++ b/exploration/pareto_options.py @@ -11,11 +11,14 @@ def greedy(graphs, target, decomposer, keepgraphs): Also returns if one of the distances equals 0. """ distances = [] +#### distances = euclidean_distances(target, vertex_vec(graphs, decomposer)) for g in graphs: distances.append(euclidean_distances(target, vertex_vec(g, decomposer).sum(axis=0))[0][0]) + if min(distances) == 0: + return graphs, True ranked_distances = np.argsort(distances)[:keepgraphs] res = [graphs[i] for i in ranked_distances] - if distances[ranked_distances[0]] <= 0: + if distances[ranked_distances[0]] < 0.000001: ## => At least 1 distance is 0 => Successful reconstruction return res, True return res, False diff --git a/reconstruct.py b/reconstruct.py index 88f11cc..8d01828 100644 --- a/reconstruct.py +++ b/reconstruct.py @@ -45,7 +45,7 @@ def maketasks(params): ## OPTIONS FOR GRAPHS ########################################## -EXPERIMENT_REPEATS = 50 #### CHANGE THIS BACK TO 100! 50 only for chemsets +EXPERIMENT_REPEATS = 20 #### CHANGE THIS BACK TO 100! 50 only for chemsets # 1. param dict params_graphs = { diff --git a/run_in_order.sh b/run_in_order.sh index 3d0a735..64dd189 100644 --- a/run_in_order.sh +++ b/run_in_order.sh @@ -3,11 +3,11 @@ source /beegfs/work/workspace/ws/fr_mh595-conda-0/conda/etc/profile.d/conda.sh conda activate binenv -REPEATS=50 ### Change to 100 for Normal or 250 for Chem +REPEATS=100 ### Change to 100 for Normal or 250 for Chem execute () { ######## Make sure to change filename in first sed command to 'chem_runall_binac.sh' or just 'runall_binac.sh' - sed '/reconstruct.py/s/$/'"$STRING"'/' runall_binac.sh > .run_$RESPREFIX.sh + sed '/reconstruct.py/s/$/'"$STRING"'/' chem_runall_binac.sh > .run_$RESPREFIX.sh JOBID=$(qsub -q short -t 1-$REPEATS .run_$RESPREFIX.sh | sed -r 's/^[^0-9]*([0-9]+).*$/\1/') echo "Current Task: $JOBID : $STRING" echo "$JOBID : $STRING" >> results.txt @@ -32,17 +32,17 @@ pass () { echo "Start: $(date)" ## Parameter Optimization -CIPSELECTOR=1 -for PARETO in 'random' 'greedy' 'pareto_only' 'all' 'default'; do - for NORMALIZATION in 1; do - for CIPK in 500 1000; do +CIPSELECTOR=2 +for PARETO in 'greedy' 'default'; do + for NORMALIZATION in 0 1; do + for CIPK in 10; do for DECOMPRADIUS in 1 2; do for CONTEXTSIZE in 1 2; do - for MINCOUNT in 1; do - for SIZELIMITER in 1; do + for MINCOUNT in 1 2; do + for SIZELIMITER in 0 1; do RESPREFIX="$CIPSELECTOR-$CIPK-$CONTEXTSIZE-$MINCOUNT-$SIZELIMITER-$NORMALIZATION-$DECOMPRADIUS-$PARETO" STRING=" --cipselector_option $CIPSELECTOR --pareto_option $PARETO --cipselector_k $CIPK --context_size $CONTEXTSIZE --min_count $MINCOUNT --graph_size_limiter $SIZELIMITER --use_normalization $NORMALIZATION --max_decompose_radius $DECOMPRADIUS --resprefix $RESPREFIX" - ##report ## Replace this with report/execute/pass +##### report ## Replace this with report/execute... HANDLE WITH CARE done done done @@ -51,53 +51,36 @@ for PARETO in 'random' 'greedy' 'pareto_only' 'all' 'default'; do done done ## Chemset Comparison +CIPSELECTOR=2 +for PARETOOPTION in 'default' 'greedy'; do for CONTEXTSIZE in 1 2; do - for CIPK in 100 200 300 400; do - RESPREFIX="cipK-$CIPK-contextsize-$CONTEXTSIZE" - STRING=" --pareto_option 'default' --core_sizes 0 1 2 3 --context_size $CONTEXTSIZE --cipselector_k $CIPK --resprefix $RESPREFIX" + for CIPK in 5 10 15 20; do + RESPREFIX="res_CHEMCOMPARE_pareto-$PARETOOPTION-cipsel-$CIPSELECTOR-cipK-$CIPK-contextsize-$CONTEXTSIZE" + STRING=" --pareto_option $PARETOOPTION --core_sizes 0 1 2 3 --context_size $CONTEXTSIZE --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" # reportchem done done +done ## Artificial Comparison CIPSELECTOR=2 +CIPK=10 +for PARETOOPTION in 'default' 'greedy'; do for CONTEXTSIZE in 1 2; do - RESPREFIX="coresizes-012-contextsize-$CONTEXTSIZE" - STRING=" --cipselector_option $CIPSELECTOR --core_sizes 0 1 2 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" + RESPREFIX="res_cipsel-$CIPSELECTOR-cipk-$CIPK-pareto-$PARETOOPTION-coresizes-012-contextsize-$CONTEXTSIZE" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --pareto_option $PARETOOPTION --core_sizes 0 1 2 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" # report - RESPREFIX="coresizes-01-contextsize-$CONTEXTSIZE" - STRING=" --cipselector_option $CIPSELECTOR --core_sizes 0 1 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" + RESPREFIX="res_cipsel-$CIPSELECTOR-cipk-$CIPK-pareto-$PARETOOPTION-coresizes-01-contextsize-$CONTEXTSIZE" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --pareto_option $PARETOOPTION --core_sizes 0 1 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" # report - RESPREFIX="coresizes-0-contextsize-$CONTEXTSIZE" - STRING=" --cipselector_option $CIPSELECTOR --core_sizes 0 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" + RESPREFIX="res_cipsel-$CIPSELECTOR-cipk-$CIPK-pareto-$PARETOOPTION-coresizes-0-contextsize-$CONTEXTSIZE" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --pareto_option $PARETOOPTION --core_sizes 0 --context_size $CONTEXTSIZE --resprefix $RESPREFIX" # report done +done -## Comparison of Average Productions ## - CIPSELECTOR=0 - CIPK=1000 - RESPREFIX="cipsel0_$CIPK" - STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" -# report - CIPSELECTOR=1 - CIPK=100 - RESPREFIX="cipsel1_$CIPK" - STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" -# report - CIPSELECTOR=2 - CIPK=10 - RESPREFIX="cipsel2_$CIPK" - STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --resprefix $RESPREFIX" -# report ########################## -## Cipselector 1: 100 200 400 800 -for CIPK in 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200; do -#for CIPK in 10 30 50 70 90; do - RESPREFIX="cipsel1_$CIPK" - STRING=" --cipselector_option 1 --cipselector_k $CIPK --resprefix $RESPREFIX" -# report -done for KEEP in 12 30 60; do for CIPK in 50 100 250; do @@ -108,47 +91,15 @@ for KEEP in 12 30 60; do done -## Cipselector 2: 1 5 10 15 20 #### REMOVED 100 FOR CHEMSETS -for CIPK in 1 5 10 15 20; do - RESPREFIX="cipsel2_$CIPK" - STRING=" --cipselector_option 2 --cipselector_k $CIPK --resprefix $RESPREFIX" -# pass -done - -## Normalization: -for NORM in 0; do - RESPREFIX="no_norm" - STRING=" --use_normalization $NORM --resprefix $RESPREFIX" -# pass -done - -## Pareto Options: ###### REMOVED 'all' FOR CHEMSETS +CIPSELECTOR=2 +CIPK=10 +## Pareto Comparison: for PARETO in 'random' 'greedy' 'paretogreed' 'pareto_only' 'all' 'default'; do - RESPREFIX="res_pareto_$PARETO" -# STRING=" --pareto_option $PARETO --resprefix $RESPREFIX" - STRING=" --core_sizes 0 1 2 3 --pareto_option $PARETO --resprefix $RESPREFIX" -# reportchem -done - -## Contextsizes/Thickness: -for CONTEXTSIZE in 1 2; do - RESPREFIX="contextsize_$CONTEXTSIZE" - STRING=" --context_size $CONTEXTSIZE --resprefix $RESPREFIX" -# pass -done - -## Mincount/min_cip: -for MINCOUNT in 1 2; do - RESPREFIX="mincount_$MINCOUNT" - STRING=" --min_count $MINCOUNT --resprefix $RESPREFIX" -# pass -done - -## Graphsizelimiter: -for SIZELIMITER in 0 1; do - RESPREFIX="sizelimiter_$SIZELIMITER" - STRING=" --graph_size_limiter $SIZELIMITER --resprefix $RESPREFIX" -# pass + RESPREFIX="res_cipsel-$CIPSELECTOR-cipk-$CIPK-pareto-$PARETO" + RESPREFIX="res_CHEM_cipsel-$CIPSELECTOR-cipk-$CIPK-pareto-$PARETO" + RESPREFIX="res_MINCOUNT1_cipsel-$CIPSELECTOR-cipk-$CIPK-pareto-$PARETO" + STRING=" --cipselector_option $CIPSELECTOR --cipselector_k $CIPK --pareto_option $PARETO --resprefix $RESPREFIX" +### report done @@ -161,7 +112,7 @@ KEEP=60 for CIPK in 50 450 1125 1875 2625 3375 4125; do RESPREFIX="res_keep-$KEEP-cipsel0-$CIPK" STRING=" --keepgraphs $KEEP --cipselector_option 0 --cipselector_k $CIPK --pareto_option 'greedy' --resprefix $RESPREFIX" - report +# report done #CIP1 ###for CIPK in 10 50 100 150 200 250 300; do # KEEP 30 @@ -170,7 +121,7 @@ done for CIPK in 1 15 38 63 88 113 138; do RESPREFIX="res_keep-$KEEP-cipsel1-$CIPK" STRING=" --keepgraphs $KEEP --cipselector_option 1 --cipselector_k $CIPK --pareto_option 'greedy' --resprefix $RESPREFIX" - report +# report done #CIP2 ###for CIPK in 10 20 30 40 50 60 70 80 90 100; do # KEEP 30 @@ -179,5 +130,20 @@ done for CIPK in 1 2 3 4 6 7 8 9; do RESPREFIX="res_keep-$KEEP-cipsel2-$CIPK" STRING=" --keepgraphs $KEEP --cipselector_option 2 --cipselector_k $CIPK --pareto_option 'greedy' --resprefix $RESPREFIX" - report +# report +done + + + +######### TESTING ######### +RESPREFIX="res_TESTING" +STRING=" --cipselector_option 1 --cipselector_k 100 --pareto_option 'greedy' --resprefix $RESPREFIX" +#report # execute + +### 24 Core Test ### +PARETOOPTION='greedy' +for CONTEXTSIZE in 1 2; do + RESPREFIX="res_24core_CHEMCOMPARE_pareto-$PARETOOPTION-cipsel-2-cipK-10-contextsize-$CONTEXTSIZE" + STRING=" --pareto_option $PARETOOPTION --core_sizes 0 1 2 3 --context_size $CONTEXTSIZE --cipselector_option 2 --cipselector_k 10 --resprefix $RESPREFIX" +# reportchem done From 23801e9a7a5195215918b650935f2772b3d3e7e1 Mon Sep 17 00:00:00 2001 From: MatthiasHerrmann Date: Thu, 1 Apr 2021 15:42:07 +0200 Subject: [PATCH 7/7] Fix default params and some minor cleaning --- exploration/pareto.py | 4 ---- reconstruct.py | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/exploration/pareto.py b/exploration/pareto.py index 589eae3..1e4744b 100644 --- a/exploration/pareto.py +++ b/exploration/pareto.py @@ -482,10 +482,6 @@ def __init__( self.output_k_best = output_k_best self.decomposer = decomposer self.grammar = lsgg_size_hack(radii=core_sizes, thickness=context_size, core_vec_decomposer=decomposer, cipselector=cipselector, nodelevel_radius_and_thickness=True) #cip_root_all=False, half_step_distance=True) -### self.grammar = lsggold(nodelevel_radius_and_thickness=True) #cip_root_all=False, half_step_distance=True) -## self.grammar.radii = core_sizes #self.grammar.set_core_size(core_sizes) -## self.grammar.thickness = context_size #self.grammar.decomposition_args['thickness_list'] = [context_size] - #self.grammar.set_min_count(min_count) interfacecount 1 makes no sense self.grammar.filter_min_cip = min_count #self.grammar.filter_args['min_cip_count'] = min_count self.optiopts = kwargs self.cs2cs = cs2cs diff --git a/reconstruct.py b/reconstruct.py index 8d01828..33499e9 100644 --- a/reconstruct.py +++ b/reconstruct.py @@ -45,7 +45,7 @@ def maketasks(params): ## OPTIONS FOR GRAPHS ########################################## -EXPERIMENT_REPEATS = 20 #### CHANGE THIS BACK TO 100! 50 only for chemsets +EXPERIMENT_REPEATS = 50 # 1. param dict params_graphs = { @@ -125,7 +125,7 @@ def maketasks(params): help='Core sizes/Radii') parser.add_argument('--context_size', nargs=1, type=float, default=[1], help='Context sizes/Thickness') -parser.add_argument('--cipselector_option', nargs=1, type=int, default=[1], ## Change this back +parser.add_argument('--cipselector_option', nargs=1, type=int, default=[2], ## Change this back choices=[0, 1, 2], help='1: Take k best from all, 2: Take k best from each current cip') parser.add_argument('--cipselector_k', nargs=1, type=int, default=[10], @@ -137,7 +137,7 @@ def maketasks(params): help='Number of graphs kept from the pareto part') parser.add_argument('--use_normalization', nargs=1, type=int, default=[1], choices=[1,0], help='If 1, normalization will be applied for cipselection') -parser.add_argument('--min_count', nargs=1, type=int, default=[2], +parser.add_argument('--min_count', nargs=1, type=int, default=[1], help='Also called min_cip') parser.add_argument('--graph_size_limiter', nargs=1, type=int, default=[1], choices=[1,0], help='If 0, graph size limiter is only used with a graphs >100')