-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathPXR.yaml
More file actions
113 lines (113 loc) · 12.5 KB
/
PXR.yaml
File metadata and controls
113 lines (113 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
input_type : molecule # ['molecule', 'data', 'model_ensemble'] Type of input data.
quantitative : True # [True, False] Should be true for quantitative endpoints andf false for qualitative endpoints
confidential : False # [True, False] Confidentiality of the model. If True an estimator with no trace of the training series will be built
SDFile_activity : pchembl_value # Name of the activity field in the SDF file
SDFile_name : molecule_pref_name # Name of the compound name field in the SDF file
SDFile_id : ID # Name of the compound ID field in the SDF file
SDFile_experimental : # Experimental data field in the SDF file
SDFile_complementary : # Complementary data field in the SDF file
normalize_method : standardize # [None, 'standardize', 'chEMBL'] Selection of a standardization method
ionize_method : # [None, 'Moka (not implemented)'] Selection of a ionization method
convert3D_method : # ['ETKDG', None] Selection of a 3D conversion method
computeMD_method : ['RDKit_properties', 'RDKit_md'] # ['RDKit_properties', 'RDKit_md', 'morganFP', 'rdkFP', 'substructureFP', 'custom'] Selection of molecular descriptors to be used in model generation
model : XGBOOST # ['RF', 'XGBOOST', 'SVM', 'PLSR', 'PLSDA', 'GNB', 'mean', 'median', 'majority', 'logicalOR', 'matrix'] List of available ML algorithms
modelAutoscaling : StandardScaler # ['StandardScaler', 'MinMaxScaler', 'RobustScaler', None] Scaling method. Null means that raw, unscaled data, will be used
tune : False # [True, False] Whether to perform or not algorithm hyperparameter optimization
conformal : True # [True, False] If true, use the conformal variant of the selected modeling method, when available
conformalConfidence : 0.80000 # Conformal estimator confidence (from 0 to 1)
ModelValidationCV : kfold # ['loo', 'kfold'] Selection of cross-validation method
ModelValidationN : 2 # Number of folds
feature_importances_method : internal # ['internal', 'permutation'] Method used to compute the model feature importances.
output_format : JSON # ['JSON', 'TSV'] Output data format
output_md : False # [True, False] Dump descriptors to a TSV file
output_similar : True # [True, False] Show compounds similar to the query compounds present in the training series
TSV_activity : activity # Name of the activity field in the TSV file
TSV_objnames : True # [True, False] True if object names in first column of data matrix
imbalance : # [None, 'simple_subsampling', 'near_miss1', 'near_miss2', 'near_miss3', 'edited_KNN', 'rep_edited_KNN', 'all_KNN', 'iht'] Whether to perform or not sub/over sampling strategies.
feature_selection : # [None, 'Kbest'] Feature selection based on univariate statistics (F-test)
feature_number : auto # ['auto'] Number of features to keep or auto for selecting 10% of objects
mol_batch : series # ['series', 'objects'] How molecules are going to be processed
ensemble_names : # Names of the ensemble models the output of which will be used as input for this model
ensemble_versions : # Versions of the ensemble models the output of which will be used as input for this model
similarity_metric : # ['euclidean', 'tanimoto', 'substructural'] Show compounds similar to the query compounds present in the training series
similarity_cutoff_num : 5 # Maximum number of similar compounds to extract. If not set, all compounds meeting other criteria will be extracted
similarity_cutoff_distance : 0.60000 # Float in the range (0.0 - 1.0) indicating minimum similarity of the compounds to extract. If not set, all compounds meeting other criteria will be extracted
tune_metric_quantitative : r2 # ['r2', 'neg_mean_squared_error'] Metric used for the GricCV optimization search
tune_metric_qualitative : mcc # ['mcc', 'recall', 'precision', 'balanced_accuracy', 'f1', 'f1_weigthed'] Metric used for the GridCV optimization search
tune_cv_fold : 5 # Number of fold in the KFold CV used in GridCV optimization search
numCPUs : 6 # [None] Number of independent threads used to compute MD
verbose_error : True # [True, False] When false, RDKit errors are captured and not shown
modelingToolkit : internal # ['internal', 'R', 'KNIME', 'custom'] List of toolkits usable to build models
endpoint : PXRdemo #
model_path : X:\models\quality\COX2\models\PXRdemo\dev #
version : 0 #
MD_settings :
mordred_3D : False # Whether to compute or not mordred 3D descriptors
morgan_features : True # [True, False] Whether to use or not feature-based invariants.
morgan_nbits : 2048 # Size of the Morgan fingerprint (e.g. 1024, 2048)
morgan_radius : 2 # Morgan fingerprint radius (e.g. 2, 4)
rdkit_black_list : ['Ipc'] # Name of problematic RDKit molecular descriptor to avoid
conformal_settings :
ACP_sampler : BootstrapSampler # ['BootstrapSampler', 'RandomSubSampler', 'CrossSampler'] Sampling strategy to select callibration sets in aggregated conformal predictions (ACP).
KNN_NN : 15 # Number of nearest neighbors used by KNN normalizing model.
aggregated : True # [True, False] Use aggregated conformal predictions (ACP).
aggregation_function : median # ['median', 'mean'] Function used to aggregate p-values in aggregated conformal predictions (ACP).
conformal_predictors : 10 # Number of models to build aggregated conformal predictions (ACP).
normalizing_model : KNN # ['KNN', 'Underlying'] Normalizing model used to scale non-conformity scores.
RF_parameters :
class_weight : balanced # [None, 'balanced'] Weights associated with classes. If not given, all classes are supposed to have weight one
max_depth : # Maximum tree depth. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples
max_features : sqrt # ['auto', 'sqrt', 'log2', None] Maximum features. sqrt = sqrt(n_features). log2 = log2(n_features). None = n_features. Auto idem sqrt.
min_samples_split : 2 # The minimum number of samples required to split an internal node (must be > 1)
n_estimators : 200 # [50, 100, 150] The number of trees in the forest
oob_score : True # [True, False] Whether to use out-of-bag samples to estimate the generalization accuracy
random_state : 46 # [46, None] Random seed
RF_optimize :
class_weight : ['default', 'balanced'] #
max_depth : [2, 3] # ['default', 'balanced']
max_features : ['sqrt', 'log2'] # ['auto', 'sqrt', 'log2', None]
min_samples_split : [2, 3] #
n_estimators : [50, 100] #
XGBOOST_parameters :
booster : gbtree # ['gbtree', 'gblinear', 'dart'] Specify which booster to use
learning_rate : 0.30000 # [1, 0.3, 0.1] Boosting learning rate (xgb's "eta")
max_depth : 4 # Maximum tree depth for base learners.
n_estimators : 200 # Number of trees to fit.
XGBOOST_optimize :
booster : ['gbtree', 'gblinear', 'dart'] # ['gbtree', 'gblinear', 'dart']
learning_rate : [1, 0.1, 0.01] # [1, 0.1, 0.01]
max_depth : [1, 3, 6] # [1, 3, 6]
n_estimators : [50, 100, 150] # [50, 100, 150]
SVM_parameters :
C : 1 # Penalty parameter C of the error term.
class_weight : # [None, 'balanced'] Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one.
coef0 : 0 # Independent term in kernel function. It is only significant in poly and sigmoid.
degree : 3 # Degree of the polynomial kernel function (poly). Ignored by all other kernels.
gamma : auto # Kernel coefficient for rbf, poly and sigmoid.
kernel : rbf # [None, 'rbf', 'linear', 'poly'] Specifies the kernel type to be used in the algorithm.
probability : True # [True, False] Whether to enable probability estimates.
random_state : 46 # [46, None] Random seed
shrinking : True # [True, False] Whether to use the shrinking heuristic.
SVM_optimize :
C : [1, 10, 100] # [None, 1, 3, 5]
class_weight : ['default', 'balanced'] # ['default', 'balanced']
coef0 : [0, 0.8, 100] # [None, 0, 0.8, 100]
degree : [1, 3, 5] # [None, 1, 3, 5]
gamma : ['auto'] # [None, 'auto']
kernel : ['rbf'] # [None, 'rfb', 'linear', 'poly']
probability : [True] # [True, False]
shrinking : [True] # [True, False]
PLSR_parameters :
n_components : 2 # Number of latent variables to extract
PLSR_optimize :
n_components : [2, 3, 4, 5] # [2, 3, 4, 5, 6]
PLSDA_parameters :
n_components : 2 # Number of latent variables to extract
threshold : 0.50000 # [0.5] Cutoff value from 0 to 1 which separates class 0 from class 1
PLSDA_optimize :
n_components : [2, 3, 4, 5] # [2, 3, 4, 5]
threshold : [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75] #
GNB_parameters :
prior_negative : # prior probability for class 0
prior_positive : # prior probability for class 1
var_smoothing : # Lorenzian variance smoothing factor