Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ instruction for the installation on Ubuntu.
-spec smlp_toy_num_resp_mult_free_inps -data_scaler min_max \
-beta "y1>7 and y2>6" -objv_names obj1,objv2,objv3 \
-objv_exprs "(y1+y2)/2;y1/2-y2;y2" -epsilon 0.05 -delta_rel 0.01 \
-save_model_config f -mrmr_pred 0 -plots f -seed 10 -log_time f \
-save_model_config f -plots f -seed 10 -log_time f \
-spec ../specs/smlp_toy_num_resp_mult_free_inps.spec

# then the regression script
Expand Down Expand Up @@ -124,8 +124,8 @@ will produce the SMLP command for the regression test number 1:

../../src/run_smlp.py -data "../data/smlp_toy_num_resp_mult" \
-out_dir ./ -pref Test1 -mode train -resp y1 -feat x,p1,p2 \
-model dt_caret -save_model_config f -mrmr_pred 0 -plots f \
-seed 10 -log_time f
-model dt_caret -save_model_config f -feat_select_model mrmr \
-feat_select_count 0 -plots f -seed 10 -log_time f

For details about those parameters, please refer to the help messages (-h) of
both tools, src/run_smlp.py and regr_smlp/code/smlp_regr.py, as well as the
Expand Down Expand Up @@ -185,12 +185,14 @@ directory, run the following commands:
-mode optimize -pareto f -sat_thresh f -resp o0 -feat \
Byte,CH,RANK,Timing,i0,i1,i2,i3 -model dt_sklearn -dt_sklearn_max_depth 15 \
-data_scaler min_max -epsilon 0.05 -log_time f -plots f \
-feat_select_model mrmr -feat_select_count 15 \
-spec ../specs/smlp_s2_tx

../../src/run_smlp.py -out_dir ./ -pref smlp_s2_tx_nn -data ../data/smlp_s2_tx \
-mode optimize -pareto f -sat_thresh f -resp o0 \
-feat Byte,CH,RANK,Timing,i0,i1,i2,i3 \
-model nn_keras -nn_keras_epochs 20 -data_scaler min_max \
-feat_select_model mrmr -feat_select_count 15 \
-epsilon 0.05 -log_time f -plots f -spec ../specs/smlp_s2_tx

These runs will take longer than the regression tests provided earlier,
Expand Down
154 changes: 129 additions & 25 deletions src/smlp_py/smlp_data.py

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions src/smlp_py/smlp_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,14 @@ def smlp_flow(self):
#self.logger.info('Running SMLP in mode "{}": Start'.format(args.analytics_mode))
self.logger.info('PREPARE DATA FOR MODELING')
X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \
levels_dict, model_features_dict, feat_names, resp_names = self.dataInst.process_data(
self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, True, args.split_test,
levels_dict, pca_equations, model_features_dict, feat_names, resp_names = self.dataInst.process_data(
self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, args.spec, True, args.split_test,
feat_names, resp_names, args.keep_features, args.train_first_n, args.train_random_n, args.train_uniform_n,
args.interactive_plots, args.response_plots, args.data_scaler,
args.scale_features, args.scale_responses, args.impute_responses, args.mrmr_feat_count_for_prediction,
args.scale_features, args.scale_responses, args.impute_responses, args.feature_selection_model,
args.feature_selection_count, args.pca_feat_count_for_prediction,
args.positive_value, args.negative_value, args.response_map, args.response_to_bool, args.save_model, args.use_model)

# sanity check that the order of features in model_features_dict, feat_names, X_train, X_test, X is
# the same; this is mostly important for model exploration modes
self.modelInst.model_features_sanity_check(model_features_dict, feat_names, X_train, X_test, X)
Expand Down Expand Up @@ -388,7 +389,7 @@ def smlp_flow(self):
elif args.analytics_mode == 'optimize':
self.optInst.smlp_optimize(syst_expr_dict, args.model, model,
self.dataInst.unscaled_training_features, self.dataInst.unscaled_training_responses,
model_features_dict, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto,
model_features_dict, pca_equations, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto,
args.optimization_strategy, quer_names, quer_exprs,
delta_dict, args.epsilon, alpha_global_expr, beta_expr, args.eta, theta_radii_dict,
args.solver_logic, args.vacuity_check,
Expand Down
7 changes: 2 additions & 5 deletions src/smlp_py/smlp_mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ def __init__(self):
self._MRMR_FEATURES_PRED = 15
self._MRMR_FEATURES_CORR = 15
self.mrmr_params_dict = {
'mrmr_feat_count_for_prediction': {'abbr':'mrmr_pred', 'default':self._MRMR_FEATURES_PRED, 'type':int,
'help':'Count of features selected by MRMR algorithm for predictive models ' +
'[default: {}]'.format(str(self._MRMR_FEATURES_PRED))},
'mrmr_feat_count_for_correlation': {'abbr':'mrmr_corr', 'default':self._MRMR_FEATURES_CORR, 'type':int,
'help':'Count of features selected by MRMR algorithm for correlation analysis ' +
'[default: {}]'.format(str(self._MRMR_FEATURES_CORR))}
Expand Down Expand Up @@ -63,7 +60,7 @@ def _mrmr_regres(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f', redund
str(y.name) + ' :\n'+ str(mrmr_scores_df))

self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end')
return mrmr_res[0], mrmr_scores_df
return mrmr_res[0], mrmr_scores_df, mrmr_res[1]

# mrmr feature selection using mrmr-feature package, where y is a categorical variable (pandas.Series)
# TODO !!!: not tested
Expand All @@ -87,7 +84,7 @@ def _mrmr_class(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f',
y.name + ' :\n'+ str(mrmr_scores_df))

self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end')
return mrmr_res[0], mrmr_scores_df
return mrmr_res[0], mrmr_scores_df, mrmr_res[1]

def smlp_mrmr(self, X:pd.DataFrame, y:pd.Series, #resp_type:str, #"numeric",
feat_cnt:int):
Expand Down
6 changes: 3 additions & 3 deletions src/smlp_py/smlp_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,8 +1096,8 @@ def check_synthesis_feasibility(self, feasibility:bool, objv_names:list[str], ob
# function (and to smlp_optsyn() instead of passing X,y; The bounds on objectives are not strictly necessary,
# any approximation may be used, but accurate approximation might reduce iterations count needed for
# computing optimal confoguurations (in optimize and optsyn modes)
def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict,
feat_names:list[str], resp_names:list[str],
def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict,
pca_equations:dict, feat_names:list[str], resp_names:list[str],
objv_names:list[str], objv_exprs, pareto:bool, strategy:str, #asrt_names:list[str], asrt_exprs,
quer_names:list[str], quer_exprs, delta:float, epsilon:float,
alph_expr:str, beta_expr:str, eta_expr:str, theta_radii_dict:dict, solver_logic:str, vacuity:bool,
Expand All @@ -1115,7 +1115,7 @@ def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFram

domain, syst_term_dict, model_full_term_dict, eta, alpha, beta, interface_consistent, model_consistent = \
self._modelTermsInst.create_model_exploration_base_components(
syst_expr_dict, algo, model, model_features_dict, feat_names, resp_names,
syst_expr_dict, algo, model, model_features_dict, pca_equations, feat_names, resp_names,
#delta, epsilon, #objv_names, objv_exprs, None, None, None, None,
alph_expr, beta_expr, eta_expr, data_scaler, scale_feat, scale_resp, #scale_objv,
float_approx, float_precision, data_bounds_json_path)
Expand Down
225 changes: 225 additions & 0 deletions src/smlp_py/smlp_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
# SPDX-License-Identifier: Apache-2.0
# This file is part of smlp.

import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA
import json
#from sklearn.preprocessing import StandardScaler

class SmlpPCA:
def __init__(self):
self._pca_logger = None
self.pca_model = None
self._DEF_PCA_FEATURES_PRED = 0
self.pca_params_dict = {
'pca_feat_count_for_prediction': {'abbr':'pca_pred', 'default':self._DEF_PCA_FEATURES_PRED, 'type':int,
'help':'Count of features selected by pca algorithm ' +
'[default: {}]'.format(str(self._DEF_PCA_FEATURES_PRED))},
}
def set_logger(self, logger):
self._pca_logger = logger



def create_pca_based_spec(self, X: pd.DataFrame, y: pd.DataFrame, original_spec_path: str):
if self.pca_model is None:
raise ValueError("PCA model has not been trained. Call smlp_pca() first.")

self._pca_logger.info('Generating PCA-based spec file: start')

# Read original spec file
with open(original_spec_path, "r") as f:
raw_text = f.read().replace('\t', ' ')
spec_data = json.loads(raw_text)

# Prepare output directory
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
output_dir = os.path.join(base_dir, "regr_smlp", "models")
os.makedirs(output_dir, exist_ok=True)

# Collect original knob ranges
knob_ranges = {}
original_variables = spec_data.get('variables', [])
for item in original_variables:
if item.get('interface') in ['knob', 'input']:
knob_ranges[item['label']] = item['range']

# PCA components and feature information
components = self.pca_model.components_
feature_means = self.pca_model.mean_
feature_names = X.columns.tolist()
pca_knobs = []
pca_equations_log = ["PCA Component Equations:"]
# Create PCA knobs with corrected ranges and save equations
for i, component in enumerate(components):
pc_name = f"PC{i+1}"

min_pc, max_pc = 0, 0
offset = 0.0
terms = []

for weight, feature in zip(component, feature_names):
if feature not in knob_ranges:
continue
feature_min, feature_max = knob_ranges[feature]
if weight >= 0:
min_pc += weight * feature_min
max_pc += weight * feature_max
else:
min_pc += weight * feature_max
max_pc += weight * feature_min

# Accumulate the offset for mean centering
feature_idx = feature_names.index(feature)

offset -= weight * feature_means[feature_idx]

# Build equation term
if abs(weight) > 1e-6:
terms.append(f"{weight:.4f} * {feature}")

# Final corrected min/max with offset
estimated_min = min(min_pc, max_pc) + offset
estimated_max = max(min_pc, max_pc) + offset
estimated_range = [round(estimated_min, 4), round(estimated_max, 4)]

# Create knob
pca_knobs.append({
"label": pc_name,
"interface": "knob",
"type": "real",
"range": estimated_range,
"rad-rel": 0.05
})

# Create readable equation for PCA component
equation = " + ".join(terms)
equation += f" + {offset:.4f}"
pca_equations_log.append(f"{pc_name} ≈ {equation}")

# Keep outputs unchanged
output_variables = []
for item in original_variables:
if item.get('interface') == 'output':
output_variables.append(item)

# Combine into new spec
new_variables = pca_knobs + output_variables
new_spec = {
"version": spec_data.get("version", "1.2"),
"variables": new_variables,
"objectives": spec_data.get("objectives", {})
}

# Save new PCA spec
output_spec_path = os.path.join(output_dir, "pca_generated.spec")
with open(output_spec_path, "w") as f:
json.dump(new_spec, f, indent=4)

self._pca_logger.info(f"PCA-based spec saved to {output_spec_path}")

# Save PCA component equations
pca_equations_path = os.path.join(output_dir, "pca_component_equations.txt")
with open(pca_equations_path, "w") as f:
f.write("\n".join(pca_equations_log))

self._pca_logger.info(f"PCA component equations saved to {pca_equations_path}")
self._pca_logger.info('Generating PCA-based spec file: end')

return "OK"

def smlp_pca(self, X: pd.DataFrame, y: pd.DataFrame, feat_count: int , spec_path: str):

if X.shape[1] == 0:
return X , None

self._pca_logger.info('PCA feature compression: start')

self.pca_model = PCA(n_components=feat_count)
X_pca = self.pca_model.fit_transform(X)

pca_columns = [f'PC{i+1}' for i in range(feat_count)]
X_pca_df = pd.DataFrame(X_pca, index=X.index, columns=pca_columns)

self._pca_logger.info(f'PCA applied: Reduced {X.shape[1]} features to {feat_count} components')
self._pca_logger.info('PCA feature compression: end')

# Go back two levels from smlp_py to project root
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))

# Target output directory
output_dir = os.path.join(base_dir, "regr_smlp", "models")
os.makedirs(output_dir, exist_ok=True)

# Save PCA components
pca_components_path = os.path.join(output_dir, "pca_components.csv")
X_pca_df.to_csv(pca_components_path, index=False)
self._pca_logger.info(f"PCA components saved to {pca_components_path}")

# Save combined PCA components and outputs
X_pca_and_outputs = pd.concat([X_pca_df, y.loc[X_pca_df.index]], axis=1)
pca_full_data_path = os.path.join(output_dir, "pca_full_data.csv")
X_pca_and_outputs.to_csv(pca_full_data_path, index=False)
self._pca_logger.info(f"PCA components with outputs saved to {pca_full_data_path}")

return X_pca_df, self.pca_model

def inverse_transform(self, X_pca , X):
# Converts PCA features back to original feature space using stored PCA model.

if self.pca_model is None:
raise ValueError("PCA model has not been trained. Call smlp_pca() first.")

X_reconstructed = pd.DataFrame(self.pca_model.inverse_transform(X_pca), index=X_pca.index)
X_reconstructed.columns = list(X.columns)

return X_reconstructed

def get_feature_equations(self, X_pca, X):
# Generate a dictionary mapping each original feature to its equation in terms of PCA-transformed components .

if self.pca_model is None:
raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
W = self.pca_model.components_
W_inv = np.linalg.pinv(W)

# Get the mean values of original features
feature_means = self.pca_model.mean_
feature_equations = {}
equations_log = ["PCA Equations for Original Features:"]

for i, feature_name in enumerate(X.columns):
coefficients = W_inv[i, :]

# Create equation for each feature
feature_index = X.columns.get_loc(feature_name)
equation = " + ".join(f"{coeff:.4f} * {pc}" for coeff, pc in zip(coefficients, X_pca.columns))
equation += f" + {feature_means[feature_index]:.4f}"
equation_terms = {pc: coeff for pc, coeff in zip(X_pca.columns, coefficients)}

# Store each output equation
feature_equations[feature_name] = {
"terms": equation_terms,
"offset": feature_means[i]
}

equations_log.append(f"{feature_name} ≈ {equation}")
# Logs all equations, can be commented out if not relevant
self._pca_logger.info("\n".join(equations_log))

base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))

# Target output directory
output_dir = os.path.join(base_dir, "regr_smlp", "models")
os.makedirs(output_dir, exist_ok=True)
# Save PCA feature equations
pca_equations_path = os.path.join(output_dir, "pca_equations.txt")
with open(pca_equations_path, "w") as f:
f.write("\n".join(equations_log))
self._pca_logger.info(f"PCA feature equations saved to {pca_equations_path}")


return feature_equations

Loading