SMLP-Systems · Mihai-Stere · Jun 25, 2025 · Jun 25, 2025
diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ instruction for the installation on Ubuntu.
     -spec smlp_toy_num_resp_mult_free_inps -data_scaler min_max \
     -beta "y1>7 and y2>6" -objv_names obj1,objv2,objv3 \
     -objv_exprs "(y1+y2)/2;y1/2-y2;y2" -epsilon 0.05 -delta_rel 0.01 \
-    -save_model_config f -mrmr_pred 0 -plots f -seed 10 -log_time f \
+    -save_model_config f -plots f -seed 10 -log_time f \
     -spec ../specs/smlp_toy_num_resp_mult_free_inps.spec
 
     # then the regression script
@@ -124,8 +124,8 @@ will produce the SMLP command for the regression test number 1:
 
 	../../src/run_smlp.py -data "../data/smlp_toy_num_resp_mult" \
 	-out_dir ./ -pref Test1 -mode train -resp y1 -feat x,p1,p2 \
-	-model dt_caret -save_model_config f -mrmr_pred 0 -plots f \
-	-seed 10 -log_time f
+	-model dt_caret -save_model_config f -feat_select_model mrmr \
+	-feat_select_count 0 -plots f -seed 10 -log_time f
 
 For details about those parameters, please refer to the help messages (-h) of
 both tools, src/run_smlp.py and regr_smlp/code/smlp_regr.py, as well as the
@@ -185,12 +185,14 @@ directory, run the following commands:
     -mode optimize -pareto f -sat_thresh f -resp o0 -feat \
     Byte,CH,RANK,Timing,i0,i1,i2,i3 -model dt_sklearn -dt_sklearn_max_depth 15 \
     -data_scaler min_max -epsilon 0.05 -log_time f -plots f \
+	-feat_select_model mrmr -feat_select_count 15 \
     -spec ../specs/smlp_s2_tx
 
     ../../src/run_smlp.py -out_dir ./ -pref smlp_s2_tx_nn -data ../data/smlp_s2_tx \
     -mode optimize -pareto f -sat_thresh f -resp o0 \
     -feat Byte,CH,RANK,Timing,i0,i1,i2,i3 \
     -model nn_keras -nn_keras_epochs 20 -data_scaler min_max \
+	-feat_select_model mrmr -feat_select_count 15 \
     -epsilon 0.05 -log_time f -plots f  -spec ../specs/smlp_s2_tx 
 
 These runs will take longer than the regression tests provided earlier,

diff --git a/src/smlp_py/smlp_data.py b/src/smlp_py/smlp_data.py
diff --git a/src/smlp_py/smlp_flows.py b/src/smlp_py/smlp_flows.py
@@ -299,13 +299,14 @@ def smlp_flow(self):
             #self.logger.info('Running SMLP in mode "{}": Start'.format(args.analytics_mode))
             self.logger.info('PREPARE DATA FOR MODELING')
             X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \
-            levels_dict, model_features_dict, feat_names, resp_names = self.dataInst.process_data(
-                self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, True, args.split_test, 
+            levels_dict, pca_equations, model_features_dict, feat_names, resp_names = self.dataInst.process_data(
+                self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, args.spec, True, args.split_test, 
                 feat_names, resp_names, args.keep_features, args.train_first_n, args.train_random_n, args.train_uniform_n, 
                 args.interactive_plots, args.response_plots, args.data_scaler,
-                args.scale_features, args.scale_responses, args.impute_responses, args.mrmr_feat_count_for_prediction, 
+                args.scale_features, args.scale_responses, args.impute_responses, args.feature_selection_model,
+                args.feature_selection_count, args.pca_feat_count_for_prediction,
                 args.positive_value, args.negative_value, args.response_map, args.response_to_bool, args.save_model, args.use_model)
-
+            
             # sanity check that the order of features in model_features_dict, feat_names, X_train, X_test, X is 
             # the same; this is mostly important for model exploration modes 
             self.modelInst.model_features_sanity_check(model_features_dict, feat_names, X_train, X_test, X)
@@ -388,7 +389,7 @@ def smlp_flow(self):
             elif args.analytics_mode == 'optimize':
                 self.optInst.smlp_optimize(syst_expr_dict, args.model, model,
                     self.dataInst.unscaled_training_features, self.dataInst.unscaled_training_responses, 
-                    model_features_dict, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto, 
+                    model_features_dict, pca_equations, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto, 
                     args.optimization_strategy, quer_names, quer_exprs, 
                     delta_dict, args.epsilon, alpha_global_expr, beta_expr, args.eta, theta_radii_dict, 
                     args.solver_logic, args.vacuity_check, 

diff --git a/src/smlp_py/smlp_mrmr.py b/src/smlp_py/smlp_mrmr.py
@@ -11,9 +11,6 @@ def __init__(self):
         self._MRMR_FEATURES_PRED = 15
         self._MRMR_FEATURES_CORR = 15
         self.mrmr_params_dict = {
-            'mrmr_feat_count_for_prediction': {'abbr':'mrmr_pred', 'default':self._MRMR_FEATURES_PRED, 'type':int,
-                'help':'Count of features selected by MRMR algorithm for predictive models '  +
-                    '[default: {}]'.format(str(self._MRMR_FEATURES_PRED))},
             'mrmr_feat_count_for_correlation': {'abbr':'mrmr_corr', 'default':self._MRMR_FEATURES_CORR, 'type':int,
                 'help':'Count of features selected by MRMR algorithm for correlation analysis '  +
                     '[default: {}]'.format(str(self._MRMR_FEATURES_CORR))}
@@ -63,7 +60,7 @@ def _mrmr_regres(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f', redund
                                str(y.name) + ' :\n'+ str(mrmr_scores_df))
 
         self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end') 
-        return mrmr_res[0], mrmr_scores_df
+        return mrmr_res[0], mrmr_scores_df, mrmr_res[1]
 
     # mrmr feature selection using mrmr-feature package, where y is a categorical variable (pandas.Series)
     # TODO !!!: not tested
@@ -87,7 +84,7 @@ def _mrmr_class(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f',
                                y.name + ' :\n'+ str(mrmr_scores_df))
 
         self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end')
-        return mrmr_res[0], mrmr_scores_df
+        return mrmr_res[0], mrmr_scores_df, mrmr_res[1]
 
     def smlp_mrmr(self, X:pd.DataFrame, y:pd.Series, #resp_type:str, #"numeric", 
             feat_cnt:int):

diff --git a/src/smlp_py/smlp_optimize.py b/src/smlp_py/smlp_optimize.py
@@ -1096,8 +1096,8 @@ def check_synthesis_feasibility(self, feasibility:bool, objv_names:list[str], ob
     #     function (and to smlp_optsyn() instead of passing X,y; The bounds on objectives are not strictly necessary,
     #     any approximation may be used, but accurate approximation might reduce iterations count needed for
     #     computing optimal confoguurations (in optimize and optsyn modes)
-    def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict, 
-            feat_names:list[str], resp_names:list[str], 
+    def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict,
+            pca_equations:dict, feat_names:list[str], resp_names:list[str], 
             objv_names:list[str], objv_exprs, pareto:bool, strategy:str, #asrt_names:list[str], asrt_exprs, 
             quer_names:list[str], quer_exprs, delta:float, epsilon:float, 
             alph_expr:str, beta_expr:str, eta_expr:str, theta_radii_dict:dict, solver_logic:str, vacuity:bool, 
@@ -1115,7 +1115,7 @@ def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFram
 
         domain, syst_term_dict, model_full_term_dict, eta, alpha, beta, interface_consistent, model_consistent = \
         self._modelTermsInst.create_model_exploration_base_components(
-            syst_expr_dict, algo, model, model_features_dict, feat_names, resp_names, 
+            syst_expr_dict, algo, model, model_features_dict, pca_equations, feat_names, resp_names, 
             #delta, epsilon, #objv_names, objv_exprs, None, None, None, None, 
             alph_expr, beta_expr, eta_expr, data_scaler, scale_feat, scale_resp, #scale_objv, 
             float_approx, float_precision, data_bounds_json_path)

diff --git a/src/smlp_py/smlp_pca.py b/src/smlp_py/smlp_pca.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# This file is part of smlp.
+
+import numpy as np
+import pandas as pd
+import os
+from sklearn.decomposition import PCA
+import json
+#from sklearn.preprocessing import StandardScaler
+
+class SmlpPCA:
+    def __init__(self):
+        self._pca_logger = None
+        self.pca_model = None
+        self._DEF_PCA_FEATURES_PRED = 0
+        self.pca_params_dict = {
+            'pca_feat_count_for_prediction': {'abbr':'pca_pred', 'default':self._DEF_PCA_FEATURES_PRED, 'type':int,
+                'help':'Count of features selected by pca algorithm '  +
+                    '[default: {}]'.format(str(self._DEF_PCA_FEATURES_PRED))},
+        }
+    def set_logger(self, logger):
+        self._pca_logger = logger
+
+
+
+    def create_pca_based_spec(self, X: pd.DataFrame, y: pd.DataFrame, original_spec_path: str):
+        if self.pca_model is None:
+            raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
+
+        self._pca_logger.info('Generating PCA-based spec file: start')
+
+        # Read original spec file
+        with open(original_spec_path, "r") as f:
+            raw_text = f.read().replace('\t', '    ')
+            spec_data = json.loads(raw_text)
+
+        # Prepare output directory
+        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+        output_dir = os.path.join(base_dir, "regr_smlp", "models")
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Collect original knob ranges
+        knob_ranges = {}
+        original_variables = spec_data.get('variables', [])
+        for item in original_variables:
+            if item.get('interface') in ['knob', 'input']:
+                knob_ranges[item['label']] = item['range']
+
+        # PCA components and feature information
+        components = self.pca_model.components_
+        feature_means = self.pca_model.mean_
+        feature_names = X.columns.tolist()
+        pca_knobs = []
+        pca_equations_log = ["PCA Component Equations:"]
+        # Create PCA knobs with corrected ranges and save equations
+        for i, component in enumerate(components):
+            pc_name = f"PC{i+1}"
+
+            min_pc, max_pc = 0, 0
+            offset = 0.0
+            terms = []
+
+            for weight, feature in zip(component, feature_names):
+                if feature not in knob_ranges:
+                    continue
+                feature_min, feature_max = knob_ranges[feature]
+                if weight >= 0:
+                    min_pc += weight * feature_min
+                    max_pc += weight * feature_max
+                else:
+                    min_pc += weight * feature_max
+                    max_pc += weight * feature_min
+
+                # Accumulate the offset for mean centering
+                feature_idx = feature_names.index(feature)
+
+                offset -= weight * feature_means[feature_idx]
+
+                # Build equation term
+                if abs(weight) > 1e-6:
+                    terms.append(f"{weight:.4f} * {feature}")
+
+            # Final corrected min/max with offset
+            estimated_min = min(min_pc, max_pc) + offset
+            estimated_max = max(min_pc, max_pc) + offset
+            estimated_range = [round(estimated_min, 4), round(estimated_max, 4)]
+
+            # Create knob
+            pca_knobs.append({
+                "label": pc_name,
+                "interface": "knob",
+                "type": "real",
+                "range": estimated_range,
+                "rad-rel": 0.05
+            })
+
+            # Create readable equation for PCA component
+            equation = " + ".join(terms)
+            equation += f" + {offset:.4f}"
+            pca_equations_log.append(f"{pc_name} ≈ {equation}")
+
+        # Keep outputs unchanged
+        output_variables = []
+        for item in original_variables:
+            if item.get('interface') == 'output':
+                output_variables.append(item)
+
+        # Combine into new spec
+        new_variables = pca_knobs + output_variables
+        new_spec = {
+            "version": spec_data.get("version", "1.2"),
+            "variables": new_variables,
+            "objectives": spec_data.get("objectives", {})
+        }
+
+        # Save new PCA spec
+        output_spec_path = os.path.join(output_dir, "pca_generated.spec")
+        with open(output_spec_path, "w") as f:
+            json.dump(new_spec, f, indent=4)
+
+        self._pca_logger.info(f"PCA-based spec saved to {output_spec_path}")
+
+        # Save PCA component equations
+        pca_equations_path = os.path.join(output_dir, "pca_component_equations.txt")
+        with open(pca_equations_path, "w") as f:
+            f.write("\n".join(pca_equations_log))
+
+        self._pca_logger.info(f"PCA component equations saved to {pca_equations_path}")
+        self._pca_logger.info('Generating PCA-based spec file: end')
+
+        return "OK"
+
+    def smlp_pca(self, X: pd.DataFrame, y: pd.DataFrame, feat_count: int , spec_path: str):
+
+        if X.shape[1] == 0:
+            return X , None
+
+        self._pca_logger.info('PCA feature compression: start')
+
+        self.pca_model = PCA(n_components=feat_count)
+        X_pca = self.pca_model.fit_transform(X)
+
+        pca_columns = [f'PC{i+1}' for i in range(feat_count)]
+        X_pca_df = pd.DataFrame(X_pca, index=X.index, columns=pca_columns)
+
+        self._pca_logger.info(f'PCA applied: Reduced {X.shape[1]} features to {feat_count} components')
+        self._pca_logger.info('PCA feature compression: end')
+
+        # Go back two levels from smlp_py to project root
+        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+        # Target output directory
+        output_dir = os.path.join(base_dir, "regr_smlp", "models")
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Save PCA components
+        pca_components_path = os.path.join(output_dir, "pca_components.csv")
+        X_pca_df.to_csv(pca_components_path, index=False)
+        self._pca_logger.info(f"PCA components saved to {pca_components_path}")
+
+        # Save combined PCA components and outputs
+        X_pca_and_outputs = pd.concat([X_pca_df, y.loc[X_pca_df.index]], axis=1)
+        pca_full_data_path = os.path.join(output_dir, "pca_full_data.csv")
+        X_pca_and_outputs.to_csv(pca_full_data_path, index=False)
+        self._pca_logger.info(f"PCA components with outputs saved to {pca_full_data_path}")
+
+        return X_pca_df, self.pca_model
+
+    def inverse_transform(self, X_pca , X):
+        # Converts PCA features back to original feature space using stored PCA model.
+
+        if self.pca_model is None:
+            raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
+
+        X_reconstructed = pd.DataFrame(self.pca_model.inverse_transform(X_pca), index=X_pca.index)
+        X_reconstructed.columns = list(X.columns)
+
+        return X_reconstructed
+
+    def get_feature_equations(self, X_pca, X):
+        # Generate a dictionary mapping each original feature to its equation in terms of PCA-transformed components .
+
+        if self.pca_model is None:
+            raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
+        W = self.pca_model.components_  
+        W_inv = np.linalg.pinv(W)  
+
+        # Get the mean values of original features
+        feature_means = self.pca_model.mean_
+        feature_equations = {}
+        equations_log = ["PCA Equations for Original Features:"]
+
+        for i, feature_name in enumerate(X.columns):
+            coefficients = W_inv[i, :]
+
+            # Create equation for each feature
+            feature_index = X.columns.get_loc(feature_name)
+            equation = " + ".join(f"{coeff:.4f} * {pc}" for coeff, pc in zip(coefficients, X_pca.columns))
+            equation += f" + {feature_means[feature_index]:.4f}"
+            equation_terms = {pc: coeff for pc, coeff in zip(X_pca.columns, coefficients)}
+
+            # Store each output equation
+            feature_equations[feature_name] = {
+                "terms": equation_terms,
+                "offset": feature_means[i]
+            }
+
+            equations_log.append(f"{feature_name} ≈ {equation}")
+        # Logs all equations, can be commented out if not relevant
+        self._pca_logger.info("\n".join(equations_log))
+
+        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+        # Target output directory
+        output_dir = os.path.join(base_dir, "regr_smlp", "models")
+        os.makedirs(output_dir, exist_ok=True)
+        # Save PCA feature equations
+        pca_equations_path = os.path.join(output_dir, "pca_equations.txt")
+        with open(pca_equations_path, "w") as f:
+            f.write("\n".join(equations_log))
+        self._pca_logger.info(f"PCA feature equations saved to {pca_equations_path}")
+
+
+        return feature_equations
+