Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 34 additions & 17 deletions foqus_lib/framework/surrogate/keras_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from pathlib import Path
from tokenize import String

from typing import Tuple
import numpy as np
import pandas as pd
import tensorflow as tf # pylint: disable=import-error
Expand All @@ -52,6 +53,20 @@
from foqus_lib.framework.surrogate.surrogate import surrogate
from foqus_lib.framework.uq.SurrogateParser import SurrogateParser

from foqus_lib.framework.surrogate.scaling import (
BaseScaler,
LinearScaler,
LogScaler,
LogScaler2,
PowerScaler,
PowerScaler2,
map_name_to_scaler,
scale_dataframe,
)

# mapping between the human-readable name for the scaling variant
# and an instance of the corresponding scaler class


# custom class to define Keras NN layers
@tf.keras.utils.register_keras_serializable()
Expand Down Expand Up @@ -293,6 +308,14 @@ def __init__(self, dat=None):
desc="Name of output file for model, should have file extension: .keras",
hint="Enter a custom file name if desired",
)
# add option for normalization_form, make dropdown option
self.options.add(
name="scaling_function",
default="Linear",
dtype=str,
desc="Scaling/normalization function for input data",
validValues=list(map_name_to_scaler.keys()),
)

def run(self):
"""
Expand All @@ -316,6 +339,9 @@ def run(self):
self.msgQueue.put(f"input data columns: {input_data.columns}")
self.msgQueue.put(f"output data columns: {output_data.columns}")

# extract scaling function option, apply it to the input data
# get scaler object

# np.random.seed(46)
# rn.seed(1342)
# tf.random.set_seed(62)
Expand All @@ -341,22 +367,13 @@ def run(self):
xdata = input_data
zdata = output_data

xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata} # x bounds
zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata} # z bounds

# normalize data using Linear form
# users can normalize with any allowed form # manually, and then pass the
# appropriate flag to FOQUS from the allowed list:
# ["Linear", "Log", "Power", "Log 2", "Power 2"] - see the documentation for
# details on the scaling formulations
xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
xdata, zdata = np.array(xdata), np.array(zdata)
for i in range(len(xdata)):
for j in range(len(xlabels)):
xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
for j in range(len(zlabels)):
zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
scaling_func_option = self.options["scaling_function"].value

scaler_instance = map_name_to_scaler[scaling_func_option]
xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)

print(f"using scaling function: {scaling_func_option}")

# method to create model
def create_model():
Expand All @@ -370,7 +387,7 @@ def create_model():
input_bounds=xdata_bounds,
output_bounds=zdata_bounds,
normalized=True,
normalization_form="Linear",
normalization_form=scaling_func_option,
)

outputs = layers(inputs) # use network as function outputs = f(inputs)
Expand Down
50 changes: 32 additions & 18 deletions foqus_lib/framework/surrogate/pytorch_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@
# from foqus_lib.framework.graph.graph import Graph
from foqus_lib.framework.surrogate.surrogate import surrogate
from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
from foqus_lib.framework.surrogate.scaling import (
BaseScaler,
LinearScaler,
LogScaler,
LogScaler2,
PowerScaler,
PowerScaler2,
map_name_to_scaler,
scale_dataframe,
)

# custom class to define Keras NN layers
np.random.seed(46)
Expand Down Expand Up @@ -284,6 +294,13 @@ def __init__(self, dat=None):
desc="Name of output file for model, should have file extension: .pt",
hint="Enter a custom file name if desired",
)
self.options.add(
name="scaling_function",
default="Linear",
dtype=str,
desc="Scaling/normalization function for input data",
validValues=["Linear", "Log", "Log2", "Power", "Power2"],
)

def run(self):
"""
Expand Down Expand Up @@ -326,22 +343,16 @@ def run(self):
zlabels = list(output_data.columns)
xdata = input_data
zdata = output_data
xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata} # x bounds
zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata} # z bounds

# normalize data using Linear form, pass as custom string and parse with SymPy
# users can normalize with any allowed form # manually, and then pass the
# appropriate flag to FOQUS from the allowed list:
# ["Linear", "Log", "Power", "Log 2", "Power 2", "Custom] - see the
# documentation for details on the scaling formulations
xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
xdata, zdata = np.array(xdata), np.array(zdata)
for i in range(len(xdata)):
for j in range(len(xlabels)):
xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
for j in range(len(zlabels)):
zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
# xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata} # x bounds
# zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata} # z bounds

scaling_func_option = self.options["scaling_function"].value

scaler_instance = map_name_to_scaler[scaling_func_option]
xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)

print(f"using scaling function: {scaling_func_option}")

model_data = np.concatenate(
(xdata, zdata), axis=1
Expand All @@ -353,8 +364,11 @@ def run(self):

# raise exception here after BPC position
# create model
x_train = torch.from_numpy(xdata).float().to(device)
z_train = torch.from_numpy(zdata).float().to(device)

# need to convert xdata to a numpy array for the below to work
# otherwise causes TypeError: expected np.ndarray (got DataFrame)
x_train = torch.from_numpy(xdata.to_numpy()).float().to(device)
z_train = torch.from_numpy(zdata.to_numpy()).float().to(device)

# print type at this point
# can also print inside create_model
Expand Down
202 changes: 202 additions & 0 deletions foqus_lib/framework/surrogate/scaling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import copy
import json
import logging
import math
from collections import OrderedDict

import numpy as np
import pandas as pd
from typing import Tuple


def validate_for_scaling(array_in, lo, hi) -> None:
if not np.all(np.isfinite(array_in)):
raise ValueError("Input data cannot contain NaN or inf values")
if array_in.ndim != 1:
raise ValueError("Only 1D arrays supported")
if array_in.size < 2:
raise ValueError("Array must have at least 2 values")
if np.allclose(lo, hi):
raise ValueError("Array must contain non-identical values")
if not check_under_or_overflow(array_in):
raise ValueError("Array contains under/overflow values for dtype")


def check_under_or_overflow(arr):
if np.issubdtype(arr.dtype, np.integer):
info = np.iinfo(arr.dtype)
elif np.issubdtype(arr.dtype, np.floating):
info = np.finfo(arr.dtype)
else:
raise ValueError("Unsupported data type")
max_value = info.max
min_value = info.min
return np.all(arr < max_value) & np.all(arr > min_value)


def scale_linear(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
if (hi - lo) == 0:
result = 0
else:
result = (array_in - lo) / (hi - lo)
return result


def scale_log(array_in, lo=None, hi=None):
# need to account for log domain
epsilon = 1e-8
if np.any(array_in < epsilon):
raise ValueError(f"All values must be greater than {epsilon}")
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = (np.log10(array_in) - np.log10(lo)) / (np.log10(hi) - np.log10(lo))
return result


def scale_log2(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = np.log10(9 * (array_in - lo) / (hi - lo) + 1)
return result


def scale_power(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = (np.power(10, array_in) - np.power(10, lo)) / (
np.power(10, hi) - np.power(10, lo)
)
return result


def scale_power2(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = 1 / 9 * (np.power(10, (array_in - lo) / (hi - lo)) - 1)
return result


def unscale_linear(array_in, lo, hi):
result = array_in * (hi - lo) / 1.0 + lo
return result


def unscale_log(array_in, lo, hi):
result = lo * np.power(hi / lo, array_in)
return result


def unscale_log2(array_in, lo=None, hi=None):
result = (np.power(10, array_in / 1.0) - 1) * (hi - lo) / 9.0 + lo
return result


def unscale_power(array_in, lo, hi):
result = np.log10(
(array_in / 1.0) * (np.power(10, hi) - np.power(10, lo)) + np.power(10, lo)
)
return result


def unscale_power2(array_in, lo, hi):
result = np.log10(9.0 * array_in / 1.0 + 1) * (hi - lo) + lo
return result


class BaseScaler:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may be misunderstanding the usage of the annotations here, but what is the outcome of this class? It seems that arrays that are transformed will raise exceptions for any input.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add a comment to explain the purpose of the BaseScaler class; transform() and inverse_transform() should be implemented by the derived classes, so it raises an error if called from the base class.

"""BaseScaler is the base class for the scaler classes defined
below. It exposes the transformer interface from scikit-learn,
and is not supposed to be instantiated directly."""

def fit(self, X: np.ndarray):
self.lo_ = np.min(X)
self.hi_ = np.max(X)
return self

def fit_transform(self, X: np.ndarray) -> np.ndarray:
return self.fit(X).transform(X)

def transform(self, X: np.ndarray) -> np.ndarray:
raise NotImplementedError

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
raise NotImplementedError


class LinearScaler(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_linear(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_linear(X, self.lo_, self.hi_)


class LogScaler(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_log(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_log(X, self.lo_, self.hi_)


class LogScaler2(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_log2(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_log2(X, self.lo_, self.hi_)


class PowerScaler(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_power(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_power(X, self.lo_, self.hi_)


class PowerScaler2(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_power2(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_power2(X, self.lo_, self.hi_)


map_name_to_scaler = {
"Linear": LinearScaler(),
"Log": LogScaler(),
"Log2": LogScaler2(),
"Power": PowerScaler(),
"Power2": PowerScaler2(),
}


def scale_dataframe(df: pd.DataFrame, scaler: BaseScaler) -> Tuple[pd.DataFrame, dict]:
scaled_df = pd.DataFrame(np.nan, columns=df.columns, index=df.index)
bounds = {}

for col_name in df:
unscaled_col_data = df[col_name]
scaled_col_data = scaler.fit_transform(unscaled_col_data)
bounds[col_name] = scaler.lo_, scaler.hi_
scaled_df.loc[:, col_name] = scaled_col_data

return scaled_df, bounds
Loading