Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion tests/test_downsampling.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import numpy
import os

from uips.options import UIPSOptions, stepOptions
from uips import UIPS_INPUT_DIR
from uips.wrapper import (
downsample_dataset_from_input,
downsample_dataset_from_input_file,
)


def test_nf_input():
def test_nf_input_file():
input_file = os.path.join(UIPS_INPUT_DIR, "input_test")
downsample_dataset_from_input_file(input_file)

Expand All @@ -17,6 +19,39 @@ def test_bins_input():
downsample_dataset_from_input_file(input_file)


def test_nf_input():

step1_options = stepOptions(
nEpochs = 1,
batch_size_train = 2048,
nWorkingData = 1e4,
learning_rate = 2e-4,
nCouplingLayer = 2,
num_bins = 3,
hidden_features = 12,
num_blocks = 3
)
step2_options = step1_options
downsampler_options = UIPSOptions(
num_pdf_iter = 2,
pdfMethod = "NormalizingFlow",
use_gpu = False,
printTiming = True,
nDatReduced = 1e5,
preShuffled = True,
scalerFile = "scaler.npz",
nSamples_list = [1e3],
computeDistanceCriterion = True,
prefixDownsampledData = "downSampledData",
data_freq_adjustment = 1,
nWorkingDataAdjustment = -1,
)

dataFile = "../data/combustion2DToDownsampleSmall.npy"
dataset = np.load(dataFile)
downsample_dataset_from_input(downsampler_options, dataset)


if __name__ == "__main__":
test_nf_input()
test_bins_input()
202 changes: 202 additions & 0 deletions tutorials/gaussian_2d_example.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tutorials/uips_normal.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
inpt["nEpochs"] = f"5 20"
inpt["nSamples"] = f"{nSampl}"

best_files = downsample_dataset_from_input(inpt)
best_files = downsample_dataset_from_input(inpt


if par.irank == par.iroot:
Expand Down
27 changes: 27 additions & 0 deletions uips/inputs/downsampler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import torch
from ..options import UIPSOptions

class DownSampler():
def __init__(self, options: UIPSOptions):
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
self.options = options

self.configure_torch()
self.configure_numpy()

def configure_torch(self):
""" Configure torch based on DownSamplerOptions
"""

if self.options.use_gpu:
device = torch.device("cuda")
torch.set_default_dtype(torch.cuda.float32)
else:
device = torch.device("cpu")
torch.set_default_dtype(torch.float32)
torch.manual_seed(int(inpt["seed"]) + par.irank)

def configure_numpy():
numpy.random.seed(self.options.seed)

66 changes: 66 additions & 0 deletions uips/options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from dataclasses import dataclass, field, asdict
import os
import typing as t
import enum

class PDFMethodEnum(enum.Enum):
NORMALIZINGFLOW = enum.auto()
BINS = enum.auto()

def lookup(self, name):
for member in self:
if member.name.lower() == name.lower():
return member
raise ValueError(f"PDFMethodEnum has no method {name}")

@dataclass
class stepOptions:
# Number of epochs for each normalizing flow training
nEpochs: int = 5
# Batch size for normalizing flow training
batch_size_train: int = 25_000
# Subset of data on which training is done
nWorkingData: int = 10_000
data_freq_adjustment: int = 1
# Learning rate during normalizing flow training
learning_rate: float = 2e-4
num_bins: int = 4
hidden_features: int = 12
nCouplingLayer: int = 2
num_blocks: int = 3

@dataclass
class UIPSOptions:
num_pdf_iter: int = 2
stepOptionsList: t.List[stepOptions] = field(default_factory = lambda : [stepOptions(), stepOptions()])
pdfMethod: str = "NormalizingFlow"
batch_size_eval: int = 2.5e4
seed: int = 42
printTiming: bool = False
nDatReduced: int = -1
nDimReduced: int = -1
dimList: t.Optional[int] = None
dataFile: "os.PathLike[str]" = "./datafile.npy"
preShuffled: bool = True
scalerFile: "os.PathLike[str]" = "scaler.npz"
nSamples_list: t.List[int] = field(default_factory=lambda : [1e3, 1e4, 1e5])
nSampleCriterionLimit: int = 1e5
computeDistanceCriterion: bool = True
prefixDownsampledData: str = "downSampledData"
use_gpu: bool = True
num_pdf_bins: int = 20
data_freq_adjustment: int = 1
# Subset of data used to adjust the sampling probability to the desired number of samples
# If nWorkingDataAdjustment < 0, all data with the prescribed frequency is used
nWorkingDataAdjustment: int = -1

@property
def nSamples(self) -> t.List[int]:
return [int(n) for n in self.nSamples_list]

def __post_init__(self):
if len(self.stepOptionsList) != self.num_pdf_iter:
raise Exception("Length of stepOptions is less than the number of pdf iterations")

def stepOptions_as_list(self, field):
return [asdict(step_option)[field] for step_option in self.stepOptionsList]
52 changes: 21 additions & 31 deletions uips/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def checkParamListLength(listToCheck, targetLength, name):


def makeParamList(strEntry, fun, inpt, pdf_iter):
num_pdf_iter = int(inpt["num_pdf_iter"])
num_pdf_iter = inpt.num_pdf_iter
param_list_inpt = [fun(n) for n in inpt[strEntry].split()]
checkParamListLength(param_list_inpt, num_pdf_iter, strEntry)
if len(param_list_inpt) == 1 and num_pdf_iter > 1:
Expand All @@ -66,7 +66,7 @@ def computeDistanceToClosestNeighbor(data):


def rescaleData(np_data, inpt):
scaler = np.load(inpt["scalerFile"])
scaler = np.load(inpt.scalerFile)
np_data_rescaled = np_data.copy()
np_data_rescaled = (np_data_rescaled - scaler["minVal"]) / (
0.125 * (scaler["maxVal"] - scaler["minVal"])
Expand All @@ -79,14 +79,11 @@ def createFlow(dim, pdf_iter, inpt):
base_transform_type = "spline"
grad_norm_clip_value = float(5)

hidden_features_list = makeParamList(
"hidden_features", str2int, inpt, pdf_iter
)
num_blocks_list = makeParamList("num_blocks", str2int, inpt, pdf_iter)
num_bins_list = makeParamList("num_bins", str2int, inpt, pdf_iter)
num_coupling_layer_list = makeParamList(
"nCouplingLayer", str2int, inpt, pdf_iter
)
hidden_features_list = inpt.stepOptions_as_list("hidden_features")

num_blocks_list = inpt.stepOptions_as_list("num_blocks")
num_bins_list = inpt.stepOptions_as_list("num_bins")
num_coupling_layer_list = inpt.stepOptions_as_list("nCouplingLayer")

transform = transforms.CompositeTransform(
[
Expand Down Expand Up @@ -147,16 +144,12 @@ def trainFlow(np_data, flow, pdf_iter, inpt):
# Timer
times = time.time()

learning_rate_list = makeParamList(
"learning_rate", str2float, inpt, pdf_iter
)
num_epochs_list = makeParamList("nEpochs", str2int, inpt, pdf_iter)
batch_size_list = makeParamList(
"batch_size_train", str2int, inpt, pdf_iter
)
learning_rate_list = inpt.stepOptions_as_list("learning_rate")
num_epochs_list = inpt.stepOptions_as_list("nEpochs")
batch_size_list = inpt.stepOptions_as_list("batch_size_train")

BATCH_SIZE = batch_size_list[pdf_iter]
EPOCHS = num_epochs_list[pdf_iter]
BATCH_SIZE = batch_size_list[-1]
EPOCHS = num_epochs_list[-1]

grad_norm_clip_value = float(5)
# create optimizer
Expand All @@ -168,7 +161,7 @@ def trainFlow(np_data, flow, pdf_iter, inpt):
return last_loss

os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
use_gpu = (inpt["use_gpu"] == "True") and (torch.cuda.is_available())
use_gpu = (inpt.use_gpu) and (torch.cuda.is_available())

# Perform training on GPU if possible
if use_gpu:
Expand Down Expand Up @@ -232,7 +225,7 @@ def trainFlow(np_data, flow, pdf_iter, inpt):

# Timer
timee = time.time()
printTiming = inpt["printTiming"] == "True"
printTiming = inpt.printTiming
if printTiming:
par.printRoot(f"Time Train : {timee - times:.2f}s")

Expand All @@ -248,7 +241,7 @@ def trainBinPDF(np_data, pdf_iter, inpt):
return None, None
# Train
np_data_rescaled = rescaleData(np_data, inpt)
H, edges = np.histogramdd(np_data_rescaled, bins=int(inpt["num_pdf_bins"]))
H, edges = np.histogramdd(np_data_rescaled, bins=inpt.num_pdf_bins)
logProb = np.log(1e-16 + H / np.sum(H))

os.makedirs("TrainingLog", exist_ok=True)
Expand All @@ -260,7 +253,7 @@ def trainBinPDF(np_data, pdf_iter, inpt):

# Timer
timee = time.time()
printTiming = inpt["printTiming"] == "True"
printTiming = inpt.printTiming
if printTiming:
par.printRoot(f"Time Train : {timee - times:.2f}s")

Expand Down Expand Up @@ -359,7 +352,7 @@ def evalLogProbNF(flow, np_data_to_downsample, pdf_iter, inpt):
# Evaluation
flow.eval()
log_density_np = []
BATCH_SIZE = int(float(inpt["batch_size_eval"]))
BATCH_SIZE = inpt.batch_size_eval
to_downsample_loader = makePytorchData(
np_data_to_downsample, BATCH_SIZE, inpt, shuffle=False
)
Expand Down Expand Up @@ -389,7 +382,7 @@ def evalLogProbNF(flow, np_data_to_downsample, pdf_iter, inpt):

# Timer
timee = time.time()
printTiming = inpt["printTiming"] == "True"
printTiming = inpt.printTiming
if printTiming:
par.printRoot(f"Time Eval : {par.allmaxScalar(timee - times):.2f}s")

Expand Down Expand Up @@ -429,7 +422,7 @@ def evalLogProbBIN(np_data_to_downsample, pdf_iter, inpt):

# Timer
timee = time.time()
printTiming = inpt["printTiming"] == "True"
printTiming = inpt.printTiming
if printTiming:
par.printRoot(f"Time Eval : {par.allmaxScalar(timee - times):.2f}s")

Expand Down Expand Up @@ -476,13 +469,10 @@ def downSample(
inpt,
):
# Mode of adjustment of the sampling probability
nWorkingDataAdjustment = int(float(inpt["nWorkingDataAdjustment"]))
nWorkingDataAdjustment = inpt.nWorkingDataAdjustment
if nWorkingDataAdjustment < 0:
use_serial_adjustment = False
try:
data_freq_adjustment = int(inpt["data_freq_adjustment"])
except KeyError:
data_freq_adjustment = 1
data_freq_adjustment = inpt.data_freq_adjustment
else:
use_serial_adjustment = True

Expand Down
29 changes: 15 additions & 14 deletions uips/utils/dataUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,33 +33,34 @@ def checkData(shape, N, d, nWorkingData, nWorkingDataAdjustment, useNF):


# @profile
def prepareData(inpt):
def prepareData(inpt, dataset = None):
# Set parameters from input
dataFile = find_data(inpt["dataFile"])
preShuffled = inpt["preShuffled"] == "True"
scalerFile = inpt["scalerFile"]
nWorkingDatas = [int(float(n)) for n in inpt["nWorkingData"].split()]
if dataset is None:
# Load the dataset but don't read it just yet
dataFile = find_data(inpt.dataFile)
dataset = np.load(dataFile, mmap_mode="r")

preShuffled = inpt.preShuffled
scalerFile = inpt.scalerFile
nWorkingDatas = inpt.stepOptions_as_list("nWorkingData")
if len(nWorkingDatas) == 1:
nWorkingDatas = nWorkingDatas * int(inpt["num_pdf_iter"])
nWorkingDataAdjustment = int(float(inpt["nWorkingDataAdjustment"]))
nWorkingDatas = nWorkingDatas * inpt.num_pdf_iter
nWorkingDataAdjustment = int(inpt.nWorkingDataAdjustment)

# Reduce Data
try:
dimList = [int(n) for n in inpt["dimList"].split()]
dimList = [int(n) for n in inpt.dimList.split()]
except:
dimList = None
try:
nDimReduced = int(float(inpt["nDimReduced"]))
nDimReduced = inpt.nDimReduced
except:
nDimReduced = -1
try:
nDatReduced = int(float(inpt["nDatReduced"]))
nDatReduced = inpt.nDatReduced
except:
nDatReduced = -1

# Load the dataset but don't read it just yet
dataset = np.load(dataFile, mmap_mode="r")

# Check that dataset shape make sense
if nDatReduced > 0:
nFullData = min(dataset.shape[0], nDatReduced)
Expand All @@ -72,7 +73,7 @@ def prepareData(inpt):
else:
nDim = dataset.shape[1]
if par.irank == par.iroot:
useNF = inpt["pdf_method"].lower() == "normalizingflow"
useNF = inpt.pdfMethod.lower() == "normalizingflow"
checkData(
dataset.shape,
nFullData,
Expand Down
Loading