diff --git a/client_cmd.py b/client_cmd.py index b783761..73297a7 100644 --- a/client_cmd.py +++ b/client_cmd.py @@ -34,12 +34,9 @@ parser.add_argument("--smooth_method", type=str, default=None, help="Weight smoothing") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--local_port", type=int, default=8081, help="Local port") - parser.add_argument("--data_path", type=str, default=None, help="Data path") parser.add_argument("--production_mode", type=str, default="True", help="Production mode") - parser.add_argument("--certs_path", type=str, default="./", help="Certificates path") parser.add_argument("--node_name", type=str, default="./", help="Node name for certificates") - parser.add_argument("--sandbox_path", type=str, default="./", help="Sandbox path to use") parser.add_argument("--experiment", type=json.loads, default={"name": "experiment_1", "log_path": "logs", "debug": "true"}, help="experiment logs") parser.add_argument("--smoothWeights", type=json.loads, default= {"smoothing_strenght": 0.5}, help="Smoothing parameters") parser.add_argument("--linear_models", type=json.loads, default={"n_features": 9}, help="Linear model parameters") @@ -48,9 +45,16 @@ parser.add_argument("--weighted_random_forest", type=json.loads, default={"balanced_rf": "true", "levelOfDetail": "DecisionTree"}, help="Weighted random forest parameters") parser.add_argument("--xgb", type=json.loads, default={"batch_size": 32,"num_iterations": 100,"task_type": "BINARY","tree_num": 500}, help="XGB parameters") +# Variables hardcoded + parser.add_argument("--sandbox_path", type=str, default="./sandbox", help="Sandbox path to use") + parser.add_argument("--certs_path", type=str, default="./certs_path", help="Certificates path") + parser.add_argument("--data_path", type=str, default="./data", help="Data path") args = parser.parse_args() config = vars(args) +# config["sandbox_path"] = "./sandbox" +# config["certs_path"] = "/app/config/certificates" +# config["data_path"] = "./data" if config["model"] in ("logistic_regression", "elastic_net", "lsvc"): config["linear_models"] = {} diff --git a/flcore/datasets.py b/flcore/datasets.py index 3ecebd1..8fb4935 100644 --- a/flcore/datasets.py +++ b/flcore/datasets.py @@ -564,19 +564,34 @@ def load_dt4h(config,id): dat_len = len(dat) - # Numerical variables numeric_columns_non_zero = {} - for feat in metadata["entries"][0]["featureSet"]["features"]: - if feat["dataType"] == "NUMERIC" and feat["statistics"]["numOfNotNull"] != 0: + for feat in metadata["entity"]["features"]: + if (feat["dataType"] == "NUMERIC" + and feat["name"] in train_labels + and metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["numOfNotNull"] != 0): # statistic keys = ['Q1', 'avg', 'min', 'Q2', 'max', 'Q3', 'numOfNotNull'] numeric_columns_non_zero[feat["name"]] = ( - feat["statistics"]["Q1"], - feat["statistics"]["avg"], - feat["statistics"]["min"], - feat["statistics"]["Q2"], - feat["statistics"]["max"], - feat["statistics"]["Q3"], - feat["statistics"]["numOfNotNull"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["q1"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["avg"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["min"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["q2"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["max"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["q3"], + metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["numOfNotNull"], + ) + for feat in metadata["entity"]["outcomes"]: + if (feat["dataType"] == "NUMERIC" + and feat["name"] in target_labels + and metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["numOfNotNull"] != 0): + # statistic keys = ['Q1', 'avg', 'min', 'Q2', 'max', 'Q3', 'numOfNotNull'] + numeric_columns_non_zero[feat["name"]] = ( + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["q1"], + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["avg"], + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["min"], + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["q2"], + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["max"], + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["q3"], + metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["numOfNotNull"], ) for col, (q1,avg,mini,q2,maxi,q3,numOfNotNull) in numeric_columns_non_zero.items(): @@ -587,28 +602,59 @@ def load_dt4h(config,id): pass # no std found in data set elif config["normalization_method"] == "MIN_MAX": dat[col] = min_max_normalize(col, mini, maxi) - tipos=[] + #tipos=[] map_variables = {} - for feat in metadata["entries"][0]["featureSet"]["features"]: - tipos.append(feat["dataType"]) - if feat["dataType"] == "NOMINAL" and feat["statistics"]["numOfNotNull"] != 0: - num_cat = len(feat["statistics"]["valueset"]) + for feat in metadata["entity"]["features"]: + if (feat["dataType"] == "NOMINAL" + and feat["name"] in train_labels + and metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["numOfNotNull"] != 0): + #print("FEAT", feat["name"]) + map_cat = {} + if "valueSet" in feat.keys(): + for ind, cat_ in enumerate(feat["valueSet"]["concept"]): + #print(ind,cat_["code"]) + cat = cat_["code"] + map_cat[cat] = ind + else: + pass + #print("NO",feat["name"]) + map_variables[feat["name"]] = map_cat + + for feat in metadata["entity"]["outcomes"]: + if (feat["dataType"] == "NOMINAL" + and feat["name"] in target_labels + and metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["numOfNotNull"] != 0): + #print("FEAT", feat["name"]) map_cat = {} - for ind, cat in enumerate(feat["statistics"]["valueset"]): - map_cat[cat] = ind + if "valueSet" in feat.keys(): + for ind, cat_ in enumerate(feat["valueSet"]["concept"]): + #print(ind,cat_["code"]) + cat = cat_["code"] + map_cat[cat] = ind + else: + pass # temporal + #print("NO",feat["name"]) map_variables[feat["name"]] = map_cat + for col,mapa in map_variables.items(): dat[col] = dat[col].map(mapa) dat[map_variables.keys()].dropna() - - tipos=[] + map_variables = {} boolean_map = {np.bool_(False) :0, np.bool_(True):1, "False":0,"True":1} - for feat in metadata["entries"][0]["featureSet"]["features"]: - tipos.append(feat["dataType"]) - if feat["dataType"] == "BOOLEAN" and feat["statistics"]["numOfNotNull"] != 0: + for feat in metadata["entity"]["features"]: + if (feat["dataType"] == "BOOLEAN" + and feat["name"] in train_labels + and metadata["entity"]["datasetStats"]["featureStats"][feat["name"]]["numOfNotNull"] != 0): map_variables[feat["name"]] = boolean_map + + for feat in metadata["entity"]["outcomes"]: + if (feat["dataType"] == "BOOLEAN" + and feat["name"] in target_labels + and metadata["entity"]["datasetStats"]["outcomeStats"][feat["name"]]["numOfNotNull"] != 0): + map_variables[feat["name"]] = boolean_map + for col,mapa in map_variables.items(): dat[col] = dat[col].map(boolean_map)