hpcc-systems · vzeufack · Aug 24, 2020 · Jan 28, 2021 · Jan 28, 2021 · lilyclemson
diff --git a/Preprocessing/LabelEncoder.ecl b/Preprocessing/LabelEncoder.ecl
@@ -0,0 +1,206 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems®.  All rights reserved.
+############################################################################## */
+
+/**
+ * Convert categorical values into discrete numbers
+ * in the range [0 ..（n - 1)] where n is the number of categories of a feature.                                  
+ */
+EXPORT LabelEncoder := MODULE
+  /**
+   * Builds a mapping between feature names and categories.
+   * 
+   * @param dataForUndefinedCategories: any record-oriented dataset.
+   *   <p>The data from which the categories are extracted 
+   *   if not predefined in the list of categorical features.
+   *
+   * @param partialKey: same record structure as the key (see below).                
+   *   <p> Mapping between feature names and categories. 
+   *   Some names are mapped to empty categories such that 
+   *   their categories could be extracted from dataForUndefinedCategories.
+   *   Names which are mapped to non-empty categories will be assigned the same categories.
+   *
+   * @return key: DATASET(KeyLayout)
+   *   <p>The full mapping between categorical feature names and their categories.
+   *   Its record structure has the following format:
+   *   <p>
+   *   <pre>
+   *   KeyLayout := RECORD
+   *     SET OF STRING <name of categorical feature 1>;
+   *     SET OF STRING <name of categorical feature 2>;
+   *     ...
+   *     SET OF STRING <name of categorical feature n>;
+   *   END;
+   *   </pre>
+   */
+  EXPORT GetKey(dataForUndefinedCategories, partialKey) := FUNCTIONMACRO
+    IMPORT ML_Core;
+
+    Utl := ML_Core.Preprocessing.Utils;
+
+    KeyLayout := RECORDOF(partialKey);
+    #EXPORTXML(KeyMetaInfo, partialKey)
+    dta := #TEXT(dataForUndefinedCategories);
+
+    KeyLayout completeKey(KeyLayout L) := TRANSFORM
+      #FOR(KeyMetaInfo)
+        #FOR(field)     
+          #EXPAND('SELF.' + %'@label'% + ' := IF(EXISTS(L.' + %'@label'% + '), '
+                                            + 'L.' + %'@label'% + ','
+                                            + 'Utl.GetCategories(' + dta + ',' + %'@label'% + '))');
+
+        #END
+      #END
+    END;
+
+    Result := PROJECT(partialKey, completeKey(LEFT));
+    RETURN Result;
+  ENDMACRO;
+
+/**
+  * Builds a lookup table that maps each category of a feature to a unique number.
+  * Each category is assigned its index in the category set.
+  *
+  * @param key: DATASET(KeyLayout).
+  *   <p> Mapping between feature names and categories.
+  *
+  * @return categoriesMapping: DATASET(MappingLayout).
+  *   <p> A table with each feature name mapped to its categories and each category
+  *   mapped to its value.
+  *
+  *   <pre>
+  *   //record mapping a category to its value.
+  *   Category := RECORD
+  *     STRING categoryName;
+  *     INTEGER value;
+  *   END;
+  *   
+  *   //record mapping feature names to their categories.
+  *   MappingLayout := RECORD
+  *     STRING featureName;
+  *     DATASET(Category) categories;
+  *   END;
+  *   </pre>
+  */
+  EXPORT GetMapping(key) := FUNCTIONMACRO
+    IMPORT ML_Core;
+
+    RETURN ML_Core.LabelEncoder.MapCategoriesToValues(key);
+  ENDMACRO;
+
+  /**
+    * Replaces each categorical value in the data with its index in the key.
+    * Every unknown category (not in the key) is replaced by -1.
+    *
+    * @param dataToEncode: any dataset.
+    *   <p> The data to encode.
+    *
+    * @param key: DATASET(KeyLayout).
+    *   <p> Mapping between feature names and their categories.
+    *
+    * @return encodedData: same record structure as dataToEncode 
+    *   with the datatype of all categorical features changed to INTEGER.
+    *   <p> Data with categorical values replaced by numbers.
+    */
+  EXPORT Encode(dataToEncode, key) := FUNCTIONMACRO
+    IMPORT ML_Core;
+
+    utils := ML_Core.Preprocessing.Utils;
+
+    //build mapping between categories and values
+    #UNIQUENAME(mapping)
+    %mapping% := Utils.LabelEncoder.MapCategoriesToValues(key);
+
+    //build final record structure
+    featureNameSET := Utils.GetFeatureNames(key);
+
+    #EXPORTXML(dataMetaInfo, RECORDOF(dataToEncode))
+    EncodedDataLayout := RECORD
+      #FOR(dataMetaInfo)
+        #FOR(field)
+          #IF(%'@label'% IN featureNameSET)
+            #EXPAND('INTEGER ' + %'@label'%);
+          #ELSE
+            #EXPAND(%'@type'% + ' ' + %'@label'%);
+          #END        
+        #END
+      #END
+    END;
+
+    //replace categories by corresponding value
+    #EXPORTXML(keyMetaInfo, RECORDOF(key))
+    #UNIQUENAME(categories)
+    #UNIQUENAME(category)
+    EncodedDataLayout replace (RECORDOF(dataToEncode) L):= TRANSFORM      
+      #FOR(keyMetaInfo)
+        #FOR(field)
+          #SET(categories, %'mapping'% + '(featureName = \'' + %'@label'% + '\')[1].categories')
+          #SET(category, %'categories'% + '(categoryName = (STRING)L.' + %'@label'% + ')')
+          SELF.%@label% := IF(EXISTS(%category%), %category%[1].value, -1);
+        #END
+      #END
+      SELF := L;
+    END;
+
+    result := PROJECT(dataToEncode, replace(LEFT));
+    RETURN result;
+  ENDMACRO;
+
+  /**
+    * Converts back the categorical values into their original labels.
+    * Every -1 is replaced by an empty string.
+    *
+    * @param dataToDecode: any dataset.
+    *   <p> The data to decode.
+    *
+    * @param key: DATASET(KeyLayout).
+    *   <p> Mapping between feature names and their categories.
+    *
+    * @return decodedData: same record structure as dataToDecode 
+    *   with the datatype of all categorical features changed to STRING.
+    *   <p> Data with categorical values replaced by their original labels.
+    */
+  EXPORT Decode(dataToDecode, encoderKey) := FUNCTIONMACRO
+    IMPORT ML_Core;
+
+    utils := ML_Core.Preprocessing.Utils;
+
+    //build mapping between categories and values
+    #UNIQUENAME(mapping)
+    %mapping% := Utils.LabelEncoder.MapCategoriesToValues(key);
+
+    //build final record structure
+    featureNameSET := Utils.GetFeatureNames(key);
+
+    #EXPORTXML(dataMetaInfo, RECORDOF(dataToDecode))
+    DecodedDataLayout := RECORD
+      #FOR(dataMetaInfo)
+        #FOR(field)
+          #IF(%'@label'% IN featureNameSET)
+            #EXPAND('STRING ' + %'@label'%);
+          #ELSE
+            #EXPAND(%'@type'% + ' ' + %'@label'%);
+          #END        
+        #END
+      #END
+    END;
+
+    //replace values by original labels
+    #EXPORTXML(keyMetaInfo, RECORDOF(key))
+    #UNIQUENAME(categories)
+    #UNIQUENAME(category)
+    DecodedDataLayout replace (RECORDOF(dataToDecode) L):= TRANSFORM      
+      #FOR(keyMetaInfo)
+        #FOR(field)
+          #SET(categories, %'mapping'% + '(featureName = \'' + %'@label'% + '\')[1].categories')
+          #SET(category, %'categories'% + '(value = L.' + %'@label'% + ')')
+          SELF.%@label% := %category%[1].categoryName;         
+        #END
+      #END
+      SELF := L;
+    END;
+
+    result := PROJECT(dataToDecode, replace(LEFT));
+    RETURN result;
+  ENDMACRO;
+END;
diff --git a/Preprocessing/MinMaxScaler.ecl b/Preprocessing/MinMaxScaler.ecl
@@ -0,0 +1,123 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems.  All rights reserved.
+############################################################################## */
+
+IMPORT $.^ as ML_Core;
+
+Types := ML_Core.Preprocessing.Types;
+KeyLayout := Types.MinMaxScaler.KeyLayout;
+FeatureMinMax := Types.MinMaxScaler.FeatureMinMax;
+NumericField := ML_Core.types.NumericField;
+t_FieldReal := ML_Core.types.t_FieldReal;
+
+/**
+ * shift the values in a range [min, max].
+ *
+ * @param baseData: DATASET(NumericField), Default = DATASET([], NumericField).           
+ *   <p> The data from which the minimums and maximums are determined.
+ *
+ * @param low: t_FieldReal, Default = 0.0                     
+ *   <p> The minimum value of the normalized data.
+ *
+ * @param high: t_FieldReal, Default = 1.0                     
+ *   <p> The maximum value of the normalized data.
+ *
+ * @param key: DATASET(KeyLayout), default = DATASET([], KeyRec).            
+ *   <p> The key to be reused for scaling/unscaling.
+ */
+EXPORT MinMaxScaler (DATASET(NumericField) baseData = DATASET([], NumericField),
+                     t_FieldReal lowBound = 0.0, t_FieldReal highBound = 1.0, 
+                     DATASET(KeyLayout) key = DATASET([], KeyLayout)) := MODULE
+
+  /**
+   * Get mins and maxs for each feature in baseData.
+   *
+   * @return minAndMaxByFeature: DATASET(KeyLayout).
+   */
+  SHARED ComputeKey() := FUNCTION    
+    //compute the mins and max for each feature
+    FeatureMinMax GetMinAndMax(Types.numberLayout L) := TRANSFORM
+      SELF.featureId := L.number;
+      values := SET(baseData(number = L.number), value);
+      SELF.minValue := MIN(values);
+      SELF.maxValue := MAX(values);
+    END;
+
+    featureIds := DATASET(SET(baseData(id = 1), number), Types.numberLayout);
+    minsAndMaxs := PROJECT(featureIds, GetMinAndMax(LEFT));
+
+    //add lowBound and highBound to key
+    Result := DATASET([{lowBound, highBound, minsAndMaxs}], KeyLayout);
+    boundariesErrorMsg := 'lowBound must be strictly smaller than high bound';
+    RETURN IF(lowBound < highBound, Result, ERROR(KeyLayout, 2, boundariesErrorMsg));
+  END;
+
+  //the key used by encode and decode functions
+  SHARED errorMsg := 'MinMaxScaler: must pass either baseData or key!';
+  SHARED innerKey := IF(EXISTS(key), 
+                        key, 
+                        IF(EXISTS(baseData), 
+                          ComputeKey(), 
+                          ERROR(KeyLayout, 1, errorMsg)));
+
+
+  /**
+   * Computes the key or reuses it if already given.
+   *
+   * @return the key: DATASET(KeyLayout).
+   */
+  EXPORT GetKey() := FUNCTION
+    RETURN innerKey;
+  END;
+
+
+  /**
+    * scale the data using the following formula:
+    * x' = min + ([(x - x_min)(max - min)]/(x_max - x_min))
+    *
+    * @param dataToScale: DATASET(NumericField)  .         
+    *   <p> The data to scale.
+    *
+    * @return the scaled data: DATASET(NumericField)
+    */
+  EXPORT Scale (DATASET(NumericField) dataToScale) := FUNCTION
+    IMPORT STD;
+
+    low := innerKey[1].lowBound;
+    high := innerKey[1].highBound;
+
+    NumericField XF(NumericField L) := TRANSFORM
+      minValue := innerKey.minsMaxs(featureId = L.number)[1].minValue;
+      maxValue := innerKey.minsMaxs(featureId = L.number)[1].maxValue;
+      SELF.value := low + (((L.value - minValue) * (high - low))/(maxValue - minValue));
+      SELF := L;
+    END;
+
+    scaledData := PROJECT(dataToScale, XF(LEFT));
+    RETURN scaledData;
+  END; 
+
+  /**
+   * unscale the data using the following formula
+   * x = x_min + ((x' - min)(x_max - x_min))/(max-min)
+   *
+   * @param dataToUnscale: DATASET(NumericField)         
+   *  <p> The data to unscale.
+   *
+   * @return the unscaled data: DATASET(NumericField).
+   */
+  EXPORT unscale(DATASET(NumericField) dataToUnscale) := FUNCTION
+    low := innerKey[1].lowBound;
+    high := innerKey[1].highBound;
+
+    NumericField XF(NumericField L) := TRANSFORM
+      minValue := innerKey.minsMaxs(featureId = L.number)[1].minValue;
+      maxValue := innerKey.minsMaxs(featureId = L.number)[1].maxValue;
+      SELF.value := minValue + (((L.value - low) * (maxValue - minValue))/(high - low));
+      SELF := L;
+    END;
+
+    unscaledData := PROJECT(dataToUnscale, XF(LEFT));
+    RETURN unscaledData;
+  END;
+END;