added tests for FairGBM openml interface

andre.cruz · andre.cruz · commit 23ca1561ae16 · 2022-06-01T17:30:17.000+01:00
diff --git a/openml-lightgbm/lightgbm-provider/src/test/java/com/feedzai/openml/provider/lightgbm/FairGBMBinaryClassificationModelTrainerTest.java b/openml-lightgbm/lightgbm-provider/src/test/java/com/feedzai/openml/provider/lightgbm/FairGBMBinaryClassificationModelTrainerTest.java
@@ -5,14 +5,21 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import com.feedzai.openml.data.Dataset;
+import com.feedzai.openml.data.Instance;
 import com.feedzai.openml.data.schema.DatasetSchema;
+import com.feedzai.openml.mocks.MockDataset;
 import com.feedzai.openml.provider.exception.ModelLoadingException;
 
 import static com.feedzai.openml.provider.lightgbm.FairGBMDescriptorUtil.CONSTRAINT_GROUP_COLUMN_PARAMETER_NAME;
+import static com.feedzai.openml.provider.lightgbm.LightGBMBinaryClassificationModelTrainerTest.average;
+import static com.feedzai.openml.provider.lightgbm.LightGBMBinaryClassificationModelTrainerTest.ensureFeatureContributions;
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
 
 /**
  * Tests for using the LightGBMBinaryClassificationModelTrainer class with FairGBM.
@@ -71,6 +78,73 @@ public static void setupFixture() {
 //        MODEL_PARAMS.replace(NUM_ITERATIONS_PARAMETER_NAME, NUM_ITERATIONS_FOR_FAST_TESTS);
     }
 
+    /**
+     * Asserts that a model trained with numericals+categoricals and evaluated on the same datasource
+     * has in average higher scores for the positive class (1) than for the negative one (0).
+     *
+     * @throws URISyntaxException    In case of error retrieving the data resource path.
+     * @throws IOException           In case of error reading data.
+     * @throws ModelLoadingException In case of error training the model.
+     */
+    @Test
+    public void fitWithNumericalsAndCategoricals() throws URISyntaxException, IOException, ModelLoadingException {
+
+        final ArrayList<List<Double>> scoresPerClass = fitModelAndGetFirstScoresPerClass(
+                DATASET_RESOURCE_NAME,
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_START,
+                MAX_NUMBER_OF_INSTANCES_TO_TRAIN,
+                MAX_NUMBER_OF_INSTANCES_TO_SCORE,
+                SMALL_TRAIN_DATA_CHUNK_INSTANCES_SIZE
+        );
+
+        assertThat(average(scoresPerClass.get(0))).as("score average per class")
+                                                  .isLessThan(average(scoresPerClass.get(1)));
+    }
+
+    /**
+     * Assert that in general, a model trained+scored on schemas where the position of
+     * the label changes results in exactly the same scores.
+     * <p>
+     * This tests for regressions on the copying data code during train that at the start
+     * of development resulted in broken scores (mostly constant) that were very hard to diagnose.
+     *
+     * @throws URISyntaxException    In case of error retrieving the data resource path.
+     * @throws IOException           In case of error reading data.
+     * @throws ModelLoadingException In case of error training the model.
+     */
+    @Test
+    public void fitCategoricalsWithLabelInStartMiddleOrEndHasSameResults()
+            throws URISyntaxException, IOException, ModelLoadingException {
+
+        final ArrayList<List<Double>> scoresPerClassForLabelAtStart = fitModelAndGetFirstScoresPerClass(
+                DATASET_RESOURCE_NAME,
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_START,
+                MAX_NUMBER_OF_INSTANCES_TO_TRAIN,
+                MAX_NUMBER_OF_INSTANCES_TO_SCORE,
+                SMALL_TRAIN_DATA_CHUNK_INSTANCES_SIZE
+        );
+
+        final ArrayList<List<Double>> scoresPerClassForLabelInMiddle = fitModelAndGetFirstScoresPerClass(
+                DATASET_RESOURCE_NAME,
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_IN_MIDDLE,
+                MAX_NUMBER_OF_INSTANCES_TO_TRAIN,
+                MAX_NUMBER_OF_INSTANCES_TO_SCORE,
+                SMALL_TRAIN_DATA_CHUNK_INSTANCES_SIZE
+        );
+
+        final ArrayList<List<Double>> scoresPerClassForLabelAtEnd = fitModelAndGetFirstScoresPerClass(
+                DATASET_RESOURCE_NAME,
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_END,
+                MAX_NUMBER_OF_INSTANCES_TO_TRAIN,
+                MAX_NUMBER_OF_INSTANCES_TO_SCORE,
+                SMALL_TRAIN_DATA_CHUNK_INSTANCES_SIZE
+        );
+
+        assertThat(scoresPerClassForLabelAtStart).as("scores")
+                                                 .isEqualTo(scoresPerClassForLabelInMiddle)
+                                                 .isEqualTo(scoresPerClassForLabelAtEnd);
+    }
+
     @Test
     public void fitResultsAreIndependentOfTrainChunkSizes()
             throws URISyntaxException, IOException, ModelLoadingException {
@@ -104,6 +178,77 @@ public void fitResultsAreIndependentOfTrainChunkSizes()
                                          .isEqualTo(scoresWithSingleChunk);
     }
 
+    /**
+     * Assert that there's an error when training with no instances.
+     */
+    @Test
+    public void fitWithNoInstances() {
+
+        final List<Instance> noInstances = new ArrayList<>();
+        final Dataset emptyDataset = new MockDataset(TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_START, noInstances);
+
+        assertThatThrownBy(() ->
+                new LightGBMModelCreator().fit(
+                        emptyDataset,
+                    new Random(),
+                    MODEL_PARAMS
+                )
+        )
+        .isInstanceOf(RuntimeException.class);
+    }
+
+    /**
+     * Test Feature Contributions with target at end.
+     *
+     * @throws URISyntaxException For errors when loading the dataset resource.
+     * @throws IOException        For errors when reading the dataset.
+     * @since 1.3.0
+     */
+    @Test
+    public void testFeatureContributionsTargetEnd() throws URISyntaxException, IOException {
+        final Dataset dataset = CSVUtils.getDatasetWithSchema(
+                TestResources.getResourcePath(DATASET_RESOURCE_NAME),
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_END,
+                10000
+        );
+        ensureFeatureContributions(dataset, MODEL_PARAMS);
+    }
+
+    /**
+     * Test Feature Contributions with target at middle.
+     *
+     * @throws URISyntaxException For errors when loading the dataset resource.
+     * @throws IOException        For errors when reading the dataset.
+     * @since 1.3.0
+     */
+    @Test
+    public void testFeatureContributionsTargetMiddle() throws URISyntaxException, IOException {
+        final Dataset dataset = CSVUtils.getDatasetWithSchema(
+                TestResources.getResourcePath(DATASET_RESOURCE_NAME),
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_IN_MIDDLE,
+                10000
+        );
+        ensureFeatureContributions(dataset, MODEL_PARAMS);
+    }
+
+    /**
+     * Test Feature Contributions with target at beginning.
+     *
+     * @throws URISyntaxException For errors when loading the dataset resource.
+     * @throws IOException        For errors when reading the dataset.
+     * @since 1.3.0
+     */
+    @Test
+    public void testFeatureContributionsTargetBeginning() throws URISyntaxException, IOException {
+        final Dataset dataset = CSVUtils.getDatasetWithSchema(
+                TestResources.getResourcePath(DATASET_RESOURCE_NAME),
+                TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_START,
+                10000
+        );
+        ensureFeatureContributions(dataset, MODEL_PARAMS);
+    }
+
+
     static ArrayList<List<Double>> fitModelAndGetFirstScoresPerClass(
             final String datasetResourceName,
             final DatasetSchema schema,
diff --git a/openml-lightgbm/lightgbm-provider/src/test/java/com/feedzai/openml/provider/lightgbm/LightGBMBinaryClassificationModelTrainerTest.java b/openml-lightgbm/lightgbm-provider/src/test/java/com/feedzai/openml/provider/lightgbm/LightGBMBinaryClassificationModelTrainerTest.java
@@ -317,7 +317,7 @@ public void testFeatureContributionsTargetEnd() throws URISyntaxException, IOExc
                 TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_END,
                 10000
         );
-        ensureFeatureContributions(dataset);
+        ensureFeatureContributions(dataset, MODEL_PARAMS);
     }
 
     /**
@@ -334,7 +334,7 @@ public void testFeatureContributionsTargetMiddle() throws URISyntaxException, IO
                 TestSchemas.CATEGORICALS_SCHEMA_LABEL_IN_MIDDLE,
                 10000
         );
-        ensureFeatureContributions(dataset);
+        ensureFeatureContributions(dataset, MODEL_PARAMS);
     }
 
     /**
@@ -351,7 +351,7 @@ public void testFeatureContributionsTargetBeginning() throws URISyntaxException,
                 TestSchemas.CATEGORICALS_SCHEMA_LABEL_AT_START,
                 10000
         );
-        ensureFeatureContributions(dataset);
+        ensureFeatureContributions(dataset, MODEL_PARAMS);
     }
 
     /**
@@ -360,12 +360,12 @@ public void testFeatureContributionsTargetBeginning() throws URISyntaxException,
      * @param dataset The {@link Dataset}.
      * @since 1.3.0
      */
-    private void ensureFeatureContributions(final Dataset dataset) {
+    static void ensureFeatureContributions(final Dataset dataset, final Map<String, String> modelParams) {
         final int targetIndex = dataset.getSchema().getTargetIndex().get();
         final int num1Index = 1;
         final int cat1Index = 4;
 
-        final Map<String, String> trainParams = new HashMap<>(MODEL_PARAMS);
+        final Map<String, String> trainParams = new HashMap<>(modelParams);
         trainParams.replace(NUM_ITERATIONS_PARAMETER_NAME, "100");
 
         final LightGBMBinaryClassificationModel model = new LightGBMModelCreator().fit(
@@ -484,7 +484,7 @@ static ArrayList<List<Double>> getClassScores(final Dataset dataset,
      * @param inputArray Input array from which to compute the average.
      * @return Average
      */
-    double average(final List<Double> inputArray) {
+    static double average(final List<Double> inputArray) {
 
         double sum = 0.0;
         for (final Double x : inputArray) {