add dataset operation with add datapoint feature

marconetto · marconetto · commit f2907f6d5a6d · 2025-02-07T22:24:26.000-08:00
diff --git a/docs/commands.md b/docs/commands.md
@@ -5,6 +5,8 @@ There are five hpcadvisor commands:
 
 - `deploy`: operations related to a deployment
 - `collect`: operation(s) related to data collection (i.e. execution of tasks)
+- `dataset`: opertaion(s) related to manipulation of the dataset (e.g. filter,
+add)
 - `plot`: operation(s) related to generation of plots
 - `advice`: operations(s) related to generation of advice
 - `gui`: trigger of the graphical user interface (gui) via browser
@@ -68,6 +70,24 @@ options:
                         Clear tasks
 ```
 
+## dataset
+
+Manipulate dataset, file containing data generated by job executions. For now,
+one can (i) filter out data, based on a filter and output file; and (ii) add
+a new data point to the dataset file. The dataset file is located at
+`$HOME/.hpcadvisor/dataset.json`
+
+```
+usage: hpcadvisor dataset [-h] [-i INPUT] [-o OUTPUT] operation
+
+options:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Data filter | New datapoints
+  -o OUTPUT, --output OUTPUT
+                        Filtered file
+```
+
 
 ## plot
 
diff --git a/docs/howto.md b/docs/howto.md
@@ -0,0 +1,58 @@
+# How-To
+
+
+## Add datapoint to dataset
+
+
+Assume you have your own way of running a job, with your own way of setting up
+the environment. You've learnt how the job behaved and would like to add this
+datapoint to the HPCAdvisor dataset file. Here is how you can do it.
+
+First, the dataset file is located at `$HOME/.hpcadvisor/dataset.json`. You
+don't need to create this file; HPCAdvisor will create for you.
+
+Now you need a json file with your data point created by yourself. For instance:
+
+```json
+    {
+      "timestamp": "2024-07-28-10-46",
+      "sku": "standard_hc44rs",
+      "nnodes": 4,
+      "total_cores": 176,
+      "ppr_perc": 100,
+      "cpu_usage": [
+        33.079166666666666,
+        33.547,
+        33.17916666666667,
+        40.756428571428565
+      ],
+      "exec_time": 432,
+      "appinputs": {
+        "BLOCKMESH_DIMENSIONS": "60 18 18"
+      },
+      "deployment": "myenvironment",
+      "region": "southcentralus",
+      "appname": "openfoam",
+      "tags": {
+        "appname": "openfoam",
+        "version": "v8",
+        "resolution": "60_18_18",
+        "poolid": "pool-2407280944qsb",
+        "taskid": "task-compute-2407281039pak"
+      },
+      "appexectime": 165,
+      "appmetrics": {
+        "FOAMRUNCLOCKTIME": "165",
+        "FOAMMESHCELLS": "8257533",
+        "APPEXECTIME": "165"
+      }
+    }
+```
+
+Once HPCAdvisor is installed or enabled via `poetry` (see installation guide), just run:
+
+```bash
+./bin/hpcadvisor  dataset add -i mydatapoint.json
+```
+
+
diff --git a/src/hpcadvisor/__main__.py b/src/hpcadvisor/__main__.py
@@ -109,18 +109,18 @@ def selecttask_handler(args):
     main_cli.main_selecttask(operation, userinput, taskfile, policy, numtasks)
 
 
-def datafilter_handler(args):
+def dataset_handler(args):
 
     operation = args.operation
-    datafilter = args.datafilter
-    exportfile = args.exportfile
+    input = args.input
+    output = args.output
 
     from hpcadvisor import main_cli
 
-    if operation == "export":
-        main_cli.main_datafilter(operation, datafilter, exportfile)
+    if operation == "filter" or operation == "add":
+        main_cli.main_dataset(operation, input, output)
     else:
-        print(f"Invalid operation: {operation}. Supported operations: export")
+        print(f"Invalid operation: {operation}. Supported operations: filter | add")
         sys.exit(1)
 
 
@@ -216,11 +216,13 @@ def _process_arguments():
     selecttask.add_argument("-n", "--numtasks", help="Number of tasks", required=False)
     selecttask.set_defaults(func=selecttask_handler)
 
-    datafilter = subparsers.add_parser("datafilter", help="Datafilter help")
-    datafilter.add_argument("operation", type=str)
-    datafilter.add_argument("-df", "--datafilter", help="Data filter", required=True)
-    datafilter.add_argument("-e", "--exportfile", help="Exported file", required=True)
-    datafilter.set_defaults(func=datafilter_handler)
+    dataset = subparsers.add_parser("dataset", help="Dataset handler help")
+    dataset.add_argument("operation", type=str)
+    dataset.add_argument(
+        "-i", "--input", help="Data filter | New datapoints", required=False
+    )
+    dataset.add_argument("-o", "--output", help="Filtered file", required=False)
+    dataset.set_defaults(func=dataset_handler)
 
     args = parser.parse_args()
 
diff --git a/src/hpcadvisor/cli_dataset.py b/src/hpcadvisor/cli_dataset.py
@@ -2,12 +2,19 @@
 
 import itertools
 
-from hpcadvisor import dataset_handler, logger, price_puller, taskset_handler
+from hpcadvisor import dataset_handler, logger, price_puller, taskset_handler, utils
 
 log = logger.logger
 
 
-def export(datafilter, exportfile):
+def filter(datafilter, exportfile):
 
     datapoints = dataset_handler.get_datapoints(datafilter)
     dataset_handler.store_datapoints(exportfile, datapoints)
+
+
+def add(newdatapoints_file):
+
+    datasetfile = utils.get_dataset_filename()
+
+    dataset_handler.add_datapoints_fromfile(datasetfile, newdatapoints_file)
diff --git a/src/hpcadvisor/dataset_handler.py b/src/hpcadvisor/dataset_handler.py
@@ -16,6 +16,24 @@ def store_datapoints(dataset_file, datapoints):
         json.dump(datapoints, outfile, indent=2)
 
 
+def add_datapoints_fromfile(dataset_file, newdatapoints_file):
+    newdatapoints = utils.get_data_from_file(newdatapoints_file)
+    existing_data = {}
+
+    if os.path.exists(dataset_file):
+        with open(dataset_file, "r") as file:
+            existing_data = json.load(file)
+
+    if not datapoints_label in existing_data:
+        existing_data[datapoints_label] = []
+
+    # for datapoint in newdatapoints[datapoints_label]:
+    existing_data[datapoints_label].append(newdatapoints)
+
+    with open(dataset_file, "w") as outfile:
+        json.dump(existing_data, outfile, indent=2)
+
+
 def add_datapoint(dataset_file, datapoint):
     existing_data = {}
 
diff --git a/src/hpcadvisor/main_cli.py b/src/hpcadvisor/main_cli.py
@@ -5,7 +5,7 @@
 from hpcadvisor import (
     batch_handler,
     cli_advice_generator,
-    cli_data_filter,
+    cli_dataset,
     cli_plot_generator,
     cli_task_selector,
     data_collector,
@@ -83,9 +83,16 @@ def main_advice(datafilter, appexectime):
     cli_advice_generator.generate_advice(datafilter, appexectime)
 
 
-def main_datafilter(operation, datafilter, exportfile):
-    log.info("Data filtering ...")
-    cli_data_filter.export(datafilter, exportfile)
+def main_dataset(operation, input, output):
+    if operation == "filter":
+        log.info("Filtering data...")
+        if not input or not output:
+            log.error("Input and output files are required for filtering.")
+            return
+        cli_dataset.filter(input, output)
+    elif operation == "add":
+        log.info("Adding data...")
+        cli_dataset.add(input)
 
 
 def main_selecttask(operation, userinput, taskfile, policy_name, num_tasks):