Skip to content

Commit f2907f6

Browse files
committed
add dataset operation with add datapoint feature
1 parent 1144e16 commit f2907f6

File tree

6 files changed

+129
-17
lines changed

6 files changed

+129
-17
lines changed

docs/commands.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ There are five hpcadvisor commands:
55

66
- `deploy`: operations related to a deployment
77
- `collect`: operation(s) related to data collection (i.e. execution of tasks)
8+
- `dataset`: opertaion(s) related to manipulation of the dataset (e.g. filter,
9+
add)
810
- `plot`: operation(s) related to generation of plots
911
- `advice`: operations(s) related to generation of advice
1012
- `gui`: trigger of the graphical user interface (gui) via browser
@@ -68,6 +70,24 @@ options:
6870
Clear tasks
6971
```
7072

73+
## dataset
74+
75+
Manipulate dataset, file containing data generated by job executions. For now,
76+
one can (i) filter out data, based on a filter and output file; and (ii) add
77+
a new data point to the dataset file. The dataset file is located at
78+
`$HOME/.hpcadvisor/dataset.json`
79+
80+
```
81+
usage: hpcadvisor dataset [-h] [-i INPUT] [-o OUTPUT] operation
82+
83+
options:
84+
-h, --help show this help message and exit
85+
-i INPUT, --input INPUT
86+
Data filter | New datapoints
87+
-o OUTPUT, --output OUTPUT
88+
Filtered file
89+
```
90+
7191

7292
## plot
7393

docs/howto.md

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# How-To
2+
3+
4+
## Add datapoint to dataset
5+
6+
7+
Assume you have your own way of running a job, with your own way of setting up
8+
the environment. You've learnt how the job behaved and would like to add this
9+
datapoint to the HPCAdvisor dataset file. Here is how you can do it.
10+
11+
First, the dataset file is located at `$HOME/.hpcadvisor/dataset.json`. You
12+
don't need to create this file; HPCAdvisor will create for you.
13+
14+
Now you need a json file with your data point created by yourself. For instance:
15+
16+
```json
17+
{
18+
"timestamp": "2024-07-28-10-46",
19+
"sku": "standard_hc44rs",
20+
"nnodes": 4,
21+
"total_cores": 176,
22+
"ppr_perc": 100,
23+
"cpu_usage": [
24+
33.079166666666666,
25+
33.547,
26+
33.17916666666667,
27+
40.756428571428565
28+
],
29+
"exec_time": 432,
30+
"appinputs": {
31+
"BLOCKMESH_DIMENSIONS": "60 18 18"
32+
},
33+
"deployment": "myenvironment",
34+
"region": "southcentralus",
35+
"appname": "openfoam",
36+
"tags": {
37+
"appname": "openfoam",
38+
"version": "v8",
39+
"resolution": "60_18_18",
40+
"poolid": "pool-2407280944qsb",
41+
"taskid": "task-compute-2407281039pak"
42+
},
43+
"appexectime": 165,
44+
"appmetrics": {
45+
"FOAMRUNCLOCKTIME": "165",
46+
"FOAMMESHCELLS": "8257533",
47+
"APPEXECTIME": "165"
48+
}
49+
}
50+
```
51+
52+
Once HPCAdvisor is installed or enabled via `poetry` (see installation guide), just run:
53+
54+
```bash
55+
./bin/hpcadvisor dataset add -i mydatapoint.json
56+
```
57+
58+

src/hpcadvisor/__main__.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,18 +109,18 @@ def selecttask_handler(args):
109109
main_cli.main_selecttask(operation, userinput, taskfile, policy, numtasks)
110110

111111

112-
def datafilter_handler(args):
112+
def dataset_handler(args):
113113

114114
operation = args.operation
115-
datafilter = args.datafilter
116-
exportfile = args.exportfile
115+
input = args.input
116+
output = args.output
117117

118118
from hpcadvisor import main_cli
119119

120-
if operation == "export":
121-
main_cli.main_datafilter(operation, datafilter, exportfile)
120+
if operation == "filter" or operation == "add":
121+
main_cli.main_dataset(operation, input, output)
122122
else:
123-
print(f"Invalid operation: {operation}. Supported operations: export")
123+
print(f"Invalid operation: {operation}. Supported operations: filter | add")
124124
sys.exit(1)
125125

126126

@@ -216,11 +216,13 @@ def _process_arguments():
216216
selecttask.add_argument("-n", "--numtasks", help="Number of tasks", required=False)
217217
selecttask.set_defaults(func=selecttask_handler)
218218

219-
datafilter = subparsers.add_parser("datafilter", help="Datafilter help")
220-
datafilter.add_argument("operation", type=str)
221-
datafilter.add_argument("-df", "--datafilter", help="Data filter", required=True)
222-
datafilter.add_argument("-e", "--exportfile", help="Exported file", required=True)
223-
datafilter.set_defaults(func=datafilter_handler)
219+
dataset = subparsers.add_parser("dataset", help="Dataset handler help")
220+
dataset.add_argument("operation", type=str)
221+
dataset.add_argument(
222+
"-i", "--input", help="Data filter | New datapoints", required=False
223+
)
224+
dataset.add_argument("-o", "--output", help="Filtered file", required=False)
225+
dataset.set_defaults(func=dataset_handler)
224226

225227
args = parser.parse_args()
226228

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,19 @@
22

33
import itertools
44

5-
from hpcadvisor import dataset_handler, logger, price_puller, taskset_handler
5+
from hpcadvisor import dataset_handler, logger, price_puller, taskset_handler, utils
66

77
log = logger.logger
88

99

10-
def export(datafilter, exportfile):
10+
def filter(datafilter, exportfile):
1111

1212
datapoints = dataset_handler.get_datapoints(datafilter)
1313
dataset_handler.store_datapoints(exportfile, datapoints)
14+
15+
16+
def add(newdatapoints_file):
17+
18+
datasetfile = utils.get_dataset_filename()
19+
20+
dataset_handler.add_datapoints_fromfile(datasetfile, newdatapoints_file)

src/hpcadvisor/dataset_handler.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,24 @@ def store_datapoints(dataset_file, datapoints):
1616
json.dump(datapoints, outfile, indent=2)
1717

1818

19+
def add_datapoints_fromfile(dataset_file, newdatapoints_file):
20+
newdatapoints = utils.get_data_from_file(newdatapoints_file)
21+
existing_data = {}
22+
23+
if os.path.exists(dataset_file):
24+
with open(dataset_file, "r") as file:
25+
existing_data = json.load(file)
26+
27+
if not datapoints_label in existing_data:
28+
existing_data[datapoints_label] = []
29+
30+
# for datapoint in newdatapoints[datapoints_label]:
31+
existing_data[datapoints_label].append(newdatapoints)
32+
33+
with open(dataset_file, "w") as outfile:
34+
json.dump(existing_data, outfile, indent=2)
35+
36+
1937
def add_datapoint(dataset_file, datapoint):
2038
existing_data = {}
2139

src/hpcadvisor/main_cli.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from hpcadvisor import (
66
batch_handler,
77
cli_advice_generator,
8-
cli_data_filter,
8+
cli_dataset,
99
cli_plot_generator,
1010
cli_task_selector,
1111
data_collector,
@@ -83,9 +83,16 @@ def main_advice(datafilter, appexectime):
8383
cli_advice_generator.generate_advice(datafilter, appexectime)
8484

8585

86-
def main_datafilter(operation, datafilter, exportfile):
87-
log.info("Data filtering ...")
88-
cli_data_filter.export(datafilter, exportfile)
86+
def main_dataset(operation, input, output):
87+
if operation == "filter":
88+
log.info("Filtering data...")
89+
if not input or not output:
90+
log.error("Input and output files are required for filtering.")
91+
return
92+
cli_dataset.filter(input, output)
93+
elif operation == "add":
94+
log.info("Adding data...")
95+
cli_dataset.add(input)
8996

9097

9198
def main_selecttask(operation, userinput, taskfile, policy_name, num_tasks):

0 commit comments

Comments
 (0)