diff --git a/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json new file mode 100644 index 00000000..b700ecb7 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/diff_OMNI_SDM_3_11.json @@ -0,0 +1,16 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.diff_thresh", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.diff_thresh#1": { + "thresh": 0.1, + "op": ">" + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json new file mode 100644 index 00000000..c221a4bc --- /dev/null +++ b/orion/pipelines/sandbox/SOL/movstd_OMNI_SDM_3_11.json @@ -0,0 +1,17 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.rolling_std_thresh", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.rolling_std_thresh#1": { + "thresh": 10, + "op": ">", + "window_size": 5 + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": false + } + } +} diff --git a/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json b/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json new file mode 100644 index 00000000..84a52ce8 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/movstd_numenta_AISD.json @@ -0,0 +1,17 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.rolling_std_thresh", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.rolling_std_thresh#1": { + "thresh": 10, + "op": ">", + "window_size": 5 + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json b/orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json new file mode 100644 index 00000000..9c97bcac --- /dev/null +++ b/orion/pipelines/sandbox/SOL/thres_OMNI_SDM_3_11.json @@ -0,0 +1,16 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.thresholding", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.thresholding#1": { + "thres": 0.01, + "op": "<" + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/pipelines/sandbox/SOL/thres_yahoo_real1.json b/orion/pipelines/sandbox/SOL/thres_yahoo_real1.json new file mode 100644 index 00000000..42dabbc2 --- /dev/null +++ b/orion/pipelines/sandbox/SOL/thres_yahoo_real1.json @@ -0,0 +1,16 @@ +{ + "primitives": [ + "orion.primitives.timeseries_postprocessing.extract_dimension", + "orion.primitives.timeseries_postprocessing.thresholding", + "orion.primitives.timeseries_postprocessing.get_intervals" + ], + "init_params": { + "orion.primitives.timeseries_postprocessing.thresholding#1": { + "thres": 0.45, + "op": ">" + }, + "orion.primitives.intervals.get_intervals#1": { + "severity": true + } + } +} diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thresh.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thresh.json new file mode 100644 index 00000000..3fa33da3 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.diff_thresh.json @@ -0,0 +1,32 @@ +{ + "name": "diff_thresh", + "primitive": "orion.primitives.timeseries_postprocessing.diff_thresh", + "description": "Detects anomalies by calculating n-th order discrete difference", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "thresh": { + "type": "float", + "default": "0.1" + }, + "op": { + "type": "str", + "default": ">", + "options": ["<", ">", "<=", ">=", "=="] + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json new file mode 100644 index 00000000..414dc417 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.extract_dimension.json @@ -0,0 +1,27 @@ +{ + "name": "extract_dimension", + "primitive": "orion.primitives.timeseries_postprocessing.extract_dimension", + "description": "Dataset checks and slicing dimension", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "dim": { + "type": "int", + "default": null + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json new file mode 100644 index 00000000..1d2e2742 --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.get_intervals.json @@ -0,0 +1,27 @@ +{ + "name": "get_intervals", + "primitive": "orion.primitives.timeseries_postprocessing.get_intervals", + "description": "Builds anomaly intervals", + "produce": { + "args": [ + { + "name": "y", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "severity": { + "type": "bool", + "default": true + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thresh.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thresh.json new file mode 100644 index 00000000..a1621b4a --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.rolling_std_thresh.json @@ -0,0 +1,37 @@ +{ + "name": "rolling_std_thresh", + "primitive": "orion.primitives.timeseries_postprocessing.rolling_std_thresh", + "description": "Detects anomalies through moving standard deviation thresholding", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "thresh": { + "type": "float", + "default": 10 + }, + "op": { + "type": "str", + "default": ">", + "options": ["<", ">", "<=", ">=", "=="] + }, + "window_size": { + "type": "int", + "default": "5" + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json new file mode 100644 index 00000000..b8880a1c --- /dev/null +++ b/orion/primitives/jsons/orion.primitives.timeseries_postprocessing.thresholding.json @@ -0,0 +1,33 @@ +{ + "name": "thresholding", + "primitive": "orion.primitives.timeseries_postprocessing.thresholding", + "description": "Detects anomalies through simple thresholding", + "produce": { + "args": [ + { + "name": "X", + "type": "ndarray" + } + + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "tunable": { + "thres": { + "type": "float", + "default": 10 + }, + "op": { + "type": "str", + "default": ">", + "options": ["<", ">", "<=", ">=", "=="] + } + } + } +} \ No newline at end of file diff --git a/orion/primitives/timeseries_postprocessing.py b/orion/primitives/timeseries_postprocessing.py new file mode 100644 index 00000000..dc1acc83 --- /dev/null +++ b/orion/primitives/timeseries_postprocessing.py @@ -0,0 +1,149 @@ +import operator + +import numpy as np +import pandas as pd + +ops = {'>': operator.gt, + '<': operator.lt, + '>=': operator.ge, + '<=': operator.le, + '==': operator.eq} + + +def extract_dimension(X, dim=None): + """Validate data dimension. + + The function checks if the dataset being used is valid i.e has a length + greater than 0 and contains the dimension required. + + Args: + X (pd.DataFrame): + Data to validate and extract dimension from. + dim (str): + Column indicating the dimension number for a multi-dimensional dataset + + Returns: + pd.DataFrame: + Returns a dataframe that contains a dataset with 2 columns ['timestamp', 'value'] + """ + if len(X) == 0: + return [] + + columns = X.columns.values + + if 'timestamp' not in columns: + X['timestamp'] = X.index.values + X = X.reset_index(drop=True) + + if dim is not None and dim in columns: + X['value'] = X[dim] + X = pd.DataFrame().assign(timestamp=X['timestamp'], value=X[dim]) + + if 'value' not in columns: + return [] + + return X[['timestamp', 'value']] + + +def rolling_std_thresh(X, thresh, op=">", window_size=5): + """Apply moving standard deviation thesholding. + + The function flags anomalies based on moving standard deviation thresholding. + + Args: + X (pd.DataFrame): + N-dimensional value sequence to iterate over. + thresh (float): + Float used to indicate the threshold of the function. + op (str): + String indicating the operator used to compare with the threshold. + Possible values are '<', '>', '<=', '>=', '=='. + window_size (int): + Integer indicating the number of observations used for each window. + + Returns: + list: + List of indices indicating the timestamps that were flagged. + """ + a = X['value'].rolling(window=window_size).std().values + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thresh)] + return X.iloc[idx_arr] + + +def diff_thresh(X, thresh=0.1, op=">"): + """Apply discrete difference thresholding. + + The function flags anomalies based on n-th discrete difference thresholding. + + Args: + X (ndarray): + N-dimensional value sequence to iterate over. + thresh (float): + Integer used to indicate the threshold of the function. + op (str): + String indicating the operator used to compare with the threshold. + Possible values are '<', '>', '<=', '>=', '=='. + + Returns: + list: + List of indices indicating the timestamps that were flagged. + """ + a = np.diff(X['value']) + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thresh)] + return X.iloc[idx_arr] + + +def thresholding(X, thresh, op): + """Apply simple thresholding. + + The function flags anomalies based on simple thresholding + + Args: + X (ndarray): + N-dimensional value sequence to iterate over. + thresh (float): + Integer used to indicate the threshold of the function. + op (str): + String indicating the operator used to compare with the threshold. + Possible values are '<', '>', '<=', '>=', '=='. + + Returns: + list: + List of indices indicating the timestamps that were flagged. + """ + a = X['value'] + idx_arr = [idx for idx in range(len(a)) if ops[op](a[idx], thresh)] + return X.iloc[idx_arr] + + +def get_intervals(y, severity=True): + """Group together consecutive anomalies in anomaly internals. + + The function groups together samples that have been consecutively flagged + as an anomaly and returns the lower and upper bound of the boundary. + Optionally, it computes the average severity of each interval. + + Args: + y (ndarray): + N-dimensional array containing the flagged anomalies of the dataset. + severity (bool): + Optional. Indicates whether the average severity of each interval + should be calculated. + + Returns: + ndarray: + Array containing the anomaly intervals + """ + intervals = np.split(y, np.where(np.diff(y.index.values) > 1)[0] + 1) + + anomalies = list() + for interval in intervals: + timestamp = interval['timestamp'].values + + if severity: + anomalies.append((timestamp[0], timestamp[-1], np.mean(interval['value']))) + + else: + anomalies.append((timestamp[0], timestamp[-1])) + + return anomalies diff --git a/tests/primitives/test_timeseries_postprocessing.py b/tests/primitives/test_timeseries_postprocessing.py new file mode 100644 index 00000000..5f3e64e8 --- /dev/null +++ b/tests/primitives/test_timeseries_postprocessing.py @@ -0,0 +1,186 @@ +import pandas as pd +import pytest + +from orion.primitives.timeseries_postprocessing import ( + diff_thresh, extract_dimension, get_intervals, rolling_std_thresh, thresholding) + + +@pytest.fixture +def data(): + return pd.DataFrame({ + "timestamp": list(range(1, 11)), + "value": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + "test": [1, 2] * 5, + }) + + +@pytest.fixture +def signal(): + return pd.DataFrame({ + "timestamp": list(range(1, 11)), + "value": [0, 0, 1, 1, 5, 5, 10, 10, 2, 1], + }) + + +def test_extract_dimension(data): + # Setup + expected = data[['timestamp', 'value']] + + # Run + output = extract_dimension(data) + + # Assert + pd.testing.assert_frame_equal(output, expected) + + +def test_extract_dimension_dim(data): + # Setup + expected = data[['timestamp', 'test']] + expected.columns = ['timestamp', 'value'] + + # Run + output = extract_dimension(data, dim='test') + + # Assert + pd.testing.assert_frame_equal(output, expected) + + +def test_extract_dimension_empty(): + # Setup + data = pd.DataFrame() + + # Run + output = extract_dimension(data) + + # Assert + output == [] + + +def test_extract_dimension_timestamp_index(data): + # Setup + expected = data[['timestamp', 'value']] + data = data.set_index('timestamp') + + # Run + output = extract_dimension(data) + + # Assert + pd.testing.assert_frame_equal(output, expected) + + +def test_extract_dimension_no_value(data): + # Setup + data = data[['timestamp', 'test']] + + # Run + output = extract_dimension(data) + + # Assert + output == [] + + +def test_rolling_std_thresh(signal): + # Setup + expected = signal.iloc[[6, 7, 8, 9]] + + # Run + output = rolling_std_thresh(signal, 3) + + # Assert + output == expected + + +def test_rolling_std_thresh_one(signal): + # Setup + expected = signal.iloc[[9]] + + # Run + output = rolling_std_thresh(signal, 4) + + # Assert + output == expected + + +def test_rolling_std_thresh_empty(signal): + # Run + output = rolling_std_thresh(signal, 10) + + # Assert + len(output) == 0 + + +def test_diff_thresh(signal): + # Setup + expected = signal.iloc[[3, 5]] + + # Run + output = diff_thresh(signal, 1) + + # Assert + output == expected + + +def test_diff_thresh_equal(signal): + # Setup + expected = signal.iloc[[1, 3, 5]] + + # Run + output = diff_thresh(signal, 1, ">=") + + # Assert + output == expected + + +def test_thresholding(signal): + # Setup + expected = signal.iloc[[6, 7]] + + # Run + output = thresholding(signal, 5, ">") + + # Assert + output == expected + + +def test_thresholding_less(signal): + # Setup + expected = signal.iloc[[0, 1, 2, 3, 8, 9]] + + # Run + output = thresholding(signal, 5, "<") + + # Assert + output == expected + + +def test_get_intervals(signal): + # Setup + indicies = signal.iloc[[6, 7]] + expected = (7, 8) + + # Run + output = get_intervals(indicies, False) + + # Assert + assert isinstance(output, list) + assert len(output[0]) == 2 + + assert output[0][0] == expected[0] + assert output[0][1] == expected[1] + + +def test_get_intervals_severity(signal): + # Setup + indicies = signal.iloc[[6, 7]] + expected = (7, 8, 10) + + # Run + output = get_intervals(indicies) + + # Assert + assert isinstance(output, list) + assert len(output[0]) == 3 + + assert output[0][0] == expected[0] + assert output[0][1] == expected[1] + assert output[0][2] == expected[2]