Skip to content
68 changes: 68 additions & 0 deletions sigllm/pipelines/prompter/mistral_prompter_multivariate_0shot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"primitives": [
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
"sklearn.impute.SimpleImputer",
"sigllm.primitives.transformation.Float2Scalar",
"sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences",
"sigllm.primitives.transformation.format_as_string",
"sigllm.primitives.prompting.huggingface.HF",
"sigllm.primitives.prompting.anomalies.parse_anomaly_response",
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",
"sigllm.primitives.prompting.anomalies.merge_anomalous_sequences",
"sigllm.primitives.prompting.anomalies.format_anomalies"
],
"init_params": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"time_column": "timestamp",
"interval": 21600,
"method": "mean"
},
"sigllm.primitives.transformation.Float2Scalar#1": {
"decimal": 2,
"rescale": true
},
"sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": {
"window_size": 100,
"step_size": 40
},
"sigllm.primitives.transformation.format_as_string#1": {
"space": false
},
"sigllm.primitives.prompting.huggingface.HF#1": {
"name": "mistralai/Mistral-7B-Instruct-v0.2",
"samples": 1,
"temp": 0.01
},
"sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
"interval": true
},
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
"alpha": 0.4
},
"sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": {
"beta": 0.5
}
},
"input_names": {
"sigllm.primitives.prompting.huggingface.HF#1": {
"X": "X_str"
},
"sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
"X": "y_hat"
}
},
"output_names": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"index": "timestamp"
},
"sigllm.primitives.transformation.format_as_string#1": {
"X": "X_str"
},
"sigllm.primitives.prompting.huggingface.HF#1": {
"y": "y_hat"
},
"sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
"X": "y"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
"name": "X",
"type": "ndarray"
},
{
"name": "dim",
"type": "int",
"default": 1
},
{
"name": "normal",
"type": "ndarray",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
{
"name": "step_size",
"type": "int"
},
{
"name": "dim",
"type": "int"
}
]
},
Expand Down
4 changes: 4 additions & 0 deletions sigllm/primitives/prompting/anomalies.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def find_anomalies_in_windows(y, alpha=0.5):
ndarray:
A 2-dimensional array containing final anomalous indices of each windows.
"""
if isinstance(y, list):
y = np.array(y, dtype=object)

idx_list = []
for samples in y:
min_vote = np.ceil(alpha * len(samples))
Expand Down Expand Up @@ -203,6 +206,7 @@ def format_anomalies(y, timestamp, padding_size=50):
if len(y) == 0:
return []

y = y[y < len(timestamp)].astype(int)
y = timestamp[y] # Convert list of indices into list of timestamps
start, end = timestamp[0], timestamp[-1]
interval = timestamp[1] - timestamp[0]
Expand Down
34 changes: 29 additions & 5 deletions sigllm/primitives/prompting/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-

import json
import logging
import os
import re

import torch
from tqdm import tqdm
Expand All @@ -24,6 +24,23 @@
DEFAULT_MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'


def get_examples(text, k=3):
"""Extracts the content within the first k sets of parentheses in a string.

Args:
text (str):
Input string.
k (int):
Number of examples to return.

Returns:
list:
A list containing the content within the first k sets of parentheses.
"""
matches = re.findall(r'\(([^)]*)\)', text)
return matches[:k]


class HF:
"""Prompt Pretrained models on HuggingFace to detect anomalies in a time series.

Expand Down Expand Up @@ -72,7 +89,6 @@ def __init__(
self.samples = samples
self.padding = padding
self.restrict_tokens = restrict_tokens

self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False)

# special tokens
Expand Down Expand Up @@ -111,12 +127,14 @@ def __init__(

self.model.eval()

def detect(self, X, normal=None, **kwargs):
def detect(self, X, dim=1, normal=None, **kwargs):
"""Use HF to detect anomalies of a signal.

Args:
X (ndarray):
Input sequences of strings containing signal values
Input sequences of strings containing signal values.
dim (int, optional):
Number of dimensions of the time series. Default to 1.
normal (str, optional):
A normal reference sequence for one-shot prompting. If None,
zero-shot prompting is used. Default to None.
Expand All @@ -140,7 +158,13 @@ def detect(self, X, normal=None, **kwargs):
if self.restrict_tokens:
user_message = PROMPTS['user_message']
else:
user_message = PROMPTS['user_message_2']
user_message = PROMPTS['user_message_unrestricted']

if dim > 1:
examples = get_examples(text)
examples = ' '.join([f'({ex})' for ex in examples])

user_message = PROMPTS['user_message_multivariate'].format(dim, examples)

# Combine messages with one-shot example if provided
message = ' '.join([
Expand Down
3 changes: 2 additions & 1 deletion sigllm/primitives/prompting/huggingface_messages.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"system_message": "You are an expert in time series analysis. Your task is to detect anomalies in time series data.",
"user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]",
"user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",
"user_message_unrestricted": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",
"user_message_multivariate": "Below is a multivariate time series of {} dimensions with the following format \"{}\".\nYour job is to detect ranges of anomalies in this time series, in terms of the x-axis coordinates. List one by one in a list.\nFor example, if ranges (including two endpoints) [5, 12], [71, 74], and [105, 131], are anomalies, then output \"[[5,12], [71, 74], [105, 131]]\".\nIf no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.\n[SEQUENCE]\n",
"one_shot_prefix": "Here is a normal reference of the time series: [NORMAL]"
}
7 changes: 6 additions & 1 deletion sigllm/primitives/prompting/timeseries_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
* rolling window sequences.
* first index value of each input sequence.
"""
if X.ndim == 1:
dim = 1
else:
dim = X.shape[1]

index = range(len(X))
out_X = list()
X_index = list()
Expand All @@ -37,4 +42,4 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
out_X.append(X[start:end])
X_index.append(index[start])
start = start + step_size
return np.asarray(out_X), np.asarray(X_index), window_size, step_size
return np.asarray(out_X), np.asarray(X_index), window_size, step_size, dim
24 changes: 24 additions & 0 deletions tests/primitives/prompting/test_timeseries_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,30 @@ def test_rolling_window_sequences(values, window_size, step_size):
np.array([0, 1, 2, 3, 4]),
3,
1,
1,
)

result = rolling_window_sequences(values, window_size, step_size)

if len(result) != len(expected):
raise AssertionError('Tuples has different length')

for arr1, arr2 in zip(result, expected):
np.testing.assert_equal(arr1, arr2)


def test_rolling_window_sequences_multivariate(window_size, step_size):
values = np.array([[0.555, 2.345], [1.501, 5.903], [9.116, 3.068], [7.432, 4.532]])

expected = (
np.array([
[[0.555, 2.345], [1.501, 5.903], [9.116, 3.068]],
[[1.501, 5.903], [9.116, 3.068], [7.432, 4.532]],
]),
np.array([0, 1]),
3,
1,
2,
)

result = rolling_window_sequences(values, window_size, step_size)
Expand Down
1,186 changes: 1,186 additions & 0 deletions tutorials/pipelines/multivariate-mistral-prompter-pipeline.ipynb

Large diffs are not rendered by default.

Loading