diff --git a/.github/workflows/skywalking.yaml b/.github/workflows/skywalking.yaml index 57bd56f15f02..51f384738a88 100644 --- a/.github/workflows/skywalking.yaml +++ b/.github/workflows/skywalking.yaml @@ -780,7 +780,7 @@ jobs: if: matrix.test.docker != null run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} @@ -844,7 +844,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -905,7 +905,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -968,7 +968,7 @@ jobs: shell: bash run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package - name: Java version ${{ matrix.java-version }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 env: SW_AGENT_JDK_VERSION: ${{ matrix.java-version }} with: @@ -1064,7 +1064,7 @@ jobs: # fi # docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v # - name: ${{ matrix.test.name }} -# uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 +# uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 # with: # e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} # - if: ${{ failure() }} diff --git a/dist-material/alarm-settings.yml b/dist-material/alarm-settings.yml index 5dd6a9d2abf4..261f2714aea9 100644 --- a/dist-material/alarm-settings.yml +++ b/dist-material/alarm-settings.yml @@ -23,6 +23,8 @@ rules: expression: sum(service_resp_time > 1000) >= 3 period: 10 silence-period: 5 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 3 message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes. # service_resp_time_rule: # expression: avg(service_resp_time) > 1000 @@ -35,16 +37,20 @@ rules: period: 10 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 3 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 2 message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes service_resp_time_percentile_rule: expression: sum(service_percentile{p='50,75,90,95,99'} > 1000) >= 3 period: 10 silence-period: 5 + recovery-observation-period: 3 message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000 service_instance_resp_time_rule: expression: sum(service_instance_resp_time > 1000) >= 2 period: 10 silence-period: 5 + recovery-observation-period: 2 message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes database_access_resp_time_rule: expression: sum(database_access_resp_time > 1000) >= 2 @@ -63,11 +69,36 @@ rules: # silence-period: 5 # message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes + #hooks: # webhook: # default: # is-default: true # urls: -# - http://127.0.0.1/notify/ -# - http://127.0.0.1/go-wechat/ - +# - http://127.0.0.1/default/alarm +# recovery-urls: +# - http://127.0.0.1/default/alarm-recovery +# custom1: +# urls: +# - http://127.0.0.1/custom1/alarm +# recovery-urls: +# - http://127.0.0.1/custom1/alarm-recovery +# wechat: +# default: +# is-default: true +# text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm: \n %s." +# } +# } +# recovery-text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm Recovered: \n %s." +# } +# } +# webhooks: +# - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key diff --git a/dist-material/config-examples/alarm-settings.yml b/dist-material/config-examples/alarm-settings.yml index afd68583ba26..969e860f9186 100644 --- a/dist-material/config-examples/alarm-settings.yml +++ b/dist-material/config-examples/alarm-settings.yml @@ -23,6 +23,8 @@ rules: period: 10 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 10 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 3 message: Successful rate of endpoint {name} is lower than 75% tags: level: WARNING @@ -43,7 +45,35 @@ rules: silence-period: 5 message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes -#webhooks: -# - http://127.0.0.1/notify/ -# - http://127.0.0.1/go-wechat/ - +#hooks: +# webhook: +# default: +# is-default: true +# urls: +# - http://127.0.0.1/default/alarm +# recovery-urls: +# - http://127.0.0.1/default/alarm-recovery +# custom1: +# urls: +# - http://127.0.0.1/custom1/alarm +# recovery-urls: +# - http://127.0.0.1/custom1/alarm-recovery +# wechat: +# default: +# is-default: true +# text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm: \n %s." +# } +# } +# recovery-text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm Recovered: \n %s." +# } +# } +# webhooks: +# - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md index b31cc331e418..e06dad626901 100644 --- a/docs/en/changes/changes.md +++ b/docs/en/changes/changes.md @@ -5,8 +5,11 @@ #### OAP Server * KubernetesCoordinator: make self instance return real pod IP address instead of `127.0.0.1`. +* Enhance the alarm kernel with recovered status notification capability #### UI +* Fix the missing icon in new native trace view. +* Enhance the alert page to show the recovery time of resolved alerts. #### Documentation diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index f1a2045e34e4..60a60eaae8a2 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -39,6 +39,9 @@ The metrics names in the expression could be found in the [list of all potential If the hook id is not specified, the global hook will be used. - **Silence period**. After the alarm is triggered at Time-N (TN), there will be silence during the **TN -> TN + period**. By default, it works in the same manner as **period**. The same Alarm (having the same ID in the same metrics name) may only be triggered once within a period. +- **Recovery observation period**. Defines the number of consecutive periods that the alarm condition must remain false before the alarm is considered recovered. When the alarm condition becomes false, the system enters an observation period. If the condition remains false for the specified number of periods, a recovery notification is sent. If the condition becomes true again during the observation period, the alarm returns to the FIRING state. +The default value is 0, which means immediate recovery notification when the condition becomes false. + Such as for a metric, there is a shifting window as following at T7. @@ -52,6 +55,7 @@ Such as for a metric, there is a shifting window as following at T7. For example, expression `avg(service_resp_time) > 1000`, if the value are `1001, 1001, 1001, 1001, 1001, 1001, 1001`, the calculation is `((1001 + 10001 + ... + 1001) / 7) > 1000` and the result would be `1`(true). Then the alarm would be triggered. * In every minute, the window would shift automatically. At T8, Value8 would be cached, and T1/Value1 would be removed from the window. +* If Value8 is 890, the expression will be calculated based on the metric values from T2 to T8, which are `1001, 1001, 1001, 1001, 1001, 1001, 990`. The calculation becomes `((1001 + 1001 + ... + 890) / 7) < 1000`, and the result would be `0`(false). Consequently, the alarm enters an observation period for recovery. If the `Recovery observation period`is not set or is set to `0`, the alarm is considered recovered immediately, and a recovery notification is sent. Otherwise, the system will wait and observe the condition over the specified number of subsequent periods before declaring recovery. **NOTE**: * If the expression include labeled metrics and result has multiple labeled value(e.g. `sum(service_percentile{p='50,75'} > 1000) >= 3`), the alarm will be triggered if any of the labeled value result matches 3 times of the condition(P50 > 1000 or P75 > 1000). @@ -69,6 +73,8 @@ rules: period: 10 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 10 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 2 message: Successful rate of endpoint {name} is lower than 75% tags: level: WARNING @@ -163,6 +169,14 @@ hooks: "text": ":alarm_clock: *Apache Skywalking Alarm* \n **%s**." } } + recovery-text-template: |- + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":green_heart: *Apache SkyWalking Alarm Recovered* \n **%s**." + } + } webhooks: - https://hooks.slack.com/services/x/y/zssss custom1: @@ -192,12 +206,16 @@ webhook: custom1: urls: - http://127.0.0.1/custom1 + recovery-urls: + - http://127.0.0.1/custom1 # headers config is provided to add custom configurations or authentications that are required from the server side. headers: Authorization: Bearer bearer_token custom2: urls: - http://127.0.0.1/custom2 + recovery-urls: + - http://127.0.0.1/custom2 # headers config is provided to add custom configurations or authentications that are required from the server headers: Authorization: Basic basic_token @@ -213,11 +231,13 @@ webhook: The JSON format is based on `List` with the following key information: - **scopeId**, **scope**. All scopes are defined in `org.apache.skywalking.oap.server.core.source.DefaultScopeDefine`. - **name**. Target scope entity name. Please follow the [entity name definitions](#entity-name). +- **uuid** : The unique identifier (UUID) of the alarm, which is consistent between the trigger and recovery messages. - **id0**. The ID of the scope entity that matches with the name. When using the relation scope, it is the source entity ID. - **id1**. When using the relation scope, it is the destination entity ID. Otherwise, it is empty. - **ruleName**. The rule name configured in `alarm-settings.yml`. - **alarmMessage**. The alarm text message. -- **startTime**. The alarm time measured in milliseconds, which occurs between the current time and the midnight of January 1, 1970 UTC. +- **startTime**. The time, in milliseconds since the Unix epoch (January 1, 1970 UTC), when the alarm was triggered. +- **recoveryTime**. The time, in milliseconds since the Unix epoch (January 1, 1970 UTC), when the alarm was recovered. This value is `null` if the alarm has not been recovered. - **tags**. The tags configured in `alarm-settings.yml`. See the following example: @@ -226,12 +246,14 @@ See the following example: "scopeId": 1, "scope": "SERVICE", "name": "serviceA", + "uuid": "uuid1", "id0": "12", "id1": "", - "ruleName": "service_resp_time_rule", + "ruleName": "service_resp_time_rule", "alarmMessage": "alarmMessage xxxx", "startTime": 1560524171000, - "tags": [{ + "recoveryTime": 1560524351000, + "tags": [{ "key": "level", "value": "WARNING" }] @@ -239,9 +261,10 @@ See the following example: "scopeId": 1, "scope": "SERVICE", "name": "serviceB", + "uuid": "uuid2", "id0": "23", "id1": "", - "ruleName": "service_resp_time_rule", + "ruleName": "service_resp_time_rule", "alarmMessage": "alarmMessage yyy", "startTime": 1560524171000, "tags": [{ @@ -275,6 +298,21 @@ message AlarmMessage { string alarmMessage = 7; int64 startTime = 8; AlarmTags tags = 9; + string uuid = 10; +} + +message AlarmRecoveryMessage { + int64 scopeId = 1; + string scope = 2; + string name = 3; + string id0 = 4; + string id1 = 5; + string ruleName = 6; + string alarmMessage = 7; + int64 startTime = 8; + AlarmTags tags = 9; + string uuid = 10; + int64 recoveryTime = 11; } message AlarmTags { @@ -304,6 +342,14 @@ slack: "text": ":alarm_clock: *Apache Skywalking Alarm* \n **%s**." } } + recovery-text-template: |- + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":green_heart: *Apache SkyWalking Alarm Recovered* \n **%s**." + } + } webhooks: - https://hooks.slack.com/services/x/y/z ``` @@ -322,6 +368,13 @@ wechat: "content": "Apache SkyWalking Alarm: \n %s." } } + recovery-text-template: |- + { + "msgtype": "text", + "text": { + "content": "Apache SkyWalking Alarm Recovered: \n %s." + } + } webhooks: - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key ``` @@ -341,6 +394,13 @@ dingtalk: "content": "Apache SkyWalking Alarm: \n %s." } } + recovery-text-template: |- + { + "msgtype": "text", + "text": { + "content": "Apache SkyWalking Alarm Recovered: \n %s." + } + } webhooks: - url: https://oapi.dingtalk.com/robot/send?access_token=dummy_token secret: dummysecret @@ -363,6 +423,14 @@ feishu: }, "ats":"feishu_user_id_1,feishu_user_id_2" } + recovery-text-template: |- + { + "msg_type": "text", + "content": { + "text": "Apache SkyWalking Alarm Recovered: \n %s." + }, + "ats":"feishu_user_id_1,feishu_user_id_2" + } webhooks: - url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token secret: dummysecret @@ -376,6 +444,7 @@ welink: default: is-default: true text-template: "Apache SkyWalking Alarm: \n %s." + recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s." webhooks: # you may find your own client_id and client_secret in your app, below are dummy, need to change. - client-id: "dummy_client_id" @@ -400,6 +469,7 @@ pagerduty: default: is-default: true text-template: "Apache SkyWalking Alarm: \n %s." + recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s." integration-keys: - 5c6d805c9dcf4e03d09dfa81e8789ba1 ``` @@ -415,6 +485,7 @@ discord: default: is-default: true text-template: "Apache SkyWalking Alarm: \n %s." + recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s." webhooks: - url: https://discordapp.com/api/webhooks/1008166889777414645/8e0Am4Zb-YGbBqqbiiq0jSHPTEEaHa4j1vIC-zSSm231T8ewGxgY0_XUYpY-k1nN4HBl username: robot @@ -430,15 +501,38 @@ the sliding window will be destroyed and re-created, causing the Alarm of this s ### Keys with data types of alerting rule configuration file -| Alerting element | Configuration property key | Type | Description | -|----------------------|----------------------------|----------------|--------------------| -| Expression | expression | string | MQE expression | -| Include names | include-names | string array | | -| Exclude names | exclude-names | string array | | -| Include names regex | include-names-regex | string | Java regex Pattern | -| Exclude names regex | exclude-names-regex | string | Java regex Pattern | -| Tags | tags | key-value pair | | -| Period | Period | int | | -| Silence period | silence-period | int | | -| Message | message | string | | -| Hooks | hooks | string array | | +| Alerting element | Configuration property key | Type | Description | +| --------------------------- | --------------------------- | -------------- | ------------------ | +| Expression | expression | string | MQE expression | +| Include names | include-names | string array | | +| Exclude names | exclude-names | string array | | +| Include names regex | include-names-regex | string | Java regex Pattern | +| Exclude names regex | exclude-names-regex | string | Java regex Pattern | +| Tags | tags | key-value pair | | +| Period | period | int | | +| Silence period | silence-period | int | | +| Recovery observation period | recovery-observation-period | int | | +| Message | message | string | | +| Hooks | hooks | string array | | + +## Alarm state transition +The overall alarm state transition after the introduction of alarm restoration detection and notification since version 10.4.0 is as follows: + +```mermaid +stateDiagram-v2 + [*] --> NORMAL + NORMAL --> FIRING: Expression true
not in silence period + + FIRING --> SILENCED: Expression true
in silence period + FIRING --> OBSERVING_RECOVERY: Expression false
in recovery window + FIRING --> RECOVERED: Expression false
not in recovery window + + OBSERVING_RECOVERY --> FIRING: Expression true
not in silence period + OBSERVING_RECOVERY --> RECOVERED: Expression false
not in recovery window + + SILENCED --> RECOVERED: Expression false
not in recovery window + SILENCED --> OBSERVING_RECOVERY: Expression false
in recovery window + + RECOVERED --> FIRING: Expression true
not in silence period + RECOVERED --> NORMAL: Expression false +``` \ No newline at end of file diff --git a/docs/en/status/query_alarm_runtime_status.md b/docs/en/status/query_alarm_runtime_status.md index 4dc68c57d0ea..3568b2a49586 100644 --- a/docs/en/status/query_alarm_runtime_status.md +++ b/docs/en/status/query_alarm_runtime_status.md @@ -63,6 +63,7 @@ Return the detailed information of the alarm running rule. "expression": "sum(service_resp_time > 1000) >= 1", "period": 10, "silencePeriod": 10, + "recoveryObservationPeriod": 2, "additionalPeriod": 0, "includeEntityNames": [], "excludeEntityNames": [], @@ -97,6 +98,7 @@ Return the detailed information of the alarm running rule. "expression": "sum(service_resp_time > 1000) >= 1", "period": 10, "silencePeriod": 10, + "recoveryObservationPeriod": 2, "additionalPeriod": 0, "includeEntityNames": [], "excludeEntityNames": [], @@ -157,6 +159,7 @@ Return the running context of the alarm rule. "additionalPeriod": 0, "size": 10, "silenceCountdown": 10, + "recoveryObservationCountdown": 2, "entityName": "mock_b_service", "windowValues": [ { @@ -220,6 +223,7 @@ Return the running context of the alarm rule. "additionalPeriod": 0, "size": 0, "silenceCountdown": 0, + "recoveryObservationCountdown": 0, "windowValues": [] } } @@ -228,6 +232,7 @@ Return the running context of the alarm rule. ``` `size` is the window size. Equal to the `period + additionalPeriod`. `silenceCountdown` is the countdown of the silence period. -1 means silence countdown is not running. +`recoveryObservationCountdown` is the countdown of the recovery observation period. `windowValues` is the original metrics data. The `index` is the index of the window, starting from 0. `mqeMetricsSnapshot` is the metrics data in the MQE format. When checking conditions, these data will be calculated according to the expression. diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java index 1bb61d6d6185..610ca3f30a46 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java @@ -18,10 +18,9 @@ package org.apache.skywalking.oap.server.core.alarm.provider; -import java.util.Map; -import java.util.Set; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.joda.time.LocalDateTime; import org.joda.time.Minutes; import org.slf4j.Logger; @@ -29,8 +28,11 @@ import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; /** * Alarm core includes metrics values in certain time windows based on alarm settings. By using its internal timer @@ -92,9 +94,16 @@ public void start(List allCallbacks) { } if (!alarmMessageList.isEmpty()) { + List alarmFiringMessageList = getAlarmFiringMessageList(alarmMessageList); + List alarmRecoveryMessageList = getAlarmRecoveryMessageList(alarmMessageList); for (AlarmCallback callback : allCallbacks) { try { - callback.doAlarm(alarmMessageList); + if (!alarmFiringMessageList.isEmpty()) { + callback.doAlarm(alarmFiringMessageList); + } + if (!alarmRecoveryMessageList.isEmpty()) { + callback.doAlarmRecovery(alarmRecoveryMessageList); + } } catch (Exception e) { LOGGER.error(e.getMessage(), e); } @@ -102,7 +111,27 @@ public void start(List allCallbacks) { } } catch (Exception e) { LOGGER.error(e.getMessage(), e); + } catch (Throwable e) { + LOGGER.error(e.getMessage(), e); + } finally { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("move to new time and check"); + } } }, 10, 10, TimeUnit.SECONDS); } + + public static List getAlarmFiringMessageList(List alarmMessageList) { + return alarmMessageList + .stream() + .filter(msg -> !(msg instanceof AlarmRecoveryMessage)) + .collect(Collectors.toList()); + } + + public static List getAlarmRecoveryMessageList(List alarmMessageList) { + return alarmMessageList + .stream() + .filter(msg -> msg instanceof AlarmRecoveryMessage) + .collect(Collectors.toList()); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java index 633ea5c6465a..7aa35522d1ac 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java @@ -62,6 +62,7 @@ public class AlarmRule { private String excludeNamesRegex; private int period; private int silencePeriod; + private int recoveryObservationPeriod; private String message; private Map tags; private Set hooks; diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java index 1b476d038aff..1cda9bb0050c 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java @@ -92,6 +92,7 @@ public String getAlarmRuleById(final String ruleId) { ruleDetail.setExpression(rule.getExpression()); ruleDetail.setPeriod(rule.getPeriod()); ruleDetail.setSilencePeriod(rule.getSilencePeriod()); + ruleDetail.setRecoveryObservationPeriod(rule.getRecoveryObservationPeriod()); ruleDetail.setAdditionalPeriod(rule.getAdditionalPeriod()); ruleDetail.setIncludeEntityNames(rule.getIncludeNames()); ruleDetail.setExcludeEntityNames(rule.getExcludeNames()); @@ -135,7 +136,8 @@ public String getAlarmRuleContext(final String ruleName, final String entityName runningContext.setEndTime(window.getEndTime().toString()); runningContext.setAdditionalPeriod(window.getAdditionalPeriod()); runningContext.setSize(window.getSize()); - runningContext.setSilenceCountdown(window.getSilenceCountdown()); + runningContext.setSilenceCountdown(window.getStateMachine().getSilenceCountdown()); + runningContext.setRecoveryObservationCountdown(window.getStateMachine().getRecoveryObservationCountdown()); window.scanWindowValues(values -> { for (int i = 0; i < values.size(); i++) { AlarmRunningContext.WindowValue windowValue = new AlarmRunningContext.WindowValue(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java index acb14d2764f3..5018a3fb2315 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java @@ -19,6 +19,7 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import java.io.IOException; + import org.apache.skywalking.apm.network.event.v3.Event; import org.apache.skywalking.apm.network.event.v3.Source; import org.apache.skywalking.apm.network.event.v3.Type; @@ -27,6 +28,7 @@ import org.apache.skywalking.oap.server.core.CoreModule; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.analysis.IDManager; import org.apache.skywalking.oap.server.core.analysis.Layer; import org.apache.skywalking.oap.server.core.query.MetadataQueryService; @@ -40,7 +42,6 @@ /** * EventCallBack: When an alert is present, an event is generated for each alert message. These events are then sent to the internal event analyzer. - * */ public class EventHookCallback implements AlarmCallback { @@ -50,8 +51,8 @@ public class EventHookCallback implements AlarmCallback { private MetadataQueryService getMetadataQueryService() { if (metadataQueryService == null) { this.metadataQueryService = manager.find(CoreModule.NAME) - .provider() - .getService(MetadataQueryService.class); + .provider() + .getService(MetadataQueryService.class); } return metadataQueryService; } @@ -60,11 +61,18 @@ public EventHookCallback(ModuleManager manager) { this.manager = manager; } - @Override public void doAlarm(List alarmMessage) throws Exception { + doAlarmCallback(alarmMessage, false); + } + + public void doAlarmRecovery(List alarmRecoveryMessages) throws Exception { + doAlarmCallback(alarmRecoveryMessages, true); + } + + private void doAlarmCallback(List alarmMessage, boolean isRecovery) throws Exception { EventAnalyzerService analyzerService = manager.find(EventAnalyzerModule.NAME).provider().getService(EventAnalyzerService.class); for (AlarmMessage a : alarmMessage) { - for (Event event : constructCurrentEvent(a)) { + for (Event event : constructCurrentEvent(a, isRecovery)) { analyzerService.analyze(event); } } @@ -79,33 +87,33 @@ private String getLayer(String serviceId) throws IOException { } } - private List constructCurrentEvent(AlarmMessage msg) throws IOException { + private List constructCurrentEvent(AlarmMessage msg, boolean isRecovery) throws IOException { List events = new ArrayList<>(2); long now = System.currentTimeMillis(); Event.Builder builder = Event.newBuilder() .setUuid(UUID.randomUUID().toString()) - .setName("Alarm") - .setStartTime(now - (msg.getPeriod() * 60 * 1000)) + .setName(isRecovery ? "AlarmRecovery" : "Alarm") + .setStartTime(isRecovery ? ((AlarmRecoveryMessage) msg).getRecoveryTime() : now - (msg.getPeriod() * 60 * 1000)) .setMessage(msg.getAlarmMessage()) - .setType(Type.Error) - .setEndTime(now); + .setType(isRecovery ? Type.Normal : Type.Error) + .setEndTime(isRecovery ? ((AlarmRecoveryMessage) msg).getRecoveryTime() : now); switch (msg.getScopeId()) { - case DefaultScopeDefine.SERVICE : + case DefaultScopeDefine.SERVICE: IDManager.ServiceID.ServiceIDDefinition serviceIdDef = IDManager.ServiceID.analysisId(msg.getId0()); builder.setSource( - Source.newBuilder() - .setService(serviceIdDef.getName()) - .build() + Source.newBuilder() + .setService(serviceIdDef.getName()) + .build() ); builder.setLayer(getLayer(msg.getId0())); events.add(builder.build()); break; - case DefaultScopeDefine.SERVICE_RELATION : + case DefaultScopeDefine.SERVICE_RELATION: IDManager.ServiceID.ServiceIDDefinition sourceServiceIdDef = IDManager.ServiceID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() - .setService(sourceServiceIdDef.getName()) - .build() + .setService(sourceServiceIdDef.getName()) + .build() ); builder.setLayer(getLayer(msg.getId0())); events.add(builder.build()); @@ -118,7 +126,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(msg.getId1())); events.add(builder.build()); break; - case DefaultScopeDefine.SERVICE_INSTANCE : + case DefaultScopeDefine.SERVICE_INSTANCE: IDManager.ServiceInstanceID.InstanceIDDefinition instanceIdDef = IDManager.ServiceInstanceID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() @@ -129,7 +137,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(instanceIdDef.getServiceId())); events.add(builder.build()); break; - case DefaultScopeDefine.SERVICE_INSTANCE_RELATION : + case DefaultScopeDefine.SERVICE_INSTANCE_RELATION: IDManager.ServiceInstanceID.InstanceIDDefinition sourceInstanceIdDef = IDManager.ServiceInstanceID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() @@ -149,7 +157,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(destInstanceIdDef.getServiceId())); events.add(builder.build()); break; - case DefaultScopeDefine.ENDPOINT : + case DefaultScopeDefine.ENDPOINT: IDManager.EndpointID.EndpointIDDefinition endpointIDDef = IDManager.EndpointID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() @@ -160,7 +168,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(endpointIDDef.getServiceId())); events.add(builder.build()); break; - case DefaultScopeDefine.ENDPOINT_RELATION : + case DefaultScopeDefine.ENDPOINT_RELATION: IDManager.EndpointID.EndpointIDDefinition sourceEndpointIDDef = IDManager.EndpointID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index b48c94b1c2f5..36331230a636 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -29,6 +29,7 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; + import org.apache.skywalking.mqe.rt.exception.IllegalExpressionException; import org.apache.skywalking.oap.server.core.alarm.provider.discord.DiscordSettings; import org.apache.skywalking.oap.server.core.alarm.provider.pagerduty.PagerDutySettings; @@ -111,6 +112,8 @@ private void readRulesConfig(Rules rules) { alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1)); // How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. alarmRule.setSilencePeriod((Integer) settings.getOrDefault("silence-period", alarmRule.getPeriod())); + alarmRule.setRecoveryObservationPeriod((Integer) settings.getOrDefault("recovery-observation-period", + 0)); alarmRule.setMessage( (String) settings.getOrDefault("message", "Alarm caused by Rule " + alarmRule .getAlarmRuleName())); @@ -156,11 +159,15 @@ private void readWebHookConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; WebhookSettings settings = new WebhookSettings( - k.toString(), AlarmHooksType.webhook, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.webhook, (Boolean) config.getOrDefault("is-default", false)); List urls = (List) config.get("urls"); if (urls != null) { settings.getUrls().addAll(urls); } + List recoveryUrls = (List) config.get("recovery-urls"); + if (recoveryUrls != null) { + settings.getRecoveryUrls().addAll(recoveryUrls); + } Map headers = (Map) config.getOrDefault("headers", new HashMap<>()); settings.setHeaders(headers); rules.getWebhookSettingsMap().put(settings.getFormattedName(), settings); @@ -183,7 +190,7 @@ private void readGrpcConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; GRPCAlarmSetting setting = new GRPCAlarmSetting( - k.toString(), AlarmHooksType.gRPC, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.gRPC, (Boolean) config.getOrDefault("is-default", false)); Object targetHost = config.get("target-host"); if (targetHost != null) { @@ -216,10 +223,12 @@ private void readSlackConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; SlackSettings settings = new SlackSettings( - k.toString(), AlarmHooksType.slack, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.slack, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List webhooks = (List) config.get("webhooks"); if (webhooks != null) { @@ -245,11 +254,14 @@ private void readWechatConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; WechatSettings settings = new WechatSettings( - k.toString(), AlarmHooksType.wechat, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.wechat, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); + List webhooks = (List) config.get("webhooks"); if (webhooks != null) { settings.getWebhooks().addAll(webhooks); @@ -274,11 +286,14 @@ private void readDingtalkConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; DingtalkSettings settings = new DingtalkSettings( - k.toString(), AlarmHooksType.dingtalk, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.dingtalk, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); + List> webhooks = (List>) config.get("webhooks"); if (webhooks != null) { webhooks.forEach(webhook -> { @@ -307,11 +322,14 @@ private void readFeishuConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; FeishuSettings settings = new FeishuSettings( - k.toString(), AlarmHooksType.feishu, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.feishu, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); + List> webhooks = (List>) config.get("webhooks"); if (webhooks != null) { webhooks.forEach(webhook -> { @@ -340,17 +358,19 @@ private void readWeLinkConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); + String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", ""); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; } List webHookUrls = webhooks.stream().map( - WeLinkSettings.WebHookUrl::generateFromMap + WeLinkSettings.WebHookUrl::generateFromMap ).collect(Collectors.toList()); WeLinkSettings settings = new WeLinkSettings( - k.toString(), AlarmHooksType.welink, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.welink, (Boolean) config.getOrDefault("is-default", false)); settings.setTextTemplate(textTemplate); + settings.setRecoveryTextTemplate(recoveryTextTemplate); settings.setWebhooks(webHookUrls); rules.getWeLinkSettingsMap().put(settings.getFormattedName(), settings); @@ -373,9 +393,11 @@ private void readPagerDutyConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; PagerDutySettings settings = new PagerDutySettings( - k.toString(), AlarmHooksType.pagerduty, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.pagerduty, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List integrationKeys = (List) config.get("integration-keys"); if (integrationKeys != null) { @@ -402,17 +424,19 @@ private void readDiscordConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); + String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", ""); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; } List webHookUrls = webhooks.stream().map( - DiscordSettings.WebHookUrl::generateFromMap + DiscordSettings.WebHookUrl::generateFromMap ).collect(Collectors.toList()); DiscordSettings settings = new DiscordSettings( - k.toString(), AlarmHooksType.discord, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.discord, (Boolean) config.getOrDefault("is-default", false)); settings.setTextTemplate(textTemplate); + settings.setRecoveryTextTemplate(recoveryTextTemplate); settings.setWebhooks(webHookUrls); rules.getDiscordSettingsMap().put(settings.getFormattedName(), settings); @@ -426,7 +450,7 @@ private void readDiscordConfig(Map hooks, Rules rules) { private void checkSpecificHooks(String ruleName, Set hooks) { if (!this.allHooks.containsAll(hooks)) { throw new IllegalArgumentException("rule: [" + ruleName + "] contains invalid hooks." + - " Please check the hook is exist and name format is {hookType}.{hookName}"); + " Please check the hook is exist and name format is {hookType}.{hookName}"); } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index d270611b88ef..e226b950dd84 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -19,19 +19,6 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import com.google.gson.JsonObject; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.locks.ReentrantLock; -import java.util.function.Consumer; -import java.util.regex.Pattern; -import java.util.stream.Collectors; import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.ToString; @@ -42,15 +29,10 @@ import org.apache.skywalking.mqe.rt.exception.ParseErrorListener; import org.apache.skywalking.mqe.rt.grammar.MQELexer; import org.apache.skywalking.mqe.rt.grammar.MQEParser; -import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResult; -import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResultType; -import org.apache.skywalking.oap.server.core.query.mqe.MQEValues; -import org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVisitor; -import org.apache.skywalking.oap.server.core.query.type.debugging.DebuggingTraceContext; -import org.apache.skywalking.oap.server.library.module.ModuleManager; -import org.apache.skywalking.oap.server.library.util.StringUtil; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm; +import org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVisitor; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable; import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder; @@ -58,12 +40,32 @@ import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; +import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResult; +import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResultType; +import org.apache.skywalking.oap.server.core.query.mqe.MQEValues; +import org.apache.skywalking.oap.server.core.query.type.debugging.DebuggingTraceContext; +import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; import org.joda.time.LocalDateTime; import org.joda.time.Minutes; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + import static org.apache.skywalking.oap.server.core.query.type.debugging.DebuggingTraceContext.TRACE_CONTEXT; /** @@ -78,6 +80,7 @@ public class RunningRule { private final int period; private final String expression; private final int silencePeriod; + private final int recoveryObservationPeriod; private final Map windows; private final List includeNames; private final List excludeNames; @@ -100,18 +103,19 @@ public RunningRule(AlarmRule alarmRule, ModuleManager moduleManager) { windows = new ConcurrentHashMap<>(); period = alarmRule.getPeriod(); this.silencePeriod = alarmRule.getSilencePeriod(); + this.recoveryObservationPeriod = alarmRule.getRecoveryObservationPeriod(); this.includeNames = alarmRule.getIncludeNames(); this.excludeNames = alarmRule.getExcludeNames(); this.includeNamesRegex = StringUtil.isNotEmpty(alarmRule.getIncludeNamesRegex()) ? - Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; + Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ? - Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; + Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); this.tags = alarmRule.getTags() - .entrySet() - .stream() - .map(e -> new Tag(e.getKey(), e.getValue())) - .collect(Collectors.toList()); + .entrySet() + .stream() + .map(e -> new Tag(e.getKey(), e.getValue())) + .collect(Collectors.toList()); this.hooks = alarmRule.getHooks(); MQELexer lexer = new MQELexer(CharStreams.fromString(alarmRule.getExpression())); MQEParser parser = new MQEParser(new CommonTokenStream(lexer)); @@ -143,9 +147,10 @@ public void in(MetaInAlarm meta, Metrics metrics) { } AlarmEntity entity = new AlarmEntity( - meta.getScope(), meta.getScopeId(), meta.getName(), meta.getId0(), meta.getId1()); + meta.getScope(), meta.getScopeId(), meta.getName(), meta.getId0(), meta.getId1()); - Window window = windows.computeIfAbsent(entity, ignored -> new Window(entity, this.period, this.additionalPeriod)); + Window window = windows.computeIfAbsent(entity, ignored -> new Window(entity, this.period, + this.silencePeriod, this.recoveryObservationPeriod, this.additionalPeriod)); window.add(meta.getMetricsName(), metrics); } @@ -214,38 +219,35 @@ public List check() { windows.forEach((alarmEntity, window) -> { if (window.isExpired()) { expiredEntityList.add(alarmEntity); + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} expired", ruleName, alarmEntity.getName(), + alarmEntity.getId0(), alarmEntity.getId1()); + } return; } Optional alarmMessageOptional = window.checkAlarm(); - if (alarmMessageOptional.isPresent()) { - AlarmMessage alarmMessage = alarmMessageOptional.get(); - alarmMessage.setScopeId(alarmEntity.getScopeId()); - alarmMessage.setScope(alarmEntity.getScope()); - alarmMessage.setName(alarmEntity.getName()); - alarmMessage.setId0(alarmEntity.getId0()); - alarmMessage.setId1(alarmEntity.getId1()); - alarmMessage.setRuleName(this.ruleName); - alarmMessage.setAlarmMessage(formatter.format(alarmEntity)); - alarmMessage.setStartTime(System.currentTimeMillis()); - alarmMessage.setPeriod(this.period); - alarmMessage.setTags(this.tags); - alarmMessage.setHooks(this.hooks); - alarmMessage.setExpression(expression); - alarmMessage.setMqeMetricsSnapshot(window.mqeMetricsSnapshot); - alarmMessageList.add(alarmMessage); - } + alarmMessageOptional.ifPresent(alarmMessageList::add); }); expiredEntityList.forEach(windows::remove); return alarmMessageList; } + public enum State { + NORMAL, + FIRING, + SILENCED, + OBSERVING_RECOVERY, + RECOVERED + } + /** * A metrics window, based on AlarmRule#period. This window slides with time, just keeps the recent N(period) * buckets. */ public class Window { + @Getter private LocalDateTime endTime; @Getter @@ -253,20 +255,24 @@ public class Window { @Getter private final int size; @Getter - private int silenceCountdown; + private final int period; + @Getter + private final AlarmStateMachine stateMachine; private LinkedList> values; private ReentrantLock lock = new ReentrantLock(); + private AlarmMessage lastAlarmMessage; @Getter private JsonObject mqeMetricsSnapshot; private AlarmEntity entity; - public Window(AlarmEntity entity, int period, int additionalPeriod) { + public Window(AlarmEntity entity, int period, int silencePeriod, int recoveryObservationPeriod, + int additionalPeriod) { this.entity = entity; this.additionalPeriod = additionalPeriod; this.size = period + additionalPeriod; - // -1 means silence countdown is not running. - silenceCountdown = -1; - init(); + this.period = period; + this.stateMachine = new AlarmStateMachine(silencePeriod, recoveryObservationPeriod); + this.init(); } public void moveTo(LocalDateTime current) { @@ -321,8 +327,8 @@ public void add(String metricsName, Metrics metrics) { // also should happen, but maybe if agent/probe mechanism time is not right. if (log.isTraceEnabled()) { log.trace( - "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, - values.size() + "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, + values.size() ); } return; @@ -345,24 +351,47 @@ public void add(String metricsName, Metrics metrics) { } public Optional checkAlarm() { - if (isMatch()) { - /* - * When - * 1. Alarm trigger conditions are satisfied. - * 2. Isn't in silence stage, judged by SilenceCountdown(!=0). - */ - if (silenceCountdown < 1) { - silenceCountdown = silencePeriod; - return Optional.of(new AlarmMessage()); - } else { - silenceCountdown--; - } + boolean match = isMatch(); + if (log.isTraceEnabled()) { + log.trace("RuleName {} AlarmEntity {} {} {} isMatch:{}", ruleName, entity.getName(), entity.getId0(), + entity.getId1(), match); + } + if (match) { + stateMachine.onMatch(); } else { - silenceCountdown--; + stateMachine.onMismatch(); + } + if (stateMachine.getCurrentState() == State.FIRING) { + AlarmMessage alarmMessage = buildAlarmMessage(); + lastAlarmMessage = alarmMessage; + return Optional.of(alarmMessage); + } + if (stateMachine.getCurrentState() == State.RECOVERED) { + AlarmRecoveryMessage alarmRecoveryMessage = new AlarmRecoveryMessage(lastAlarmMessage); + lastAlarmMessage = null; + return Optional.of(alarmRecoveryMessage); } return Optional.empty(); } + private AlarmMessage buildAlarmMessage() { + AlarmMessage alarmMessage = new AlarmMessage(); + alarmMessage.setScopeId(entity.getScopeId()); + alarmMessage.setScope(entity.getScope()); + alarmMessage.setName(entity.getName()); + alarmMessage.setId0(entity.getId0()); + alarmMessage.setId1(entity.getId1()); + alarmMessage.setRuleName(ruleName); + alarmMessage.setAlarmMessage(formatter.format(entity)); + alarmMessage.setStartTime(System.currentTimeMillis()); + alarmMessage.setPeriod(period); + alarmMessage.setTags(tags); + alarmMessage.setHooks(hooks); + alarmMessage.setExpression(expression); + alarmMessage.setMqeMetricsSnapshot(mqeMetricsSnapshot); + return alarmMessage; + } + private boolean isMatch() { this.lock.lock(); int isMatch = 0; @@ -375,15 +404,15 @@ private boolean isMatch() { return false; } if (!parseResult.isBoolResult() || - ExpressionResultType.SINGLE_VALUE != parseResult.getType() || - CollectionUtils.isEmpty(parseResult.getResults())) { + ExpressionResultType.SINGLE_VALUE != parseResult.getType() || + CollectionUtils.isEmpty(parseResult.getResults())) { return false; } if (!parseResult.isLabeledResult()) { MQEValues mqeValues = parseResult.getResults().get(0); if (mqeValues != null && - CollectionUtils.isNotEmpty(mqeValues.getValues()) && - mqeValues.getValues().get(0) != null) { + CollectionUtils.isNotEmpty(mqeValues.getValues()) && + mqeValues.getValues().get(0) != null) { isMatch = (int) mqeValues.getValues().get(0).getDoubleValue(); } } else { @@ -401,8 +430,8 @@ private boolean isMatch() { // then the isMatch is 1 for (MQEValues mqeValues : parseResult.getResults()) { if (mqeValues != null && - CollectionUtils.isNotEmpty(mqeValues.getValues()) && - mqeValues.getValues().get(0) != null) { + CollectionUtils.isNotEmpty(mqeValues.getValues()) && + mqeValues.getValues().get(0) != null) { isMatch = (int) mqeValues.getValues().get(0).getDoubleValue(); if (isMatch == 1) { break; @@ -447,6 +476,113 @@ private void init() { values.add(null); } } + + public class AlarmStateMachine { + @Getter + private int silenceCountdown; + @Getter + private int recoveryObservationCountdown; + private final int silencePeriod; + private final int recoveryObservationPeriod; + @Getter + private State currentState; + + public AlarmStateMachine(int silencePeriod, int recoveryObservationPeriod) { + this.currentState = State.NORMAL; + this.silencePeriod = silencePeriod; + this.recoveryObservationPeriod = recoveryObservationPeriod; + this.silenceCountdown = -1; + this.recoveryObservationCountdown = recoveryObservationPeriod; + } + + public void onMatch() { + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} onMatch silenceCountdown:{} currentState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), silenceCountdown, currentState); + } + silenceCountdown--; + switch (currentState) { + case NORMAL: + case SILENCED: + case OBSERVING_RECOVERY: + case RECOVERED: + if (silenceCountdown < 0) { + transitionTo(State.FIRING); + } + break; + case FIRING: + if (silenceCountdown >= 0) { + transitionTo(State.SILENCED); + } + break; + default: + break; + } + } + + public void onMismatch() { + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} onMismatch silenceCountdown:{} " + + "recoveryObservationCountdown:{} currentState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), silenceCountdown, + recoveryObservationCountdown, currentState); + } + recoveryObservationCountdown--; + silenceCountdown--; + switch (currentState) { + case FIRING: + case SILENCED: + if (this.recoveryObservationCountdown < 0) { + transitionTo(State.RECOVERED); + } else { + transitionTo(State.OBSERVING_RECOVERY); + } + break; + case OBSERVING_RECOVERY: + if (recoveryObservationCountdown < 0) { + transitionTo(State.RECOVERED); + } + break; + case RECOVERED: + transitionTo(State.NORMAL); + break; + case NORMAL: + default: + break; + } + } + + private void transitionTo(State newState) { + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} transitionTo newState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), newState); + } + this.currentState = newState; + switch (newState) { + case NORMAL: + resetCountdowns(); + break; + case FIRING: + this.silenceCountdown = this.silencePeriod; + this.recoveryObservationCountdown = recoveryObservationPeriod; + break; + case SILENCED: + break; + case OBSERVING_RECOVERY: + this.recoveryObservationCountdown = this.recoveryObservationPeriod - 1; + break; + case RECOVERED: + this.recoveryObservationCountdown = this.recoveryObservationPeriod; + break; + } + } + + private void resetCountdowns() { + recoveryObservationCountdown = this.recoveryObservationPeriod; + } + + } + } private LinkedList> transformValues(LinkedList> values) { @@ -460,16 +596,16 @@ private LinkedList> transformValues(LinkedList r = new HashMap<>(); result.add(r); if (m instanceof LongValueHolder) { - r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[] {((LongValueHolder) m).getValue()})); + r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[]{((LongValueHolder) m).getValue()})); } else if (m instanceof IntValueHolder) { - r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[] {((IntValueHolder) m).getValue()})); + r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[]{((IntValueHolder) m).getValue()})); } else if (m instanceof DoubleValueHolder) { - r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[] {((DoubleValueHolder) m).getValue()})); + r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[]{((DoubleValueHolder) m).getValue()})); } else if (m instanceof LabeledValueHolder) { DataTable dt = ((LabeledValueHolder) m).getValue(); TraceLogMetric l = new TraceLogMetric( - m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()) - .toArray(new Number[0])); + m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()) + .toArray(new Number[0])); l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]); r.put(name, l); } else { diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java index ab2c7efb3297..de179a7ca15b 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java @@ -46,11 +46,7 @@ public class DingtalkHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; - /** - * Send alarm message if the settings not empty - */ - @Override - public void doAlarm(List alarmMessages) throws Exception { + protected void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getDingtalkSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -61,21 +57,26 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { final var url = getUrl(webHookUrl); for (final var alarmMessage : messages) { - final var requestBody = String.format( - setting.getTextTemplate(), alarmMessage.getAlarmMessage() - ); - post(URI.create(url), requestBody, Map.of()); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var requestBody = String.format(template, alarmMessage.getAlarmMessage()); + post(URI.create(url), requestBody, Map.of()); + } } } } } + private String getTemplate(DingtalkSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * Get webhook url, sign the url when secret is not empty. */ diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java index 475c44329c52..d842a150a0fe 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java @@ -34,6 +34,7 @@ public class DingtalkSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); public DingtalkSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java index 44d53dcc6a5d..e18f9b84f661 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java @@ -29,7 +29,9 @@ import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm Discord webhook API. @@ -39,11 +41,7 @@ public class DiscordHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; - /** - * Send alarm message if the settings not empty - */ - @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getDiscordSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -55,21 +53,25 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var content = String.format( - setting.getTextTemplate(), - alarmMessage.getAlarmMessage() - ); - sendAlarmMessage(webHookUrl, content); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var content = String.format(template, alarmMessage.getAlarmMessage()); + sendAlarmMessage(webHookUrl, content); + } } } } } + private String getTemplate(DiscordSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * Send alarm message to remote endpoint */ diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java index ddaea517045d..6e7251219e5c 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java @@ -35,6 +35,7 @@ public class DiscordSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); public DiscordSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java index 82004e77426c..8ede5dca487a 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java @@ -48,11 +48,7 @@ public class FeishuHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; - /** - * Send alarm message if the settings not empty - */ - @Override - public void doAlarm(List alarmMessages) throws Exception { + protected void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getFeishuSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -63,22 +59,29 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var requestBody = getRequestBody(webHookUrl, alarmMessage, setting.getTextTemplate()); - try { - post(URI.create(webHookUrl.getUrl()), requestBody, Map.of()); - } catch (Exception e) { - log.error("Failed to send alarm message to Feishu: {}", webHookUrl.getUrl(), e); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var requestBody = getRequestBody(webHookUrl, alarmMessage, template); + try { + post(URI.create(webHookUrl.getUrl()), requestBody, Map.of()); + } catch (Exception e) { + log.error("Failed to send alarm message to Feishu: {}", webHookUrl.getUrl(), e); + } } } } } } + private String getTemplate(FeishuSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * deal requestBody,if it has sign set the sign */ diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java index b4a3cef55888..2d578e6a58b9 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java @@ -34,6 +34,7 @@ public class FeishuSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; @Builder.Default private List webhooks = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java index 2ae872f2f13c..68b913a75917 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java @@ -19,13 +19,16 @@ package org.apache.skywalking.oap.server.core.alarm.provider.grpc; import io.grpc.stub.StreamObserver; + import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; + import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmServiceGrpc; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmTags; import org.apache.skywalking.oap.server.core.alarm.grpc.KeyStringValuePair; @@ -51,9 +54,9 @@ public class GRPCCallback implements AlarmCallback { public GRPCCallback(AlarmRulesWatcher alarmRulesWatcher) { this.alarmRulesWatcher = alarmRulesWatcher; - this.alarmSettingMap = new HashMap<>(); this.alarmServiceStubMap = new HashMap<>(); this.grpcClientMap = new HashMap<>(); + this.alarmSettingMap = new HashMap<>(); Map alarmSettingMap = alarmRulesWatcher.getGrpchookSetting(); if (CollectionUtils.isNotEmpty(alarmSettingMap)) { alarmSettingMap.forEach((name, alarmSetting) -> { @@ -64,11 +67,21 @@ public GRPCCallback(AlarmRulesWatcher alarmRulesWatcher) { alarmServiceStubMap.put(name, AlarmServiceGrpc.newStub(grpcClient.getChannel())); } }); + this.alarmSettingMap = alarmSettingMap; } } @Override public void doAlarm(List alarmMessages) { + doAlarmCallback(alarmMessages, false); + } + + @Override + public void doAlarmRecovery(List alarmRecoveryMessages) { + doAlarmCallback(alarmRecoveryMessages, true); + } + + private void doAlarmCallback(List alarmMessages, boolean isRecovery) { // recreate gRPC client and stub if host and port configuration changed. Map settinsMap = alarmRulesWatcher.getGrpchookSetting(); onGRPCAlarmSettingUpdated(settinsMap); @@ -76,11 +89,15 @@ public void doAlarm(List alarmMessages) { if (settinsMap == null || settinsMap.isEmpty()) { return; } - Map> groupedMessages = groupMessagesByHook(alarmMessages); + Map> groupedMessages = groupMessagesByHook(alarmMessages); groupedMessages.forEach((hook, messages) -> { if (alarmServiceStubMap.containsKey(hook)) { - sendAlarmMessages(alarmServiceStubMap.get(hook), messages, settinsMap.get(hook)); + if (!isRecovery) { + sendAlarmMessages(alarmServiceStubMap.get(hook), messages, settinsMap.get(hook)); + } else { + sendAlarmRecoveryMessages(alarmServiceStubMap.get(hook), messages, settinsMap.get(hook)); + } } }); @@ -92,32 +109,30 @@ private void sendAlarmMessages(AlarmServiceGrpc.AlarmServiceStub alarmServiceStu GRPCStreamStatus status = new GRPCStreamStatus(); StreamObserver streamObserver = - alarmServiceStub.withDeadlineAfter(10, TimeUnit.SECONDS).doAlarm(new StreamObserver() { - @Override - public void onNext(Response response) { - // ignore empty response - } + alarmServiceStub.withDeadlineAfter(10, TimeUnit.SECONDS).doAlarm(new StreamObserver() { + @Override + public void onNext(Response response) { + // ignore empty response + } - @Override - public void onError(Throwable throwable) { - status.done(); - if (log.isDebugEnabled()) { - log.debug("Send alarm message failed: {}", throwable.getMessage()); + @Override + public void onError(Throwable throwable) { + status.done(); + log.warn("Send alarm message failed: {}", throwable.getMessage()); } - } - @Override - public void onCompleted() { - status.done(); - if (log.isDebugEnabled()) { - log.debug("Send alarm message successful."); + @Override + public void onCompleted() { + status.done(); + if (log.isDebugEnabled()) { + log.debug("Send alarm message successful."); + } } - } - }); + }); alarmMessages.forEach(message -> { org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage.Builder builder = - org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage.newBuilder(); + org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage.newBuilder(); builder.setScopeId(message.getScopeId()); builder.setScope(message.getScope()); @@ -127,6 +142,7 @@ public void onCompleted() { builder.setRuleName(message.getRuleName()); builder.setAlarmMessage(message.getAlarmMessage()); builder.setStartTime(message.getStartTime()); + builder.setUuid(message.getUuid()); AlarmTags.Builder alarmTagsBuilder = AlarmTags.newBuilder(); message.getTags().forEach(m -> alarmTagsBuilder.addData(KeyStringValuePair.newBuilder().setKey(m.getKey()).setValue(m.getValue()).build())); builder.setTags(alarmTagsBuilder.build()); @@ -148,18 +164,93 @@ public void onCompleted() { if (log.isDebugEnabled()) { log.debug("Send {} alarm message to {}:{}.", alarmMessages.size(), - alarmSetting.getTargetHost(), alarmSetting.getTargetPort() + alarmSetting.getTargetHost(), alarmSetting.getTargetPort() ); } if (sleepTime > 2000L) { log.warn("Send {} alarm message to {}:{}, wait {} milliseconds.", alarmMessages.size(), - alarmSetting.getTargetHost(), alarmSetting.getTargetPort(), sleepTime + alarmSetting.getTargetHost(), alarmSetting.getTargetPort(), sleepTime ); cycle = 2000L; } } -} + } + + private void sendAlarmRecoveryMessages(AlarmServiceGrpc.AlarmServiceStub alarmServiceStub, + List alarmMessages, + GRPCAlarmSetting alarmSetting) { + GRPCStreamStatus status = new GRPCStreamStatus(); + + StreamObserver streamObserver = + alarmServiceStub.withDeadlineAfter(10, TimeUnit.SECONDS).doAlarmRecovery(new StreamObserver() { + @Override + public void onNext(Response response) { + // ignore empty response + } + + @Override + public void onError(Throwable throwable) { + status.done(); + log.warn("Send alarm recovery message failed: {}", throwable.getMessage()); + } + + @Override + public void onCompleted() { + status.done(); + if (log.isDebugEnabled()) { + log.debug("Send alarm recovery message successful."); + } + } + }); + + alarmMessages.forEach(message -> { + org.apache.skywalking.oap.server.core.alarm.grpc.AlarmRecoveryMessage.Builder builder = + org.apache.skywalking.oap.server.core.alarm.grpc.AlarmRecoveryMessage.newBuilder(); + AlarmRecoveryMessage recoveryMessage = (AlarmRecoveryMessage) message; + builder.setScopeId(recoveryMessage.getScopeId()); + builder.setScope(recoveryMessage.getScope()); + builder.setName(recoveryMessage.getName()); + builder.setId0(recoveryMessage.getId0()); + builder.setId1(recoveryMessage.getId1()); + builder.setRuleName(recoveryMessage.getRuleName()); + builder.setAlarmMessage(recoveryMessage.getAlarmMessage()); + builder.setStartTime(recoveryMessage.getStartTime()); + builder.setUuid(recoveryMessage.getUuid()); + builder.setRecoveryTime(recoveryMessage.getRecoveryTime()); + AlarmTags.Builder alarmTagsBuilder = AlarmTags.newBuilder(); + message.getTags().forEach(m -> alarmTagsBuilder.addData(KeyStringValuePair.newBuilder().setKey(m.getKey()).setValue(m.getValue()).build())); + builder.setTags(alarmTagsBuilder.build()); + streamObserver.onNext(builder.build()); + }); + + streamObserver.onCompleted(); + + long sleepTime = 0; + long cycle = 100L; + + // For memory safe of oap, we must wait for the peer confirmation. + while (!status.isDone()) { + try { + sleepTime += cycle; + Thread.sleep(cycle); + } catch (InterruptedException ignored) { + } + + if (log.isDebugEnabled()) { + log.debug("Send {} alarm recovery message to {}:{}.", alarmMessages.size(), + alarmSetting.getTargetHost(), alarmSetting.getTargetPort() + ); + } + + if (sleepTime > 2000L) { + log.warn("Send {} alarm recovery message to {}:{}, wait {} milliseconds.", alarmMessages.size(), + alarmSetting.getTargetHost(), alarmSetting.getTargetPort(), sleepTime + ); + cycle = 2000L; + } + } + } private void onGRPCAlarmSettingUpdated(Map newAlarmSettingMap) { if (newAlarmSettingMap == null || newAlarmSettingMap.isEmpty()) { @@ -193,5 +284,6 @@ private void onGRPCAlarmSettingUpdated(Map newAlarmSet alarmServiceStubMap.put(name, AlarmServiceGrpc.newStub(grpcClient.getChannel())); } }); + alarmSettingMap = newAlarmSettingMap; } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java index a7d6e9c1b6db..0f9d91696236 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java @@ -31,7 +31,9 @@ import java.util.List; import java.util.Map; import java.util.UUID; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; @Slf4j @RequiredArgsConstructor @@ -42,7 +44,7 @@ public class PagerDutyHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; @Override - public void doAlarm(List alarmMessages) throws Exception { + protected void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getPagerDutySettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -54,16 +56,19 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getIntegrationKeys()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var integrationKey : setting.getIntegrationKeys()) { for (final var alarmMessage : messages) { try { - post( - URI.create(PAGER_DUTY_EVENTS_API_V2_URL), - getMessageBody(alarmMessage, integrationKey, setting.getTextTemplate()), Map.of() - ); + String template = getTemplate(isRecovery, setting); + if (StringUtil.isNotBlank(template)) { + post( + URI.create(PAGER_DUTY_EVENTS_API_V2_URL), + getMessageBody(alarmMessage, integrationKey, template), Map.of() + ); + } } catch (Exception e) { log.error("Failed to send alarm message to PagerDuty: {}", integrationKey, e); } @@ -72,6 +77,10 @@ public void doAlarm(List alarmMessages) throws Exception { } } + private String getTemplate(boolean isRecovery, PagerDutySettings setting) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + private String getMessageBody(AlarmMessage alarmMessage, String integrationKey, String textTemplate) { final var body = new JsonObject(); final var payload = new JsonObject(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java index d36f0577e26c..700e118ee863 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java @@ -33,6 +33,7 @@ public class PagerDutySettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List integrationKeys = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java index c87d60d37887..30e622dec20b 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java @@ -32,6 +32,7 @@ public class SlackSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java index d1c6edb3a706..c8dc031747f1 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java @@ -30,7 +30,9 @@ import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm slack webhook API calls a remote endpoints. @@ -43,7 +45,7 @@ public class SlackhookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getSlackSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -55,7 +57,7 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } @@ -63,10 +65,10 @@ public void doAlarm(List alarmMessages) throws Exception { final var jsonObject = new JsonObject(); final var jsonElements = new JsonArray(); for (AlarmMessage item : messages) { - jsonElements.add(GSON.fromJson( - String.format( - setting.getTextTemplate(), item.getAlarmMessage() - ), JsonObject.class)); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + jsonElements.add(GSON.fromJson(String.format(template, item.getAlarmMessage()), JsonObject.class)); + } } jsonObject.add("blocks", jsonElements); final var body = GSON.toJson(jsonObject); @@ -78,4 +80,8 @@ public void doAlarm(List alarmMessages) throws Exception { } } } + + private String getTemplate(SlackSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java index 9978da65e6e7..a745f44062c7 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java @@ -31,6 +31,7 @@ public class AlarmRuleDetail { private String expression; private int period; private int silencePeriod; + private int recoveryObservationPeriod; private int additionalPeriod; private List includeEntityNames = new ArrayList<>(); private List excludeEntityNames = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java index d0ee7e52fe5e..8d98d8960fa9 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java @@ -31,6 +31,7 @@ public class AlarmRunningContext { private int additionalPeriod; private int size; private int silenceCountdown; + private int recoveryObservationCountdown; private String entityName; private List windowValues = new ArrayList<>(); private JsonObject mqeMetricsSnapshot; diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java index 0fd0dbe30453..0217ddb46967 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java @@ -23,9 +23,11 @@ import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.HttpAlarmCallback; + import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.library.util.CollectionUtils; @@ -39,7 +41,7 @@ public class WebhookCallback extends HttpAlarmCallback { private final Gson gson = new Gson(); @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getWebHooks(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -50,11 +52,12 @@ public void doAlarm(List alarmMessages) throws Exception { var hookName = entry.getKey(); var messages = entry.getValue(); var setting = settingsMap.get(hookName); - if (setting == null || CollectionUtils.isEmpty(setting.getUrls()) || CollectionUtils.isEmpty( - messages)) { + List urls = getUrls(setting, isRecovery); + if (setting == null || CollectionUtils.isEmpty(urls) || CollectionUtils.isEmpty( + messages)) { continue; } - for (final var url : setting.getUrls()) { + for (final var url : urls) { try { post(URI.create(url), gson.toJson(messages), setting.getHeaders()); } catch (Exception e) { @@ -63,4 +66,8 @@ public void doAlarm(List alarmMessages) throws Exception { } } } + + private static List getUrls(WebhookSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryUrls() : setting.getUrls(); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java index 813bbf4c5739..a1ce830cd6a8 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java @@ -34,6 +34,7 @@ @ToString public class WebhookSettings extends AlarmHookSettings { private List urls = new ArrayList<>(); + private List recoveryUrls = new ArrayList<>(); private Map headers = new HashMap<>(); public WebhookSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java index 9fc3e6e0be9e..618c2192ab4a 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java @@ -27,7 +27,9 @@ import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm wechat webhook API. @@ -38,7 +40,7 @@ public class WechatHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getWechatSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -49,22 +51,26 @@ public void doAlarm(List alarmMessages) throws Exception { var hookName = entry.getKey(); var messages = entry.getValue(); var setting = settingsMap.get(hookName); - if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty(messages)) { continue; } for (final var url : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var requestBody = String.format( - setting.getTextTemplate(), alarmMessage.getAlarmMessage() - ); - try { - post(URI.create(url), requestBody, Map.of()); - } catch (Exception e) { - log.error("Failed to send alarm message to Wechat webhook: {}", url, e); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var requestBody = String.format(template, alarmMessage.getAlarmMessage()); + try { + post(URI.create(url), requestBody, Map.of()); + } catch (Exception e) { + log.error("Failed to send alarm message to Wechat webhook: {}", url, e); + } } } } } } + + private String getTemplate(WechatSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java index f29b897d41e1..9dc6beb6c52f 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java @@ -33,6 +33,8 @@ public class WechatSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; + private List webhooks = new ArrayList<>(); public WechatSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java index 5505a05b95e4..ef0237787703 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java @@ -22,7 +22,9 @@ import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; + import java.util.Map; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; @@ -37,7 +39,9 @@ import java.util.Locale; import java.util.Optional; import java.util.UUID; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm WeLink webhook API. @@ -51,33 +55,37 @@ public class WeLinkHookCallback extends HttpAlarmCallback { * Send alarm message if the settings not empty */ @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getWeLinkSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; } - Map> groupedMessages = groupMessagesByHook(alarmMessages); + Map> groupedMessages = groupMessagesByHook(alarmMessages); for (Map.Entry> entry : groupedMessages.entrySet()) { var hookName = entry.getKey(); var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { final var accessToken = getAccessToken(webHookUrl); for (final var alarmMessage : messages) { - final var content = String.format( - setting.getTextTemplate(), - alarmMessage.getAlarmMessage() - ); - sendAlarmMessage(webHookUrl, accessToken, content); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var content = String.format(template, alarmMessage.getAlarmMessage()); + sendAlarmMessage(webHookUrl, accessToken, content); + } } } } } + private String getTemplate(WeLinkSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * Send alarm message to remote endpoint */ @@ -92,9 +100,9 @@ private void sendAlarmMessage(WeLinkSettings.WebHookUrl webHookUrl, String acces body.addProperty("app_msg_id", UUID.randomUUID().toString()); body.add("group_id", groupIds); body.addProperty("content", String.format( - Locale.US, "0<imbody><imagelist/>" + - "<html><![CDATA[<DIV>%s</DIV>]]></html><content><![CDATA[%s]]></content></imbody>", - content, content + Locale.US, "0<imbody><imagelist/>" + + "<html><![CDATA[<DIV>%s</DIV>]]></html><content><![CDATA[%s]]></content></imbody>", + content, content )); body.addProperty("content_type", 0); body.addProperty("client_app_id", "1"); @@ -110,16 +118,16 @@ private String getAccessToken(WeLinkSettings.WebHookUrl webHookUrl) throws IOExc final var clientId = webHookUrl.getClientId(); final var clientSecret = webHookUrl.getClientSecret(); final var response = post( - URI.create(accessTokenUrl), - String.format(Locale.US, "{\"client_id\":%s,\"client_secret\":%s}", clientId, clientSecret), - Collections.emptyMap() + URI.create(accessTokenUrl), + String.format(Locale.US, "{\"client_id\":%s,\"client_secret\":%s}", clientId, clientSecret), + Collections.emptyMap() ); final var gson = new Gson(); final var responseJson = gson.fromJson(response, JsonObject.class); return Optional.ofNullable(responseJson) - .map(r -> r.get("access_token")) - .map(JsonElement::getAsString) - .orElse(""); + .map(r -> r.get("access_token")) + .map(JsonElement::getAsString) + .orElse(""); } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java index 5c4609319921..e4d93e660699 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java @@ -34,6 +34,7 @@ public class WeLinkSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); public WeLinkSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto b/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto index 75b9c31ece04..fc180ab757fa 100644 --- a/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto +++ b/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto @@ -24,6 +24,8 @@ option java_package = "org.apache.skywalking.oap.server.core.alarm.grpc"; service AlarmService { rpc doAlarm (stream AlarmMessage) returns (Response) { } + rpc doAlarmRecovery (stream AlarmRecoveryMessage) returns (Response) { + } } message AlarmMessage { @@ -36,8 +38,24 @@ message AlarmMessage { string alarmMessage = 7; int64 startTime = 8; AlarmTags tags = 9; + string uuid = 10; } +message AlarmRecoveryMessage { + int64 scopeId = 1; + string scope = 2; + string name = 3; + string id0 = 4; + string id1 = 5; + string ruleName = 6; + string alarmMessage = 7; + int64 startTime = 8; + AlarmTags tags = 9; + string uuid = 10; + int64 recoveryTime = 11; +} + + message AlarmTags { // String key, String value pair. repeated KeyStringValuePair data = 1; diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java index 75252274270e..0e3583a1ecd5 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java @@ -19,6 +19,7 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import com.google.common.collect.Lists; +import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.EndpointMetaInAlarm; import org.apache.skywalking.oap.server.core.alarm.EndpointRelationMetaInAlarm; @@ -44,6 +45,8 @@ import org.mockito.quality.Strictness; import org.powermock.reflect.Whitebox; +import java.util.List; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -261,11 +264,22 @@ public void setUp() { notifyHandler = new NotifyHandler(new AlarmRulesWatcher(rules, null, moduleManager), moduleManager); - notifyHandler.init(alarmMessageList -> { - for (AlarmMessage message : alarmMessageList) { - assertNotNull(message); + notifyHandler.init(new AlarmCallback() { + @Override + public void doAlarm(List alarmMessages) throws Exception { + for (AlarmMessage message : alarmMessages) { + assertNotNull(message); + } + } + + @Override + public void doAlarmRecovery(List alarmResolvedMessages) throws Exception { + for (AlarmMessage message : alarmResolvedMessages) { + assertNotNull(message); + } } - }); + } + ); AlarmCore core = mock(AlarmCore.class); diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index 656babfc834c..e6030177f069 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -50,6 +50,9 @@ import java.util.Map; import java.util.Objects; +import static org.apache.skywalking.oap.server.core.alarm.provider.AlarmCore.getAlarmFiringMessageList; +import static org.apache.skywalking.oap.server.core.alarm.provider.AlarmCore.getAlarmRecoveryMessageList; + /** * Running rule is the core of how does alarm work. *

@@ -59,11 +62,11 @@ public class RunningRuleTest { @BeforeEach public void setup() { ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_percent", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_percent", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_multiple_values", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_multiple_values", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_cpm", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_cpm", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); } @Test @@ -119,12 +122,12 @@ public void testAlarm() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); // check at startTime - 4 - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); // check at startTime - 2 runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); - alarmMessages = runningRule.check(); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(1, alarmMessages.size()); } @@ -152,14 +155,14 @@ public void testAlarmMetricsOutOfDate() throws IllegalExpressionException { // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); } @Test public void testLabeledAlarm() throws IllegalExpressionException { ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_labeled", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_labeled", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); AlarmRule alarmRule = new AlarmRule(null); alarmRule.setExpression("sum(endpoint_labeled{p='95,99'} > 10) >= 3"); alarmRule.getIncludeMetrics().add("endpoint_labeled"); @@ -202,13 +205,13 @@ private void multipleMetricsAlarm(String expression, int alarmMsgSize) throws Il runningRule.in(getMetaInAlarm(123, "endpoint_cpm"), getMetrics(timeInPeriod1, 50)); runningRule.in(getMetaInAlarm(123, "endpoint_cpm"), getMetrics(timeInPeriod2, 99)); - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); runningRule.in(getMetaInAlarm(123, "endpoint_cpm"), getMetrics(timeInPeriod3, 60)); - alarmMessages = runningRule.check(); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(alarmMsgSize, alarmMessages.size()); } @@ -231,6 +234,11 @@ public void testNoAlarm() throws IllegalExpressionException { public void doAlarm(List alarmMessage) { isAlarm[0] = true; } + + @Override + public void doAlarmRecovery(List alarmResolvedMessages) { + isAlarm[0] = false; + } }; LinkedList callbackList = new LinkedList<>(); callbackList.add(assertCallback); @@ -249,15 +257,15 @@ public void doAlarm(List alarmMessage) { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod5, 95)); // check at startTime - 1 - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 1 runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } @Test @@ -282,21 +290,78 @@ public void testSilence() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); // check at startTime - 4 - Assertions.assertEquals(0, runningRule.check().size()); //check matches, no alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //check matches, no alarm // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); - Assertions.assertEquals(1, runningRule.check().size()); //alarm + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); //alarm // check at starTime + 1 runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertNotEquals(0, runningRule.check().size()); //alarm - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertNotEquals(0, runningRule.check().size()); //alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertNotEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertNotEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + } + + @Test + public void testRecoverObservation() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 3"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(15); + alarmRule.setRecoveryObservationPeriod(2); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + long timeInPeriod1 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(6).getMillis()); + long timeInPeriod2 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(4).getMillis()); + long timeInPeriod3 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); + runningRule.moveTo(startTime.toLocalDateTime()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + runningRule.moveTo(startTime.plusMinutes(8).toLocalDateTime()); + Assertions.assertEquals(0, getAlarmRecoveryMessageList(runningRule.check()).size()); //no recovery + runningRule.moveTo(startTime.plusMinutes(9).toLocalDateTime()); + Assertions.assertEquals(0, getAlarmRecoveryMessageList(runningRule.check()).size()); //recoverObserving + Assertions.assertEquals(0, getAlarmRecoveryMessageList(runningRule.check()).size()); //recoverObserving + Assertions.assertEquals(1, getAlarmRecoveryMessageList(runningRule.check()).size()); //recovered + } + + @Test + public void testRecover() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 3"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(15); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + long timeInPeriod1 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(6).getMillis()); + long timeInPeriod2 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(4).getMillis()); + long timeInPeriod3 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); + runningRule.moveTo(startTime.toLocalDateTime()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + runningRule.moveTo(startTime.plusMinutes(9).toLocalDateTime()); + Assertions.assertEquals(1, getAlarmRecoveryMessageList(runningRule.check()).size()); //recovery } @Test @@ -323,15 +388,15 @@ public void testExclude() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); // check at startTime - 2 - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 1 runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } @Test @@ -342,7 +407,7 @@ public void testIncludeNamesRegex() throws IllegalExpressionException { alarmRule.getIncludeMetrics().add("endpoint_percent"); alarmRule.setPeriod(10); alarmRule.setMessage( - "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+"); alarmRule.setTags(new HashMap() {{ put("key", "value"); @@ -359,15 +424,15 @@ public void testIncludeNamesRegex() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); // check at startTime - 1 - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 6 runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } @Test @@ -378,7 +443,7 @@ public void testExcludeNamesRegex() throws IllegalExpressionException { alarmRule.getIncludeMetrics().add("endpoint_percent"); alarmRule.setPeriod(10); alarmRule.setMessage( - "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+"); alarmRule.setTags(new HashMap() {{ put("key", "value"); @@ -395,15 +460,15 @@ public void testExcludeNamesRegex() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); // check at startTime - 1 - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 6 runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } private MetaInAlarm getMetaInAlarm(int id) { @@ -472,7 +537,7 @@ private Metrics getLabeledValueMetrics(long timeBucket, String values) { private AlarmEntity getAlarmEntity(int id) { MetaInAlarm metaInAlarm = getMetaInAlarm(id); return new AlarmEntity(metaInAlarm.getScope(), metaInAlarm.getScopeId(), metaInAlarm.getName(), - metaInAlarm.getId0(), metaInAlarm.getId1() + metaInAlarm.getId0(), metaInAlarm.getId1() ); } @@ -594,13 +659,303 @@ private void assertLabeled(AlarmRule alarmRule, String value1, String value2, St runningRule.in(getMetaInAlarm(123, "endpoint_labeled"), getLabeledValueMetrics(timeInPeriod2, value2)); // check at startTime - 4 - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); runningRule.in(getMetaInAlarm(123, "endpoint_labeled"), getLabeledValueMetrics(timeInPeriod3, value3)); - alarmMessages = runningRule.check(); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(alarmMsgSize, alarmMessages.size()); } + + @Test + public void testAlarmStateMachine_NoSilenceNoRecoveryObservation() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_no_silence_no_recovery"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 2"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + long timeBucket1 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()); + long timeBucket2 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(1).getMillis()); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket2, 71)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + long timeBucket3 = TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()); + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket3, 80)); + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover immediately"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + long timeBucket4 = TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()); + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket4, 80)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + List messages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, messages.size(), "Should be empty"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_OnlySilencePeriod() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_only_silence"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setSilencePeriod(2); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()), 72)); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()), 72)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(3).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm after silence"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(4).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(4).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(5).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(5).toLocalDateTime()); + alarmMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should recover immediately"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(6).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be normal"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_OnlyRecoveryObservationPeriod() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_only_recovery_observation"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setRecoveryObservationPeriod(1); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(1).getMillis()), 72)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should not recover yet"); + Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover after observation"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(3).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should be normal"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_SilenceGreaterThanRecovery() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_silence_gt_recovery"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(5); + alarmRule.setSilencePeriod(3); + alarmRule.setRecoveryObservationPeriod(2); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + + runningRule.in(getMetaInAlarm(123), getMetrics( + TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size()); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + for (int i = 0; i <= 3; i++) { + runningRule.moveTo(startTime.plusMinutes(i).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics( + TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(i).getMillis()), 72)); + + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + if (i < 3) { + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced at minute " + i); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + } else { + Assertions.assertEquals(1, alarmMessages.size(), "Should fire after silence period"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + } + } + for (int i = 0; i <= 2; i++) { + runningRule.moveTo(startTime.plusMinutes(8 + i).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics( + TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(8 + i).getMillis()), 80)); + if (i < 2) { + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should not recover immediately"); + Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY, stateMachine.getCurrentState()); + } else { + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover after observation period"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + } + } + runningRule.moveTo(startTime.plusMinutes(11).toLocalDateTime()); + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should recover after observation period"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_RecoveryGreaterThanSilence() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_recovery_gt_silence"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setSilencePeriod(2); + alarmRule.setRecoveryObservationPeriod(3); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size()); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()), 72)); + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should fire after silence period"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(4).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(4).getMillis()), 80)); + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should not recover immediately"); + Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(5).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should still in observation"); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(6).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should still in observation"); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(7).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(7).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover after full observation period"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(8).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should be normal"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + private RunningRule.Window getWindow(RunningRule runningRule, int entityId) { + Map windows = runningRule.getWindows(); + AlarmEntity entity = getAlarmEntity(entityId); + return windows.get(entity); + } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java index edd87ce9110d..80b0f1b44b1a 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java @@ -21,6 +21,7 @@ import io.grpc.stub.StreamObserver; import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmServiceGrpc; import org.apache.skywalking.oap.server.core.alarm.grpc.Response; import org.apache.skywalking.oap.server.library.server.ServerException; @@ -67,5 +68,31 @@ public void onCompleted() { } }; } + + @Override public StreamObserver doAlarmRecovery(StreamObserver responseObserver) { + return new StreamObserver() { + @Override + public void onNext(AlarmRecoveryMessage value) { + log.info("received alarm recovery message: {}", value.toString()); + } + + @Override + public void onError(Throwable throwable) { + responseObserver.onError(throwable); + if (log.isDebugEnabled()) { + log.debug("received alarm recovery message error."); + } + } + + @Override + public void onCompleted() { + responseObserver.onNext(Response.newBuilder().build()); + responseObserver.onCompleted(); + if (log.isDebugEnabled()) { + log.debug("received alarm recovery message completed."); + } + } + }; + } } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java index 9c2934108b49..ec78e2de61c2 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java @@ -23,6 +23,7 @@ import java.util.Arrays; import java.util.List; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmHooksType; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.core.alarm.provider.Rules; @@ -38,6 +39,7 @@ public class GRPChookCallbackTest { private AlarmRulesWatcher alarmRulesWatcher; private List alarmMessageList; + private List alarmRecoveryMessageList; @BeforeEach public void init() throws Exception { @@ -54,11 +56,13 @@ public void init() throws Exception { alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); grpcCallback = new GRPCCallback(alarmRulesWatcher); mockAlarmMessage(setting1.getFormattedName(), setting2.getFormattedName()); + mockAlarmRecoveryMessage(setting1.getFormattedName(), setting2.getFormattedName()); } @Test public void doAlarm() { grpcCallback.doAlarm(alarmMessageList); + grpcCallback.doAlarmRecovery(alarmRecoveryMessageList); } @Test @@ -71,6 +75,7 @@ public void testGauchoSettingClean() { alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); grpcCallback = new GRPCCallback(alarmRulesWatcher); grpcCallback.doAlarm(alarmMessageList); + grpcCallback.doAlarmRecovery(alarmRecoveryMessageList); } private void mockAlarmMessage(String hook1, String hook2) { @@ -96,4 +101,11 @@ private void mockAlarmMessage(String hook1, String hook2) { alarmMessage2.getHooks().add(hook1); alarmMessageList = Lists.newArrayList(alarmMessage, alarmMessage2); } + + private void mockAlarmRecoveryMessage(String hook1, String hook2) { + AlarmRecoveryMessage alarmRecoveryMessage0 = new AlarmRecoveryMessage(alarmMessageList.get(0)); + AlarmRecoveryMessage alarmRecoveryMessage1 = new AlarmRecoveryMessage(alarmMessageList.get(1)); + + alarmRecoveryMessageList = Lists.newArrayList(alarmRecoveryMessage0, alarmRecoveryMessage1); + } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java index a9d40e2a255e..b9770ef3c972 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java @@ -26,6 +26,7 @@ import com.linecorp.armeria.server.ServerBuilder; import com.linecorp.armeria.testing.junit5.server.ServerExtension; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmHooksType; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.core.alarm.provider.Rules; @@ -73,6 +74,25 @@ protected void configure(ServerBuilder sb) { IS_SUCCESS.set(false); return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); }))); + + sb.service("/webhook/receiveAlarmRecovery", (ctx, req) -> HttpResponse.from(req.aggregate().thenApply(r -> { + final String content = r.content().toStringUtf8(); + List alarmMessages = new Gson().fromJson(content, new TypeToken>() { + }.getType()); + if (alarmMessages.size() != 1) { + IS_SUCCESS.set(false); + return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); + } + if (Objects.equals(alarmMessages.get(0).getId0(), "1")) { + if (alarmMessages.get(0).getRecoveryTime() > 0) { + IS_SUCCESS.set(true); + COUNTER.incrementAndGet(); + return HttpResponse.of(HttpStatus.OK); + } + } + IS_SUCCESS.set(false); + return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); + }))); } }; @@ -80,9 +100,12 @@ protected void configure(ServerBuilder sb) { public void testWebhook() throws Exception { List remoteEndpoints = new ArrayList<>(); remoteEndpoints.add("http://127.0.0.1:" + SERVER.httpPort() + "/webhook/receiveAlarm"); + List remoteEndpointsForRecovery = new ArrayList<>(); + remoteEndpointsForRecovery.add("http://127.0.0.1:" + SERVER.httpPort() + "/webhook/receiveAlarmRecovery"); Rules rules = new Rules(); WebhookSettings setting1 = new WebhookSettings("setting1", AlarmHooksType.webhook, true); setting1.setUrls(remoteEndpoints); + setting1.setRecoveryUrls(remoteEndpointsForRecovery); WebhookSettings setting2 = new WebhookSettings("setting2", AlarmHooksType.webhook, false); setting2.setUrls(remoteEndpoints); rules.getWebhookSettingsMap().put(setting1.getFormattedName(), setting1); @@ -106,8 +129,10 @@ public void testWebhook() throws Exception { anotherAlarmMessage.getHooks().add(setting2.getFormattedName()); alarmMessages.add(anotherAlarmMessage); webhookCallback.doAlarm(alarmMessages); - + List alarmRecoveryMessages = new ArrayList<>(1); + alarmRecoveryMessages.add(new AlarmRecoveryMessage(alarmMessage)); + webhookCallback.doAlarmRecovery(alarmRecoveryMessages); Assertions.assertTrue(IS_SUCCESS.get()); - Assertions.assertEquals(2, COUNTER.get()); + Assertions.assertEquals(3, COUNTER.get()); } } \ No newline at end of file diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java index 5f8c5a058bfb..c70b7763657e 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java @@ -25,6 +25,7 @@ import com.linecorp.armeria.server.ServerBuilder; import com.linecorp.armeria.testing.junit5.server.ServerExtension; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmHooksType; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.core.alarm.provider.Rules; @@ -39,28 +40,34 @@ import java.util.concurrent.atomic.AtomicInteger; public class WechatHookCallbackTest { + public static final String RECOVERED = "[Recovered]"; private static final AtomicBoolean IS_SUCCESS = new AtomicBoolean(); private static final AtomicInteger COUNT = new AtomicInteger(); + private static final AtomicInteger RECOVERY_COUNT = new AtomicInteger(); @RegisterExtension public static final ServerExtension SERVER = new ServerExtension() { @Override protected void configure(ServerBuilder sb) { sb.service("/wechathook/receiveAlarm", (ctx, req) -> HttpResponse.from( - req.aggregate().thenApply(r -> { - final String content = r.content().toStringUtf8(); - final JsonObject jsonObject = new Gson().fromJson(content, JsonObject.class); - final String type = jsonObject.get("msgtype").getAsString(); - if (type.equalsIgnoreCase("text")) { - COUNT.incrementAndGet(); - if (COUNT.get() == 2) { - IS_SUCCESS.set(true); + req.aggregate().thenApply(r -> { + final String content = r.content().toStringUtf8(); + final JsonObject jsonObject = new Gson().fromJson(content, JsonObject.class); + final String type = jsonObject.get("msgtype").getAsString(); + if (type.equalsIgnoreCase("text")) { + COUNT.incrementAndGet(); + final String textContent = ((JsonObject) jsonObject.get("text")).get("content").getAsString(); + if (textContent.startsWith(RECOVERED)) { + RECOVERY_COUNT.incrementAndGet(); + } + if (COUNT.get() == 3 && RECOVERY_COUNT.get() == 1) { + IS_SUCCESS.set(true); + } + return HttpResponse.of(HttpStatus.OK); } - return HttpResponse.of(HttpStatus.OK); - } - return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); - }) + return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); + }) )); } }; @@ -71,17 +78,21 @@ public void testWechatWebhook() throws Exception { remoteEndpoints.add("http://127.0.0.1:" + SERVER.httpPort() + "/wechathook/receiveAlarm"); Rules rules = new Rules(); String template = "{\"msgtype\":\"text\",\"text\":{\"content\":\"Skywaling alarm: %s\"}}"; + String recoveryTemplate = "{\"msgtype\":\"text\",\"text\":{\"content\":\"" + RECOVERED + "Skywaling alarm: %s\"}}"; WechatSettings setting1 = new WechatSettings("setting1", AlarmHooksType.wechat, true); setting1.setWebhooks(remoteEndpoints); setting1.setTextTemplate(template); + setting1.setRecoveryTextTemplate(recoveryTemplate); WechatSettings setting2 = new WechatSettings("setting2", AlarmHooksType.wechat, false); setting2.setWebhooks(remoteEndpoints); setting2.setTextTemplate(template); + setting2.setRecoveryTextTemplate(recoveryTemplate); rules.getWechatSettingsMap().put(setting1.getFormattedName(), setting1); rules.getWechatSettingsMap().put(setting2.getFormattedName(), setting2); AlarmRulesWatcher alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); WechatHookCallback wechatHookCallback = new WechatHookCallback(alarmRulesWatcher); List alarmMessages = new ArrayList<>(2); + List alarmRecoveryMessages = new ArrayList<>(1); AlarmMessage alarmMessage = new AlarmMessage(); alarmMessage.setScopeId(DefaultScopeDefine.SERVICE); alarmMessage.setRuleName("service_resp_time_rule"); @@ -95,6 +106,9 @@ public void testWechatWebhook() throws Exception { anotherAlarmMessage.getHooks().add(setting2.getFormattedName()); alarmMessages.add(anotherAlarmMessage); wechatHookCallback.doAlarm(alarmMessages); + AlarmRecoveryMessage alarmRecoveryMessage = new AlarmRecoveryMessage(anotherAlarmMessage); + alarmRecoveryMessages.add(alarmRecoveryMessage); + wechatHookCallback.doAlarmRecovery(alarmRecoveryMessages); Assertions.assertTrue(IS_SUCCESS.get()); } } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java index b6e3a4d3985e..c7d3019c52fa 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java @@ -41,4 +41,6 @@ default Map> groupMessagesByHook(List a } void doAlarm(List alarmMessages) throws Exception; + + void doAlarmRecovery(List alarmRecoveryMessages) throws Exception; } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java index e644eb93062b..2177d6c3c920 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java @@ -19,12 +19,16 @@ package org.apache.skywalking.oap.server.core.alarm; import com.google.gson.JsonObject; + import java.util.HashSet; import java.util.Set; + import lombok.Getter; import lombok.Setter; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; + import java.util.List; +import java.util.UUID; /** * Alarm message represents the details of each alarm. @@ -45,4 +49,13 @@ public class AlarmMessage { private Set hooks = new HashSet<>(); private String expression; private JsonObject mqeMetricsSnapshot; + private String uuid; + + public AlarmMessage(String uuid) { + this.uuid = uuid; + } + + public AlarmMessage() { + this.uuid = UUID.randomUUID().toString(); + } } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java index 4d2b1256e3a9..64b73d78ff90 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java @@ -48,6 +48,7 @@ public class AlarmRecord extends Record { public static final String INDEX_NAME = "alarm_record"; public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag"; + public static final String UUID = "uuid"; public static final String SCOPE = "scope"; public static final String NAME = "name"; public static final String ID0 = "id0"; @@ -93,6 +94,8 @@ public StorageID id() { private byte[] tagsRawData; @Column(name = SNAPSHOT, storageOnly = true, length = 50000) private String snapshot; + @Column(name = UUID) + private String uuid; public static class Builder implements StorageBuilder { @Override @@ -100,6 +103,7 @@ public AlarmRecord storage2Entity(final Convert2Entity converter) { AlarmRecord record = new AlarmRecord(); record.setScope(((Number) converter.get(SCOPE)).intValue()); record.setName((String) converter.get(NAME)); + record.setUuid((String) converter.get(UUID)); record.setId0((String) converter.get(ID0)); record.setId1((String) converter.get(ID1)); record.setAlarmMessage((String) converter.get(ALARM_MESSAGE)); @@ -116,6 +120,7 @@ public AlarmRecord storage2Entity(final Convert2Entity converter) { public void entity2Storage(final AlarmRecord storageData, final Convert2Storage converter) { converter.accept(SCOPE, storageData.getScope()); converter.accept(NAME, storageData.getName()); + converter.accept(UUID, storageData.getUuid()); converter.accept(ID0, storageData.getId0()); converter.accept(ID1, storageData.getId1()); converter.accept(ALARM_MESSAGE, storageData.getAlarmMessage()); diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java new file mode 100644 index 000000000000..257717c201e8 --- /dev/null +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.skywalking.oap.server.core.alarm; + +import lombok.Getter; +import lombok.Setter; + +/** + * Alarm message represents the details of each alarm. + */ +@Setter +@Getter +public class AlarmRecoveryMessage extends AlarmMessage { + private long recoveryTime; + + public AlarmRecoveryMessage(AlarmMessage alarmMessage) { + this.setScopeId(alarmMessage.getScopeId()); + this.setScope(alarmMessage.getScope()); + this.setName(alarmMessage.getName()); + this.setId0(alarmMessage.getId0()); + this.setId1(alarmMessage.getId1()); + this.setRuleName(alarmMessage.getRuleName()); + this.setAlarmMessage(alarmMessage.getAlarmMessage()); + this.setTags(alarmMessage.getTags()); + this.setStartTime(alarmMessage.getStartTime()); + this.setPeriod(alarmMessage.getPeriod()); + this.setHooks(alarmMessage.getHooks()); + this.setExpression(alarmMessage.getExpression()); + this.setMqeMetricsSnapshot(alarmMessage.getMqeMetricsSnapshot()); + this.setUuid(alarmMessage.getUuid()); + this.setRecoveryTime(System.currentTimeMillis()); + } +} diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java new file mode 100644 index 000000000000..d569a900ab3e --- /dev/null +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.skywalking.oap.server.core.alarm; + +import lombok.Getter; +import lombok.Setter; +import org.apache.skywalking.oap.server.core.analysis.Stream; +import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; +import org.apache.skywalking.oap.server.core.analysis.record.Record; +import org.apache.skywalking.oap.server.core.analysis.worker.RecordStreamProcessor; +import org.apache.skywalking.oap.server.core.source.DefaultScopeDefine; +import org.apache.skywalking.oap.server.core.source.ScopeDeclaration; +import org.apache.skywalking.oap.server.core.storage.StorageID; +import org.apache.skywalking.oap.server.core.storage.annotation.BanyanDB; +import org.apache.skywalking.oap.server.core.storage.annotation.Column; +import org.apache.skywalking.oap.server.core.storage.annotation.ElasticSearch; +import org.apache.skywalking.oap.server.core.storage.annotation.SQLDatabase; +import org.apache.skywalking.oap.server.core.storage.type.Convert2Entity; +import org.apache.skywalking.oap.server.core.storage.type.Convert2Storage; +import org.apache.skywalking.oap.server.core.storage.type.StorageBuilder; + +import java.util.List; + +import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.ALARM_RECOVERY; +import static org.apache.skywalking.oap.server.core.storage.StorageData.TIME_BUCKET; + +@Getter +@Setter +@ScopeDeclaration(id = ALARM_RECOVERY, name = "AlarmRecovery") +@Stream(name = AlarmRecoveryRecord.INDEX_NAME, scopeId = DefaultScopeDefine.ALARM_RECOVERY, builder = AlarmRecoveryRecord.Builder.class, processor = RecordStreamProcessor.class) +@SQLDatabase.ExtraColumn4AdditionalEntity(additionalTable = AlarmRecoveryRecord.ADDITIONAL_TAG_TABLE, parentColumn = TIME_BUCKET) +@BanyanDB.TimestampColumn(AlarmRecoveryRecord.START_TIME) +@BanyanDB.Group(streamGroup = BanyanDB.StreamGroup.RECORDS) +public class AlarmRecoveryRecord extends Record { + public static final String INDEX_NAME = "alarm_recovery_record"; + public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag"; + public static final String UUID = "uuid"; + public static final String SCOPE = "scope"; + public static final String NAME = "name"; + public static final String ID0 = "id0"; + public static final String ID1 = "id1"; + public static final String START_TIME = "start_time"; + public static final String RECOVERY_TIME = "recovery_time"; + public static final String ALARM_MESSAGE = "alarm_message"; + public static final String RULE_NAME = "rule_name"; + public static final String TAGS = "tags"; + public static final String TAGS_RAW_DATA = "tags_raw_data"; + public static final String SNAPSHOT = "snapshot"; + + @Override + public StorageID id() { + return new StorageID() + .append(TIME_BUCKET, getTimeBucket()) + .append(RULE_NAME, ruleName) + .append(ID0, id0) + .append(ID1, id1); + } + + @Column(name = SCOPE) + private int scope; + @Column(name = NAME, storageOnly = true, length = 512) + private String name; + @Column(name = ID0, storageOnly = true, length = 512) + @BanyanDB.SeriesID(index = 0) + private String id0; + @Column(name = ID1, storageOnly = true) + private String id1; + @ElasticSearch.EnableDocValues + @Column(name = START_TIME) + private long startTime; + @ElasticSearch.EnableDocValues + @Column(name = RECOVERY_TIME) + private long recoveryTime; + @Column(name = ALARM_MESSAGE, length = 512) + @ElasticSearch.MatchQuery + @BanyanDB.MatchQuery(analyzer = BanyanDB.MatchQuery.AnalyzerType.SIMPLE) + private String alarmMessage; + @Column(name = RULE_NAME) + private String ruleName; + @Column(name = UUID) + private String uuid; + @Column(name = TAGS, indexOnly = true) + @SQLDatabase.AdditionalEntity(additionalTables = {ADDITIONAL_TAG_TABLE}) + private List tagsInString; + @Column(name = TAGS_RAW_DATA, storageOnly = true, length = Tag.TAG_LENGTH) + private byte[] tagsRawData; + @Column(name = SNAPSHOT, storageOnly = true, length = 50000) + private String snapshot; + + public static class Builder implements StorageBuilder { + @Override + public AlarmRecoveryRecord storage2Entity(final Convert2Entity converter) { + AlarmRecoveryRecord record = new AlarmRecoveryRecord(); + record.setScope(((Number) converter.get(SCOPE)).intValue()); + record.setName((String) converter.get(NAME)); + record.setUuid((String) converter.get(UUID)); + record.setId0((String) converter.get(ID0)); + record.setId1((String) converter.get(ID1)); + record.setAlarmMessage((String) converter.get(ALARM_MESSAGE)); + record.setStartTime(((Number) converter.get(START_TIME)).longValue()); + record.setRecoveryTime(((Number) converter.get(RECOVERY_TIME)).longValue()); + record.setTimeBucket(((Number) converter.get(TIME_BUCKET)).longValue()); + record.setRuleName((String) converter.get(RULE_NAME)); + record.setTagsRawData(converter.getBytes(TAGS_RAW_DATA)); + record.setSnapshot((String) converter.get(SNAPSHOT)); + // Don't read the TAGS as they are only for query. + return record; + } + + @Override + public void entity2Storage(final AlarmRecoveryRecord storageData, final Convert2Storage converter) { + converter.accept(SCOPE, storageData.getScope()); + converter.accept(NAME, storageData.getName()); + converter.accept(UUID, storageData.getUuid()); + converter.accept(ID0, storageData.getId0()); + converter.accept(ID1, storageData.getId1()); + converter.accept(ALARM_MESSAGE, storageData.getAlarmMessage()); + converter.accept(START_TIME, storageData.getStartTime()); + converter.accept(RECOVERY_TIME, storageData.getRecoveryTime()); + converter.accept(TIME_BUCKET, storageData.getTimeBucket()); + converter.accept(RULE_NAME, storageData.getRuleName()); + converter.accept(TAGS_RAW_DATA, storageData.getTagsRawData()); + converter.accept(TAGS, storageData.getTagsInString()); + converter.accept(SNAPSHOT, storageData.getSnapshot()); + } + } +} diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java index 3851ba81a0ed..c4acfa8a00dd 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java @@ -31,6 +31,7 @@ import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -58,12 +59,43 @@ public void doAlarm(List alarmMessage) { } AlarmRecord record = new AlarmRecord(); + record.setUuid(message.getUuid()); + record.setScope(message.getScopeId()); + record.setId0(message.getId0()); + record.setId1(message.getId1()); + record.setName(message.getName()); + record.setAlarmMessage(message.getAlarmMessage()); + record.setStartTime(message.getStartTime()); + record.setTimeBucket(TimeBucket.getRecordTimeBucket(message.getStartTime())); + record.setRuleName(message.getRuleName()); + Collection tags = appendSearchableTags(message.getTags()); + addAutocompleteTags(tags, TimeBucket.getMinuteTimeBucket(message.getStartTime())); + record.setTagsRawData(gson.toJson(message.getTags()).getBytes(Charsets.UTF_8)); + record.setTagsInString(Tag.Util.toStringList(new ArrayList<>(tags))); + AlarmSnapshotRecord snapshot = new AlarmSnapshotRecord(); + snapshot.setExpression(message.getExpression()); + snapshot.setMetrics(message.getMqeMetricsSnapshot()); + record.setSnapshot(gson.toJson(snapshot)); + RecordStreamProcessor.getInstance().in(record); + }); + } + + @Override + public void doAlarmRecovery(List alarmMessage) { + alarmMessage.forEach(message -> { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Alarm recovery message: {}", message.getAlarmMessage()); + } + AlarmRecoveryMessage alarmRecoveryMessage = (AlarmRecoveryMessage) message; + AlarmRecoveryRecord record = new AlarmRecoveryRecord(); + record.setUuid(message.getUuid()); record.setScope(message.getScopeId()); record.setId0(message.getId0()); record.setId1(message.getId1()); record.setName(message.getName()); record.setAlarmMessage(message.getAlarmMessage()); record.setStartTime(message.getStartTime()); + record.setRecoveryTime(alarmRecoveryMessage.getRecoveryTime()); record.setTimeBucket(TimeBucket.getRecordTimeBucket(message.getStartTime())); record.setRuleName(message.getRuleName()); Collection tags = appendSearchableTags(message.getTags()); @@ -94,7 +126,7 @@ private Collection appendSearchableTags(List tags) { if (configService.getSearchableAlarmTags().contains(tag.getKey())) { final Tag alarmTag = new Tag(tag.getKey(), tag.getValue()); - if (tag.getValue().length() > Tag.TAG_LENGTH || alarmTag.toString().length() > Tag.TAG_LENGTH) { + if (tag.getValue().length() > Tag.TAG_LENGTH || alarmTag.toString().length() > Tag.TAG_LENGTH) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Alarm tag : {} length > : {}, dropped", alarmTag, Tag.TAG_LENGTH); } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java index 6b9ce0f00963..6414613edcfa 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java @@ -26,6 +26,7 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.time.Duration; +import java.util.List; import java.util.Map; public abstract class HttpAlarmCallback implements AlarmCallback { @@ -58,4 +59,21 @@ protected String post( } return response.body(); } + + /** + * Send alarm message if the settings not empty + */ + public void doAlarm(List alarmMessages) throws Exception { + doAlarmCallback(alarmMessages, false); + } + + /** + * Send alarm recovery message if the settings not empty + */ + public void doAlarmRecovery(List alarmRecoveryMessages) throws Exception { + doAlarmCallback(alarmRecoveryMessages, true); + } + + protected abstract void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception ; + } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java index 7ff8e5306ccd..ba6a3ee4ec7c 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java @@ -33,8 +33,10 @@ public class AlarmMessage { private int scopeId; private String id; private String name; + private String uuid; private String message; private Long startTime; + private Long recoveryTime; private transient String id1; private final List tags; private List events = new ArrayList<>(2); diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java index 359910b05a89..2650babbb125 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java @@ -155,6 +155,7 @@ public class DefaultScopeDefine { public static final int PPROF_TASK = 92; public static final int PPROF_PROFILING_DATA = 93; public static final int PPROF_TASK_LOG = 94; + public static final int ALARM_RECOVERY = 95; /** * Catalog of scope, the metrics processor could use this to group all generated metrics by oal rt. diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java index f59fb1cd1564..179c74a87d74 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java @@ -19,6 +19,7 @@ package org.apache.skywalking.oap.server.core.storage.query; import com.google.gson.JsonObject; + import java.io.IOException; import java.util.Base64; import java.util.List; @@ -71,6 +72,7 @@ default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord) { AlarmMessage message = new AlarmMessage(); message.setId(String.valueOf(alarmRecord.getId0())); message.setId1(String.valueOf(alarmRecord.getId1())); + message.setUuid(alarmRecord.getUuid()); message.setName(alarmRecord.getName()); message.setMessage(alarmRecord.getAlarmMessage()); message.setStartTime(alarmRecord.getStartTime()); @@ -89,8 +91,8 @@ default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord) { MQEMetric metrics = new MQEMetric(); metrics.setName(name); List values = GSON.fromJson( - obj.getValue().getAsString(), new TypeToken>() { - }.getType()); + obj.getValue().getAsString(), new TypeToken>() { + }.getType()); metrics.setResults(values); alarmSnapshot.getMetrics().add(metrics); } diff --git a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol index 003664676984..4fc10625ba72 160000 --- a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol +++ b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol @@ -1 +1 @@ -Subproject commit 0036646769842e915e2828fde0b6c1da0179a1e5 +Subproject commit 4fc10625ba72ef4788972b4f7991a535065d609b diff --git a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java index c50a5d1f708a..a2908bdcd83c 100644 --- a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java +++ b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java @@ -24,6 +24,7 @@ import org.apache.skywalking.banyandb.v1.client.StreamQuery; import org.apache.skywalking.banyandb.v1.client.StreamQueryResponse; import org.apache.skywalking.oap.server.core.alarm.AlarmRecord; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryRecord; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.query.input.Duration; import org.apache.skywalking.oap.server.core.query.type.AlarmMessage; @@ -36,9 +37,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; /** * {@link org.apache.skywalking.oap.server.core.alarm.AlarmRecord} is a stream, @@ -46,8 +50,12 @@ */ public class BanyanDBAlarmQueryDAO extends AbstractBanyanDBDAO implements IAlarmQueryDAO { private static final Set TAGS = ImmutableSet.of(AlarmRecord.SCOPE, - AlarmRecord.NAME, AlarmRecord.ID0, AlarmRecord.ID1, AlarmRecord.ALARM_MESSAGE, AlarmRecord.START_TIME, - AlarmRecord.RULE_NAME, AlarmRecord.TAGS, AlarmRecord.TAGS_RAW_DATA, AlarmRecord.SNAPSHOT); + AlarmRecord.NAME, AlarmRecord.ID0, AlarmRecord.ID1, AlarmRecord.UUID, AlarmRecord.ALARM_MESSAGE, + AlarmRecord.START_TIME, AlarmRecord.RULE_NAME, AlarmRecord.TAGS, AlarmRecord.TAGS_RAW_DATA, AlarmRecord.SNAPSHOT); + private static final Set RECOVERY_TAGS = ImmutableSet.of(AlarmRecoveryRecord.SCOPE, + AlarmRecoveryRecord.NAME, AlarmRecord.ID0, AlarmRecoveryRecord.ID1, AlarmRecoveryRecord.UUID, + AlarmRecoveryRecord.ALARM_MESSAGE, AlarmRecoveryRecord.START_TIME, AlarmRecoveryRecord.RECOVERY_TIME, + AlarmRecoveryRecord.RULE_NAME, AlarmRecoveryRecord.TAGS, AlarmRecoveryRecord.TAGS_RAW_DATA, AlarmRecoveryRecord.SNAPSHOT); public BanyanDBAlarmQueryDAO(BanyanDBStorageClient client) { super(client); @@ -58,7 +66,7 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, Dur final boolean isColdStage = duration != null && duration.isColdStage(); StreamQueryResponse resp = query(isColdStage, AlarmRecord.INDEX_NAME, TAGS, getTimestampRange(duration), - new QueryBuilder() { + new QueryBuilder<>() { @Override public void apply(StreamQuery query) { if (Objects.nonNull(scopeId)) { @@ -94,6 +102,45 @@ public void apply(StreamQuery query) { } alarms.getMsgs().add(alarmMessage); } + updateAlarmRecoveryTime(alarms, duration); return alarms; } + + private void updateAlarmRecoveryTime(Alarms alarms, Duration duration) throws IOException { + List alarmMessages = alarms.getMsgs(); + Map alarmRecoveryRecordMap = getAlarmRecoveryRecord(alarmMessages, duration); + alarmMessages.forEach(alarmMessage -> { + AlarmRecoveryRecord alarmRecoveryRecord = alarmRecoveryRecordMap.get(alarmMessage.getUuid()); + if (alarmRecoveryRecord != null) { + alarmMessage.setRecoveryTime(alarmRecoveryRecord.getRecoveryTime()); + } + }); + + } + + private Map getAlarmRecoveryRecord(List msgs, Duration duration) throws IOException { + Map result = new HashMap<>(); + if (CollectionUtils.isEmpty(msgs)) { + return result; + } + final boolean isColdStage = duration != null && duration.isColdStage(); + List uuids = msgs.stream().map(AlarmMessage::getUuid).collect(Collectors.toList()); + StreamQueryResponse resp = query(isColdStage, AlarmRecoveryRecord.INDEX_NAME, RECOVERY_TAGS, + getTimestampRange(duration), + new QueryBuilder<>() { + @Override + public void apply(StreamQuery query) { + query.and(in(AlarmRecoveryRecord.UUID, uuids)); + } + }); + + for (final RowEntity rowEntity : resp.getElements()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity( + new BanyanDBConverter.StorageToStream(AlarmRecoveryRecord.INDEX_NAME, rowEntity) + ); + result.put(alarmRecoveryRecord.getUuid(), alarmRecoveryRecord); + } + return result; + } } diff --git a/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java b/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java index 03b2f9eb6087..513d79899e5d 100644 --- a/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java +++ b/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java @@ -19,9 +19,6 @@ package org.apache.skywalking.oap.server.storage.plugin.elasticsearch.query; import com.google.common.base.Strings; -import java.io.IOException; -import java.util.List; -import java.util.Objects; import org.apache.skywalking.library.elasticsearch.requests.search.BoolQueryBuilder; import org.apache.skywalking.library.elasticsearch.requests.search.Query; import org.apache.skywalking.library.elasticsearch.requests.search.Search; @@ -30,6 +27,7 @@ import org.apache.skywalking.library.elasticsearch.response.search.SearchHit; import org.apache.skywalking.library.elasticsearch.response.search.SearchResponse; import org.apache.skywalking.oap.server.core.alarm.AlarmRecord; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryRecord; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.query.input.Duration; import org.apache.skywalking.oap.server.core.query.type.AlarmMessage; @@ -42,6 +40,13 @@ import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.IndexController; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.MatchCNameBuilder; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + public class AlarmQueryEsDAO extends EsDAO implements IAlarmQueryDAO { public AlarmQueryEsDAO(ElasticSearchClient client) { @@ -53,11 +58,11 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li final int from, final Duration duration, final List tags) - throws IOException { + throws IOException { long startTB = duration.getStartTimeBucketInSec(); long endTB = duration.getEndTimeBucketInSec(); final String index = - IndexController.LogicIndicesRegister.getPhysicalTableName(AlarmRecord.INDEX_NAME); + IndexController.LogicIndicesRegister.getPhysicalTableName(AlarmRecord.INDEX_NAME); final BoolQueryBuilder query = Query.bool(); if (IndexController.LogicIndicesRegister.isMergedTable(AlarmRecord.INDEX_NAME)) { query.must(Query.term(IndexController.LogicIndicesRegister.RECORD_TABLE_NAME, AlarmRecord.INDEX_NAME)); @@ -81,9 +86,9 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li } final SearchBuilder search = - Search.builder().query(query) - .size(limit).from(from) - .sort(AlarmRecord.START_TIME, Sort.Order.DESC); + Search.builder().query(query) + .size(limit).from(from) + .sort(AlarmRecord.START_TIME, Sort.Order.DESC); SearchResponse response = getClient().search(index, search.build()); @@ -98,6 +103,48 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li } alarms.getMsgs().add(alarmMessage); } + updateAlarmRecoveryTime(alarms, duration); return alarms; } + + private void updateAlarmRecoveryTime(Alarms alarms, Duration duration) throws IOException { + List alarmMessages = alarms.getMsgs(); + Map alarmRecoveryRecordMap = getAlarmRecoveryRecord(alarmMessages, duration); + alarmMessages.forEach(alarmMessage -> { + AlarmRecoveryRecord alarmRecoveryRecord = alarmRecoveryRecordMap.get(alarmMessage.getUuid()); + if (alarmRecoveryRecord != null) { + alarmMessage.setRecoveryTime(alarmRecoveryRecord.getRecoveryTime()); + } + }); + + } + + private Map getAlarmRecoveryRecord(List msgs, Duration duration) throws IOException { + Map result = new HashMap<>(); + if (CollectionUtils.isEmpty(msgs)) { + return result; + } + List uuids = msgs.stream().map(AlarmMessage::getUuid).collect(Collectors.toList()); + long startTB = duration.getStartTimeBucketInSec(); + long endTB = duration.getEndTimeBucketInSec(); + final String index = + IndexController.LogicIndicesRegister.getPhysicalTableName(AlarmRecoveryRecord.INDEX_NAME); + final BoolQueryBuilder query = Query.bool(); + if (IndexController.LogicIndicesRegister.isMergedTable(AlarmRecoveryRecord.INDEX_NAME)) { + query.must(Query.term(IndexController.LogicIndicesRegister.RECORD_TABLE_NAME, AlarmRecoveryRecord.INDEX_NAME)); + } + if (startTB != 0 && endTB != 0) { + query.must(Query.range(AlarmRecord.TIME_BUCKET).gte(startTB).lte(endTB)); + } + query.must(Query.terms(AlarmRecoveryRecord.UUID, uuids)); + final SearchBuilder search = + Search.builder().query(query); + SearchResponse response = getClient().search(index, search.build()); + for (SearchHit searchHit : response.getHits().getHits()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity(new ElasticSearchConverter.ToEntity(AlarmRecoveryRecord.INDEX_NAME, searchHit.getSource())); + result.put(alarmRecoveryRecord.getUuid(), alarmRecoveryRecord); + } + return result; + } } diff --git a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java index 42aa3622aea4..ff109be82b21 100644 --- a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java +++ b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java @@ -26,12 +26,14 @@ import org.apache.skywalking.oap.server.core.Const; import org.apache.skywalking.oap.server.core.CoreModule; import org.apache.skywalking.oap.server.core.alarm.AlarmRecord; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryRecord; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.config.ConfigService; import org.apache.skywalking.oap.server.core.query.input.Duration; import org.apache.skywalking.oap.server.core.query.type.AlarmMessage; import org.apache.skywalking.oap.server.core.query.type.Alarms; import org.apache.skywalking.oap.server.core.storage.query.IAlarmQueryDAO; +import org.apache.skywalking.oap.server.core.storage.type.Convert2Entity; import org.apache.skywalking.oap.server.library.client.jdbc.hikaricp.JDBCClient; import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.apache.skywalking.oap.server.library.util.CollectionUtils; @@ -40,16 +42,20 @@ import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.SQLAndParameters; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.TableHelper; +import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; import static java.util.Comparator.comparing; import static java.util.Objects.nonNull; import static java.util.function.Predicate.not; +import static java.util.stream.Collectors.joining; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toSet; @@ -68,20 +74,20 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, Duration duration, final List tags) { if (searchableTagKeys == null) { final ConfigService configService = manager.find(CoreModule.NAME) - .provider() - .getService(ConfigService.class); + .provider() + .getService(ConfigService.class); searchableTagKeys = new HashSet<>(Arrays.asList(configService.getSearchableAlarmTags().split(Const.COMMA))); } // If the tag is not searchable, but is required, then we don't need to run the real query. if (tags != null && !searchableTagKeys.containsAll(tags.stream().map(Tag::getKey).collect(toSet()))) { log.warn( - "Searching tags that are not searchable: {}", - tags.stream().map(Tag::getKey).filter(not(searchableTagKeys::contains)).collect(toSet())); + "Searching tags that are not searchable: {}", + tags.stream().map(Tag::getKey).filter(not(searchableTagKeys::contains)).collect(toSet())); return new Alarms(); } final var tables = tableHelper.getTablesForRead( - AlarmRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() + AlarmRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() ); final var alarmMsgs = new ArrayList(); @@ -90,25 +96,64 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, jdbcClient.executeQuery(sqlAndParameters.sql(), resultSet -> { while (resultSet.next()) { AlarmRecord.Builder builder = new AlarmRecord.Builder(); - AlarmRecord alarmRecord = builder.storage2Entity(JDBCEntityConverters.toEntity(resultSet)); + Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); + AlarmRecord alarmRecord = builder.storage2Entity(convert2Entity); AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinaryBase64( - new String(alarmRecord.getTagsRawData(), Charsets.UTF_8), alarmMessage.getTags()); + new String(alarmRecord.getTagsRawData(), Charsets.UTF_8), alarmMessage.getTags()); } alarmMsgs.add(alarmMessage); } return null; }, sqlAndParameters.parameters()); } - return new Alarms( - alarmMsgs - .stream() - .sorted(comparing(AlarmMessage::getStartTime).reversed()) - .skip(from) - .limit(limit) - .collect(toList()) + Alarms alarms = new Alarms( + alarmMsgs + .stream() + .sorted(comparing(AlarmMessage::getStartTime).reversed()) + .skip(from) + .limit(limit) + .collect(toList()) ); + updateAlarmRecoveryTime(alarms, duration); + return alarms; + } + + private void updateAlarmRecoveryTime(Alarms alarms, Duration duration) throws SQLException { + List alarmMessages = alarms.getMsgs(); + Map alarmRecoveryRecordMap = getAlarmRecoveryRecord(alarmMessages, duration); + alarmMessages.forEach(alarmMessage -> { + AlarmRecoveryRecord alarmRecoveryRecord = alarmRecoveryRecordMap.get(alarmMessage.getUuid()); + if (alarmRecoveryRecord != null) { + alarmMessage.setRecoveryTime(alarmRecoveryRecord.getRecoveryTime()); + } + }); + + } + + private Map getAlarmRecoveryRecord(List msgs, Duration duration) throws SQLException { + Map result = new HashMap<>(); + if (CollectionUtils.isEmpty(msgs)) { + return result; + } + List uuids = msgs.stream().map(AlarmMessage::getUuid).collect(toList()); + final var tables = tableHelper.getTablesForRead( + AlarmRecoveryRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() + ); + for (final var table : tables) { + final var sqlAndParameters = buildSQL4Recovery(uuids, table); + jdbcClient.executeQuery(sqlAndParameters.sql(), resultSet -> { + while (resultSet.next()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); + AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity(convert2Entity); + result.put(alarmRecoveryRecord.getUuid(), alarmRecoveryRecord); + } + return null; + }, sqlAndParameters.parameters()); + } + return result; } protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, int from, @@ -129,6 +174,7 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, */ final var timeBucket = TableHelper.getTimeBucket(table); final var tagTable = TableHelper.getTable(AlarmRecord.ADDITIONAL_TAG_TABLE, timeBucket); + if (!CollectionUtils.isEmpty(tags)) { for (int i = 0; i < tags.size(); i++) { sql.append(" inner join ").append(tagTable).append(" "); @@ -138,7 +184,7 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, } } sql.append(" where ") - .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); + .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); parameters.add(AlarmRecord.INDEX_NAME); if (Objects.nonNull(scopeId)) { sql.append(" and ").append(AlarmRecord.SCOPE).append(" = ?"); @@ -167,4 +213,17 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, return new SQLAndParameters(sql.toString(), parameters); } + + private SQLAndParameters buildSQL4Recovery(List uuids, String table) { + final var sql = new StringBuilder(); + final var parameters = new ArrayList<>(); + sql.append("select * from ").append(table); + sql.append(" where ") + .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); + parameters.add(AlarmRecoveryRecord.INDEX_NAME); + sql.append(" and ").append(AlarmRecoveryRecord.UUID).append(" in ") + .append(uuids.stream().map(it -> "?").collect(joining(", ", "(", ")"))); + parameters.addAll(uuids); + return new SQLAndParameters(sql.toString(), parameters); + } } diff --git a/skywalking-ui b/skywalking-ui index 30927258d669..6eaf7fe26da7 160000 --- a/skywalking-ui +++ b/skywalking-ui @@ -1 +1 @@ -Subproject commit 30927258d66934278a401b2defa0d9592e7d1974 +Subproject commit 6eaf7fe26da704cf54d1371ac489b3c8f458fbb8 diff --git a/test/e2e-v2/cases/alarm/alarm-cases.yaml b/test/e2e-v2/cases/alarm/alarm-cases.yaml index aa07203bf8c2..dc49ba84e6c7 100644 --- a/test/e2e-v2/cases/alarm/alarm-cases.yaml +++ b/test/e2e-v2/cases/alarm/alarm-cases.yaml @@ -31,7 +31,9 @@ - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql alarm autocomplete-values --key=level expected: expected/tag-values.yml # before silence webhook - - query: curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read + - query: | + sleep 30; + curl -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/silence-before-webhook.yml # after silence alarm list WARNING,receivers=lisi - query: | @@ -44,3 +46,11 @@ # after silence webhook - query: curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/silence-after-webhook.yml + - query: | + sleep 60; + curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read + expected: expected/recovery-webhook.yml + - query: | + sleep 60; + curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read + expected: expected/recovery-after-observation-webhook.yml diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml b/test/e2e-v2/cases/alarm/alarm-settings.yml index 04ddbc61cd3f..8679261c4441 100755 --- a/test/e2e-v2/cases/alarm/alarm-settings.yml +++ b/test/e2e-v2/cases/alarm/alarm-settings.yml @@ -27,19 +27,20 @@ rules: - webhook.custom # service_percentile > 10ms service_percentile_rule: - expression: sum(service_percentile{p='50,75,90,95,99'} > 10) >= 3 + expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 period: 10 silence-period: 1 - message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. tags: level: WARNING receivers: lisi hooks: - - webhook.none + - webhook.custom comp_rule: - expression: sum((service_resp_time > 10) && (service_sla > 100)) >= 1 + expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 period: 10 - message: Service {name} response time is more than 10ms and sla is more than 1%. + recovery-observation-period: 3 + message: Service {name} response time is more than 100ms and sla is more than 1%. tags: level: CRITICAL receivers: zhangsan @@ -50,9 +51,15 @@ hooks: is-default: true urls: - http://provider:9090/alarm/receive + recovery-urls: + - http://provider:9090/alarm/receive custom: urls: - http://provider:9090/alarm/receive + recovery-urls: + - http://provider:9090/alarm/receive none: urls: - http://none:9090/alarm/receive + recovery-urls: + - http://none:9090/alarm/receive \ No newline at end of file diff --git a/test/e2e-v2/cases/alarm/banyandb/e2e.yaml b/test/e2e-v2/cases/alarm/banyandb/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/alarm/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/es/e2e.yaml b/test/e2e-v2/cases/alarm/es/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/es/e2e.yaml +++ b/test/e2e-v2/cases/alarm/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml index a1fa0e36b91d..0ab7c649dded 100644 --- a/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml b/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml new file mode 100644 index 000000000000..607e3c3eb501 --- /dev/null +++ b/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +messages: + {{- contains .messages }} + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ gt .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ gt .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + {{- end }} diff --git a/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml b/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml new file mode 100644 index 000000000000..84faee28ed9e --- /dev/null +++ b/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +messages: + {{- contains .messages }} + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ gt .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + {{- end }} diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml index 0631c162b88d..a4b608c285c6 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + message: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. tags: - key: level value: CRITICAL @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum((service_resp_time > 10) && (service_sla > 100)) >= 1 + expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 metrics: {{- contains .snapshot.metrics }} - name: service_resp_time diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml index e54e4bd6abfd..50b81cb282d4 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. tags: - key: level value: WARNING @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum(service_percentile{p='50,75,90,95,99'} > 10) >= 3 + expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 metrics: {{- contains .snapshot.metrics }} - name: service_percentile{p='50,75,90,95,99'} diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml b/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml index 3e9ad6d25bac..443889614a94 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml @@ -23,6 +23,21 @@ messages: ruleName: service_resp_time_rule alarmMessage: Response time of service e2e-service-provider is increase/decrease in 1 minutes of last 10 minutes. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: WARNING @@ -34,8 +49,9 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: CRITICAL @@ -49,19 +65,105 @@ messages: ruleName: service_resp_time_rule alarmMessage: Response time of service e2e-service-provider is increase/decrease in 1 minutes of last 10 minutes. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: WARNING - key: receivers value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan - scopeId: 1 scope: SERVICE name: e2e-service-provider id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: CRITICAL diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml index 0631c162b88d..a4b608c285c6 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + message: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. tags: - key: level value: CRITICAL @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum((service_resp_time > 10) && (service_sla > 100)) >= 1 + expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 metrics: {{- contains .snapshot.metrics }} - name: service_resp_time diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml index e25f5cf61417..6d19c8185deb 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. tags: - key: level value: WARNING @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum(service_percentile{p='50,75,90,95,99'} > 10) >= 3 + expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 metrics: {{- contains .snapshot.metrics }} - name: service_percentile{p='50,75,90,95,99'} diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml b/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml index d34f856f282f..294d7de7de87 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml @@ -23,6 +23,21 @@ messages: ruleName: service_resp_time_rule alarmMessage: Response time of service e2e-service-provider is increase/decrease in 1 minutes of last 10 minutes. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: WARNING @@ -34,8 +49,9 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: CRITICAL diff --git a/test/e2e-v2/cases/alarm/mysql/docker-compose.yml b/test/e2e-v2/cases/alarm/mysql/docker-compose.yml index 0de65f5e04f1..340dac7dff1b 100644 --- a/test/e2e-v2/cases/alarm/mysql/docker-compose.yml +++ b/test/e2e-v2/cases/alarm/mysql/docker-compose.yml @@ -22,6 +22,8 @@ services: - e2e expose: - 3306 + ports: + - 3306 environment: - MYSQL_ROOT_PASSWORD=root@1234 - MYSQL_DATABASE=swtest diff --git a/test/e2e-v2/cases/alarm/mysql/e2e.yaml b/test/e2e-v2/cases/alarm/mysql/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/mysql/e2e.yaml +++ b/test/e2e-v2/cases/alarm/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/postgres/e2e.yaml b/test/e2e-v2/cases/alarm/postgres/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/postgres/e2e.yaml +++ b/test/e2e-v2/cases/alarm/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml b/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml index 283c4d3e111a..4dd6328dc443 100644 --- a/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml +++ b/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9089/info/ method: GET diff --git a/test/e2e-v2/cases/aws/api-gateway/e2e.yaml b/test/e2e-v2/cases/aws/api-gateway/e2e.yaml index 2b221d493b9f..29039e270ebb 100644 --- a/test/e2e-v2/cases/aws/api-gateway/e2e.yaml +++ b/test/e2e-v2/cases/aws/api-gateway/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/aws/dynamodb/e2e.yaml b/test/e2e-v2/cases/aws/dynamodb/e2e.yaml index ef3a9b14802c..72c8217a8524 100644 --- a/test/e2e-v2/cases/aws/dynamodb/e2e.yaml +++ b/test/e2e-v2/cases/aws/dynamodb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/aws/eks/e2e.yaml b/test/e2e-v2/cases/aws/eks/e2e.yaml index 938e46a1a3f5..f60b668f3450 100644 --- a/test/e2e-v2/cases/aws/eks/e2e.yaml +++ b/test/e2e-v2/cases/aws/eks/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/aws/s3/e2e.yaml b/test/e2e-v2/cases/aws/s3/e2e.yaml index 7ae7e0d4e22b..cf4e43a45f0f 100644 --- a/test/e2e-v2/cases/aws/s3/e2e.yaml +++ b/test/e2e-v2/cases/aws/s3/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/baseline/banyandb/e2e.yaml b/test/e2e-v2/cases/baseline/banyandb/e2e.yaml index 4dcb90bf0f7d..da9bbc39e26a 100644 --- a/test/e2e-v2/cases/baseline/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/baseline/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/baseline/es/e2e.yaml b/test/e2e-v2/cases/baseline/es/e2e.yaml index 4dcb90bf0f7d..da9bbc39e26a 100644 --- a/test/e2e-v2/cases/baseline/es/e2e.yaml +++ b/test/e2e-v2/cases/baseline/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml index a1fa0e36b91d..272a070c744c 100644 --- a/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/exporter/kafka/e2e.yaml b/test/e2e-v2/cases/exporter/kafka/e2e.yaml index d7220c4cfac2..6de4c0bb443d 100644 --- a/test/e2e-v2/cases/exporter/kafka/e2e.yaml +++ b/test/e2e-v2/cases/exporter/kafka/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/flink/e2e.yaml b/test/e2e-v2/cases/flink/e2e.yaml index 2c523e2d9bf3..06e9a080e6d5 100644 --- a/test/e2e-v2/cases/flink/e2e.yaml +++ b/test/e2e-v2/cases/flink/e2e.yaml @@ -32,7 +32,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${jobmanager_host}:${jobmanager_9260}/metrics method: GET diff --git a/test/e2e-v2/cases/gateway/e2e.yaml b/test/e2e-v2/cases/gateway/e2e.yaml index e3270e2dfa67..177f1ec5832c 100644 --- a/test/e2e-v2/cases/gateway/e2e.yaml +++ b/test/e2e-v2/cases/gateway/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/go/e2e.yaml b/test/e2e-v2/cases/go/e2e.yaml index 21b33d4e2519..b5c7aa39e1a7 100644 --- a/test/e2e-v2/cases/go/e2e.yaml +++ b/test/e2e-v2/cases/go/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/correlation method: POST diff --git a/test/e2e-v2/cases/kafka/log/e2e.yaml b/test/e2e-v2/cases/kafka/log/e2e.yaml index 1c80a57c29dc..037b6cad789a 100644 --- a/test/e2e-v2/cases/kafka/log/e2e.yaml +++ b/test/e2e-v2/cases/kafka/log/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/kafka/meter/e2e.yaml b/test/e2e-v2/cases/kafka/meter/e2e.yaml index ca7a520eaada..c7f363d6d18c 100644 --- a/test/e2e-v2/cases/kafka/meter/e2e.yaml +++ b/test/e2e-v2/cases/kafka/meter/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml b/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml index a91249e4e4f8..aefc8568d335 100644 --- a/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml +++ b/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/log/banyandb/e2e.yaml b/test/e2e-v2/cases/log/banyandb/e2e.yaml index eb1725bc750a..bc8437238c97 100644 --- a/test/e2e-v2/cases/log/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/log/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/es/e2e.yaml b/test/e2e-v2/cases/log/es/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/es/e2e.yaml +++ b/test/e2e-v2/cases/log/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml index 592267acd079..b9b08e0fe888 100644 --- a/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/fluent-bit/e2e.yaml b/test/e2e-v2/cases/log/fluent-bit/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/fluent-bit/e2e.yaml +++ b/test/e2e-v2/cases/log/fluent-bit/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/mysql/e2e.yaml b/test/e2e-v2/cases/log/mysql/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/mysql/e2e.yaml +++ b/test/e2e-v2/cases/log/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/postgres/e2e.yaml b/test/e2e-v2/cases/log/postgres/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/postgres/e2e.yaml +++ b/test/e2e-v2/cases/log/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/logql/e2e.yaml b/test/e2e-v2/cases/logql/e2e.yaml index 5a4519cc738b..df8f3585e131 100644 --- a/test/e2e-v2/cases/logql/e2e.yaml +++ b/test/e2e-v2/cases/logql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/lua/e2e.yaml b/test/e2e-v2/cases/lua/e2e.yaml index 3a4a2340b77f..53433d47cb5e 100644 --- a/test/e2e-v2/cases/lua/e2e.yaml +++ b/test/e2e-v2/cases/lua/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider-entry_host}:${provider-entry_9090}/nginx/entry/info method: POST diff --git a/test/e2e-v2/cases/menu/banyandb/e2e.yaml b/test/e2e-v2/cases/menu/banyandb/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/menu/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/es/e2e.yaml b/test/e2e-v2/cases/menu/es/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/es/e2e.yaml +++ b/test/e2e-v2/cases/menu/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml index 0e765d63bf9f..3a53dd4fe4b3 100644 --- a/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/mysql/e2e.yaml b/test/e2e-v2/cases/menu/mysql/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/mysql/e2e.yaml +++ b/test/e2e-v2/cases/menu/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/opensearch/e2e.yaml b/test/e2e-v2/cases/menu/opensearch/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/opensearch/e2e.yaml +++ b/test/e2e-v2/cases/menu/opensearch/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/postgres/e2e.yaml b/test/e2e-v2/cases/menu/postgres/e2e.yaml index 84bc9fa55523..7784f0b2f70e 100644 --- a/test/e2e-v2/cases/menu/postgres/e2e.yaml +++ b/test/e2e-v2/cases/menu/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/meter/e2e.yaml b/test/e2e-v2/cases/meter/e2e.yaml index 0dd74fc67b08..cc4bd72ccb44 100644 --- a/test/e2e-v2/cases/meter/e2e.yaml +++ b/test/e2e-v2/cases/meter/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/mqe/e2e.yaml b/test/e2e-v2/cases/mqe/e2e.yaml index de8031630149..3c0a91b4901a 100644 --- a/test/e2e-v2/cases/mqe/e2e.yaml +++ b/test/e2e-v2/cases/mqe/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/nginx/e2e.yaml b/test/e2e-v2/cases/nginx/e2e.yaml index 1c55aee1de6e..56b3983cba62 100644 --- a/test/e2e-v2/cases/nginx/e2e.yaml +++ b/test/e2e-v2/cases/nginx/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${nginx_host}:${nginx_8080}/test method: POST diff --git a/test/e2e-v2/cases/nodejs/e2e.yaml b/test/e2e-v2/cases/nodejs/e2e.yaml index bccedb26fd23..c26660e3d652 100644 --- a/test/e2e-v2/cases/nodejs/e2e.yaml +++ b/test/e2e-v2/cases/nodejs/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_5001}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/otlp-traces/e2e.yaml b/test/e2e-v2/cases/otlp-traces/e2e.yaml index 1a1e3151873a..f40e58628167 100644 --- a/test/e2e-v2/cases/otlp-traces/e2e.yaml +++ b/test/e2e-v2/cases/otlp-traces/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8080}/api/products method: GET diff --git a/test/e2e-v2/cases/php/e2e.yaml b/test/e2e-v2/cases/php/e2e.yaml index 83aeb3c1b2eb..3b559c6f5887 100644 --- a/test/e2e-v2/cases/php/e2e.yaml +++ b/test/e2e-v2/cases/php/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 20 + times: -1 url: http://${php_host}:${php_8080}/php/info method: POST diff --git a/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml b/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml index 06eae493c5bf..9f9befaa8ea1 100644 --- a/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml @@ -83,7 +83,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${service_service_host}:${service_service_80}/consumer method: GET diff --git a/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml b/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml index 1d7f85a49227..4f05c0d2aca3 100644 --- a/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml +++ b/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml @@ -82,7 +82,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${service_service_host}:${service_service_80}/consumer method: GET diff --git a/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml index 51425ac79f7a..f3cff405949e 100644 --- a/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml @@ -82,7 +82,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${service_service_host}:${service_service_80}/consumer method: GET diff --git a/test/e2e-v2/cases/promql/e2e.yaml b/test/e2e-v2/cases/promql/e2e.yaml index 560e4ac6e690..18b3eca522c2 100644 --- a/test/e2e-v2/cases/promql/e2e.yaml +++ b/test/e2e-v2/cases/promql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/python/e2e.yaml b/test/e2e-v2/cases/python/e2e.yaml index baf4fe6139fe..d1c3e537837f 100644 --- a/test/e2e-v2/cases/python/e2e.yaml +++ b/test/e2e-v2/cases/python/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer-py_host}:${consumer-py_9090}/test method: POST diff --git a/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml b/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml index 457cb5303bf6..2461b74e3f25 100644 --- a/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml +++ b/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml @@ -33,7 +33,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/info method: POST diff --git a/test/e2e-v2/cases/simple/auth/e2e.yaml b/test/e2e-v2/cases/simple/auth/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/auth/e2e.yaml +++ b/test/e2e-v2/cases/simple/auth/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/simple/jdk/e2e.yaml b/test/e2e-v2/cases/simple/jdk/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/jdk/e2e.yaml +++ b/test/e2e-v2/cases/simple/jdk/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/simple/mtls/e2e.yaml b/test/e2e-v2/cases/simple/mtls/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/mtls/e2e.yaml +++ b/test/e2e-v2/cases/simple/mtls/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/simple/ssl/e2e.yaml b/test/e2e-v2/cases/simple/ssl/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/ssl/e2e.yaml +++ b/test/e2e-v2/cases/simple/ssl/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/so11y/e2e.yaml b/test/e2e-v2/cases/so11y/e2e.yaml index fae4ea729616..8f22574e06cc 100644 --- a/test/e2e-v2/cases/so11y/e2e.yaml +++ b/test/e2e-v2/cases/so11y/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/banyandb/e2e.yaml b/test/e2e-v2/cases/storage/banyandb/e2e.yaml index 840ac9937222..a6d4d8489b5d 100644 --- a/test/e2e-v2/cases/storage/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/storage/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 5s - times: 40 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml b/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml index 86f249bb5420..22d2548b0134 100644 --- a/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml +++ b/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 5s - times: 40 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml b/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml index 93dd16f86739..c93bbedcebe4 100644 --- a/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml +++ b/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 5s - times: 40 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/es/e2e.yaml b/test/e2e-v2/cases/storage/es/e2e.yaml index 6c1c49117ed4..35566af71a28 100644 --- a/test/e2e-v2/cases/storage/es/e2e.yaml +++ b/test/e2e-v2/cases/storage/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml index 13dbfb34e1a6..d9126d4002d7 100644 --- a/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/mysql/e2e.yaml b/test/e2e-v2/cases/storage/mysql/e2e.yaml index 9c7aebe5f5be..774dc86c914d 100644 --- a/test/e2e-v2/cases/storage/mysql/e2e.yaml +++ b/test/e2e-v2/cases/storage/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/opensearch/e2e.yaml b/test/e2e-v2/cases/storage/opensearch/e2e.yaml index 6c1c49117ed4..35566af71a28 100644 --- a/test/e2e-v2/cases/storage/opensearch/e2e.yaml +++ b/test/e2e-v2/cases/storage/opensearch/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/postgres/e2e.yaml b/test/e2e-v2/cases/storage/postgres/e2e.yaml index 9c7aebe5f5be..774dc86c914d 100644 --- a/test/e2e-v2/cases/storage/postgres/e2e.yaml +++ b/test/e2e-v2/cases/storage/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/virtual-mq/e2e.yaml b/test/e2e-v2/cases/virtual-mq/e2e.yaml index d15976319248..7256e86f57d2 100644 --- a/test/e2e-v2/cases/virtual-mq/e2e.yaml +++ b/test/e2e-v2/cases/virtual-mq/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/kafka/send method: GET diff --git a/test/e2e-v2/cases/win/e2e.yaml b/test/e2e-v2/cases/win/e2e.yaml index 275eb17eb051..5b884fb7fec3 100644 --- a/test/e2e-v2/cases/win/e2e.yaml +++ b/test/e2e-v2/cases/win/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${sender_host}:${sender_9093}/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml b/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml index 8749cd70d7dc..b77ab08e18d7 100644 --- a/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/es/e2e.yaml b/test/e2e-v2/cases/zipkin/es/e2e.yaml index 8e3ff87be9f1..d55440880c48 100644 --- a/test/e2e-v2/cases/zipkin/es/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml index 5cf2bc49ef05..054913ec43e0 100644 --- a/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/kafka/e2e.yaml b/test/e2e-v2/cases/zipkin/kafka/e2e.yaml index 90d877db1405..e6b46033097c 100644 --- a/test/e2e-v2/cases/zipkin/kafka/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/kafka/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${sender_host}:${sender_9093}/sendZipkinTrace2Kafka method: POST diff --git a/test/e2e-v2/cases/zipkin/mysql/e2e.yaml b/test/e2e-v2/cases/zipkin/mysql/e2e.yaml index 8749cd70d7dc..b77ab08e18d7 100644 --- a/test/e2e-v2/cases/zipkin/mysql/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml b/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml index 8e3ff87be9f1..d55440880c48 100644 --- a/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/postgres/e2e.yaml b/test/e2e-v2/cases/zipkin/postgres/e2e.yaml index 8e3ff87be9f1..d55440880c48 100644 --- a/test/e2e-v2/cases/zipkin/postgres/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java b/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java index fd446702894e..e8b596e33b77 100644 --- a/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java +++ b/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java @@ -66,6 +66,7 @@ public static class AlarmMessage { private String alarmMessage; private long startTime; private List tags; + private long recoveryTime; } /**