From 7481ca8d6d216fcd602acd9906dd583db9b137e5 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 11:06:35 +0800 Subject: [PATCH 01/21] enhance the alarm kernel with recovered status notification capability #13492 --- .github/workflows/skywalking.yaml | 10 +- .../server/core/alarm/provider/AlarmCore.java | 35 +- .../server/core/alarm/provider/AlarmRule.java | 1 + .../alarm/provider/EventHookCallback.java | 50 +-- .../core/alarm/provider/RulesReader.java | 45 ++- .../core/alarm/provider/RunningRule.java | 302 +++++++++++++----- .../dingtalk/DingtalkHookCallback.java | 17 +- .../provider/dingtalk/DingtalkSettings.java | 1 + .../provider/discord/DiscordHookCallback.java | 17 +- .../provider/discord/DiscordSettings.java | 1 + .../provider/feishu/FeishuHookCallback.java | 14 +- .../alarm/provider/feishu/FeishuSettings.java | 1 + .../alarm/provider/grpc/GRPCCallback.java | 142 ++++++-- .../pagerduty/PagerDutyHookCallback.java | 13 +- .../provider/pagerduty/PagerDutySettings.java | 1 + .../alarm/provider/slack/SlackSettings.java | 1 + .../provider/slack/SlackhookCallback.java | 15 +- .../provider/webhook/WebhookCallback.java | 6 +- .../provider/wechat/WechatHookCallback.java | 11 +- .../alarm/provider/wechat/WechatSettings.java | 2 + .../provider/welink/WeLinkHookCallback.java | 34 +- .../alarm/provider/welink/WeLinkSettings.java | 1 + .../src/main/proto/alarm-hook.proto | 18 ++ .../alarm/provider/NotifyHandlerTest.java | 22 +- .../core/alarm/provider/RunningRuleTest.java | 133 ++++++-- .../oap/server/core/alarm/AlarmCallback.java | 2 + .../oap/server/core/alarm/AlarmMessage.java | 13 + .../oap/server/core/alarm/AlarmRecord.java | 5 + .../core/alarm/AlarmRecoveryMessage.java | 49 +++ .../core/alarm/AlarmRecoveryRecord.java | 142 ++++++++ .../core/alarm/AlarmStandardPersistence.java | 34 +- .../server/core/alarm/HttpAlarmCallback.java | 18 ++ .../server/core/query/type/AlarmMessage.java | 1 + .../core/source/DefaultScopeDefine.java | 1 + .../core/storage/query/IAlarmQueryDAO.java | 8 +- .../src/main/resources/query-protocol | 2 +- .../query/debug/AlarmStatusQueryHandler.java | 2 +- .../stream/BanyanDBAlarmQueryDAO.java | 34 +- .../elasticsearch/query/AlarmQueryEsDAO.java | 45 ++- .../jdbc/common/dao/JDBCAlarmQueryDAO.java | 71 +++- skywalking-ui | 2 +- test/e2e-v2/cases/alarm/alarm-cases.yaml | 12 +- test/e2e-v2/cases/alarm/alarm-settings.yml | 7 +- .../recovery-after-observation-webhook.yml | 47 +++ .../cases/alarm/expected/recovery-webhook.yml | 33 ++ .../silence-after-graphql-critical.yml | 2 +- .../expected/silence-after-graphql-warn.yml | 2 +- .../alarm/expected/silence-after-webhook.yml | 102 ++++++ .../silence-before-graphql-critical.yml | 2 +- .../expected/silence-before-graphql-warn.yml | 2 +- .../alarm/expected/silence-before-webhook.yml | 16 + .../cases/alarm/mysql/docker-compose.yml | 2 + .../e2e/controller/AlarmController.java | 1 + 53 files changed, 1277 insertions(+), 273 deletions(-) create mode 100644 oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java create mode 100644 oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java create mode 100644 test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml create mode 100644 test/e2e-v2/cases/alarm/expected/recovery-webhook.yml diff --git a/.github/workflows/skywalking.yaml b/.github/workflows/skywalking.yaml index 5b3b3a6c93c0..132bc63d0f1f 100644 --- a/.github/workflows/skywalking.yaml +++ b/.github/workflows/skywalking.yaml @@ -768,7 +768,7 @@ jobs: if: matrix.test.docker != null run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} @@ -832,7 +832,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -893,7 +893,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -956,7 +956,7 @@ jobs: shell: bash run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package - name: Java version ${{ matrix.java-version }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f env: SW_AGENT_JDK_VERSION: ${{ matrix.java-version }} with: @@ -1052,7 +1052,7 @@ jobs: fi docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java index 1bb61d6d6185..610ca3f30a46 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmCore.java @@ -18,10 +18,9 @@ package org.apache.skywalking.oap.server.core.alarm.provider; -import java.util.Map; -import java.util.Set; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.joda.time.LocalDateTime; import org.joda.time.Minutes; import org.slf4j.Logger; @@ -29,8 +28,11 @@ import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; /** * Alarm core includes metrics values in certain time windows based on alarm settings. By using its internal timer @@ -92,9 +94,16 @@ public void start(List allCallbacks) { } if (!alarmMessageList.isEmpty()) { + List alarmFiringMessageList = getAlarmFiringMessageList(alarmMessageList); + List alarmRecoveryMessageList = getAlarmRecoveryMessageList(alarmMessageList); for (AlarmCallback callback : allCallbacks) { try { - callback.doAlarm(alarmMessageList); + if (!alarmFiringMessageList.isEmpty()) { + callback.doAlarm(alarmFiringMessageList); + } + if (!alarmRecoveryMessageList.isEmpty()) { + callback.doAlarmRecovery(alarmRecoveryMessageList); + } } catch (Exception e) { LOGGER.error(e.getMessage(), e); } @@ -102,7 +111,27 @@ public void start(List allCallbacks) { } } catch (Exception e) { LOGGER.error(e.getMessage(), e); + } catch (Throwable e) { + LOGGER.error(e.getMessage(), e); + } finally { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("move to new time and check"); + } } }, 10, 10, TimeUnit.SECONDS); } + + public static List getAlarmFiringMessageList(List alarmMessageList) { + return alarmMessageList + .stream() + .filter(msg -> !(msg instanceof AlarmRecoveryMessage)) + .collect(Collectors.toList()); + } + + public static List getAlarmRecoveryMessageList(List alarmMessageList) { + return alarmMessageList + .stream() + .filter(msg -> msg instanceof AlarmRecoveryMessage) + .collect(Collectors.toList()); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java index 633ea5c6465a..7aa35522d1ac 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java @@ -62,6 +62,7 @@ public class AlarmRule { private String excludeNamesRegex; private int period; private int silencePeriod; + private int recoveryObservationPeriod; private String message; private Map tags; private Set hooks; diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java index acb14d2764f3..5018a3fb2315 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/EventHookCallback.java @@ -19,6 +19,7 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import java.io.IOException; + import org.apache.skywalking.apm.network.event.v3.Event; import org.apache.skywalking.apm.network.event.v3.Source; import org.apache.skywalking.apm.network.event.v3.Type; @@ -27,6 +28,7 @@ import org.apache.skywalking.oap.server.core.CoreModule; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.analysis.IDManager; import org.apache.skywalking.oap.server.core.analysis.Layer; import org.apache.skywalking.oap.server.core.query.MetadataQueryService; @@ -40,7 +42,6 @@ /** * EventCallBack: When an alert is present, an event is generated for each alert message. These events are then sent to the internal event analyzer. - * */ public class EventHookCallback implements AlarmCallback { @@ -50,8 +51,8 @@ public class EventHookCallback implements AlarmCallback { private MetadataQueryService getMetadataQueryService() { if (metadataQueryService == null) { this.metadataQueryService = manager.find(CoreModule.NAME) - .provider() - .getService(MetadataQueryService.class); + .provider() + .getService(MetadataQueryService.class); } return metadataQueryService; } @@ -60,11 +61,18 @@ public EventHookCallback(ModuleManager manager) { this.manager = manager; } - @Override public void doAlarm(List alarmMessage) throws Exception { + doAlarmCallback(alarmMessage, false); + } + + public void doAlarmRecovery(List alarmRecoveryMessages) throws Exception { + doAlarmCallback(alarmRecoveryMessages, true); + } + + private void doAlarmCallback(List alarmMessage, boolean isRecovery) throws Exception { EventAnalyzerService analyzerService = manager.find(EventAnalyzerModule.NAME).provider().getService(EventAnalyzerService.class); for (AlarmMessage a : alarmMessage) { - for (Event event : constructCurrentEvent(a)) { + for (Event event : constructCurrentEvent(a, isRecovery)) { analyzerService.analyze(event); } } @@ -79,33 +87,33 @@ private String getLayer(String serviceId) throws IOException { } } - private List constructCurrentEvent(AlarmMessage msg) throws IOException { + private List constructCurrentEvent(AlarmMessage msg, boolean isRecovery) throws IOException { List events = new ArrayList<>(2); long now = System.currentTimeMillis(); Event.Builder builder = Event.newBuilder() .setUuid(UUID.randomUUID().toString()) - .setName("Alarm") - .setStartTime(now - (msg.getPeriod() * 60 * 1000)) + .setName(isRecovery ? "AlarmRecovery" : "Alarm") + .setStartTime(isRecovery ? ((AlarmRecoveryMessage) msg).getRecoveryTime() : now - (msg.getPeriod() * 60 * 1000)) .setMessage(msg.getAlarmMessage()) - .setType(Type.Error) - .setEndTime(now); + .setType(isRecovery ? Type.Normal : Type.Error) + .setEndTime(isRecovery ? ((AlarmRecoveryMessage) msg).getRecoveryTime() : now); switch (msg.getScopeId()) { - case DefaultScopeDefine.SERVICE : + case DefaultScopeDefine.SERVICE: IDManager.ServiceID.ServiceIDDefinition serviceIdDef = IDManager.ServiceID.analysisId(msg.getId0()); builder.setSource( - Source.newBuilder() - .setService(serviceIdDef.getName()) - .build() + Source.newBuilder() + .setService(serviceIdDef.getName()) + .build() ); builder.setLayer(getLayer(msg.getId0())); events.add(builder.build()); break; - case DefaultScopeDefine.SERVICE_RELATION : + case DefaultScopeDefine.SERVICE_RELATION: IDManager.ServiceID.ServiceIDDefinition sourceServiceIdDef = IDManager.ServiceID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() - .setService(sourceServiceIdDef.getName()) - .build() + .setService(sourceServiceIdDef.getName()) + .build() ); builder.setLayer(getLayer(msg.getId0())); events.add(builder.build()); @@ -118,7 +126,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(msg.getId1())); events.add(builder.build()); break; - case DefaultScopeDefine.SERVICE_INSTANCE : + case DefaultScopeDefine.SERVICE_INSTANCE: IDManager.ServiceInstanceID.InstanceIDDefinition instanceIdDef = IDManager.ServiceInstanceID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() @@ -129,7 +137,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(instanceIdDef.getServiceId())); events.add(builder.build()); break; - case DefaultScopeDefine.SERVICE_INSTANCE_RELATION : + case DefaultScopeDefine.SERVICE_INSTANCE_RELATION: IDManager.ServiceInstanceID.InstanceIDDefinition sourceInstanceIdDef = IDManager.ServiceInstanceID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() @@ -149,7 +157,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(destInstanceIdDef.getServiceId())); events.add(builder.build()); break; - case DefaultScopeDefine.ENDPOINT : + case DefaultScopeDefine.ENDPOINT: IDManager.EndpointID.EndpointIDDefinition endpointIDDef = IDManager.EndpointID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() @@ -160,7 +168,7 @@ private List constructCurrentEvent(AlarmMessage msg) throws IOException { builder.setLayer(getLayer(endpointIDDef.getServiceId())); events.add(builder.build()); break; - case DefaultScopeDefine.ENDPOINT_RELATION : + case DefaultScopeDefine.ENDPOINT_RELATION: IDManager.EndpointID.EndpointIDDefinition sourceEndpointIDDef = IDManager.EndpointID.analysisId(msg.getId0()); builder.setSource( Source.newBuilder() diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index b48c94b1c2f5..81b2f5febf07 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -29,6 +29,7 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; + import org.apache.skywalking.mqe.rt.exception.IllegalExpressionException; import org.apache.skywalking.oap.server.core.alarm.provider.discord.DiscordSettings; import org.apache.skywalking.oap.server.core.alarm.provider.pagerduty.PagerDutySettings; @@ -50,6 +51,7 @@ * Rule Reader parses the given `alarm-settings.yml` config file, to the target {@link Rules}. */ public class RulesReader { + public static final String RECOVERED = "[Recovered]"; private Map yamlData; private final Set defaultHooks = new HashSet<>(); private final Set allHooks = new HashSet<>(); @@ -111,6 +113,8 @@ private void readRulesConfig(Rules rules) { alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1)); // How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. alarmRule.setSilencePeriod((Integer) settings.getOrDefault("silence-period", alarmRule.getPeriod())); + alarmRule.setRecoveryObservationPeriod((Integer) settings.getOrDefault("recovery-observation-period", + 0)); alarmRule.setMessage( (String) settings.getOrDefault("message", "Alarm caused by Rule " + alarmRule .getAlarmRuleName())); @@ -156,7 +160,7 @@ private void readWebHookConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; WebhookSettings settings = new WebhookSettings( - k.toString(), AlarmHooksType.webhook, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.webhook, (Boolean) config.getOrDefault("is-default", false)); List urls = (List) config.get("urls"); if (urls != null) { settings.getUrls().addAll(urls); @@ -183,7 +187,7 @@ private void readGrpcConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; GRPCAlarmSetting setting = new GRPCAlarmSetting( - k.toString(), AlarmHooksType.gRPC, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.gRPC, (Boolean) config.getOrDefault("is-default", false)); Object targetHost = config.get("target-host"); if (targetHost != null) { @@ -216,10 +220,12 @@ private void readSlackConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; SlackSettings settings = new SlackSettings( - k.toString(), AlarmHooksType.slack, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.slack, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List webhooks = (List) config.get("webhooks"); if (webhooks != null) { @@ -245,11 +251,14 @@ private void readWechatConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; WechatSettings settings = new WechatSettings( - k.toString(), AlarmHooksType.wechat, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.wechat, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); + List webhooks = (List) config.get("webhooks"); if (webhooks != null) { settings.getWebhooks().addAll(webhooks); @@ -274,11 +283,14 @@ private void readDingtalkConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; DingtalkSettings settings = new DingtalkSettings( - k.toString(), AlarmHooksType.dingtalk, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.dingtalk, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); + List> webhooks = (List>) config.get("webhooks"); if (webhooks != null) { webhooks.forEach(webhook -> { @@ -307,11 +319,14 @@ private void readFeishuConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; FeishuSettings settings = new FeishuSettings( - k.toString(), AlarmHooksType.feishu, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.feishu, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); + List> webhooks = (List>) config.get("webhooks"); if (webhooks != null) { webhooks.forEach(webhook -> { @@ -340,17 +355,19 @@ private void readWeLinkConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); + String recoveryTextTemplate = (String) config.get("recovery-text-template"); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; } List webHookUrls = webhooks.stream().map( - WeLinkSettings.WebHookUrl::generateFromMap + WeLinkSettings.WebHookUrl::generateFromMap ).collect(Collectors.toList()); WeLinkSettings settings = new WeLinkSettings( - k.toString(), AlarmHooksType.welink, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.welink, (Boolean) config.getOrDefault("is-default", false)); settings.setTextTemplate(textTemplate); + settings.setRecoveryTextTemplate(recoveryTextTemplate); settings.setWebhooks(webHookUrls); rules.getWeLinkSettingsMap().put(settings.getFormattedName(), settings); @@ -373,9 +390,11 @@ private void readPagerDutyConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; PagerDutySettings settings = new PagerDutySettings( - k.toString(), AlarmHooksType.pagerduty, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.pagerduty, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List integrationKeys = (List) config.get("integration-keys"); if (integrationKeys != null) { @@ -402,17 +421,19 @@ private void readDiscordConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); + String recoveryTextTemplate = (String) config.get("recovery-text-template"); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; } List webHookUrls = webhooks.stream().map( - DiscordSettings.WebHookUrl::generateFromMap + DiscordSettings.WebHookUrl::generateFromMap ).collect(Collectors.toList()); DiscordSettings settings = new DiscordSettings( - k.toString(), AlarmHooksType.discord, (Boolean) config.getOrDefault("is-default", false)); + k.toString(), AlarmHooksType.discord, (Boolean) config.getOrDefault("is-default", false)); settings.setTextTemplate(textTemplate); + settings.setRecoveryTextTemplate(recoveryTextTemplate); settings.setWebhooks(webHookUrls); rules.getDiscordSettingsMap().put(settings.getFormattedName(), settings); @@ -426,7 +447,7 @@ private void readDiscordConfig(Map hooks, Rules rules) { private void checkSpecificHooks(String ruleName, Set hooks) { if (!this.allHooks.containsAll(hooks)) { throw new IllegalArgumentException("rule: [" + ruleName + "] contains invalid hooks." + - " Please check the hook is exist and name format is {hookType}.{hookName}"); + " Please check the hook is exist and name format is {hookType}.{hookName}"); } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index d270611b88ef..6f952e9e42a9 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -19,19 +19,6 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import com.google.gson.JsonObject; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.locks.ReentrantLock; -import java.util.function.Consumer; -import java.util.regex.Pattern; -import java.util.stream.Collectors; import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.ToString; @@ -42,15 +29,10 @@ import org.apache.skywalking.mqe.rt.exception.ParseErrorListener; import org.apache.skywalking.mqe.rt.grammar.MQELexer; import org.apache.skywalking.mqe.rt.grammar.MQEParser; -import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResult; -import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResultType; -import org.apache.skywalking.oap.server.core.query.mqe.MQEValues; -import org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVisitor; -import org.apache.skywalking.oap.server.core.query.type.debugging.DebuggingTraceContext; -import org.apache.skywalking.oap.server.library.module.ModuleManager; -import org.apache.skywalking.oap.server.library.util.StringUtil; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm; +import org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVisitor; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable; import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder; @@ -58,12 +40,32 @@ import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; +import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResult; +import org.apache.skywalking.oap.server.core.query.mqe.ExpressionResultType; +import org.apache.skywalking.oap.server.core.query.mqe.MQEValues; +import org.apache.skywalking.oap.server.core.query.type.debugging.DebuggingTraceContext; +import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; import org.joda.time.LocalDateTime; import org.joda.time.Minutes; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + import static org.apache.skywalking.oap.server.core.query.type.debugging.DebuggingTraceContext.TRACE_CONTEXT; /** @@ -78,6 +80,7 @@ public class RunningRule { private final int period; private final String expression; private final int silencePeriod; + private final int recoveryObservationPeriod; private final Map windows; private final List includeNames; private final List excludeNames; @@ -100,18 +103,19 @@ public RunningRule(AlarmRule alarmRule, ModuleManager moduleManager) { windows = new ConcurrentHashMap<>(); period = alarmRule.getPeriod(); this.silencePeriod = alarmRule.getSilencePeriod(); + this.recoveryObservationPeriod = alarmRule.getRecoveryObservationPeriod(); this.includeNames = alarmRule.getIncludeNames(); this.excludeNames = alarmRule.getExcludeNames(); this.includeNamesRegex = StringUtil.isNotEmpty(alarmRule.getIncludeNamesRegex()) ? - Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; + Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ? - Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; + Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); this.tags = alarmRule.getTags() - .entrySet() - .stream() - .map(e -> new Tag(e.getKey(), e.getValue())) - .collect(Collectors.toList()); + .entrySet() + .stream() + .map(e -> new Tag(e.getKey(), e.getValue())) + .collect(Collectors.toList()); this.hooks = alarmRule.getHooks(); MQELexer lexer = new MQELexer(CharStreams.fromString(alarmRule.getExpression())); MQEParser parser = new MQEParser(new CommonTokenStream(lexer)); @@ -143,9 +147,10 @@ public void in(MetaInAlarm meta, Metrics metrics) { } AlarmEntity entity = new AlarmEntity( - meta.getScope(), meta.getScopeId(), meta.getName(), meta.getId0(), meta.getId1()); + meta.getScope(), meta.getScopeId(), meta.getName(), meta.getId0(), meta.getId1()); - Window window = windows.computeIfAbsent(entity, ignored -> new Window(entity, this.period, this.additionalPeriod)); + Window window = windows.computeIfAbsent(entity, ignored -> new Window(entity, this.period, + this.silencePeriod, this.recoveryObservationPeriod, this.additionalPeriod)); window.add(meta.getMetricsName(), metrics); } @@ -214,38 +219,35 @@ public List check() { windows.forEach((alarmEntity, window) -> { if (window.isExpired()) { expiredEntityList.add(alarmEntity); - return; + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} expired", ruleName, alarmEntity.getName(), + alarmEntity.getId0(), alarmEntity.getId1()); + } + //return; } Optional alarmMessageOptional = window.checkAlarm(); - if (alarmMessageOptional.isPresent()) { - AlarmMessage alarmMessage = alarmMessageOptional.get(); - alarmMessage.setScopeId(alarmEntity.getScopeId()); - alarmMessage.setScope(alarmEntity.getScope()); - alarmMessage.setName(alarmEntity.getName()); - alarmMessage.setId0(alarmEntity.getId0()); - alarmMessage.setId1(alarmEntity.getId1()); - alarmMessage.setRuleName(this.ruleName); - alarmMessage.setAlarmMessage(formatter.format(alarmEntity)); - alarmMessage.setStartTime(System.currentTimeMillis()); - alarmMessage.setPeriod(this.period); - alarmMessage.setTags(this.tags); - alarmMessage.setHooks(this.hooks); - alarmMessage.setExpression(expression); - alarmMessage.setMqeMetricsSnapshot(window.mqeMetricsSnapshot); - alarmMessageList.add(alarmMessage); - } + alarmMessageOptional.ifPresent(alarmMessageList::add); }); expiredEntityList.forEach(windows::remove); return alarmMessageList; } + public enum State { + NORMAL, + FIRING, + SILENCED, + OBSERVING_RECOVERY, + RECOVERED + } + /** * A metrics window, based on AlarmRule#period. This window slides with time, just keeps the recent N(period) * buckets. */ public class Window { + @Getter private LocalDateTime endTime; @Getter @@ -253,20 +255,24 @@ public class Window { @Getter private final int size; @Getter - private int silenceCountdown; + private final int period; + @Getter + private final AlarmStateMachine stateMachine; private LinkedList> values; private ReentrantLock lock = new ReentrantLock(); + private AlarmMessage lastAlarmMessage; @Getter private JsonObject mqeMetricsSnapshot; private AlarmEntity entity; - public Window(AlarmEntity entity, int period, int additionalPeriod) { + public Window(AlarmEntity entity, int period, int silencePeriod, int recoveryObservationPeriod, + int additionalPeriod) { this.entity = entity; this.additionalPeriod = additionalPeriod; - this.size = period + additionalPeriod; - // -1 means silence countdown is not running. - silenceCountdown = -1; - init(); + this.size = period + additionalPeriod + Math.max(silencePeriod, recoveryObservationPeriod); + this.period = period; + this.stateMachine = new AlarmStateMachine(silencePeriod, recoveryObservationPeriod); + this.init(); } public void moveTo(LocalDateTime current) { @@ -321,8 +327,8 @@ public void add(String metricsName, Metrics metrics) { // also should happen, but maybe if agent/probe mechanism time is not right. if (log.isTraceEnabled()) { log.trace( - "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, - values.size() + "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, + values.size() ); } return; @@ -345,45 +351,70 @@ public void add(String metricsName, Metrics metrics) { } public Optional checkAlarm() { - if (isMatch()) { - /* - * When - * 1. Alarm trigger conditions are satisfied. - * 2. Isn't in silence stage, judged by SilenceCountdown(!=0). - */ - if (silenceCountdown < 1) { - silenceCountdown = silencePeriod; - return Optional.of(new AlarmMessage()); - } else { - silenceCountdown--; - } + boolean match = isMatch(); + if (log.isTraceEnabled()) { + log.trace("RuleName {} AlarmEntity {} {} {} isMatch:{}", ruleName, entity.getName(), entity.getId0(), + entity.getId1(), match); + } + if (match) { + stateMachine.onMatch(); } else { - silenceCountdown--; + stateMachine.onMismatch(); + } + if (stateMachine.currentState == State.FIRING) { + AlarmMessage alarmMessage = buildAlarmMessage(); + lastAlarmMessage = alarmMessage; + return Optional.of(alarmMessage); + } + if (stateMachine.currentState == State.RECOVERED) { + AlarmRecoveryMessage alarmRecoveryMessage = new AlarmRecoveryMessage(lastAlarmMessage); + lastAlarmMessage = null; + return Optional.of(alarmRecoveryMessage); } return Optional.empty(); } + private AlarmMessage buildAlarmMessage() { + AlarmMessage alarmMessage = new AlarmMessage(); + alarmMessage.setScopeId(entity.getScopeId()); + alarmMessage.setScope(entity.getScope()); + alarmMessage.setName(entity.getName()); + alarmMessage.setId0(entity.getId0()); + alarmMessage.setId1(entity.getId1()); + alarmMessage.setRuleName(ruleName); + alarmMessage.setAlarmMessage(formatter.format(entity)); + alarmMessage.setStartTime(System.currentTimeMillis()); + alarmMessage.setPeriod(period); + alarmMessage.setTags(tags); + alarmMessage.setHooks(hooks); + alarmMessage.setExpression(expression); + alarmMessage.setMqeMetricsSnapshot(mqeMetricsSnapshot); + return alarmMessage; + } + private boolean isMatch() { this.lock.lock(); int isMatch = 0; try { TRACE_CONTEXT.set(new DebuggingTraceContext(expression, false, false)); - AlarmMQEVisitor visitor = new AlarmMQEVisitor(moduleManager, this.entity, this.values, this.endTime, this.additionalPeriod); + int metricsSize = period + additionalPeriod; + LinkedList> metricsValues = new LinkedList<>(this.values.subList(size - metricsSize, size)); + AlarmMQEVisitor visitor = new AlarmMQEVisitor(moduleManager, this.entity, metricsValues, this.endTime, this.additionalPeriod); ExpressionResult parseResult = visitor.visit(exprTree); if (StringUtil.isNotBlank(parseResult.getError())) { log.error("expression:" + expression + " error: " + parseResult.getError()); return false; } if (!parseResult.isBoolResult() || - ExpressionResultType.SINGLE_VALUE != parseResult.getType() || - CollectionUtils.isEmpty(parseResult.getResults())) { + ExpressionResultType.SINGLE_VALUE != parseResult.getType() || + CollectionUtils.isEmpty(parseResult.getResults())) { return false; } if (!parseResult.isLabeledResult()) { MQEValues mqeValues = parseResult.getResults().get(0); if (mqeValues != null && - CollectionUtils.isNotEmpty(mqeValues.getValues()) && - mqeValues.getValues().get(0) != null) { + CollectionUtils.isNotEmpty(mqeValues.getValues()) && + mqeValues.getValues().get(0) != null) { isMatch = (int) mqeValues.getValues().get(0).getDoubleValue(); } } else { @@ -401,8 +432,8 @@ private boolean isMatch() { // then the isMatch is 1 for (MQEValues mqeValues : parseResult.getResults()) { if (mqeValues != null && - CollectionUtils.isNotEmpty(mqeValues.getValues()) && - mqeValues.getValues().get(0) != null) { + CollectionUtils.isNotEmpty(mqeValues.getValues()) && + mqeValues.getValues().get(0) != null) { isMatch = (int) mqeValues.getValues().get(0).getDoubleValue(); if (isMatch == 1) { break; @@ -447,6 +478,119 @@ private void init() { values.add(null); } } + + public class AlarmStateMachine { + @Getter + private int silenceCountdown; + @Getter + private int recoveryObservationCountdown; + private final int silencePeriod; + private final int recoveryObservationPeriod; + private State currentState; + + public AlarmStateMachine(int silencePeriod, int recoveryObservationPeriod) { + this.currentState = State.NORMAL; + this.silencePeriod = silencePeriod; + this.recoveryObservationPeriod = recoveryObservationPeriod; + this.silenceCountdown = -1; + this.recoveryObservationCountdown = recoveryObservationPeriod; + } + + public void onMatch() { + if (log.isTraceEnabled()) { + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} onMatch silenceCountdown:{} currentState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), silenceCountdown, currentState); + } + } + silenceCountdown--; + switch (currentState) { + case NORMAL: + case SILENCED: + case OBSERVING_RECOVERY: + case RECOVERED: + if (silenceCountdown < 0) { + transitionTo(State.FIRING); + } + break; + case FIRING: + if (silenceCountdown >= 0) { + transitionTo(State.SILENCED); + } + break; + default: + break; + } + } + + public void onMismatch() { + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} onMismatch silenceCountdown:{} " + + "recoveryObservationCountdown:{} currentState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), silenceCountdown, + recoveryObservationCountdown, currentState); + } + recoveryObservationCountdown--; + silenceCountdown--; + switch (currentState) { + case FIRING: + case SILENCED: + if (this.recoveryObservationCountdown < 0) { + transitionTo(State.RECOVERED); + } else { + transitionTo(State.OBSERVING_RECOVERY); + } + break; + case OBSERVING_RECOVERY: + if (recoveryObservationCountdown < 0) { + transitionTo(State.RECOVERED); + } + break; + case RECOVERED: + transitionTo(State.NORMAL); + break; + case NORMAL: + default: + break; + } + } + + private void transitionTo(State newState) { + if (log.isTraceEnabled()) { + log.trace("RuleName:{} AlarmEntity {} {} {} transitionTo newState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), newState); + } + this.currentState = newState; + switch (newState) { + case NORMAL: + resetCountdowns(); + break; + case FIRING: + this.silenceCountdown = this.silencePeriod; + this.recoveryObservationCountdown = recoveryObservationPeriod; + break; + case SILENCED: + break; + case OBSERVING_RECOVERY: + this.recoveryObservationCountdown = this.recoveryObservationPeriod - 1; + //this.silenceCountdown = -1; + break; + case RECOVERED: + this.recoveryObservationCountdown = this.recoveryObservationPeriod; + break; + } + } + + private void resetCountdowns() { + //silenceCountdown = -1; + recoveryObservationCountdown = this.recoveryObservationPeriod; + } + + public State getCurrentState() { + return currentState; + } + } + } private LinkedList> transformValues(LinkedList> values) { @@ -460,16 +604,16 @@ private LinkedList> transformValues(LinkedList r = new HashMap<>(); result.add(r); if (m instanceof LongValueHolder) { - r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[] {((LongValueHolder) m).getValue()})); + r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[]{((LongValueHolder) m).getValue()})); } else if (m instanceof IntValueHolder) { - r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[] {((IntValueHolder) m).getValue()})); + r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[]{((IntValueHolder) m).getValue()})); } else if (m instanceof DoubleValueHolder) { - r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[] {((DoubleValueHolder) m).getValue()})); + r.put(name, new TraceLogMetric(m.getTimeBucket(), new Number[]{((DoubleValueHolder) m).getValue()})); } else if (m instanceof LabeledValueHolder) { DataTable dt = ((LabeledValueHolder) m).getValue(); TraceLogMetric l = new TraceLogMetric( - m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()) - .toArray(new Number[0])); + m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()) + .toArray(new Number[0])); l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]); r.put(name, l); } else { diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java index ab2c7efb3297..84e29b72ed4a 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java @@ -46,11 +46,7 @@ public class DingtalkHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; - /** - * Send alarm message if the settings not empty - */ - @Override - public void doAlarm(List alarmMessages) throws Exception { + protected void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getDingtalkSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -61,21 +57,24 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { final var url = getUrl(webHookUrl); for (final var alarmMessage : messages) { - final var requestBody = String.format( - setting.getTextTemplate(), alarmMessage.getAlarmMessage() - ); + final var requestBody = String.format(getTemplate(setting, isRecovery), + alarmMessage.getAlarmMessage()); post(URI.create(url), requestBody, Map.of()); } } } } + private String getTemplate(DingtalkSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * Get webhook url, sign the url when secret is not empty. */ diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java index 475c44329c52..d842a150a0fe 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkSettings.java @@ -34,6 +34,7 @@ public class DingtalkSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); public DingtalkSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java index 44d53dcc6a5d..0faf690f28e4 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java @@ -29,6 +29,7 @@ import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; /** @@ -39,11 +40,7 @@ public class DiscordHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; - /** - * Send alarm message if the settings not empty - */ - @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getDiscordSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -55,14 +52,14 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { for (final var alarmMessage : messages) { final var content = String.format( - setting.getTextTemplate(), - alarmMessage.getAlarmMessage() + getTemplate(setting, isRecovery), + alarmMessage.getAlarmMessage() ); sendAlarmMessage(webHookUrl, content); } @@ -70,6 +67,10 @@ public void doAlarm(List alarmMessages) throws Exception { } } + private String getTemplate(DiscordSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * Send alarm message to remote endpoint */ diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java index ddaea517045d..6e7251219e5c 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordSettings.java @@ -35,6 +35,7 @@ public class DiscordSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); public DiscordSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java index 82004e77426c..5b35f8f02e86 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java @@ -48,11 +48,7 @@ public class FeishuHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; - /** - * Send alarm message if the settings not empty - */ - @Override - public void doAlarm(List alarmMessages) throws Exception { + protected void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getFeishuSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -63,12 +59,12 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var requestBody = getRequestBody(webHookUrl, alarmMessage, setting.getTextTemplate()); + final var requestBody = getRequestBody(webHookUrl, alarmMessage, getTemplate(setting, isRecovery)); try { post(URI.create(webHookUrl.getUrl()), requestBody, Map.of()); } catch (Exception e) { @@ -79,6 +75,10 @@ public void doAlarm(List alarmMessages) throws Exception { } } + private String getTemplate(FeishuSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * deal requestBody,if it has sign set the sign */ diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java index b4a3cef55888..2d578e6a58b9 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuSettings.java @@ -34,6 +34,7 @@ public class FeishuSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; @Builder.Default private List webhooks = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java index 2ae872f2f13c..5c6d31a9eb17 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java @@ -19,13 +19,16 @@ package org.apache.skywalking.oap.server.core.alarm.provider.grpc; import io.grpc.stub.StreamObserver; + import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; + import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmServiceGrpc; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmTags; import org.apache.skywalking.oap.server.core.alarm.grpc.KeyStringValuePair; @@ -69,6 +72,15 @@ public GRPCCallback(AlarmRulesWatcher alarmRulesWatcher) { @Override public void doAlarm(List alarmMessages) { + doAlarmCallback(alarmMessages, false); + } + + @Override + public void doAlarmRecovery(List alarmRecoveryMessages) { + doAlarmCallback(alarmRecoveryMessages, true); + } + + private void doAlarmCallback(List alarmMessages, boolean isRecovery) { // recreate gRPC client and stub if host and port configuration changed. Map settinsMap = alarmRulesWatcher.getGrpchookSetting(); onGRPCAlarmSettingUpdated(settinsMap); @@ -76,11 +88,15 @@ public void doAlarm(List alarmMessages) { if (settinsMap == null || settinsMap.isEmpty()) { return; } - Map> groupedMessages = groupMessagesByHook(alarmMessages); + Map> groupedMessages = groupMessagesByHook(alarmMessages); groupedMessages.forEach((hook, messages) -> { if (alarmServiceStubMap.containsKey(hook)) { - sendAlarmMessages(alarmServiceStubMap.get(hook), messages, settinsMap.get(hook)); + if (!isRecovery) { + sendAlarmMessages(alarmServiceStubMap.get(hook), messages, settinsMap.get(hook)); + } else { + sendAlarmRecoveryMessages(alarmServiceStubMap.get(hook), messages, settinsMap.get(hook)); + } } }); @@ -92,32 +108,32 @@ private void sendAlarmMessages(AlarmServiceGrpc.AlarmServiceStub alarmServiceStu GRPCStreamStatus status = new GRPCStreamStatus(); StreamObserver streamObserver = - alarmServiceStub.withDeadlineAfter(10, TimeUnit.SECONDS).doAlarm(new StreamObserver() { - @Override - public void onNext(Response response) { - // ignore empty response - } + alarmServiceStub.withDeadlineAfter(10, TimeUnit.SECONDS).doAlarm(new StreamObserver() { + @Override + public void onNext(Response response) { + // ignore empty response + } - @Override - public void onError(Throwable throwable) { - status.done(); - if (log.isDebugEnabled()) { - log.debug("Send alarm message failed: {}", throwable.getMessage()); + @Override + public void onError(Throwable throwable) { + status.done(); + if (log.isDebugEnabled()) { + log.debug("Send alarm message failed: {}", throwable.getMessage()); + } } - } - @Override - public void onCompleted() { - status.done(); - if (log.isDebugEnabled()) { - log.debug("Send alarm message successful."); + @Override + public void onCompleted() { + status.done(); + if (log.isDebugEnabled()) { + log.debug("Send alarm message successful."); + } } - } - }); + }); alarmMessages.forEach(message -> { org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage.Builder builder = - org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage.newBuilder(); + org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage.newBuilder(); builder.setScopeId(message.getScopeId()); builder.setScope(message.getScope()); @@ -127,6 +143,7 @@ public void onCompleted() { builder.setRuleName(message.getRuleName()); builder.setAlarmMessage(message.getAlarmMessage()); builder.setStartTime(message.getStartTime()); + builder.setUuid(message.getUuid()); AlarmTags.Builder alarmTagsBuilder = AlarmTags.newBuilder(); message.getTags().forEach(m -> alarmTagsBuilder.addData(KeyStringValuePair.newBuilder().setKey(m.getKey()).setValue(m.getValue()).build())); builder.setTags(alarmTagsBuilder.build()); @@ -148,18 +165,95 @@ public void onCompleted() { if (log.isDebugEnabled()) { log.debug("Send {} alarm message to {}:{}.", alarmMessages.size(), - alarmSetting.getTargetHost(), alarmSetting.getTargetPort() + alarmSetting.getTargetHost(), alarmSetting.getTargetPort() ); } if (sleepTime > 2000L) { log.warn("Send {} alarm message to {}:{}, wait {} milliseconds.", alarmMessages.size(), - alarmSetting.getTargetHost(), alarmSetting.getTargetPort(), sleepTime + alarmSetting.getTargetHost(), alarmSetting.getTargetPort(), sleepTime ); cycle = 2000L; } } -} + } + + private void sendAlarmRecoveryMessages(AlarmServiceGrpc.AlarmServiceStub alarmServiceStub, + List alarmMessages, + GRPCAlarmSetting alarmSetting) { + GRPCStreamStatus status = new GRPCStreamStatus(); + + StreamObserver streamObserver = + alarmServiceStub.withDeadlineAfter(10, TimeUnit.SECONDS).doAlarmRecovery(new StreamObserver() { + @Override + public void onNext(Response response) { + // ignore empty response + } + + @Override + public void onError(Throwable throwable) { + status.done(); + if (log.isDebugEnabled()) { + log.debug("Send alarm message failed: {}", throwable.getMessage()); + } + } + + @Override + public void onCompleted() { + status.done(); + if (log.isDebugEnabled()) { + log.debug("Send alarm message successful."); + } + } + }); + + alarmMessages.forEach(message -> { + org.apache.skywalking.oap.server.core.alarm.grpc.AlarmRecoveryMessage.Builder builder = + org.apache.skywalking.oap.server.core.alarm.grpc.AlarmRecoveryMessage.newBuilder(); + AlarmRecoveryMessage recoveryMessage = (AlarmRecoveryMessage) message; + builder.setScopeId(recoveryMessage.getScopeId()); + builder.setScope(recoveryMessage.getScope()); + builder.setName(recoveryMessage.getName()); + builder.setId0(recoveryMessage.getId0()); + builder.setId1(recoveryMessage.getId1()); + builder.setRuleName(recoveryMessage.getRuleName()); + builder.setAlarmMessage(recoveryMessage.getAlarmMessage()); + builder.setStartTime(recoveryMessage.getStartTime()); + builder.setRecoveryTime(recoveryMessage.getRecoveryTime()); + builder.setUuid(recoveryMessage.getUuid()); + AlarmTags.Builder alarmTagsBuilder = AlarmTags.newBuilder(); + message.getTags().forEach(m -> alarmTagsBuilder.addData(KeyStringValuePair.newBuilder().setKey(m.getKey()).setValue(m.getValue()).build())); + builder.setTags(alarmTagsBuilder.build()); + streamObserver.onNext(builder.build()); + }); + + streamObserver.onCompleted(); + + long sleepTime = 0; + long cycle = 100L; + + // For memory safe of oap, we must wait for the peer confirmation. + while (!status.isDone()) { + try { + sleepTime += cycle; + Thread.sleep(cycle); + } catch (InterruptedException ignored) { + } + + if (log.isDebugEnabled()) { + log.debug("Send {} alarm recovery message to {}:{}.", alarmMessages.size(), + alarmSetting.getTargetHost(), alarmSetting.getTargetPort() + ); + } + + if (sleepTime > 2000L) { + log.warn("Send {} alarm recovery message to {}:{}, wait {} milliseconds.", alarmMessages.size(), + alarmSetting.getTargetHost(), alarmSetting.getTargetPort(), sleepTime + ); + cycle = 2000L; + } + } + } private void onGRPCAlarmSettingUpdated(Map newAlarmSettingMap) { if (newAlarmSettingMap == null || newAlarmSettingMap.isEmpty()) { diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java index a7d6e9c1b6db..cae4283c17dd 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java @@ -31,6 +31,7 @@ import java.util.List; import java.util.Map; import java.util.UUID; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; @Slf4j @@ -42,7 +43,7 @@ public class PagerDutyHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; @Override - public void doAlarm(List alarmMessages) throws Exception { + protected void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getPagerDutySettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -54,15 +55,15 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getIntegrationKeys()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var integrationKey : setting.getIntegrationKeys()) { for (final var alarmMessage : messages) { try { post( - URI.create(PAGER_DUTY_EVENTS_API_V2_URL), - getMessageBody(alarmMessage, integrationKey, setting.getTextTemplate()), Map.of() + URI.create(PAGER_DUTY_EVENTS_API_V2_URL), + getMessageBody(alarmMessage, integrationKey, getTemplate(isRecovery, setting)), Map.of() ); } catch (Exception e) { log.error("Failed to send alarm message to PagerDuty: {}", integrationKey, e); @@ -72,6 +73,10 @@ public void doAlarm(List alarmMessages) throws Exception { } } + private String getTemplate(boolean isRecovery, PagerDutySettings setting) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + private String getMessageBody(AlarmMessage alarmMessage, String integrationKey, String textTemplate) { final var body = new JsonObject(); final var payload = new JsonObject(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java index d36f0577e26c..700e118ee863 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutySettings.java @@ -33,6 +33,7 @@ public class PagerDutySettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List integrationKeys = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java index c87d60d37887..30e622dec20b 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackSettings.java @@ -32,6 +32,7 @@ public class SlackSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java index d1c6edb3a706..72bdad30f9b2 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java @@ -30,6 +30,7 @@ import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; /** @@ -43,7 +44,7 @@ public class SlackhookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getSlackSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -55,7 +56,7 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } @@ -64,9 +65,9 @@ public void doAlarm(List alarmMessages) throws Exception { final var jsonElements = new JsonArray(); for (AlarmMessage item : messages) { jsonElements.add(GSON.fromJson( - String.format( - setting.getTextTemplate(), item.getAlarmMessage() - ), JsonObject.class)); + String.format( + getTemplate(setting, isRecovery), item.getAlarmMessage() + ), JsonObject.class)); } jsonObject.add("blocks", jsonElements); final var body = GSON.toJson(jsonObject); @@ -78,4 +79,8 @@ public void doAlarm(List alarmMessages) throws Exception { } } } + + private String getTemplate(SlackSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java index 0fd0dbe30453..a07b47508e21 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java @@ -23,9 +23,11 @@ import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.HttpAlarmCallback; + import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.library.util.CollectionUtils; @@ -39,7 +41,7 @@ public class WebhookCallback extends HttpAlarmCallback { private final Gson gson = new Gson(); @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getWebHooks(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -51,7 +53,7 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getUrls()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var url : setting.getUrls()) { diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java index 9fc3e6e0be9e..3f98cd1dfda1 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java @@ -27,6 +27,7 @@ import java.net.URI; import java.util.List; import java.util.Map; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; /** @@ -38,7 +39,7 @@ public class WechatHookCallback extends HttpAlarmCallback { private final AlarmRulesWatcher alarmRulesWatcher; @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getWechatSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; @@ -50,13 +51,13 @@ public void doAlarm(List alarmMessages) throws Exception { var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var url : setting.getWebhooks()) { for (final var alarmMessage : messages) { final var requestBody = String.format( - setting.getTextTemplate(), alarmMessage.getAlarmMessage() + getTemplate(setting, isRecovery), alarmMessage.getAlarmMessage() ); try { post(URI.create(url), requestBody, Map.of()); @@ -67,4 +68,8 @@ public void doAlarm(List alarmMessages) throws Exception { } } } + + private String getTemplate(WechatSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java index f29b897d41e1..9dc6beb6c52f 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatSettings.java @@ -33,6 +33,8 @@ public class WechatSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; + private List webhooks = new ArrayList<>(); public WechatSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java index 5505a05b95e4..91f98da52dc6 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java @@ -22,7 +22,9 @@ import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; + import java.util.Map; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; @@ -37,6 +39,7 @@ import java.util.Locale; import java.util.Optional; import java.util.UUID; + import org.apache.skywalking.oap.server.library.util.CollectionUtils; /** @@ -51,26 +54,25 @@ public class WeLinkHookCallback extends HttpAlarmCallback { * Send alarm message if the settings not empty */ @Override - public void doAlarm(List alarmMessages) throws Exception { + public void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception { Map settingsMap = alarmRulesWatcher.getWeLinkSettings(); if (settingsMap == null || settingsMap.isEmpty()) { return; } - Map> groupedMessages = groupMessagesByHook(alarmMessages); + Map> groupedMessages = groupMessagesByHook(alarmMessages); for (Map.Entry> entry : groupedMessages.entrySet()) { var hookName = entry.getKey(); var messages = entry.getValue(); var setting = settingsMap.get(hookName); if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + messages)) { continue; } for (final var webHookUrl : setting.getWebhooks()) { final var accessToken = getAccessToken(webHookUrl); for (final var alarmMessage : messages) { final var content = String.format( - setting.getTextTemplate(), - alarmMessage.getAlarmMessage() + getTemplate(setting, isRecovery), alarmMessage.getAlarmMessage() ); sendAlarmMessage(webHookUrl, accessToken, content); } @@ -78,6 +80,10 @@ public void doAlarm(List alarmMessages) throws Exception { } } + private String getTemplate(WeLinkSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryTextTemplate() : setting.getTextTemplate(); + } + /** * Send alarm message to remote endpoint */ @@ -92,9 +98,9 @@ private void sendAlarmMessage(WeLinkSettings.WebHookUrl webHookUrl, String acces body.addProperty("app_msg_id", UUID.randomUUID().toString()); body.add("group_id", groupIds); body.addProperty("content", String.format( - Locale.US, "0<imbody><imagelist/>" + - "<html><![CDATA[<DIV>%s</DIV>]]></html><content><![CDATA[%s]]></content></imbody>", - content, content + Locale.US, "0<imbody><imagelist/>" + + "<html><![CDATA[<DIV>%s</DIV>]]></html><content><![CDATA[%s]]></content></imbody>", + content, content )); body.addProperty("content_type", 0); body.addProperty("client_app_id", "1"); @@ -110,16 +116,16 @@ private String getAccessToken(WeLinkSettings.WebHookUrl webHookUrl) throws IOExc final var clientId = webHookUrl.getClientId(); final var clientSecret = webHookUrl.getClientSecret(); final var response = post( - URI.create(accessTokenUrl), - String.format(Locale.US, "{\"client_id\":%s,\"client_secret\":%s}", clientId, clientSecret), - Collections.emptyMap() + URI.create(accessTokenUrl), + String.format(Locale.US, "{\"client_id\":%s,\"client_secret\":%s}", clientId, clientSecret), + Collections.emptyMap() ); final var gson = new Gson(); final var responseJson = gson.fromJson(response, JsonObject.class); return Optional.ofNullable(responseJson) - .map(r -> r.get("access_token")) - .map(JsonElement::getAsString) - .orElse(""); + .map(r -> r.get("access_token")) + .map(JsonElement::getAsString) + .orElse(""); } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java index 5c4609319921..e4d93e660699 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkSettings.java @@ -34,6 +34,7 @@ public class WeLinkSettings extends AlarmHookSettings { private String textTemplate; + private String recoveryTextTemplate; private List webhooks = new ArrayList<>(); public WeLinkSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto b/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto index 75b9c31ece04..fc180ab757fa 100644 --- a/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto +++ b/oap-server/server-alarm-plugin/src/main/proto/alarm-hook.proto @@ -24,6 +24,8 @@ option java_package = "org.apache.skywalking.oap.server.core.alarm.grpc"; service AlarmService { rpc doAlarm (stream AlarmMessage) returns (Response) { } + rpc doAlarmRecovery (stream AlarmRecoveryMessage) returns (Response) { + } } message AlarmMessage { @@ -36,8 +38,24 @@ message AlarmMessage { string alarmMessage = 7; int64 startTime = 8; AlarmTags tags = 9; + string uuid = 10; } +message AlarmRecoveryMessage { + int64 scopeId = 1; + string scope = 2; + string name = 3; + string id0 = 4; + string id1 = 5; + string ruleName = 6; + string alarmMessage = 7; + int64 startTime = 8; + AlarmTags tags = 9; + string uuid = 10; + int64 recoveryTime = 11; +} + + message AlarmTags { // String key, String value pair. repeated KeyStringValuePair data = 1; diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java index 75252274270e..0e3583a1ecd5 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/NotifyHandlerTest.java @@ -19,6 +19,7 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import com.google.common.collect.Lists; +import org.apache.skywalking.oap.server.core.alarm.AlarmCallback; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.EndpointMetaInAlarm; import org.apache.skywalking.oap.server.core.alarm.EndpointRelationMetaInAlarm; @@ -44,6 +45,8 @@ import org.mockito.quality.Strictness; import org.powermock.reflect.Whitebox; +import java.util.List; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -261,11 +264,22 @@ public void setUp() { notifyHandler = new NotifyHandler(new AlarmRulesWatcher(rules, null, moduleManager), moduleManager); - notifyHandler.init(alarmMessageList -> { - for (AlarmMessage message : alarmMessageList) { - assertNotNull(message); + notifyHandler.init(new AlarmCallback() { + @Override + public void doAlarm(List alarmMessages) throws Exception { + for (AlarmMessage message : alarmMessages) { + assertNotNull(message); + } + } + + @Override + public void doAlarmRecovery(List alarmResolvedMessages) throws Exception { + for (AlarmMessage message : alarmResolvedMessages) { + assertNotNull(message); + } } - }); + } + ); AlarmCore core = mock(AlarmCore.class); diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index 656babfc834c..b6884764f917 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -50,6 +50,9 @@ import java.util.Map; import java.util.Objects; +import static org.apache.skywalking.oap.server.core.alarm.provider.AlarmCore.getAlarmFiringMessageList; +import static org.apache.skywalking.oap.server.core.alarm.provider.AlarmCore.getAlarmRecoveryMessageList; + /** * Running rule is the core of how does alarm work. *

@@ -59,11 +62,11 @@ public class RunningRuleTest { @BeforeEach public void setup() { ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_percent", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_percent", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_multiple_values", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_multiple_values", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_cpm", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_cpm", "testColumn", Column.ValueDataType.COMMON_VALUE, 0, Scope.Endpoint.getScopeId()); } @Test @@ -119,12 +122,12 @@ public void testAlarm() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); // check at startTime - 4 - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); // check at startTime - 2 runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); - alarmMessages = runningRule.check(); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(1, alarmMessages.size()); } @@ -152,14 +155,14 @@ public void testAlarmMetricsOutOfDate() throws IllegalExpressionException { // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); } @Test public void testLabeledAlarm() throws IllegalExpressionException { ValueColumnMetadata.INSTANCE.putIfAbsent( - "endpoint_labeled", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); + "endpoint_labeled", "testColumn", Column.ValueDataType.LABELED_VALUE, 0, Scope.Endpoint.getScopeId()); AlarmRule alarmRule = new AlarmRule(null); alarmRule.setExpression("sum(endpoint_labeled{p='95,99'} > 10) >= 3"); alarmRule.getIncludeMetrics().add("endpoint_labeled"); @@ -202,13 +205,13 @@ private void multipleMetricsAlarm(String expression, int alarmMsgSize) throws Il runningRule.in(getMetaInAlarm(123, "endpoint_cpm"), getMetrics(timeInPeriod1, 50)); runningRule.in(getMetaInAlarm(123, "endpoint_cpm"), getMetrics(timeInPeriod2, 99)); - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); runningRule.in(getMetaInAlarm(123, "endpoint_cpm"), getMetrics(timeInPeriod3, 60)); - alarmMessages = runningRule.check(); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(alarmMsgSize, alarmMessages.size()); } @@ -231,6 +234,11 @@ public void testNoAlarm() throws IllegalExpressionException { public void doAlarm(List alarmMessage) { isAlarm[0] = true; } + + @Override + public void doAlarmRecovery(List alarmResolvedMessages) { + isAlarm[0] = false; + } }; LinkedList callbackList = new LinkedList<>(); callbackList.add(assertCallback); @@ -249,15 +257,15 @@ public void doAlarm(List alarmMessage) { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod5, 95)); // check at startTime - 1 - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 1 runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } @Test @@ -282,21 +290,78 @@ public void testSilence() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); // check at startTime - 4 - Assertions.assertEquals(0, runningRule.check().size()); //check matches, no alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //check matches, no alarm // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); - Assertions.assertEquals(1, runningRule.check().size()); //alarm + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); //alarm // check at starTime + 1 runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertNotEquals(0, runningRule.check().size()); //alarm - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertEquals(0, runningRule.check().size()); //silence, no alarm - Assertions.assertNotEquals(0, runningRule.check().size()); //alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertNotEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //silence, no alarm + Assertions.assertNotEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + } + + @Test + public void testRecoverObservation() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 3"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(15); + alarmRule.setRecoveryObservationPeriod(2); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + long timeInPeriod1 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(6).getMillis()); + long timeInPeriod2 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(4).getMillis()); + long timeInPeriod3 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); + runningRule.moveTo(startTime.toLocalDateTime()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + runningRule.moveTo(startTime.plusMinutes(8).toLocalDateTime()); + Assertions.assertEquals(0, getAlarmRecoveryMessageList(runningRule.check()).size()); //no recovery + runningRule.moveTo(startTime.plusMinutes(9).toLocalDateTime()); + Assertions.assertEquals(0, getAlarmRecoveryMessageList(runningRule.check()).size()); //recoverObserving + Assertions.assertEquals(0, getAlarmRecoveryMessageList(runningRule.check()).size()); //recoverObserving + Assertions.assertEquals(1, getAlarmRecoveryMessageList(runningRule.check()).size()); //recovered + } + + @Test + public void testRecover() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 3"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(15); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + long timeInPeriod1 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(6).getMillis()); + long timeInPeriod2 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(4).getMillis()); + long timeInPeriod3 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); + runningRule.moveTo(startTime.toLocalDateTime()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); //alarm + runningRule.moveTo(startTime.plusMinutes(9).toLocalDateTime()); + Assertions.assertEquals(1, getAlarmRecoveryMessageList(runningRule.check()).size()); //recovery } @Test @@ -323,15 +388,15 @@ public void testExclude() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); // check at startTime - 2 - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 1 runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } @Test @@ -342,7 +407,7 @@ public void testIncludeNamesRegex() throws IllegalExpressionException { alarmRule.getIncludeMetrics().add("endpoint_percent"); alarmRule.setPeriod(10); alarmRule.setMessage( - "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+"); alarmRule.setTags(new HashMap() {{ put("key", "value"); @@ -359,15 +424,15 @@ public void testIncludeNamesRegex() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); // check at startTime - 1 - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 6 runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } @Test @@ -378,7 +443,7 @@ public void testExcludeNamesRegex() throws IllegalExpressionException { alarmRule.getIncludeMetrics().add("endpoint_percent"); alarmRule.setPeriod(10); alarmRule.setMessage( - "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+"); alarmRule.setTags(new HashMap() {{ put("key", "value"); @@ -395,15 +460,15 @@ public void testExcludeNamesRegex() throws IllegalExpressionException { runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); // check at startTime - 1 - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); - Assertions.assertEquals(1, runningRule.check().size()); + Assertions.assertEquals(1, getAlarmFiringMessageList(runningRule.check()).size()); // check at startTime + 6 runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); - Assertions.assertEquals(0, runningRule.check().size()); + Assertions.assertEquals(0, getAlarmFiringMessageList(runningRule.check()).size()); } private MetaInAlarm getMetaInAlarm(int id) { @@ -472,7 +537,7 @@ private Metrics getLabeledValueMetrics(long timeBucket, String values) { private AlarmEntity getAlarmEntity(int id) { MetaInAlarm metaInAlarm = getMetaInAlarm(id); return new AlarmEntity(metaInAlarm.getScope(), metaInAlarm.getScopeId(), metaInAlarm.getName(), - metaInAlarm.getId0(), metaInAlarm.getId1() + metaInAlarm.getId0(), metaInAlarm.getId1() ); } @@ -594,13 +659,13 @@ private void assertLabeled(AlarmRule alarmRule, String value1, String value2, St runningRule.in(getMetaInAlarm(123, "endpoint_labeled"), getLabeledValueMetrics(timeInPeriod2, value2)); // check at startTime - 4 - List alarmMessages = runningRule.check(); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(0, alarmMessages.size()); // check at startTime runningRule.moveTo(startTime.toLocalDateTime()); runningRule.in(getMetaInAlarm(123, "endpoint_labeled"), getLabeledValueMetrics(timeInPeriod3, value3)); - alarmMessages = runningRule.check(); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(alarmMsgSize, alarmMessages.size()); } } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java index b6e3a4d3985e..c7d3019c52fa 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmCallback.java @@ -41,4 +41,6 @@ default Map> groupMessagesByHook(List a } void doAlarm(List alarmMessages) throws Exception; + + void doAlarmRecovery(List alarmRecoveryMessages) throws Exception; } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java index e644eb93062b..2177d6c3c920 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmMessage.java @@ -19,12 +19,16 @@ package org.apache.skywalking.oap.server.core.alarm; import com.google.gson.JsonObject; + import java.util.HashSet; import java.util.Set; + import lombok.Getter; import lombok.Setter; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; + import java.util.List; +import java.util.UUID; /** * Alarm message represents the details of each alarm. @@ -45,4 +49,13 @@ public class AlarmMessage { private Set hooks = new HashSet<>(); private String expression; private JsonObject mqeMetricsSnapshot; + private String uuid; + + public AlarmMessage(String uuid) { + this.uuid = uuid; + } + + public AlarmMessage() { + this.uuid = UUID.randomUUID().toString(); + } } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java index 302866f294db..ef83e8641538 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecord.java @@ -47,6 +47,7 @@ public class AlarmRecord extends Record { public static final String INDEX_NAME = "alarm_record"; public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag"; + public static final String UUID = "uuid"; public static final String SCOPE = "scope"; public static final String NAME = "name"; public static final String ID0 = "id0"; @@ -92,6 +93,8 @@ public StorageID id() { private byte[] tagsRawData; @Column(name = SNAPSHOT, storageOnly = true, length = 50000) private String snapshot; + @Column(name = UUID) + private String uuid; public static class Builder implements StorageBuilder { @Override @@ -99,6 +102,7 @@ public AlarmRecord storage2Entity(final Convert2Entity converter) { AlarmRecord record = new AlarmRecord(); record.setScope(((Number) converter.get(SCOPE)).intValue()); record.setName((String) converter.get(NAME)); + record.setUuid((String) converter.get(UUID)); record.setId0((String) converter.get(ID0)); record.setId1((String) converter.get(ID1)); record.setAlarmMessage((String) converter.get(ALARM_MESSAGE)); @@ -115,6 +119,7 @@ public AlarmRecord storage2Entity(final Convert2Entity converter) { public void entity2Storage(final AlarmRecord storageData, final Convert2Storage converter) { converter.accept(SCOPE, storageData.getScope()); converter.accept(NAME, storageData.getName()); + converter.accept(UUID, storageData.getUuid()); converter.accept(ID0, storageData.getId0()); converter.accept(ID1, storageData.getId1()); converter.accept(ALARM_MESSAGE, storageData.getAlarmMessage()); diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java new file mode 100644 index 000000000000..257717c201e8 --- /dev/null +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryMessage.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.skywalking.oap.server.core.alarm; + +import lombok.Getter; +import lombok.Setter; + +/** + * Alarm message represents the details of each alarm. + */ +@Setter +@Getter +public class AlarmRecoveryMessage extends AlarmMessage { + private long recoveryTime; + + public AlarmRecoveryMessage(AlarmMessage alarmMessage) { + this.setScopeId(alarmMessage.getScopeId()); + this.setScope(alarmMessage.getScope()); + this.setName(alarmMessage.getName()); + this.setId0(alarmMessage.getId0()); + this.setId1(alarmMessage.getId1()); + this.setRuleName(alarmMessage.getRuleName()); + this.setAlarmMessage(alarmMessage.getAlarmMessage()); + this.setTags(alarmMessage.getTags()); + this.setStartTime(alarmMessage.getStartTime()); + this.setPeriod(alarmMessage.getPeriod()); + this.setHooks(alarmMessage.getHooks()); + this.setExpression(alarmMessage.getExpression()); + this.setMqeMetricsSnapshot(alarmMessage.getMqeMetricsSnapshot()); + this.setUuid(alarmMessage.getUuid()); + this.setRecoveryTime(System.currentTimeMillis()); + } +} diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java new file mode 100644 index 000000000000..ef802bca0390 --- /dev/null +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.skywalking.oap.server.core.alarm; + +import lombok.Getter; +import lombok.Setter; +import org.apache.skywalking.oap.server.core.analysis.Stream; +import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; +import org.apache.skywalking.oap.server.core.analysis.record.Record; +import org.apache.skywalking.oap.server.core.analysis.worker.RecordStreamProcessor; +import org.apache.skywalking.oap.server.core.source.DefaultScopeDefine; +import org.apache.skywalking.oap.server.core.source.ScopeDeclaration; +import org.apache.skywalking.oap.server.core.storage.StorageID; +import org.apache.skywalking.oap.server.core.storage.annotation.BanyanDB; +import org.apache.skywalking.oap.server.core.storage.annotation.Column; +import org.apache.skywalking.oap.server.core.storage.annotation.ElasticSearch; +import org.apache.skywalking.oap.server.core.storage.annotation.SQLDatabase; +import org.apache.skywalking.oap.server.core.storage.type.Convert2Entity; +import org.apache.skywalking.oap.server.core.storage.type.Convert2Storage; +import org.apache.skywalking.oap.server.core.storage.type.StorageBuilder; + +import java.util.List; + +import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.ALARM_RECOVERY; +import static org.apache.skywalking.oap.server.core.storage.StorageData.TIME_BUCKET; + +@Getter +@Setter +@ScopeDeclaration(id = ALARM_RECOVERY, name = "AlarmRecovery") +@Stream(name = AlarmRecoveryRecord.INDEX_NAME, scopeId = DefaultScopeDefine.ALARM_RECOVERY, builder = AlarmRecoveryRecord.Builder.class, processor = RecordStreamProcessor.class) +@SQLDatabase.ExtraColumn4AdditionalEntity(additionalTable = AlarmRecoveryRecord.ADDITIONAL_TAG_TABLE, parentColumn = TIME_BUCKET) +@BanyanDB.TimestampColumn(AlarmRecoveryRecord.START_TIME) +public class AlarmRecoveryRecord extends Record { + public static final String INDEX_NAME = "alarm_recovery_record"; + public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag"; + public static final String UUID = "uuid"; + public static final String SCOPE = "scope"; + public static final String NAME = "name"; + public static final String ID0 = "id0"; + public static final String ID1 = "id1"; + public static final String START_TIME = "start_time"; + public static final String RECOVERY_TIME = "recovery_time"; + public static final String ALARM_MESSAGE = "alarm_message"; + public static final String RULE_NAME = "rule_name"; + public static final String TAGS = "tags"; + public static final String TAGS_RAW_DATA = "tags_raw_data"; + public static final String SNAPSHOT = "snapshot"; + + @Override + public StorageID id() { + return new StorageID() + .append(TIME_BUCKET, getTimeBucket()) + .append(RULE_NAME, ruleName) + .append(ID0, id0) + .append(ID1, id1); + } + + @Column(name = SCOPE) + private int scope; + @Column(name = NAME, storageOnly = true, length = 512) + private String name; + @Column(name = ID0, storageOnly = true, length = 512) + @BanyanDB.SeriesID(index = 0) + private String id0; + @Column(name = ID1, storageOnly = true) + private String id1; + @ElasticSearch.EnableDocValues + @Column(name = START_TIME) + private long startTime; + @ElasticSearch.EnableDocValues + @Column(name = RECOVERY_TIME) + private long recoveryTime; + @Column(name = ALARM_MESSAGE, length = 512) + @ElasticSearch.MatchQuery + @BanyanDB.MatchQuery(analyzer = BanyanDB.MatchQuery.AnalyzerType.SIMPLE) + private String alarmMessage; + @Column(name = RULE_NAME) + private String ruleName; + @Column(name = UUID) + private String uuid; + @Column(name = TAGS, indexOnly = true) + @SQLDatabase.AdditionalEntity(additionalTables = {ADDITIONAL_TAG_TABLE}) + private List tagsInString; + @Column(name = TAGS_RAW_DATA, storageOnly = true, length = Tag.TAG_LENGTH) + private byte[] tagsRawData; + @Column(name = SNAPSHOT, storageOnly = true, length = 50000) + private String snapshot; + + public static class Builder implements StorageBuilder { + @Override + public AlarmRecoveryRecord storage2Entity(final Convert2Entity converter) { + AlarmRecoveryRecord record = new AlarmRecoveryRecord(); + record.setScope(((Number) converter.get(SCOPE)).intValue()); + record.setName((String) converter.get(NAME)); + record.setUuid((String) converter.get(UUID)); + record.setId0((String) converter.get(ID0)); + record.setId1((String) converter.get(ID1)); + record.setAlarmMessage((String) converter.get(ALARM_MESSAGE)); + record.setStartTime(((Number) converter.get(START_TIME)).longValue()); + record.setRecoveryTime(((Number) converter.get(RECOVERY_TIME)).longValue()); + record.setTimeBucket(((Number) converter.get(TIME_BUCKET)).longValue()); + record.setRuleName((String) converter.get(RULE_NAME)); + record.setTagsRawData(converter.getBytes(TAGS_RAW_DATA)); + record.setSnapshot((String) converter.get(SNAPSHOT)); + // Don't read the TAGS as they are only for query. + return record; + } + + @Override + public void entity2Storage(final AlarmRecoveryRecord storageData, final Convert2Storage converter) { + converter.accept(SCOPE, storageData.getScope()); + converter.accept(NAME, storageData.getName()); + converter.accept(UUID, storageData.getUuid()); + converter.accept(ID0, storageData.getId0()); + converter.accept(ID1, storageData.getId1()); + converter.accept(ALARM_MESSAGE, storageData.getAlarmMessage()); + converter.accept(START_TIME, storageData.getStartTime()); + converter.accept(RECOVERY_TIME, storageData.getRecoveryTime()); + converter.accept(TIME_BUCKET, storageData.getTimeBucket()); + converter.accept(RULE_NAME, storageData.getRuleName()); + converter.accept(TAGS_RAW_DATA, storageData.getTagsRawData()); + converter.accept(TAGS, storageData.getTagsInString()); + converter.accept(SNAPSHOT, storageData.getSnapshot()); + } + } +} diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java index 3851ba81a0ed..c4acfa8a00dd 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java @@ -31,6 +31,7 @@ import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -58,12 +59,43 @@ public void doAlarm(List alarmMessage) { } AlarmRecord record = new AlarmRecord(); + record.setUuid(message.getUuid()); + record.setScope(message.getScopeId()); + record.setId0(message.getId0()); + record.setId1(message.getId1()); + record.setName(message.getName()); + record.setAlarmMessage(message.getAlarmMessage()); + record.setStartTime(message.getStartTime()); + record.setTimeBucket(TimeBucket.getRecordTimeBucket(message.getStartTime())); + record.setRuleName(message.getRuleName()); + Collection tags = appendSearchableTags(message.getTags()); + addAutocompleteTags(tags, TimeBucket.getMinuteTimeBucket(message.getStartTime())); + record.setTagsRawData(gson.toJson(message.getTags()).getBytes(Charsets.UTF_8)); + record.setTagsInString(Tag.Util.toStringList(new ArrayList<>(tags))); + AlarmSnapshotRecord snapshot = new AlarmSnapshotRecord(); + snapshot.setExpression(message.getExpression()); + snapshot.setMetrics(message.getMqeMetricsSnapshot()); + record.setSnapshot(gson.toJson(snapshot)); + RecordStreamProcessor.getInstance().in(record); + }); + } + + @Override + public void doAlarmRecovery(List alarmMessage) { + alarmMessage.forEach(message -> { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Alarm recovery message: {}", message.getAlarmMessage()); + } + AlarmRecoveryMessage alarmRecoveryMessage = (AlarmRecoveryMessage) message; + AlarmRecoveryRecord record = new AlarmRecoveryRecord(); + record.setUuid(message.getUuid()); record.setScope(message.getScopeId()); record.setId0(message.getId0()); record.setId1(message.getId1()); record.setName(message.getName()); record.setAlarmMessage(message.getAlarmMessage()); record.setStartTime(message.getStartTime()); + record.setRecoveryTime(alarmRecoveryMessage.getRecoveryTime()); record.setTimeBucket(TimeBucket.getRecordTimeBucket(message.getStartTime())); record.setRuleName(message.getRuleName()); Collection tags = appendSearchableTags(message.getTags()); @@ -94,7 +126,7 @@ private Collection appendSearchableTags(List tags) { if (configService.getSearchableAlarmTags().contains(tag.getKey())) { final Tag alarmTag = new Tag(tag.getKey(), tag.getValue()); - if (tag.getValue().length() > Tag.TAG_LENGTH || alarmTag.toString().length() > Tag.TAG_LENGTH) { + if (tag.getValue().length() > Tag.TAG_LENGTH || alarmTag.toString().length() > Tag.TAG_LENGTH) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Alarm tag : {} length > : {}, dropped", alarmTag, Tag.TAG_LENGTH); } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java index 6b9ce0f00963..6414613edcfa 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/HttpAlarmCallback.java @@ -26,6 +26,7 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.time.Duration; +import java.util.List; import java.util.Map; public abstract class HttpAlarmCallback implements AlarmCallback { @@ -58,4 +59,21 @@ protected String post( } return response.body(); } + + /** + * Send alarm message if the settings not empty + */ + public void doAlarm(List alarmMessages) throws Exception { + doAlarmCallback(alarmMessages, false); + } + + /** + * Send alarm recovery message if the settings not empty + */ + public void doAlarmRecovery(List alarmRecoveryMessages) throws Exception { + doAlarmCallback(alarmRecoveryMessages, true); + } + + protected abstract void doAlarmCallback(List alarmMessages, boolean isRecovery) throws Exception ; + } diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java index 7ff8e5306ccd..67dd14f234a8 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java @@ -35,6 +35,7 @@ public class AlarmMessage { private String name; private String message; private Long startTime; + private Long recoveryTime; private transient String id1; private final List tags; private List events = new ArrayList<>(2); diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java index 1d51cc193e18..05e1da8e693b 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/source/DefaultScopeDefine.java @@ -151,6 +151,7 @@ public class DefaultScopeDefine { public static final int BROWSER_APP_RESOURCE_PERF = 88; public static final int BROWSER_APP_WEB_INTERACTION_PAGE_PERF = 89; public static final int SW_SPAN_ATTACHED_EVENT = 90; + public static final int ALARM_RECOVERY = 91; /** * Catalog of scope, the metrics processor could use this to group all generated metrics by oal rt. diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java index f59fb1cd1564..0b0c9d856af7 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java @@ -19,6 +19,7 @@ package org.apache.skywalking.oap.server.core.storage.query; import com.google.gson.JsonObject; + import java.io.IOException; import java.util.Base64; import java.util.List; @@ -67,13 +68,14 @@ default void parseDataBinary(byte[] dataBinary, List tags) { * Build the alarm message from the alarm record. * The Tags in JDBC storage is base64 encoded, need to decode in different way. */ - default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord) { + default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord, Long recoveryTime) { AlarmMessage message = new AlarmMessage(); message.setId(String.valueOf(alarmRecord.getId0())); message.setId1(String.valueOf(alarmRecord.getId1())); message.setName(alarmRecord.getName()); message.setMessage(alarmRecord.getAlarmMessage()); message.setStartTime(alarmRecord.getStartTime()); + message.setRecoveryTime(recoveryTime); message.setScope(Scope.Finder.valueOf(alarmRecord.getScope())); message.setScopeId(alarmRecord.getScope()); AlarmSnapshot alarmSnapshot = message.getSnapshot(); @@ -89,8 +91,8 @@ default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord) { MQEMetric metrics = new MQEMetric(); metrics.setName(name); List values = GSON.fromJson( - obj.getValue().getAsString(), new TypeToken>() { - }.getType()); + obj.getValue().getAsString(), new TypeToken>() { + }.getType()); metrics.setResults(values); alarmSnapshot.getMetrics().add(metrics); } diff --git a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol index abf4c4d1588d..77634b33ce28 160000 --- a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol +++ b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol @@ -1 +1 @@ -Subproject commit abf4c4d1588d16facae4a696032d5f8b68a4ccaf +Subproject commit 77634b33ce28229ff7e9ea9bf5bef28cbdba809f diff --git a/oap-server/server-query-plugin/status-query-plugin/src/main/java/org/apache/skywalking/oap/query/debug/AlarmStatusQueryHandler.java b/oap-server/server-query-plugin/status-query-plugin/src/main/java/org/apache/skywalking/oap/query/debug/AlarmStatusQueryHandler.java index b27797030693..974119701ba5 100644 --- a/oap-server/server-query-plugin/status-query-plugin/src/main/java/org/apache/skywalking/oap/query/debug/AlarmStatusQueryHandler.java +++ b/oap-server/server-query-plugin/status-query-plugin/src/main/java/org/apache/skywalking/oap/query/debug/AlarmStatusQueryHandler.java @@ -149,7 +149,7 @@ public HttpResponse getAlarmRuleContext(@Param("ruleName") String ruleName, @Par runningContext.addProperty("endTime", window.getEndTime().toString()); runningContext.addProperty("additionalPeriod", window.getAdditionalPeriod()); runningContext.addProperty("size", window.getSize()); - runningContext.addProperty("silenceCountdown", window.getSilenceCountdown()); + runningContext.addProperty("silenceCountdown", window.getStateMachine().getSilenceCountdown()); JsonArray metricValues = new JsonArray(); runningContext.add("windowValues", metricValues); diff --git a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java index c50a5d1f708a..fffe99666135 100644 --- a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java +++ b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java @@ -24,6 +24,7 @@ import org.apache.skywalking.banyandb.v1.client.StreamQuery; import org.apache.skywalking.banyandb.v1.client.StreamQueryResponse; import org.apache.skywalking.oap.server.core.alarm.AlarmRecord; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryRecord; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.query.input.Duration; import org.apache.skywalking.oap.server.core.query.type.AlarmMessage; @@ -46,8 +47,12 @@ */ public class BanyanDBAlarmQueryDAO extends AbstractBanyanDBDAO implements IAlarmQueryDAO { private static final Set TAGS = ImmutableSet.of(AlarmRecord.SCOPE, - AlarmRecord.NAME, AlarmRecord.ID0, AlarmRecord.ID1, AlarmRecord.ALARM_MESSAGE, AlarmRecord.START_TIME, - AlarmRecord.RULE_NAME, AlarmRecord.TAGS, AlarmRecord.TAGS_RAW_DATA, AlarmRecord.SNAPSHOT); + AlarmRecord.NAME, AlarmRecord.ID0, AlarmRecord.ID1, AlarmRecord.UUID, AlarmRecord.ALARM_MESSAGE, + AlarmRecord.START_TIME, AlarmRecord.RULE_NAME, AlarmRecord.TAGS, AlarmRecord.TAGS_RAW_DATA, AlarmRecord.SNAPSHOT); + private static final Set RECOVERY_TAGS = ImmutableSet.of(AlarmRecoveryRecord.SCOPE, + AlarmRecoveryRecord.NAME, AlarmRecord.ID0, AlarmRecoveryRecord.ID1, AlarmRecoveryRecord.UUID, + AlarmRecoveryRecord.ALARM_MESSAGE, AlarmRecoveryRecord.START_TIME, AlarmRecoveryRecord.RECOVERY_TIME, + AlarmRecoveryRecord.RULE_NAME, AlarmRecoveryRecord.TAGS, AlarmRecoveryRecord.TAGS_RAW_DATA, AlarmRecoveryRecord.SNAPSHOT); public BanyanDBAlarmQueryDAO(BanyanDBStorageClient client) { super(client); @@ -88,7 +93,8 @@ public void apply(StreamQuery query) { AlarmRecord alarmRecord = builder.storage2Entity( new BanyanDBConverter.StorageToStream(AlarmRecord.INDEX_NAME, rowEntity) ); - AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); + Long recoveryTime = getAlarmRecoveryTime(alarmRecord.getUuid(), duration); + AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord, recoveryTime); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinary(alarmRecord.getTagsRawData(), alarmMessage.getTags()); } @@ -96,4 +102,26 @@ public void apply(StreamQuery query) { } return alarms; } + + private Long getAlarmRecoveryTime(String uuid, Duration duration) throws IOException { + if (StringUtil.isBlank(uuid)) { + return null; + } + final boolean isColdStage = duration != null && duration.isColdStage(); + StreamQueryResponse resp = query(isColdStage, AlarmRecoveryRecord.INDEX_NAME, RECOVERY_TAGS, + getTimestampRange(duration), new QueryBuilder() { + @Override + public void apply(StreamQuery query) { + query.and(eq(AlarmRecoveryRecord.UUID, uuid)); + } + }); + for (final RowEntity rowEntity : resp.getElements()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity( + new BanyanDBConverter.StorageToStream(AlarmRecoveryRecord.INDEX_NAME, rowEntity) + ); + return alarmRecoveryRecord.getRecoveryTime(); + } + return null; + } } diff --git a/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java b/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java index 03b2f9eb6087..c3cff6affc97 100644 --- a/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java +++ b/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java @@ -19,9 +19,11 @@ package org.apache.skywalking.oap.server.storage.plugin.elasticsearch.query; import com.google.common.base.Strings; + import java.io.IOException; import java.util.List; import java.util.Objects; + import org.apache.skywalking.library.elasticsearch.requests.search.BoolQueryBuilder; import org.apache.skywalking.library.elasticsearch.requests.search.Query; import org.apache.skywalking.library.elasticsearch.requests.search.Search; @@ -30,6 +32,7 @@ import org.apache.skywalking.library.elasticsearch.response.search.SearchHit; import org.apache.skywalking.library.elasticsearch.response.search.SearchResponse; import org.apache.skywalking.oap.server.core.alarm.AlarmRecord; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryRecord; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.query.input.Duration; import org.apache.skywalking.oap.server.core.query.type.AlarmMessage; @@ -37,6 +40,7 @@ import org.apache.skywalking.oap.server.core.storage.query.IAlarmQueryDAO; import org.apache.skywalking.oap.server.library.client.elasticsearch.ElasticSearchClient; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.ElasticSearchConverter; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.EsDAO; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.IndexController; @@ -53,11 +57,11 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li final int from, final Duration duration, final List tags) - throws IOException { + throws IOException { long startTB = duration.getStartTimeBucketInSec(); long endTB = duration.getEndTimeBucketInSec(); final String index = - IndexController.LogicIndicesRegister.getPhysicalTableName(AlarmRecord.INDEX_NAME); + IndexController.LogicIndicesRegister.getPhysicalTableName(AlarmRecord.INDEX_NAME); final BoolQueryBuilder query = Query.bool(); if (IndexController.LogicIndicesRegister.isMergedTable(AlarmRecord.INDEX_NAME)) { query.must(Query.term(IndexController.LogicIndicesRegister.RECORD_TABLE_NAME, AlarmRecord.INDEX_NAME)); @@ -81,9 +85,9 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li } final SearchBuilder search = - Search.builder().query(query) - .size(limit).from(from) - .sort(AlarmRecord.START_TIME, Sort.Order.DESC); + Search.builder().query(query) + .size(limit).from(from) + .sort(AlarmRecord.START_TIME, Sort.Order.DESC); SearchResponse response = getClient().search(index, search.build()); @@ -92,7 +96,8 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li for (SearchHit searchHit : response.getHits().getHits()) { AlarmRecord.Builder builder = new AlarmRecord.Builder(); AlarmRecord alarmRecord = builder.storage2Entity(new ElasticSearchConverter.ToEntity(AlarmRecord.INDEX_NAME, searchHit.getSource())); - AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); + Long recoveryTime = getAlarmRecoveryTime(alarmRecord.getUuid(), duration); + AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord, recoveryTime); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinary(alarmRecord.getTagsRawData(), alarmMessage.getTags()); } @@ -100,4 +105,32 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li } return alarms; } + + private Long getAlarmRecoveryTime(String uuid, Duration duration) { + if (StringUtil.isBlank(uuid)) { + return null; + } + long startTB = duration.getStartTimeBucketInSec(); + long endTB = duration.getEndTimeBucketInSec(); + final String index = + IndexController.LogicIndicesRegister.getPhysicalTableName(AlarmRecoveryRecord.INDEX_NAME); + final BoolQueryBuilder query = Query.bool(); + if (IndexController.LogicIndicesRegister.isMergedTable(AlarmRecoveryRecord.INDEX_NAME)) { + query.must(Query.term(IndexController.LogicIndicesRegister.RECORD_TABLE_NAME, AlarmRecoveryRecord.INDEX_NAME)); + } + if (startTB != 0 && endTB != 0) { + query.must(Query.range(AlarmRecord.TIME_BUCKET).gte(startTB).lte(endTB)); + } + query.must(Query.term(AlarmRecoveryRecord.UUID, uuid)); + final SearchBuilder search = + Search.builder().query(query) + .size(1).from(1); + SearchResponse response = getClient().search(index, search.build()); + for (SearchHit searchHit : response.getHits().getHits()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity(new ElasticSearchConverter.ToEntity(AlarmRecoveryRecord.INDEX_NAME, searchHit.getSource())); + return alarmRecoveryRecord.getRecoveryTime(); + } + return null; + } } diff --git a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java index 42aa3622aea4..075436f1a2cc 100644 --- a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java +++ b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java @@ -26,20 +26,24 @@ import org.apache.skywalking.oap.server.core.Const; import org.apache.skywalking.oap.server.core.CoreModule; import org.apache.skywalking.oap.server.core.alarm.AlarmRecord; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryRecord; import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag; import org.apache.skywalking.oap.server.core.config.ConfigService; import org.apache.skywalking.oap.server.core.query.input.Duration; import org.apache.skywalking.oap.server.core.query.type.AlarmMessage; import org.apache.skywalking.oap.server.core.query.type.Alarms; import org.apache.skywalking.oap.server.core.storage.query.IAlarmQueryDAO; +import org.apache.skywalking.oap.server.core.storage.type.Convert2Entity; import org.apache.skywalking.oap.server.library.client.jdbc.hikaricp.JDBCClient; import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.JDBCEntityConverters; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.JDBCTableInstaller; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.SQLAndParameters; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.TableHelper; +import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; @@ -68,20 +72,20 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, Duration duration, final List tags) { if (searchableTagKeys == null) { final ConfigService configService = manager.find(CoreModule.NAME) - .provider() - .getService(ConfigService.class); + .provider() + .getService(ConfigService.class); searchableTagKeys = new HashSet<>(Arrays.asList(configService.getSearchableAlarmTags().split(Const.COMMA))); } // If the tag is not searchable, but is required, then we don't need to run the real query. if (tags != null && !searchableTagKeys.containsAll(tags.stream().map(Tag::getKey).collect(toSet()))) { log.warn( - "Searching tags that are not searchable: {}", - tags.stream().map(Tag::getKey).filter(not(searchableTagKeys::contains)).collect(toSet())); + "Searching tags that are not searchable: {}", + tags.stream().map(Tag::getKey).filter(not(searchableTagKeys::contains)).collect(toSet())); return new Alarms(); } final var tables = tableHelper.getTablesForRead( - AlarmRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() + AlarmRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() ); final var alarmMsgs = new ArrayList(); @@ -90,11 +94,13 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, jdbcClient.executeQuery(sqlAndParameters.sql(), resultSet -> { while (resultSet.next()) { AlarmRecord.Builder builder = new AlarmRecord.Builder(); - AlarmRecord alarmRecord = builder.storage2Entity(JDBCEntityConverters.toEntity(resultSet)); - AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); + Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); + AlarmRecord alarmRecord = builder.storage2Entity(convert2Entity); + Long recoveryTime = getRecoveryTime(alarmRecord.getUuid(), duration); + AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord, recoveryTime); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinaryBase64( - new String(alarmRecord.getTagsRawData(), Charsets.UTF_8), alarmMessage.getTags()); + new String(alarmRecord.getTagsRawData(), Charsets.UTF_8), alarmMessage.getTags()); } alarmMsgs.add(alarmMessage); } @@ -102,12 +108,12 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, }, sqlAndParameters.parameters()); } return new Alarms( - alarmMsgs - .stream() - .sorted(comparing(AlarmMessage::getStartTime).reversed()) - .skip(from) - .limit(limit) - .collect(toList()) + alarmMsgs + .stream() + .sorted(comparing(AlarmMessage::getStartTime).reversed()) + .skip(from) + .limit(limit) + .collect(toList()) ); } @@ -129,6 +135,7 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, */ final var timeBucket = TableHelper.getTimeBucket(table); final var tagTable = TableHelper.getTable(AlarmRecord.ADDITIONAL_TAG_TABLE, timeBucket); + if (!CollectionUtils.isEmpty(tags)) { for (int i = 0; i < tags.size(); i++) { sql.append(" inner join ").append(tagTable).append(" "); @@ -138,7 +145,7 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, } } sql.append(" where ") - .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); + .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); parameters.add(AlarmRecord.INDEX_NAME); if (Objects.nonNull(scopeId)) { sql.append(" and ").append(AlarmRecord.SCOPE).append(" = ?"); @@ -167,4 +174,38 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, return new SQLAndParameters(sql.toString(), parameters); } + + private Long getRecoveryTime(String uuid, Duration duration) throws SQLException { + if (StringUtil.isBlank(uuid)) { + return null; + } + final var tables = tableHelper.getTablesForRead( + AlarmRecoveryRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() + ); + final AlarmRecoveryRecord[] alarmRecoveryRecords = {null}; + for (final var table : tables) { + final var sqlAndParameters = buildSQL4Recovery(uuid, duration, table); + jdbcClient.executeQuery(sqlAndParameters.sql(), resultSet -> { + while (resultSet.next()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); + alarmRecoveryRecords[0] = builder.storage2Entity(convert2Entity); + } + return null; + }, sqlAndParameters.parameters()); + } + return alarmRecoveryRecords[0] == null ? null : alarmRecoveryRecords[0].getRecoveryTime(); + } + + private SQLAndParameters buildSQL4Recovery(String uuid, Duration duration, String table) { + final var sql = new StringBuilder(); + final var parameters = new ArrayList<>(); + sql.append("select * from ").append(table); + sql.append(" where ") + .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); + parameters.add(AlarmRecoveryRecord.INDEX_NAME); + sql.append(" and ").append(AlarmRecoveryRecord.UUID).append(" = ?"); + parameters.add(uuid); + return new SQLAndParameters(sql.toString(), parameters); + } } diff --git a/skywalking-ui b/skywalking-ui index 1b6f011f0ec5..7fa74bbd170d 160000 --- a/skywalking-ui +++ b/skywalking-ui @@ -1 +1 @@ -Subproject commit 1b6f011f0ec526c553a291f74c9bcaacc84ab451 +Subproject commit 7fa74bbd170d8e91b7d3c06608d1cc04ea5f1aca diff --git a/test/e2e-v2/cases/alarm/alarm-cases.yaml b/test/e2e-v2/cases/alarm/alarm-cases.yaml index aa07203bf8c2..60565405447c 100644 --- a/test/e2e-v2/cases/alarm/alarm-cases.yaml +++ b/test/e2e-v2/cases/alarm/alarm-cases.yaml @@ -31,7 +31,9 @@ - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql alarm autocomplete-values --key=level expected: expected/tag-values.yml # before silence webhook - - query: curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read + - query: | + sleep 30; + curl -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/silence-before-webhook.yml # after silence alarm list WARNING,receivers=lisi - query: | @@ -44,3 +46,11 @@ # after silence webhook - query: curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/silence-after-webhook.yml + - query: | + sleep 30; + curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read + expected: expected/recovery-webhook.yml + - query: | + sleep 30; + curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read + expected: expected/recovery-after-observation-webhook.yml diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml b/test/e2e-v2/cases/alarm/alarm-settings.yml index 04ddbc61cd3f..04c29ae8e2ac 100755 --- a/test/e2e-v2/cases/alarm/alarm-settings.yml +++ b/test/e2e-v2/cases/alarm/alarm-settings.yml @@ -27,7 +27,7 @@ rules: - webhook.custom # service_percentile > 10ms service_percentile_rule: - expression: sum(service_percentile{p='50,75,90,95,99'} > 10) >= 3 + expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 period: 10 silence-period: 1 message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. @@ -35,10 +35,11 @@ rules: level: WARNING receivers: lisi hooks: - - webhook.none + - webhook.custom comp_rule: - expression: sum((service_resp_time > 10) && (service_sla > 100)) >= 1 + expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 period: 10 + recovery-observation-period: 3 message: Service {name} response time is more than 10ms and sla is more than 1%. tags: level: CRITICAL diff --git a/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml b/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml new file mode 100644 index 000000000000..db025b74a18f --- /dev/null +++ b/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +messages: + {{- contains .messages }} + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ gt .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ gt .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + {{- end }} diff --git a/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml b/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml new file mode 100644 index 000000000000..95eccf59c0aa --- /dev/null +++ b/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +messages: + {{- contains .messages }} + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ gt .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + {{- end }} diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml index 0631c162b88d..ebc688b6c4a8 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum((service_resp_time > 10) && (service_sla > 100)) >= 1 + expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 metrics: {{- contains .snapshot.metrics }} - name: service_resp_time diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml index e54e4bd6abfd..0476d6efafde 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum(service_percentile{p='50,75,90,95,99'} > 10) >= 3 + expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 metrics: {{- contains .snapshot.metrics }} - name: service_percentile{p='50,75,90,95,99'} diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml b/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml index 3e9ad6d25bac..c9bd7a4f21ac 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml @@ -23,6 +23,21 @@ messages: ruleName: service_resp_time_rule alarmMessage: Response time of service e2e-service-provider is increase/decrease in 1 minutes of last 10 minutes. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: WARNING @@ -36,6 +51,7 @@ messages: ruleName: comp_rule alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: CRITICAL @@ -49,6 +65,21 @@ messages: ruleName: service_resp_time_rule alarmMessage: Response time of service e2e-service-provider is increase/decrease in 1 minutes of last 10 minutes. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: WARNING @@ -62,6 +93,77 @@ messages: ruleName: comp_rule alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: CRITICAL + - key: receivers + value: zhangsan + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: comp_rule + alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: CRITICAL diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml index 0631c162b88d..ebc688b6c4a8 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum((service_resp_time > 10) && (service_sla > 100)) >= 1 + expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 metrics: {{- contains .snapshot.metrics }} - name: service_resp_time diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml index e25f5cf61417..dc9283b298bf 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml @@ -41,7 +41,7 @@ msgs: layer: GENERAL {{- end }} snapshot: - expression: sum(service_percentile{p='50,75,90,95,99'} > 10) >= 3 + expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 metrics: {{- contains .snapshot.metrics }} - name: service_percentile{p='50,75,90,95,99'} diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml b/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml index d34f856f282f..faa28ed8607b 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml @@ -23,6 +23,21 @@ messages: ruleName: service_resp_time_rule alarmMessage: Response time of service e2e-service-provider is increase/decrease in 1 minutes of last 10 minutes. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + - scopeId: 1 + scope: SERVICE + name: e2e-service-provider + id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + id1: "" + ruleName: service_percentile_rule + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: WARNING @@ -36,6 +51,7 @@ messages: ruleName: comp_rule alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} + recoveryTime: {{ le .recoveryTime 0 }} tags: - key: level value: CRITICAL diff --git a/test/e2e-v2/cases/alarm/mysql/docker-compose.yml b/test/e2e-v2/cases/alarm/mysql/docker-compose.yml index 0de65f5e04f1..340dac7dff1b 100644 --- a/test/e2e-v2/cases/alarm/mysql/docker-compose.yml +++ b/test/e2e-v2/cases/alarm/mysql/docker-compose.yml @@ -22,6 +22,8 @@ services: - e2e expose: - 3306 + ports: + - 3306 environment: - MYSQL_ROOT_PASSWORD=root@1234 - MYSQL_DATABASE=swtest diff --git a/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java b/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java index fd446702894e..e8b596e33b77 100644 --- a/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java +++ b/test/e2e-v2/java-test-service/e2e-service-provider/src/main/java/org/apache/skywalking/e2e/controller/AlarmController.java @@ -66,6 +66,7 @@ public static class AlarmMessage { private String alarmMessage; private long startTime; private List tags; + private long recoveryTime; } /** From 4b54c1859125a60d95a257c03797d9dec4a75703 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 11:15:09 +0800 Subject: [PATCH 02/21] enhance the alarm kernel with recovered status notification capability #13492 --- .../oap/server/core/alarm/provider/RulesReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index 81b2f5febf07..75a60142aa2c 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -355,7 +355,7 @@ private void readWeLinkConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); - String recoveryTextTemplate = (String) config.get("recovery-text-template"); + String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; @@ -421,7 +421,7 @@ private void readDiscordConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); - String recoveryTextTemplate = (String) config.get("recovery-text-template"); + String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; From 638668fe6e108a4c8e6e4359f7942663137e147e Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 11:27:46 +0800 Subject: [PATCH 03/21] enhance the alarm kernel with recovered status notification capability #13492 --- .../query-graphql-plugin/src/main/resources/query-protocol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol index 77634b33ce28..4fc10625ba72 160000 --- a/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol +++ b/oap-server/server-query-plugin/query-graphql-plugin/src/main/resources/query-protocol @@ -1 +1 @@ -Subproject commit 77634b33ce28229ff7e9ea9bf5bef28cbdba809f +Subproject commit 4fc10625ba72ef4788972b4f7991a535065d609b From 0acfbe584815194e471a862067ccb05c916dc9b3 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 11:30:41 +0800 Subject: [PATCH 04/21] enhance the alarm kernel with recovered status notification capability #13492 --- skywalking-ui | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skywalking-ui b/skywalking-ui index 7fa74bbd170d..3cefbf1bd5d5 160000 --- a/skywalking-ui +++ b/skywalking-ui @@ -1 +1 @@ -Subproject commit 7fa74bbd170d8e91b7d3c06608d1cc04ea5f1aca +Subproject commit 3cefbf1bd5d588355829cbe9dd23cf2253af547e From edc27224f72e7afeaf492565945b8ca016f3b407 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 17:19:19 +0800 Subject: [PATCH 05/21] enhance the alarm kernel with recovered status notification capability #13492 --- docs/en/changes/changes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md index a80fe30332e3..3a51dcdff8ad 100644 --- a/docs/en/changes/changes.md +++ b/docs/en/changes/changes.md @@ -105,6 +105,7 @@ * BanyanDB: support add group prefix (namespace) for BanyanDB groups. * BanyanDB: fix when setting `@BanyanDB.TimestampColumn`, the column should not be indexed. * OAP Self Observability: make Trace analysis metrics separate by label `protocol`, add Zipkin span dropped metrics. +* Enhance the alarm kernel with recovered status notification capability #### UI From a7edf5c8e1c8509f3fe279f309727ca43c6a78c1 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 20:10:14 +0800 Subject: [PATCH 06/21] enhance the alarm kernel with recovered status notification capability #13492 --- .../skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java | 1 + test/e2e-v2/cases/alarm/alarm-cases.yaml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java index ef802bca0390..d569a900ab3e 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java @@ -46,6 +46,7 @@ @Stream(name = AlarmRecoveryRecord.INDEX_NAME, scopeId = DefaultScopeDefine.ALARM_RECOVERY, builder = AlarmRecoveryRecord.Builder.class, processor = RecordStreamProcessor.class) @SQLDatabase.ExtraColumn4AdditionalEntity(additionalTable = AlarmRecoveryRecord.ADDITIONAL_TAG_TABLE, parentColumn = TIME_BUCKET) @BanyanDB.TimestampColumn(AlarmRecoveryRecord.START_TIME) +@BanyanDB.Group(streamGroup = BanyanDB.StreamGroup.RECORDS) public class AlarmRecoveryRecord extends Record { public static final String INDEX_NAME = "alarm_recovery_record"; public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag"; diff --git a/test/e2e-v2/cases/alarm/alarm-cases.yaml b/test/e2e-v2/cases/alarm/alarm-cases.yaml index 60565405447c..dc49ba84e6c7 100644 --- a/test/e2e-v2/cases/alarm/alarm-cases.yaml +++ b/test/e2e-v2/cases/alarm/alarm-cases.yaml @@ -47,10 +47,10 @@ - query: curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/silence-after-webhook.yml - query: | - sleep 30; + sleep 60; curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/recovery-webhook.yml - query: | - sleep 30; + sleep 60; curl -s -XPOST http://${provider_host}:${provider_9090}/alarm/read expected: expected/recovery-after-observation-webhook.yml From f140f6e72f66fbb12943df94078b6670055fc56c Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sat, 11 Oct 2025 21:30:20 +0800 Subject: [PATCH 07/21] enhance the alarm kernel with recovered status notification capability #13492 --- .github/workflows/skywalking.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/skywalking.yaml b/.github/workflows/skywalking.yaml index 217ccbe9f8b0..a1db50764bd9 100644 --- a/.github/workflows/skywalking.yaml +++ b/.github/workflows/skywalking.yaml @@ -768,7 +768,7 @@ jobs: if: matrix.test.docker != null run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} @@ -832,7 +832,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -893,7 +893,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -956,7 +956,7 @@ jobs: shell: bash run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package - name: Java version ${{ matrix.java-version }} - uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 + uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f env: SW_AGENT_JDK_VERSION: ${{ matrix.java-version }} with: @@ -1052,7 +1052,7 @@ jobs: # fi # docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v # - name: ${{ matrix.test.name }} -# uses: apache/skywalking-infra-e2e@cf589b4a0b9f8e6f436f78e9cfd94a1ee5494180 +# uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f # with: # e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} # - if: ${{ failure() }} From cf0570b71a1b8e22c39f69535842d9bf1d7e8c13 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Wed, 15 Oct 2025 10:39:25 +0800 Subject: [PATCH 08/21] enhance the alarm kernel with recovered status notification capability #13492 --- .../server/core/query/type/AlarmMessage.java | 1 + .../core/storage/query/IAlarmQueryDAO.java | 4 +- .../stream/BanyanDBAlarmQueryDAO.java | 39 +++++++--- .../elasticsearch/query/AlarmQueryEsDAO.java | 46 +++++++---- .../jdbc/common/dao/JDBCAlarmQueryDAO.java | 76 ++++++++++++------- test/e2e-v2/cases/alarm/alarm-settings.yml | 4 +- test/e2e-v2/cases/alarm/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/alarm/es/e2e.yaml | 2 +- .../cases/alarm/es/es-sharding/e2e.yaml | 2 +- .../recovery-after-observation-webhook.yml | 4 +- .../cases/alarm/expected/recovery-webhook.yml | 2 +- .../silence-after-graphql-critical.yml | 2 +- .../expected/silence-after-graphql-warn.yml | 2 +- .../alarm/expected/silence-after-webhook.yml | 18 ++--- .../silence-before-graphql-critical.yml | 2 +- .../expected/silence-before-graphql-warn.yml | 2 +- .../alarm/expected/silence-before-webhook.yml | 4 +- test/e2e-v2/cases/alarm/mysql/e2e.yaml | 2 +- test/e2e-v2/cases/alarm/postgres/e2e.yaml | 2 +- 19 files changed, 134 insertions(+), 82 deletions(-) diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java index 67dd14f234a8..ba6a3ee4ec7c 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/query/type/AlarmMessage.java @@ -33,6 +33,7 @@ public class AlarmMessage { private int scopeId; private String id; private String name; + private String uuid; private String message; private Long startTime; private Long recoveryTime; diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java index 0b0c9d856af7..179c74a87d74 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/storage/query/IAlarmQueryDAO.java @@ -68,14 +68,14 @@ default void parseDataBinary(byte[] dataBinary, List tags) { * Build the alarm message from the alarm record. * The Tags in JDBC storage is base64 encoded, need to decode in different way. */ - default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord, Long recoveryTime) { + default AlarmMessage buildAlarmMessage(AlarmRecord alarmRecord) { AlarmMessage message = new AlarmMessage(); message.setId(String.valueOf(alarmRecord.getId0())); message.setId1(String.valueOf(alarmRecord.getId1())); + message.setUuid(alarmRecord.getUuid()); message.setName(alarmRecord.getName()); message.setMessage(alarmRecord.getAlarmMessage()); message.setStartTime(alarmRecord.getStartTime()); - message.setRecoveryTime(recoveryTime); message.setScope(Scope.Finder.valueOf(alarmRecord.getScope())); message.setScopeId(alarmRecord.getScope()); AlarmSnapshot alarmSnapshot = message.getSnapshot(); diff --git a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java index fffe99666135..a2908bdcd83c 100644 --- a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java +++ b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java @@ -37,9 +37,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.stream.Collectors; /** * {@link org.apache.skywalking.oap.server.core.alarm.AlarmRecord} is a stream, @@ -63,7 +66,7 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, Dur final boolean isColdStage = duration != null && duration.isColdStage(); StreamQueryResponse resp = query(isColdStage, AlarmRecord.INDEX_NAME, TAGS, getTimestampRange(duration), - new QueryBuilder() { + new QueryBuilder<>() { @Override public void apply(StreamQuery query) { if (Objects.nonNull(scopeId)) { @@ -93,35 +96,51 @@ public void apply(StreamQuery query) { AlarmRecord alarmRecord = builder.storage2Entity( new BanyanDBConverter.StorageToStream(AlarmRecord.INDEX_NAME, rowEntity) ); - Long recoveryTime = getAlarmRecoveryTime(alarmRecord.getUuid(), duration); - AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord, recoveryTime); + AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinary(alarmRecord.getTagsRawData(), alarmMessage.getTags()); } alarms.getMsgs().add(alarmMessage); } + updateAlarmRecoveryTime(alarms, duration); return alarms; } - private Long getAlarmRecoveryTime(String uuid, Duration duration) throws IOException { - if (StringUtil.isBlank(uuid)) { - return null; + private void updateAlarmRecoveryTime(Alarms alarms, Duration duration) throws IOException { + List alarmMessages = alarms.getMsgs(); + Map alarmRecoveryRecordMap = getAlarmRecoveryRecord(alarmMessages, duration); + alarmMessages.forEach(alarmMessage -> { + AlarmRecoveryRecord alarmRecoveryRecord = alarmRecoveryRecordMap.get(alarmMessage.getUuid()); + if (alarmRecoveryRecord != null) { + alarmMessage.setRecoveryTime(alarmRecoveryRecord.getRecoveryTime()); + } + }); + + } + + private Map getAlarmRecoveryRecord(List msgs, Duration duration) throws IOException { + Map result = new HashMap<>(); + if (CollectionUtils.isEmpty(msgs)) { + return result; } final boolean isColdStage = duration != null && duration.isColdStage(); + List uuids = msgs.stream().map(AlarmMessage::getUuid).collect(Collectors.toList()); StreamQueryResponse resp = query(isColdStage, AlarmRecoveryRecord.INDEX_NAME, RECOVERY_TAGS, - getTimestampRange(duration), new QueryBuilder() { + getTimestampRange(duration), + new QueryBuilder<>() { @Override public void apply(StreamQuery query) { - query.and(eq(AlarmRecoveryRecord.UUID, uuid)); + query.and(in(AlarmRecoveryRecord.UUID, uuids)); } }); + for (final RowEntity rowEntity : resp.getElements()) { AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity( new BanyanDBConverter.StorageToStream(AlarmRecoveryRecord.INDEX_NAME, rowEntity) ); - return alarmRecoveryRecord.getRecoveryTime(); + result.put(alarmRecoveryRecord.getUuid(), alarmRecoveryRecord); } - return null; + return result; } } diff --git a/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java b/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java index c3cff6affc97..513d79899e5d 100644 --- a/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java +++ b/oap-server/server-storage-plugin/storage-elasticsearch-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/elasticsearch/query/AlarmQueryEsDAO.java @@ -19,11 +19,6 @@ package org.apache.skywalking.oap.server.storage.plugin.elasticsearch.query; import com.google.common.base.Strings; - -import java.io.IOException; -import java.util.List; -import java.util.Objects; - import org.apache.skywalking.library.elasticsearch.requests.search.BoolQueryBuilder; import org.apache.skywalking.library.elasticsearch.requests.search.Query; import org.apache.skywalking.library.elasticsearch.requests.search.Search; @@ -40,12 +35,18 @@ import org.apache.skywalking.oap.server.core.storage.query.IAlarmQueryDAO; import org.apache.skywalking.oap.server.library.client.elasticsearch.ElasticSearchClient; import org.apache.skywalking.oap.server.library.util.CollectionUtils; -import org.apache.skywalking.oap.server.library.util.StringUtil; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.ElasticSearchConverter; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.EsDAO; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.IndexController; import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base.MatchCNameBuilder; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + public class AlarmQueryEsDAO extends EsDAO implements IAlarmQueryDAO { public AlarmQueryEsDAO(ElasticSearchClient client) { @@ -96,20 +97,34 @@ public Alarms getAlarm(final Integer scopeId, final String keyword, final int li for (SearchHit searchHit : response.getHits().getHits()) { AlarmRecord.Builder builder = new AlarmRecord.Builder(); AlarmRecord alarmRecord = builder.storage2Entity(new ElasticSearchConverter.ToEntity(AlarmRecord.INDEX_NAME, searchHit.getSource())); - Long recoveryTime = getAlarmRecoveryTime(alarmRecord.getUuid(), duration); - AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord, recoveryTime); + AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinary(alarmRecord.getTagsRawData(), alarmMessage.getTags()); } alarms.getMsgs().add(alarmMessage); } + updateAlarmRecoveryTime(alarms, duration); return alarms; } - private Long getAlarmRecoveryTime(String uuid, Duration duration) { - if (StringUtil.isBlank(uuid)) { - return null; + private void updateAlarmRecoveryTime(Alarms alarms, Duration duration) throws IOException { + List alarmMessages = alarms.getMsgs(); + Map alarmRecoveryRecordMap = getAlarmRecoveryRecord(alarmMessages, duration); + alarmMessages.forEach(alarmMessage -> { + AlarmRecoveryRecord alarmRecoveryRecord = alarmRecoveryRecordMap.get(alarmMessage.getUuid()); + if (alarmRecoveryRecord != null) { + alarmMessage.setRecoveryTime(alarmRecoveryRecord.getRecoveryTime()); + } + }); + + } + + private Map getAlarmRecoveryRecord(List msgs, Duration duration) throws IOException { + Map result = new HashMap<>(); + if (CollectionUtils.isEmpty(msgs)) { + return result; } + List uuids = msgs.stream().map(AlarmMessage::getUuid).collect(Collectors.toList()); long startTB = duration.getStartTimeBucketInSec(); long endTB = duration.getEndTimeBucketInSec(); final String index = @@ -121,16 +136,15 @@ private Long getAlarmRecoveryTime(String uuid, Duration duration) { if (startTB != 0 && endTB != 0) { query.must(Query.range(AlarmRecord.TIME_BUCKET).gte(startTB).lte(endTB)); } - query.must(Query.term(AlarmRecoveryRecord.UUID, uuid)); + query.must(Query.terms(AlarmRecoveryRecord.UUID, uuids)); final SearchBuilder search = - Search.builder().query(query) - .size(1).from(1); + Search.builder().query(query); SearchResponse response = getClient().search(index, search.build()); for (SearchHit searchHit : response.getHits().getHits()) { AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity(new ElasticSearchConverter.ToEntity(AlarmRecoveryRecord.INDEX_NAME, searchHit.getSource())); - return alarmRecoveryRecord.getRecoveryTime(); + result.put(alarmRecoveryRecord.getUuid(), alarmRecoveryRecord); } - return null; + return result; } } diff --git a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java index 075436f1a2cc..ff109be82b21 100644 --- a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java +++ b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/dao/JDBCAlarmQueryDAO.java @@ -37,7 +37,6 @@ import org.apache.skywalking.oap.server.library.client.jdbc.hikaricp.JDBCClient; import org.apache.skywalking.oap.server.library.module.ModuleManager; import org.apache.skywalking.oap.server.library.util.CollectionUtils; -import org.apache.skywalking.oap.server.library.util.StringUtil; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.JDBCEntityConverters; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.JDBCTableInstaller; import org.apache.skywalking.oap.server.storage.plugin.jdbc.common.SQLAndParameters; @@ -46,14 +45,17 @@ import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; import static java.util.Comparator.comparing; import static java.util.Objects.nonNull; import static java.util.function.Predicate.not; +import static java.util.stream.Collectors.joining; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toSet; @@ -96,8 +98,7 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, AlarmRecord.Builder builder = new AlarmRecord.Builder(); Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); AlarmRecord alarmRecord = builder.storage2Entity(convert2Entity); - Long recoveryTime = getRecoveryTime(alarmRecord.getUuid(), duration); - AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord, recoveryTime); + AlarmMessage alarmMessage = buildAlarmMessage(alarmRecord); if (!CollectionUtils.isEmpty(alarmRecord.getTagsRawData())) { parseDataBinaryBase64( new String(alarmRecord.getTagsRawData(), Charsets.UTF_8), alarmMessage.getTags()); @@ -107,7 +108,7 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, return null; }, sqlAndParameters.parameters()); } - return new Alarms( + Alarms alarms = new Alarms( alarmMsgs .stream() .sorted(comparing(AlarmMessage::getStartTime).reversed()) @@ -115,6 +116,44 @@ public Alarms getAlarm(Integer scopeId, String keyword, int limit, int from, .limit(limit) .collect(toList()) ); + updateAlarmRecoveryTime(alarms, duration); + return alarms; + } + + private void updateAlarmRecoveryTime(Alarms alarms, Duration duration) throws SQLException { + List alarmMessages = alarms.getMsgs(); + Map alarmRecoveryRecordMap = getAlarmRecoveryRecord(alarmMessages, duration); + alarmMessages.forEach(alarmMessage -> { + AlarmRecoveryRecord alarmRecoveryRecord = alarmRecoveryRecordMap.get(alarmMessage.getUuid()); + if (alarmRecoveryRecord != null) { + alarmMessage.setRecoveryTime(alarmRecoveryRecord.getRecoveryTime()); + } + }); + + } + + private Map getAlarmRecoveryRecord(List msgs, Duration duration) throws SQLException { + Map result = new HashMap<>(); + if (CollectionUtils.isEmpty(msgs)) { + return result; + } + List uuids = msgs.stream().map(AlarmMessage::getUuid).collect(toList()); + final var tables = tableHelper.getTablesForRead( + AlarmRecoveryRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() + ); + for (final var table : tables) { + final var sqlAndParameters = buildSQL4Recovery(uuids, table); + jdbcClient.executeQuery(sqlAndParameters.sql(), resultSet -> { + while (resultSet.next()) { + AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); + Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); + AlarmRecoveryRecord alarmRecoveryRecord = builder.storage2Entity(convert2Entity); + result.put(alarmRecoveryRecord.getUuid(), alarmRecoveryRecord); + } + return null; + }, sqlAndParameters.parameters()); + } + return result; } protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, int from, @@ -175,37 +214,16 @@ protected SQLAndParameters buildSQL(Integer scopeId, String keyword, int limit, return new SQLAndParameters(sql.toString(), parameters); } - private Long getRecoveryTime(String uuid, Duration duration) throws SQLException { - if (StringUtil.isBlank(uuid)) { - return null; - } - final var tables = tableHelper.getTablesForRead( - AlarmRecoveryRecord.INDEX_NAME, duration.getStartTimeBucket(), duration.getEndTimeBucket() - ); - final AlarmRecoveryRecord[] alarmRecoveryRecords = {null}; - for (final var table : tables) { - final var sqlAndParameters = buildSQL4Recovery(uuid, duration, table); - jdbcClient.executeQuery(sqlAndParameters.sql(), resultSet -> { - while (resultSet.next()) { - AlarmRecoveryRecord.Builder builder = new AlarmRecoveryRecord.Builder(); - Convert2Entity convert2Entity = JDBCEntityConverters.toEntity(resultSet); - alarmRecoveryRecords[0] = builder.storage2Entity(convert2Entity); - } - return null; - }, sqlAndParameters.parameters()); - } - return alarmRecoveryRecords[0] == null ? null : alarmRecoveryRecords[0].getRecoveryTime(); - } - - private SQLAndParameters buildSQL4Recovery(String uuid, Duration duration, String table) { + private SQLAndParameters buildSQL4Recovery(List uuids, String table) { final var sql = new StringBuilder(); final var parameters = new ArrayList<>(); sql.append("select * from ").append(table); sql.append(" where ") .append(table).append(".").append(JDBCTableInstaller.TABLE_COLUMN).append(" = ? "); parameters.add(AlarmRecoveryRecord.INDEX_NAME); - sql.append(" and ").append(AlarmRecoveryRecord.UUID).append(" = ?"); - parameters.add(uuid); + sql.append(" and ").append(AlarmRecoveryRecord.UUID).append(" in ") + .append(uuids.stream().map(it -> "?").collect(joining(", ", "(", ")"))); + parameters.addAll(uuids); return new SQLAndParameters(sql.toString(), parameters); } } diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml b/test/e2e-v2/cases/alarm/alarm-settings.yml index 04c29ae8e2ac..9bdf5430171f 100755 --- a/test/e2e-v2/cases/alarm/alarm-settings.yml +++ b/test/e2e-v2/cases/alarm/alarm-settings.yml @@ -30,7 +30,7 @@ rules: expression: sum(service_percentile{p='50,75,90,95,99'} > 100) >= 3 period: 10 silence-period: 1 - message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. tags: level: WARNING receivers: lisi @@ -40,7 +40,7 @@ rules: expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1 period: 10 recovery-observation-period: 3 - message: Service {name} response time is more than 10ms and sla is more than 1%. + message: Service {name} response time is more than 100ms and sla is more than 1%. tags: level: CRITICAL receivers: zhangsan diff --git a/test/e2e-v2/cases/alarm/banyandb/e2e.yaml b/test/e2e-v2/cases/alarm/banyandb/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/alarm/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/es/e2e.yaml b/test/e2e-v2/cases/alarm/es/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/es/e2e.yaml +++ b/test/e2e-v2/cases/alarm/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml index a1fa0e36b91d..0ab7c649dded 100644 --- a/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/alarm/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml b/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml index db025b74a18f..607e3c3eb501 100644 --- a/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/recovery-after-observation-webhook.yml @@ -22,7 +22,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: service_percentile_rule - alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. startTime: {{ gt .startTime 0 }} recoveryTime: {{ gt .recoveryTime 0 }} tags: @@ -36,7 +36,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ gt .recoveryTime 0 }} tags: diff --git a/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml b/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml index 95eccf59c0aa..84faee28ed9e 100644 --- a/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/recovery-webhook.yml @@ -22,7 +22,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: service_percentile_rule - alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. startTime: {{ gt .startTime 0 }} recoveryTime: {{ gt .recoveryTime 0 }} tags: diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml index ebc688b6c4a8..a4b608c285c6 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-critical.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + message: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. tags: - key: level value: CRITICAL diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml index 0476d6efafde..50b81cb282d4 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. tags: - key: level value: WARNING diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml b/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml index c9bd7a4f21ac..443889614a94 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-webhook.yml @@ -35,7 +35,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: service_percentile_rule - alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -49,7 +49,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -77,7 +77,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: service_percentile_rule - alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -91,7 +91,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -105,7 +105,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -119,7 +119,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -133,7 +133,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -147,7 +147,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -161,7 +161,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml index ebc688b6c4a8..a4b608c285c6 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-critical.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + message: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. tags: - key: level value: CRITICAL diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml index dc9283b298bf..6d19c8185deb 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml @@ -19,7 +19,7 @@ msgs: scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 name: e2e-service-provider - message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. tags: - key: level value: WARNING diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml b/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml index faa28ed8607b..294d7de7de87 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-webhook.yml @@ -35,7 +35,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: service_percentile_rule - alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + alarmMessage: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 100, p75 > 100, p90 > 100, p95 > 100, p99 > 100. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: @@ -49,7 +49,7 @@ messages: id0: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 id1: "" ruleName: comp_rule - alarmMessage: Service e2e-service-provider response time is more than 10ms and sla is more than 1%. + alarmMessage: Service e2e-service-provider response time is more than 100ms and sla is more than 1%. startTime: {{ gt .startTime 0 }} recoveryTime: {{ le .recoveryTime 0 }} tags: diff --git a/test/e2e-v2/cases/alarm/mysql/e2e.yaml b/test/e2e-v2/cases/alarm/mysql/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/mysql/e2e.yaml +++ b/test/e2e-v2/cases/alarm/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/alarm/postgres/e2e.yaml b/test/e2e-v2/cases/alarm/postgres/e2e.yaml index 4dcb90bf0f7d..2dab123bc37e 100644 --- a/test/e2e-v2/cases/alarm/postgres/e2e.yaml +++ b/test/e2e-v2/cases/alarm/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: 45 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' From 5829a48e0f3a43a2e2365242c6ae5da1a0a8b1c6 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sun, 19 Oct 2025 00:19:14 +0800 Subject: [PATCH 09/21] enhance the alarm kernel with recovered status notification capability #13492 --- .github/workflows/skywalking.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/skywalking.yaml b/.github/workflows/skywalking.yaml index a1db50764bd9..54c1a8919ce1 100644 --- a/.github/workflows/skywalking.yaml +++ b/.github/workflows/skywalking.yaml @@ -768,7 +768,7 @@ jobs: if: matrix.test.docker != null run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f + uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} @@ -832,7 +832,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f + uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -893,7 +893,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f + uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -956,7 +956,7 @@ jobs: shell: bash run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package - name: Java version ${{ matrix.java-version }} - uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f + uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f env: SW_AGENT_JDK_VERSION: ${{ matrix.java-version }} with: @@ -1052,7 +1052,7 @@ jobs: # fi # docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v # - name: ${{ matrix.test.name }} -# uses: apache/skywalking-infra-e2e@4e51bfbdcc3622cedcad90294c2f8c909f96943f +# uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f # with: # e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} # - if: ${{ failure() }} From f97ad0c1304613166fd1c23a9beee3a54bd9ddbe Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sun, 26 Oct 2025 02:45:04 +0800 Subject: [PATCH 10/21] enhance the alarm kernel with recovered status notification capability #13492 --- test/e2e-v2/cases/apisix/otel-collector/e2e.yaml | 2 +- test/e2e-v2/cases/aws/api-gateway/e2e.yaml | 2 +- test/e2e-v2/cases/aws/dynamodb/e2e.yaml | 2 +- test/e2e-v2/cases/aws/eks/e2e.yaml | 2 +- test/e2e-v2/cases/aws/s3/e2e.yaml | 2 +- test/e2e-v2/cases/baseline/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/baseline/es/e2e.yaml | 2 +- test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml | 2 +- test/e2e-v2/cases/exporter/kafka/e2e.yaml | 2 +- test/e2e-v2/cases/flink/e2e.yaml | 2 +- test/e2e-v2/cases/gateway/e2e.yaml | 2 +- test/e2e-v2/cases/go/e2e.yaml | 2 +- test/e2e-v2/cases/kafka/log/e2e.yaml | 2 +- test/e2e-v2/cases/kafka/meter/e2e.yaml | 2 +- test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml | 2 +- test/e2e-v2/cases/log/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/log/es/e2e.yaml | 2 +- test/e2e-v2/cases/log/es/es-sharding/e2e.yaml | 2 +- test/e2e-v2/cases/log/fluent-bit/e2e.yaml | 2 +- test/e2e-v2/cases/log/mysql/e2e.yaml | 2 +- test/e2e-v2/cases/log/postgres/e2e.yaml | 2 +- test/e2e-v2/cases/logql/e2e.yaml | 2 +- test/e2e-v2/cases/lua/e2e.yaml | 2 +- test/e2e-v2/cases/menu/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/menu/es/e2e.yaml | 2 +- test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml | 2 +- test/e2e-v2/cases/menu/mysql/e2e.yaml | 2 +- test/e2e-v2/cases/menu/opensearch/e2e.yaml | 2 +- test/e2e-v2/cases/menu/postgres/e2e.yaml | 2 +- test/e2e-v2/cases/meter/e2e.yaml | 2 +- test/e2e-v2/cases/mqe/e2e.yaml | 2 +- test/e2e-v2/cases/nginx/e2e.yaml | 2 +- test/e2e-v2/cases/nodejs/e2e.yaml | 2 +- test/e2e-v2/cases/otlp-traces/e2e.yaml | 2 +- test/e2e-v2/cases/php/e2e.yaml | 2 +- test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml | 2 +- .../e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml | 2 +- test/e2e-v2/cases/promql/e2e.yaml | 2 +- test/e2e-v2/cases/python/e2e.yaml | 2 +- test/e2e-v2/cases/satellite/native-protocols/e2e.yaml | 2 +- test/e2e-v2/cases/simple/auth/e2e.yaml | 2 +- test/e2e-v2/cases/simple/jdk/e2e.yaml | 2 +- test/e2e-v2/cases/simple/mtls/e2e.yaml | 2 +- test/e2e-v2/cases/simple/ssl/e2e.yaml | 2 +- test/e2e-v2/cases/so11y/e2e.yaml | 2 +- test/e2e-v2/cases/storage/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml | 2 +- test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml | 2 +- test/e2e-v2/cases/storage/es/e2e.yaml | 2 +- test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml | 2 +- test/e2e-v2/cases/storage/mysql/e2e.yaml | 2 +- test/e2e-v2/cases/storage/opensearch/e2e.yaml | 2 +- test/e2e-v2/cases/storage/postgres/e2e.yaml | 2 +- test/e2e-v2/cases/virtual-mq/e2e.yaml | 2 +- test/e2e-v2/cases/win/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/banyandb/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/es/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/kafka/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/mysql/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/opensearch/e2e.yaml | 2 +- test/e2e-v2/cases/zipkin/postgres/e2e.yaml | 2 +- 63 files changed, 63 insertions(+), 63 deletions(-) diff --git a/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml b/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml index 283c4d3e111a..4dd6328dc443 100644 --- a/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml +++ b/test/e2e-v2/cases/apisix/otel-collector/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9089/info/ method: GET diff --git a/test/e2e-v2/cases/aws/api-gateway/e2e.yaml b/test/e2e-v2/cases/aws/api-gateway/e2e.yaml index 2b221d493b9f..29039e270ebb 100644 --- a/test/e2e-v2/cases/aws/api-gateway/e2e.yaml +++ b/test/e2e-v2/cases/aws/api-gateway/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/aws/dynamodb/e2e.yaml b/test/e2e-v2/cases/aws/dynamodb/e2e.yaml index ef3a9b14802c..72c8217a8524 100644 --- a/test/e2e-v2/cases/aws/dynamodb/e2e.yaml +++ b/test/e2e-v2/cases/aws/dynamodb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/aws/eks/e2e.yaml b/test/e2e-v2/cases/aws/eks/e2e.yaml index 938e46a1a3f5..f60b668f3450 100644 --- a/test/e2e-v2/cases/aws/eks/e2e.yaml +++ b/test/e2e-v2/cases/aws/eks/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/aws/s3/e2e.yaml b/test/e2e-v2/cases/aws/s3/e2e.yaml index 7ae7e0d4e22b..cf4e43a45f0f 100644 --- a/test/e2e-v2/cases/aws/s3/e2e.yaml +++ b/test/e2e-v2/cases/aws/s3/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://localhost:9093/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/baseline/banyandb/e2e.yaml b/test/e2e-v2/cases/baseline/banyandb/e2e.yaml index 4dcb90bf0f7d..da9bbc39e26a 100644 --- a/test/e2e-v2/cases/baseline/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/baseline/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/baseline/es/e2e.yaml b/test/e2e-v2/cases/baseline/es/e2e.yaml index 4dcb90bf0f7d..da9bbc39e26a 100644 --- a/test/e2e-v2/cases/baseline/es/e2e.yaml +++ b/test/e2e-v2/cases/baseline/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml index a1fa0e36b91d..272a070c744c 100644 --- a/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/baseline/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 30 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/exporter/kafka/e2e.yaml b/test/e2e-v2/cases/exporter/kafka/e2e.yaml index d7220c4cfac2..6de4c0bb443d 100644 --- a/test/e2e-v2/cases/exporter/kafka/e2e.yaml +++ b/test/e2e-v2/cases/exporter/kafka/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/flink/e2e.yaml b/test/e2e-v2/cases/flink/e2e.yaml index 2c523e2d9bf3..06e9a080e6d5 100644 --- a/test/e2e-v2/cases/flink/e2e.yaml +++ b/test/e2e-v2/cases/flink/e2e.yaml @@ -32,7 +32,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${jobmanager_host}:${jobmanager_9260}/metrics method: GET diff --git a/test/e2e-v2/cases/gateway/e2e.yaml b/test/e2e-v2/cases/gateway/e2e.yaml index e3270e2dfa67..177f1ec5832c 100644 --- a/test/e2e-v2/cases/gateway/e2e.yaml +++ b/test/e2e-v2/cases/gateway/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/go/e2e.yaml b/test/e2e-v2/cases/go/e2e.yaml index 21b33d4e2519..b5c7aa39e1a7 100644 --- a/test/e2e-v2/cases/go/e2e.yaml +++ b/test/e2e-v2/cases/go/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/correlation method: POST diff --git a/test/e2e-v2/cases/kafka/log/e2e.yaml b/test/e2e-v2/cases/kafka/log/e2e.yaml index 1c80a57c29dc..037b6cad789a 100644 --- a/test/e2e-v2/cases/kafka/log/e2e.yaml +++ b/test/e2e-v2/cases/kafka/log/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/kafka/meter/e2e.yaml b/test/e2e-v2/cases/kafka/meter/e2e.yaml index ca7a520eaada..c7f363d6d18c 100644 --- a/test/e2e-v2/cases/kafka/meter/e2e.yaml +++ b/test/e2e-v2/cases/kafka/meter/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml b/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml index a91249e4e4f8..aefc8568d335 100644 --- a/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml +++ b/test/e2e-v2/cases/kafka/simple-so11y/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/log/banyandb/e2e.yaml b/test/e2e-v2/cases/log/banyandb/e2e.yaml index eb1725bc750a..bc8437238c97 100644 --- a/test/e2e-v2/cases/log/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/log/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/es/e2e.yaml b/test/e2e-v2/cases/log/es/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/es/e2e.yaml +++ b/test/e2e-v2/cases/log/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml index 592267acd079..b9b08e0fe888 100644 --- a/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/log/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/fluent-bit/e2e.yaml b/test/e2e-v2/cases/log/fluent-bit/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/fluent-bit/e2e.yaml +++ b/test/e2e-v2/cases/log/fluent-bit/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/mysql/e2e.yaml b/test/e2e-v2/cases/log/mysql/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/mysql/e2e.yaml +++ b/test/e2e-v2/cases/log/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/log/postgres/e2e.yaml b/test/e2e-v2/cases/log/postgres/e2e.yaml index ca75d051fd7d..3a76cd046fdb 100644 --- a/test/e2e-v2/cases/log/postgres/e2e.yaml +++ b/test/e2e-v2/cases/log/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/logs/trigger method: GET diff --git a/test/e2e-v2/cases/logql/e2e.yaml b/test/e2e-v2/cases/logql/e2e.yaml index 5a4519cc738b..df8f3585e131 100644 --- a/test/e2e-v2/cases/logql/e2e.yaml +++ b/test/e2e-v2/cases/logql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/lua/e2e.yaml b/test/e2e-v2/cases/lua/e2e.yaml index 3a4a2340b77f..53433d47cb5e 100644 --- a/test/e2e-v2/cases/lua/e2e.yaml +++ b/test/e2e-v2/cases/lua/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider-entry_host}:${provider-entry_9090}/nginx/entry/info method: POST diff --git a/test/e2e-v2/cases/menu/banyandb/e2e.yaml b/test/e2e-v2/cases/menu/banyandb/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/menu/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/es/e2e.yaml b/test/e2e-v2/cases/menu/es/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/es/e2e.yaml +++ b/test/e2e-v2/cases/menu/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml index 0e765d63bf9f..3a53dd4fe4b3 100644 --- a/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/menu/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/mysql/e2e.yaml b/test/e2e-v2/cases/menu/mysql/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/mysql/e2e.yaml +++ b/test/e2e-v2/cases/menu/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/opensearch/e2e.yaml b/test/e2e-v2/cases/menu/opensearch/e2e.yaml index bae14e111aa0..74b8b82d445a 100644 --- a/test/e2e-v2/cases/menu/opensearch/e2e.yaml +++ b/test/e2e-v2/cases/menu/opensearch/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/menu/postgres/e2e.yaml b/test/e2e-v2/cases/menu/postgres/e2e.yaml index 84bc9fa55523..7784f0b2f70e 100644 --- a/test/e2e-v2/cases/menu/postgres/e2e.yaml +++ b/test/e2e-v2/cases/menu/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/meter/e2e.yaml b/test/e2e-v2/cases/meter/e2e.yaml index 0dd74fc67b08..cc4bd72ccb44 100644 --- a/test/e2e-v2/cases/meter/e2e.yaml +++ b/test/e2e-v2/cases/meter/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/mqe/e2e.yaml b/test/e2e-v2/cases/mqe/e2e.yaml index de8031630149..3c0a91b4901a 100644 --- a/test/e2e-v2/cases/mqe/e2e.yaml +++ b/test/e2e-v2/cases/mqe/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/nginx/e2e.yaml b/test/e2e-v2/cases/nginx/e2e.yaml index 1c55aee1de6e..56b3983cba62 100644 --- a/test/e2e-v2/cases/nginx/e2e.yaml +++ b/test/e2e-v2/cases/nginx/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${nginx_host}:${nginx_8080}/test method: POST diff --git a/test/e2e-v2/cases/nodejs/e2e.yaml b/test/e2e-v2/cases/nodejs/e2e.yaml index bccedb26fd23..c26660e3d652 100644 --- a/test/e2e-v2/cases/nodejs/e2e.yaml +++ b/test/e2e-v2/cases/nodejs/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_5001}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/otlp-traces/e2e.yaml b/test/e2e-v2/cases/otlp-traces/e2e.yaml index 1a1e3151873a..f40e58628167 100644 --- a/test/e2e-v2/cases/otlp-traces/e2e.yaml +++ b/test/e2e-v2/cases/otlp-traces/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8080}/api/products method: GET diff --git a/test/e2e-v2/cases/php/e2e.yaml b/test/e2e-v2/cases/php/e2e.yaml index 83aeb3c1b2eb..3b559c6f5887 100644 --- a/test/e2e-v2/cases/php/e2e.yaml +++ b/test/e2e-v2/cases/php/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 20 + times: -1 url: http://${php_host}:${php_8080}/php/info method: POST diff --git a/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml b/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml index 06eae493c5bf..9f9befaa8ea1 100644 --- a/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/profiling/ebpf/network/banyandb/e2e.yaml @@ -83,7 +83,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${service_service_host}:${service_service_80}/consumer method: GET diff --git a/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml b/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml index 1d7f85a49227..4f05c0d2aca3 100644 --- a/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml +++ b/test/e2e-v2/cases/profiling/ebpf/network/es/e2e.yaml @@ -82,7 +82,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${service_service_host}:${service_service_80}/consumer method: GET diff --git a/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml index 51425ac79f7a..f3cff405949e 100644 --- a/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/profiling/ebpf/network/es/es-sharding/e2e.yaml @@ -82,7 +82,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${service_service_host}:${service_service_80}/consumer method: GET diff --git a/test/e2e-v2/cases/promql/e2e.yaml b/test/e2e-v2/cases/promql/e2e.yaml index 560e4ac6e690..18b3eca522c2 100644 --- a/test/e2e-v2/cases/promql/e2e.yaml +++ b/test/e2e-v2/cases/promql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/python/e2e.yaml b/test/e2e-v2/cases/python/e2e.yaml index baf4fe6139fe..d1c3e537837f 100644 --- a/test/e2e-v2/cases/python/e2e.yaml +++ b/test/e2e-v2/cases/python/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer-py_host}:${consumer-py_9090}/test method: POST diff --git a/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml b/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml index 457cb5303bf6..2461b74e3f25 100644 --- a/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml +++ b/test/e2e-v2/cases/satellite/native-protocols/e2e.yaml @@ -33,7 +33,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/info method: POST diff --git a/test/e2e-v2/cases/simple/auth/e2e.yaml b/test/e2e-v2/cases/simple/auth/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/auth/e2e.yaml +++ b/test/e2e-v2/cases/simple/auth/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/simple/jdk/e2e.yaml b/test/e2e-v2/cases/simple/jdk/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/jdk/e2e.yaml +++ b/test/e2e-v2/cases/simple/jdk/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/simple/mtls/e2e.yaml b/test/e2e-v2/cases/simple/mtls/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/mtls/e2e.yaml +++ b/test/e2e-v2/cases/simple/mtls/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/simple/ssl/e2e.yaml b/test/e2e-v2/cases/simple/ssl/e2e.yaml index ea34c28a7aea..5d4aa0f91278 100644 --- a/test/e2e-v2/cases/simple/ssl/e2e.yaml +++ b/test/e2e-v2/cases/simple/ssl/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/so11y/e2e.yaml b/test/e2e-v2/cases/so11y/e2e.yaml index fae4ea729616..8f22574e06cc 100644 --- a/test/e2e-v2/cases/so11y/e2e.yaml +++ b/test/e2e-v2/cases/so11y/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/banyandb/e2e.yaml b/test/e2e-v2/cases/storage/banyandb/e2e.yaml index 840ac9937222..a6d4d8489b5d 100644 --- a/test/e2e-v2/cases/storage/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/storage/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 5s - times: 40 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml b/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml index 86f249bb5420..22d2548b0134 100644 --- a/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml +++ b/test/e2e-v2/cases/storage/banyandb/stages/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 5s - times: 40 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml b/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml index 93dd16f86739..c93bbedcebe4 100644 --- a/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml +++ b/test/e2e-v2/cases/storage/banyandb/tls/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 5s - times: 40 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/es/e2e.yaml b/test/e2e-v2/cases/storage/es/e2e.yaml index 6c1c49117ed4..35566af71a28 100644 --- a/test/e2e-v2/cases/storage/es/e2e.yaml +++ b/test/e2e-v2/cases/storage/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml index 13dbfb34e1a6..d9126d4002d7 100644 --- a/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/storage/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/mysql/e2e.yaml b/test/e2e-v2/cases/storage/mysql/e2e.yaml index 9c7aebe5f5be..774dc86c914d 100644 --- a/test/e2e-v2/cases/storage/mysql/e2e.yaml +++ b/test/e2e-v2/cases/storage/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/opensearch/e2e.yaml b/test/e2e-v2/cases/storage/opensearch/e2e.yaml index 6c1c49117ed4..35566af71a28 100644 --- a/test/e2e-v2/cases/storage/opensearch/e2e.yaml +++ b/test/e2e-v2/cases/storage/opensearch/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/storage/postgres/e2e.yaml b/test/e2e-v2/cases/storage/postgres/e2e.yaml index 9c7aebe5f5be..774dc86c914d 100644 --- a/test/e2e-v2/cases/storage/postgres/e2e.yaml +++ b/test/e2e-v2/cases/storage/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${consumer_host}:${consumer_9092}/users method: POST body: '{"id":"123","name":"skywalking"}' diff --git a/test/e2e-v2/cases/virtual-mq/e2e.yaml b/test/e2e-v2/cases/virtual-mq/e2e.yaml index d15976319248..7256e86f57d2 100644 --- a/test/e2e-v2/cases/virtual-mq/e2e.yaml +++ b/test/e2e-v2/cases/virtual-mq/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${provider_host}:${provider_9090}/kafka/send method: GET diff --git a/test/e2e-v2/cases/win/e2e.yaml b/test/e2e-v2/cases/win/e2e.yaml index 275eb17eb051..5b884fb7fec3 100644 --- a/test/e2e-v2/cases/win/e2e.yaml +++ b/test/e2e-v2/cases/win/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${sender_host}:${sender_9093}/otel-metrics/send method: GET diff --git a/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml b/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml index 8749cd70d7dc..b77ab08e18d7 100644 --- a/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/banyandb/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/es/e2e.yaml b/test/e2e-v2/cases/zipkin/es/e2e.yaml index 8e3ff87be9f1..d55440880c48 100644 --- a/test/e2e-v2/cases/zipkin/es/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/es/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml b/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml index 5cf2bc49ef05..054913ec43e0 100644 --- a/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/es/es-sharding/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/kafka/e2e.yaml b/test/e2e-v2/cases/zipkin/kafka/e2e.yaml index 90d877db1405..e6b46033097c 100644 --- a/test/e2e-v2/cases/zipkin/kafka/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/kafka/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${sender_host}:${sender_9093}/sendZipkinTrace2Kafka method: POST diff --git a/test/e2e-v2/cases/zipkin/mysql/e2e.yaml b/test/e2e-v2/cases/zipkin/mysql/e2e.yaml index 8749cd70d7dc..b77ab08e18d7 100644 --- a/test/e2e-v2/cases/zipkin/mysql/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/mysql/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml b/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml index 8e3ff87be9f1..d55440880c48 100644 --- a/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/opensearch/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST diff --git a/test/e2e-v2/cases/zipkin/postgres/e2e.yaml b/test/e2e-v2/cases/zipkin/postgres/e2e.yaml index 8e3ff87be9f1..d55440880c48 100644 --- a/test/e2e-v2/cases/zipkin/postgres/e2e.yaml +++ b/test/e2e-v2/cases/zipkin/postgres/e2e.yaml @@ -31,7 +31,7 @@ setup: trigger: action: http interval: 3s - times: 10 + times: -1 url: http://${frontend_host}:${frontend_8081} method: POST From 587b2aadab194f5ad5f32d49419e49092f595d95 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Fri, 31 Oct 2025 02:23:58 +0800 Subject: [PATCH 11/21] chore(e2e): set allowed times to <=0 for endless trigger simulation --- .github/workflows/skywalking.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/skywalking.yaml b/.github/workflows/skywalking.yaml index 2a7608c7ac04..54fb9422ae85 100644 --- a/.github/workflows/skywalking.yaml +++ b/.github/workflows/skywalking.yaml @@ -775,7 +775,7 @@ jobs: if: matrix.test.docker != null run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f + uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} @@ -839,7 +839,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f + uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -900,7 +900,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f + uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -963,7 +963,7 @@ jobs: shell: bash run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package - name: Java version ${{ matrix.java-version }} - uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f + uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 env: SW_AGENT_JDK_VERSION: ${{ matrix.java-version }} with: @@ -1059,7 +1059,7 @@ jobs: # fi # docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v # - name: ${{ matrix.test.name }} -# uses: apache/skywalking-infra-e2e@04c668d691073276fb808f4d1cc7c052d8f2327f +# uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 # with: # e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} # - if: ${{ failure() }} From c4da5d2a8a190b13f0239aa0d5d8b1daf78e3450 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Thu, 6 Nov 2025 15:57:04 +0800 Subject: [PATCH 12/21] chore:add logs for troubleshooting --- .../plugin/jdbc/common/TableHelper.java | 88 +++++++++++-------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java index 95575b6da78a..9d6603c337b2 100644 --- a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java +++ b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java @@ -59,12 +59,16 @@ public class TableHelper { private final ConfigService configService = moduleManager.find(CoreModule.NAME).provider().getService(ConfigService.class); private final LoadingCache tableExistence = - CacheBuilder.newBuilder() + CacheBuilder.newBuilder() .expireAfterWrite(Duration.ofMinutes(10)) .build(new CacheLoader<>() { @Override public @NonNull Boolean load(@NonNull String tableName) throws Exception { - return jdbcClient.tableExists(tableName); + boolean tableExists = jdbcClient.tableExists(tableName); + if (!tableExists) { + log.warn("Table {} is not exists.", tableName); + } + return tableExists; } }); @@ -114,17 +118,22 @@ public List getTablesForRead(String modelName, long timeBucketStart, lon } final var ttlTables = getTablesWithinTTL(modelName); - return getTablesInTimeBucketRange(modelName, timeBucketStart, timeBucketEnd) - .stream() - .filter(ttlTables::contains) - .filter(table -> { - try { - return tableExistence.get(table); - } catch (Exception e) { - throw new RuntimeException(e); - } - }) - .collect(toList()); + List tablesForRead = getTablesInTimeBucketRange(modelName, timeBucketStart, timeBucketEnd) + .stream() + .filter(ttlTables::contains) + .filter(table -> { + try { + return tableExistence.get(table); + } catch (Exception e) { + throw new RuntimeException(e); + } + }) + .collect(toList()); + if (log.isDebugEnabled()) { + log.debug("TablesForRead for {}({}~{}) got {}", modelName, timeBucketStart, timeBucketEnd, + tablesForRead); + } + return tablesForRead; } /** @@ -145,11 +154,16 @@ public List getTablesInTimeBucketRange(String modelName, long timeBucket timeBuckets.add(TimeBucket.getTimeBucket(timestamp, DownSampling.Day)); } - return timeBuckets - .build() - .distinct() - .mapToObj(timeBucket -> getTable(rawTableName, timeBucket)) - .collect(toList()); + List tablesInTimeBucketRange = timeBuckets + .build() + .distinct() + .mapToObj(timeBucket -> getTable(rawTableName, timeBucket)) + .collect(toList()); + if (log.isDebugEnabled()) { + log.debug("TablesInTimeBucketRange for {}({}~{}) got {}", modelName, timeBucketStart, timeBucketEnd, + tablesInTimeBucketRange); + } + return tablesInTimeBucketRange; } public List getTablesWithinTTL(String modelName) { @@ -161,17 +175,21 @@ public List getTablesWithinTTL(String modelName) { } final var ttlTimeBuckets = getTTLTimeBuckets(model); - return ttlTimeBuckets - .stream() - .map(it -> getTable(rawTableName, it)) - .filter(table -> { - try { - return tableExistence.get(table); - } catch (Exception e) { - throw new RuntimeException(e); - } - }) - .collect(toList()); + List tableNameList = ttlTimeBuckets + .stream() + .map(it -> getTable(rawTableName, it)) + .filter(table -> { + try { + return tableExistence.get(table); + } catch (Exception e) { + throw new RuntimeException(e); + } + }) + .collect(toList()); + if (log.isDebugEnabled()) { + log.debug("TablesWithinTTL for {} got {}.", modelName, tableNameList); + } + return tableNameList; } public static String generateId(Model model, String originalID) { @@ -199,12 +217,12 @@ public static long getTimeBucket(String table) { List getTTLTimeBuckets(Model model) { final var ttl = model.isRecord() ? - getConfigService().getRecordDataTTL() : - getConfigService().getMetricsDataTTL(); + getConfigService().getRecordDataTTL() : + getConfigService().getMetricsDataTTL(); return LongStream - .range(0, ttl) - .mapToObj(it -> TimeBucket.getTimeBucket(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(it), DownSampling.Day)) - .distinct() - .collect(toList()); + .range(0, ttl) + .mapToObj(it -> TimeBucket.getTimeBucket(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(it), DownSampling.Day)) + .distinct() + .collect(toList()); } } From c6a8d832bd96a1627dce07c901c38fe62ca9121a Mon Sep 17 00:00:00 2001 From: youjie23 Date: Thu, 6 Nov 2025 16:02:10 +0800 Subject: [PATCH 13/21] chore:add logs for troubleshooting --- .github/workflows/skywalking.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/skywalking.yaml b/.github/workflows/skywalking.yaml index a1b2bda135d1..51f384738a88 100644 --- a/.github/workflows/skywalking.yaml +++ b/.github/workflows/skywalking.yaml @@ -780,7 +780,7 @@ jobs: if: matrix.test.docker != null run: docker build -t ${{ matrix.test.docker.name }} -f ${{ matrix.test.docker.base }}/${{ matrix.test.docker.file }} ${{ matrix.test.docker.base }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 with: e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} - if: ${{ failure() }} @@ -844,7 +844,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -905,7 +905,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: ${{ matrix.test.name }} - uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 env: ISTIO_VERSION: ${{ matrix.versions.istio }} KUBERNETES_VERSION: ${{ matrix.versions.kubernetes }} @@ -968,7 +968,7 @@ jobs: shell: bash run: ./mvnw -B -q -f test/e2e-v2/java-test-service/pom.xml clean package - name: Java version ${{ matrix.java-version }} - uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 + uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 env: SW_AGENT_JDK_VERSION: ${{ matrix.java-version }} with: @@ -1064,7 +1064,7 @@ jobs: # fi # docker compose -f ${BANYANDB_DATA_GENERATE_ROOT}/docker-compose.yml down -v # - name: ${{ matrix.test.name }} -# uses: apache/skywalking-infra-e2e@d0b77685ed8175ae2cf9cff1cef20e893ff05a87 +# uses: apache/skywalking-infra-e2e@01b80d98a38154f4f80d9cdb128b9d81727f2b80 # with: # e2e-file: $GITHUB_WORKSPACE/${{ matrix.test.config }} # - if: ${{ failure() }} From 9c8651cd3e3ebe17df8bc2c45305bf45cecb7f18 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Thu, 6 Nov 2025 17:25:47 +0800 Subject: [PATCH 14/21] Revert "chore:add logs for troubleshooting" This reverts commit c4da5d2a8a190b13f0239aa0d5d8b1daf78e3450. --- .../plugin/jdbc/common/TableHelper.java | 88 ++++++++----------- 1 file changed, 35 insertions(+), 53 deletions(-) diff --git a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java index 9d6603c337b2..95575b6da78a 100644 --- a/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java +++ b/oap-server/server-storage-plugin/storage-jdbc-hikaricp-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/jdbc/common/TableHelper.java @@ -59,16 +59,12 @@ public class TableHelper { private final ConfigService configService = moduleManager.find(CoreModule.NAME).provider().getService(ConfigService.class); private final LoadingCache tableExistence = - CacheBuilder.newBuilder() + CacheBuilder.newBuilder() .expireAfterWrite(Duration.ofMinutes(10)) .build(new CacheLoader<>() { @Override public @NonNull Boolean load(@NonNull String tableName) throws Exception { - boolean tableExists = jdbcClient.tableExists(tableName); - if (!tableExists) { - log.warn("Table {} is not exists.", tableName); - } - return tableExists; + return jdbcClient.tableExists(tableName); } }); @@ -118,22 +114,17 @@ public List getTablesForRead(String modelName, long timeBucketStart, lon } final var ttlTables = getTablesWithinTTL(modelName); - List tablesForRead = getTablesInTimeBucketRange(modelName, timeBucketStart, timeBucketEnd) - .stream() - .filter(ttlTables::contains) - .filter(table -> { - try { - return tableExistence.get(table); - } catch (Exception e) { - throw new RuntimeException(e); - } - }) - .collect(toList()); - if (log.isDebugEnabled()) { - log.debug("TablesForRead for {}({}~{}) got {}", modelName, timeBucketStart, timeBucketEnd, - tablesForRead); - } - return tablesForRead; + return getTablesInTimeBucketRange(modelName, timeBucketStart, timeBucketEnd) + .stream() + .filter(ttlTables::contains) + .filter(table -> { + try { + return tableExistence.get(table); + } catch (Exception e) { + throw new RuntimeException(e); + } + }) + .collect(toList()); } /** @@ -154,16 +145,11 @@ public List getTablesInTimeBucketRange(String modelName, long timeBucket timeBuckets.add(TimeBucket.getTimeBucket(timestamp, DownSampling.Day)); } - List tablesInTimeBucketRange = timeBuckets - .build() - .distinct() - .mapToObj(timeBucket -> getTable(rawTableName, timeBucket)) - .collect(toList()); - if (log.isDebugEnabled()) { - log.debug("TablesInTimeBucketRange for {}({}~{}) got {}", modelName, timeBucketStart, timeBucketEnd, - tablesInTimeBucketRange); - } - return tablesInTimeBucketRange; + return timeBuckets + .build() + .distinct() + .mapToObj(timeBucket -> getTable(rawTableName, timeBucket)) + .collect(toList()); } public List getTablesWithinTTL(String modelName) { @@ -175,21 +161,17 @@ public List getTablesWithinTTL(String modelName) { } final var ttlTimeBuckets = getTTLTimeBuckets(model); - List tableNameList = ttlTimeBuckets - .stream() - .map(it -> getTable(rawTableName, it)) - .filter(table -> { - try { - return tableExistence.get(table); - } catch (Exception e) { - throw new RuntimeException(e); - } - }) - .collect(toList()); - if (log.isDebugEnabled()) { - log.debug("TablesWithinTTL for {} got {}.", modelName, tableNameList); - } - return tableNameList; + return ttlTimeBuckets + .stream() + .map(it -> getTable(rawTableName, it)) + .filter(table -> { + try { + return tableExistence.get(table); + } catch (Exception e) { + throw new RuntimeException(e); + } + }) + .collect(toList()); } public static String generateId(Model model, String originalID) { @@ -217,12 +199,12 @@ public static long getTimeBucket(String table) { List getTTLTimeBuckets(Model model) { final var ttl = model.isRecord() ? - getConfigService().getRecordDataTTL() : - getConfigService().getMetricsDataTTL(); + getConfigService().getRecordDataTTL() : + getConfigService().getMetricsDataTTL(); return LongStream - .range(0, ttl) - .mapToObj(it -> TimeBucket.getTimeBucket(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(it), DownSampling.Day)) - .distinct() - .collect(toList()); + .range(0, ttl) + .mapToObj(it -> TimeBucket.getTimeBucket(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(it), DownSampling.Day)) + .distinct() + .collect(toList()); } } From 7c2b0f56f450dc379a2c02d7030b58624be1e8a6 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Thu, 6 Nov 2025 19:09:38 +0800 Subject: [PATCH 15/21] chore: remove the commented-out code --- .../skywalking/oap/server/core/alarm/provider/RunningRule.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index 6f952e9e42a9..3784685d921e 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -223,7 +223,6 @@ public List check() { log.trace("RuleName:{} AlarmEntity {} {} {} expired", ruleName, alarmEntity.getName(), alarmEntity.getId0(), alarmEntity.getId1()); } - //return; } Optional alarmMessageOptional = window.checkAlarm(); @@ -573,7 +572,6 @@ private void transitionTo(State newState) { break; case OBSERVING_RECOVERY: this.recoveryObservationCountdown = this.recoveryObservationPeriod - 1; - //this.silenceCountdown = -1; break; case RECOVERED: this.recoveryObservationCountdown = this.recoveryObservationPeriod; @@ -582,7 +580,6 @@ private void transitionTo(State newState) { } private void resetCountdowns() { - //silenceCountdown = -1; recoveryObservationCountdown = this.recoveryObservationPeriod; } From 4dcff48d903e970930d4fbd1f1f91ba9c1bdd474 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sun, 9 Nov 2025 23:50:17 +0800 Subject: [PATCH 16/21] enhance the alarm kernel with recovered status notification capability #13492 --- docs/en/setup/backend/backend-alarm.md | 123 +++++++++++++++--- .../core/alarm/provider/RulesReader.java | 19 +-- .../core/alarm/provider/RunningRule.java | 6 +- .../dingtalk/DingtalkHookCallback.java | 8 +- .../provider/discord/DiscordHookCallback.java | 11 +- .../provider/feishu/FeishuHookCallback.java | 13 +- .../pagerduty/PagerDutyHookCallback.java | 12 +- .../provider/slack/SlackhookCallback.java | 9 +- .../provider/webhook/WebhookCallback.java | 9 +- .../provider/webhook/WebhookSettings.java | 1 + .../provider/wechat/WechatHookCallback.java | 19 +-- .../provider/welink/WeLinkHookCallback.java | 10 +- .../wechat/WechatHookCallbackTest.java | 38 ++++-- test/e2e-v2/cases/alarm/alarm-settings.yml | 6 + 14 files changed, 209 insertions(+), 75 deletions(-) diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index ed3cfc6f1618..1356fe0028bf 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -39,6 +39,9 @@ The metrics names in the expression could be found in the [list of all potential If the hook name is not specified, the global hook will be used. - **Silence period**. After the alarm is triggered at Time-N (TN), there will be silence during the **TN -> TN + period**. By default, it works in the same manner as **period**. The same Alarm (having the same ID in the same metrics name) may only be triggered once within a period. +- **Recovery observation period**. Defines the number of consecutive periods that the alarm condition must remain false before the alarm is considered recovered. When the alarm condition becomes false, the system enters an observation period. If the condition remains false for the specified number of periods, a recovery notification is sent. If the condition becomes true again during the observation period, the alarm returns to the FIRING state. +The default value is 0, which means immediate recovery notification when the condition becomes false. + Such as for a metric, there is a shifting window as following at T7. @@ -52,6 +55,7 @@ Such as for a metric, there is a shifting window as following at T7. For example, expression `avg(service_resp_time) > 1000`, if the value are `1001, 1001, 1001, 1001, 1001, 1001, 1001`, the calculation is `((1001 + 10001 + ... + 1001) / 7) > 1000` and the result would be `1`(true). Then the alarm would be triggered. * In every minute, the window would shift automatically. At T8, Value8 would be cached, and T1/Value1 would be removed from the window. +* If Value8 is 890, the expression will be calculated based on the metric values from T2 to T8, which are `1001, 1001, 1001, 1001, 1001, 1001, 990`. The calculation becomes `((1001 + 1001 + ... + 890) / 7) < 1000`, and the result would be `0`(false). Consequently, the alarm enters an observation period for recovery. If the `Recovery observation period`is not set or is set to `0`, the alarm is considered recovered immediately, and a recovery notification is sent. Otherwise, the system will wait and observe the condition over the specified number of subsequent periods before declaring recovery. **NOTE**: * If the expression include labeled metrics and result has multiple labeled value(e.g. `sum(service_percentile{p='50,75'} > 1000) >= 3`), the alarm will be triggered if any of the labeled value result matches 3 times of the condition(P50 > 1000 or P75 > 1000). @@ -69,6 +73,8 @@ rules: period: 10 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 10 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 2 message: Successful rate of endpoint {name} is lower than 75% tags: level: WARNING @@ -163,6 +169,14 @@ hooks: "text": ":alarm_clock: *Apache Skywalking Alarm* \n **%s**." } } + recovery-text-template: |- + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":green_heart: *Apache SkyWalking Alarm Recovered* \n **%s**." + } + } webhooks: - https://hooks.slack.com/services/x/y/zssss custom1: @@ -192,12 +206,16 @@ webhook: custom1: urls: - http://127.0.0.1/custom1 + recovery-urls: + - http://127.0.0.1/custom1 # headers config is provided to add custom configurations or authentications that are required from the server side. headers: Authorization: Bearer bearer_token custom2: urls: - http://127.0.0.1/custom2 + recovery-urls: + - http://127.0.0.1/custom2 # headers config is provided to add custom configurations or authentications that are required from the server headers: Authorization: Basic basic_token @@ -213,11 +231,13 @@ webhook: The JSON format is based on `List` with the following key information: - **scopeId**, **scope**. All scopes are defined in `org.apache.skywalking.oap.server.core.source.DefaultScopeDefine`. - **name**. Target scope entity name. Please follow the [entity name definitions](#entity-name). +- **uuid** : The unique identifier (UUID) of the alarm, which is consistent between the trigger and recovery messages. - **id0**. The ID of the scope entity that matches with the name. When using the relation scope, it is the source entity ID. - **id1**. When using the relation scope, it is the destination entity ID. Otherwise, it is empty. - **ruleName**. The rule name configured in `alarm-settings.yml`. - **alarmMessage**. The alarm text message. -- **startTime**. The alarm time measured in milliseconds, which occurs between the current time and the midnight of January 1, 1970 UTC. +- **startTime**. The time, in milliseconds since the Unix epoch (January 1, 1970 UTC), when the alarm was triggered. +- **recoveryTime**. The time, in milliseconds since the Unix epoch (January 1, 1970 UTC), when the alarm was recovered. This value is `null` if the alarm has not been recovered. - **tags**. The tags configured in `alarm-settings.yml`. See the following example: @@ -226,11 +246,13 @@ See the following example: "scopeId": 1, "scope": "SERVICE", "name": "serviceA", + "uuid": "uuid1", "id0": "12", "id1": "", - "ruleName": "service_resp_time_rule", + "ruleName": "service_resp_time_rule", "alarmMessage": "alarmMessage xxxx", "startTime": 1560524171000, + "recoveryTime": 15596606810000, "tags": [{ "key": "level", "value": "WARNING" @@ -239,9 +261,10 @@ See the following example: "scopeId": 1, "scope": "SERVICE", "name": "serviceB", + "uuid": "uuid2", "id0": "23", "id1": "", - "ruleName": "service_resp_time_rule", + "ruleName": "service_resp_time_rule", "alarmMessage": "alarmMessage yyy", "startTime": 1560524171000, "tags": [{ @@ -275,6 +298,21 @@ message AlarmMessage { string alarmMessage = 7; int64 startTime = 8; AlarmTags tags = 9; + string uuid = 10; +} + +message AlarmRecoveryMessage { + int64 scopeId = 1; + string scope = 2; + string name = 3; + string id0 = 4; + string id1 = 5; + string ruleName = 6; + string alarmMessage = 7; + int64 startTime = 8; + AlarmTags tags = 9; + string uuid = 10; + int64 recoveryTime = 11; } message AlarmTags { @@ -304,6 +342,14 @@ slack: "text": ":alarm_clock: *Apache Skywalking Alarm* \n **%s**." } } + recovery-text-template: |- + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":green_heart: *Apache SkyWalking Alarm Recovered* \n **%s**." + } + } webhooks: - https://hooks.slack.com/services/x/y/z ``` @@ -322,6 +368,13 @@ wechat: "content": "Apache SkyWalking Alarm: \n %s." } } + recovery-text-template: |- + { + "msgtype": "text", + "text": { + "content": "Apache SkyWalking Alarm Recovered: \n %s." + } + } webhooks: - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key ``` @@ -341,6 +394,13 @@ dingtalk: "content": "Apache SkyWalking Alarm: \n %s." } } + recovery-text-template: |- + { + "msgtype": "text", + "text": { + "content": "Apache SkyWalking Alarm Recovered: \n %s." + } + } webhooks: - url: https://oapi.dingtalk.com/robot/send?access_token=dummy_token secret: dummysecret @@ -363,6 +423,14 @@ feishu: }, "ats":"feishu_user_id_1,feishu_user_id_2" } + recovery-text-template: |- + { + "msg_type": "text", + "content": { + "text": "Apache SkyWalking Alarm Recovered: \n %s." + }, + "ats":"feishu_user_id_1,feishu_user_id_2" + } webhooks: - url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token secret: dummysecret @@ -376,6 +444,7 @@ welink: default: is-default: true text-template: "Apache SkyWalking Alarm: \n %s." + recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s." webhooks: # you may find your own client_id and client_secret in your app, below are dummy, need to change. - client-id: "dummy_client_id" @@ -400,6 +469,7 @@ pagerduty: default: is-default: true text-template: "Apache SkyWalking Alarm: \n %s." + recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s." integration-keys: - 5c6d805c9dcf4e03d09dfa81e8789ba1 ``` @@ -415,6 +485,7 @@ discord: default: is-default: true text-template: "Apache SkyWalking Alarm: \n %s." + recovery-text-template: "Apache SkyWalking Alarm Recovered: \n %s." webhooks: - url: https://discordapp.com/api/webhooks/1008166889777414645/8e0Am4Zb-YGbBqqbiiq0jSHPTEEaHa4j1vIC-zSSm231T8ewGxgY0_XUYpY-k1nN4HBl username: robot @@ -430,15 +501,37 @@ the sliding window will be destroyed and re-created, causing the Alarm of this s ### Keys with data types of alerting rule configuration file -| Alerting element | Configuration property key | Type | Description | -|----------------------|----------------------------|----------------|--------------------| -| Expression | expression | string | MQE expression | -| Include names | include-names | string array | | -| Exclude names | exclude-names | string array | | -| Include names regex | include-names-regex | string | Java regex Pattern | -| Exclude names regex | exclude-names-regex | string | Java regex Pattern | -| Tags | tags | key-value pair | | -| Period | Period | int | | -| Silence period | silence-period | int | | -| Message | message | string | | -| Hooks | hooks | string array | | +| Alerting element | Configuration property key | Type | Description | +| --------------------------- | --------------------------- | -------------- | ------------------ | +| Expression | expression | string | MQE expression | +| Include names | include-names | string array | | +| Exclude names | exclude-names | string array | | +| Include names regex | include-names-regex | string | Java regex Pattern | +| Exclude names regex | exclude-names-regex | string | Java regex Pattern | +| Tags | tags | key-value pair | | +| Period | period | int | | +| Silence period | silence-period | int | | +| Recovery observation period | recovery-observation-period | int | | +| Message | message | string | | +| Hooks | hooks | string array | | + +## Alarm state transition +The overall alarm state transition after the introduction of alarm restoration detection and notification since version 10.3.0 is as follows: +```mermaid +stateDiagram-v2 + [*] --> NORMAL + NORMAL --> FIRING: Expression match
SilencePeriod reached + + FIRING --> SILENCED: Expression match
SilencePeriod reached + FIRING --> OBSERVING_RECOVERY: Expression mismatch
RecoveryObservationPeriod unreached + FIRING --> RECOVERED: Expression mismatch
RecoveryObservationPeriod reached + + SILENCED --> OBSERVING_RECOVERY: Expression mismatch
RecoveryObservationPeriod unreached + SILENCED --> RECOVERED: Expression mismatch
RecoveryObservationPeriod reached + + OBSERVING_RECOVERY --> FIRING: Expression match
SilencePeriod reached + OBSERVING_RECOVERY --> RECOVERED: Expression mismatch
RecoveryObservationPeriod reached + + RECOVERED --> FIRING: Expression match
SilencePeriod reached + RECOVERED --> NORMAL: Expression mismatch +``` \ No newline at end of file diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index 75a60142aa2c..36331230a636 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -51,7 +51,6 @@ * Rule Reader parses the given `alarm-settings.yml` config file, to the target {@link Rules}. */ public class RulesReader { - public static final String RECOVERED = "[Recovered]"; private Map yamlData; private final Set defaultHooks = new HashSet<>(); private final Set allHooks = new HashSet<>(); @@ -165,6 +164,10 @@ private void readWebHookConfig(Map hooks, Rules rules) { if (urls != null) { settings.getUrls().addAll(urls); } + List recoveryUrls = (List) config.get("recovery-urls"); + if (recoveryUrls != null) { + settings.getRecoveryUrls().addAll(recoveryUrls); + } Map headers = (Map) config.getOrDefault("headers", new HashMap<>()); settings.setHeaders(headers); rules.getWebhookSettingsMap().put(settings.getFormattedName(), settings); @@ -224,7 +227,7 @@ private void readSlackConfig(Map hooks, Rules rules) { Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); - Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List webhooks = (List) config.get("webhooks"); @@ -256,7 +259,7 @@ private void readWechatConfig(Map hooks, Rules rules) { Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); - Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List webhooks = (List) config.get("webhooks"); @@ -288,7 +291,7 @@ private void readDingtalkConfig(Map hooks, Rules rules) { Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); - Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List> webhooks = (List>) config.get("webhooks"); @@ -324,7 +327,7 @@ private void readFeishuConfig(Map hooks, Rules rules) { Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); - Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List> webhooks = (List>) config.get("webhooks"); @@ -355,7 +358,7 @@ private void readWeLinkConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); - String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", ""); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; @@ -393,7 +396,7 @@ private void readPagerDutyConfig(Map hooks, Rules rules) { k.toString(), AlarmHooksType.pagerduty, (Boolean) config.getOrDefault("is-default", false)); Object textTemplate = config.getOrDefault("text-template", ""); settings.setTextTemplate((String) textTemplate); - Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + Object recoveryTextTemplate = config.getOrDefault("recovery-text-template", ""); settings.setRecoveryTextTemplate((String) recoveryTextTemplate); List integrationKeys = (List) config.get("integration-keys"); @@ -421,7 +424,7 @@ private void readDiscordConfig(Map hooks, Rules rules) { configs.forEach((k, v) -> { Map config = (Map) v; String textTemplate = (String) config.get("text-template"); - String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", RECOVERED + textTemplate); + String recoveryTextTemplate = (String) config.getOrDefault("recovery-text-template", ""); List> webhooks = (List>) config.get("webhooks"); if (StringUtil.isBlank(textTemplate) || CollectionUtils.isEmpty(webhooks)) { return; diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index 3784685d921e..a72201edcc28 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -268,7 +268,7 @@ public Window(AlarmEntity entity, int period, int silencePeriod, int recoveryObs int additionalPeriod) { this.entity = entity; this.additionalPeriod = additionalPeriod; - this.size = period + additionalPeriod + Math.max(silencePeriod, recoveryObservationPeriod); + this.size = period + additionalPeriod /*+ Math.max(silencePeriod, recoveryObservationPeriod)*/; this.period = period; this.stateMachine = new AlarmStateMachine(silencePeriod, recoveryObservationPeriod); this.init(); @@ -396,9 +396,7 @@ private boolean isMatch() { int isMatch = 0; try { TRACE_CONTEXT.set(new DebuggingTraceContext(expression, false, false)); - int metricsSize = period + additionalPeriod; - LinkedList> metricsValues = new LinkedList<>(this.values.subList(size - metricsSize, size)); - AlarmMQEVisitor visitor = new AlarmMQEVisitor(moduleManager, this.entity, metricsValues, this.endTime, this.additionalPeriod); + AlarmMQEVisitor visitor = new AlarmMQEVisitor(moduleManager, this.entity, this.values, this.endTime, this.additionalPeriod); ExpressionResult parseResult = visitor.visit(exprTree); if (StringUtil.isNotBlank(parseResult.getError())) { log.error("expression:" + expression + " error: " + parseResult.getError()); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java index 84e29b72ed4a..de179a7ca15b 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/dingtalk/DingtalkHookCallback.java @@ -63,9 +63,11 @@ protected void doAlarmCallback(List alarmMessages, boolean isRecov for (final var webHookUrl : setting.getWebhooks()) { final var url = getUrl(webHookUrl); for (final var alarmMessage : messages) { - final var requestBody = String.format(getTemplate(setting, isRecovery), - alarmMessage.getAlarmMessage()); - post(URI.create(url), requestBody, Map.of()); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var requestBody = String.format(template, alarmMessage.getAlarmMessage()); + post(URI.create(url), requestBody, Map.of()); + } } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java index 0faf690f28e4..e18f9b84f661 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/discord/DiscordHookCallback.java @@ -31,6 +31,7 @@ import java.util.Map; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm Discord webhook API. @@ -57,11 +58,11 @@ public void doAlarmCallback(List alarmMessages, boolean isRecovery } for (final var webHookUrl : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var content = String.format( - getTemplate(setting, isRecovery), - alarmMessage.getAlarmMessage() - ); - sendAlarmMessage(webHookUrl, content); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var content = String.format(template, alarmMessage.getAlarmMessage()); + sendAlarmMessage(webHookUrl, content); + } } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java index 5b35f8f02e86..8ede5dca487a 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/feishu/FeishuHookCallback.java @@ -64,11 +64,14 @@ protected void doAlarmCallback(List alarmMessages, boolean isRecov } for (final var webHookUrl : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var requestBody = getRequestBody(webHookUrl, alarmMessage, getTemplate(setting, isRecovery)); - try { - post(URI.create(webHookUrl.getUrl()), requestBody, Map.of()); - } catch (Exception e) { - log.error("Failed to send alarm message to Feishu: {}", webHookUrl.getUrl(), e); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var requestBody = getRequestBody(webHookUrl, alarmMessage, template); + try { + post(URI.create(webHookUrl.getUrl()), requestBody, Map.of()); + } catch (Exception e) { + log.error("Failed to send alarm message to Feishu: {}", webHookUrl.getUrl(), e); + } } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java index cae4283c17dd..0f9d91696236 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/pagerduty/PagerDutyHookCallback.java @@ -33,6 +33,7 @@ import java.util.UUID; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; @Slf4j @RequiredArgsConstructor @@ -61,10 +62,13 @@ protected void doAlarmCallback(List alarmMessages, boolean isRecov for (final var integrationKey : setting.getIntegrationKeys()) { for (final var alarmMessage : messages) { try { - post( - URI.create(PAGER_DUTY_EVENTS_API_V2_URL), - getMessageBody(alarmMessage, integrationKey, getTemplate(isRecovery, setting)), Map.of() - ); + String template = getTemplate(isRecovery, setting); + if (StringUtil.isNotBlank(template)) { + post( + URI.create(PAGER_DUTY_EVENTS_API_V2_URL), + getMessageBody(alarmMessage, integrationKey, template), Map.of() + ); + } } catch (Exception e) { log.error("Failed to send alarm message to PagerDuty: {}", integrationKey, e); } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java index 72bdad30f9b2..c8dc031747f1 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/slack/SlackhookCallback.java @@ -32,6 +32,7 @@ import java.util.Map; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm slack webhook API calls a remote endpoints. @@ -64,10 +65,10 @@ public void doAlarmCallback(List alarmMessages, boolean isRecovery final var jsonObject = new JsonObject(); final var jsonElements = new JsonArray(); for (AlarmMessage item : messages) { - jsonElements.add(GSON.fromJson( - String.format( - getTemplate(setting, isRecovery), item.getAlarmMessage() - ), JsonObject.class)); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + jsonElements.add(GSON.fromJson(String.format(template, item.getAlarmMessage()), JsonObject.class)); + } } jsonObject.add("blocks", jsonElements); final var body = GSON.toJson(jsonObject); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java index a07b47508e21..0217ddb46967 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallback.java @@ -52,11 +52,12 @@ public void doAlarmCallback(List alarmMessages, boolean isRecovery var hookName = entry.getKey(); var messages = entry.getValue(); var setting = settingsMap.get(hookName); - if (setting == null || CollectionUtils.isEmpty(setting.getUrls()) || CollectionUtils.isEmpty( + List urls = getUrls(setting, isRecovery); + if (setting == null || CollectionUtils.isEmpty(urls) || CollectionUtils.isEmpty( messages)) { continue; } - for (final var url : setting.getUrls()) { + for (final var url : urls) { try { post(URI.create(url), gson.toJson(messages), setting.getHeaders()); } catch (Exception e) { @@ -65,4 +66,8 @@ public void doAlarmCallback(List alarmMessages, boolean isRecovery } } } + + private static List getUrls(WebhookSettings setting, boolean isRecovery) { + return isRecovery ? setting.getRecoveryUrls() : setting.getUrls(); + } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java index 813bbf4c5739..a1ce830cd6a8 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookSettings.java @@ -34,6 +34,7 @@ @ToString public class WebhookSettings extends AlarmHookSettings { private List urls = new ArrayList<>(); + private List recoveryUrls = new ArrayList<>(); private Map headers = new HashMap<>(); public WebhookSettings(final String name, diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java index 3f98cd1dfda1..618c2192ab4a 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallback.java @@ -29,6 +29,7 @@ import java.util.Map; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm wechat webhook API. @@ -50,19 +51,19 @@ public void doAlarmCallback(List alarmMessages, boolean isRecovery var hookName = entry.getKey(); var messages = entry.getValue(); var setting = settingsMap.get(hookName); - if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty( - messages)) { + if (setting == null || CollectionUtils.isEmpty(setting.getWebhooks()) || CollectionUtils.isEmpty(messages)) { continue; } for (final var url : setting.getWebhooks()) { for (final var alarmMessage : messages) { - final var requestBody = String.format( - getTemplate(setting, isRecovery), alarmMessage.getAlarmMessage() - ); - try { - post(URI.create(url), requestBody, Map.of()); - } catch (Exception e) { - log.error("Failed to send alarm message to Wechat webhook: {}", url, e); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var requestBody = String.format(template, alarmMessage.getAlarmMessage()); + try { + post(URI.create(url), requestBody, Map.of()); + } catch (Exception e) { + log.error("Failed to send alarm message to Wechat webhook: {}", url, e); + } } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java index 91f98da52dc6..ef0237787703 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/welink/WeLinkHookCallback.java @@ -41,6 +41,7 @@ import java.util.UUID; import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Use SkyWalking alarm WeLink webhook API. @@ -71,10 +72,11 @@ public void doAlarmCallback(List alarmMessages, boolean isRecovery for (final var webHookUrl : setting.getWebhooks()) { final var accessToken = getAccessToken(webHookUrl); for (final var alarmMessage : messages) { - final var content = String.format( - getTemplate(setting, isRecovery), alarmMessage.getAlarmMessage() - ); - sendAlarmMessage(webHookUrl, accessToken, content); + String template = getTemplate(setting, isRecovery); + if (StringUtil.isNotBlank(template)) { + final var content = String.format(template, alarmMessage.getAlarmMessage()); + sendAlarmMessage(webHookUrl, accessToken, content); + } } } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java index 5f8c5a058bfb..3caa2d2304a6 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java @@ -25,6 +25,7 @@ import com.linecorp.armeria.server.ServerBuilder; import com.linecorp.armeria.testing.junit5.server.ServerExtension; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmHooksType; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.core.alarm.provider.Rules; @@ -39,28 +40,34 @@ import java.util.concurrent.atomic.AtomicInteger; public class WechatHookCallbackTest { + public static final String RECOVERED = "[Recovered]"; private static final AtomicBoolean IS_SUCCESS = new AtomicBoolean(); private static final AtomicInteger COUNT = new AtomicInteger(); + private static final AtomicInteger RECOVERY_COUNT = new AtomicInteger(); @RegisterExtension public static final ServerExtension SERVER = new ServerExtension() { @Override protected void configure(ServerBuilder sb) { sb.service("/wechathook/receiveAlarm", (ctx, req) -> HttpResponse.from( - req.aggregate().thenApply(r -> { - final String content = r.content().toStringUtf8(); - final JsonObject jsonObject = new Gson().fromJson(content, JsonObject.class); - final String type = jsonObject.get("msgtype").getAsString(); - if (type.equalsIgnoreCase("text")) { - COUNT.incrementAndGet(); - if (COUNT.get() == 2) { - IS_SUCCESS.set(true); + req.aggregate().thenApply(r -> { + final String content = r.content().toStringUtf8(); + final JsonObject jsonObject = new Gson().fromJson(content, JsonObject.class); + final String type = jsonObject.get("msgtype").getAsString(); + if (type.equalsIgnoreCase("text")) { + COUNT.incrementAndGet(); + final String textContent = ((JsonObject) jsonObject.get("text")).get("content").getAsString(); + if (textContent.startsWith(RECOVERED)) { + RECOVERY_COUNT.incrementAndGet(); + } + if (COUNT.get() == 3 && RECOVERY_COUNT.get() == 1) { + IS_SUCCESS.set(true); + } + return HttpResponse.of(HttpStatus.OK); } - return HttpResponse.of(HttpStatus.OK); - } - return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); - }) + return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); + }) )); } }; @@ -71,17 +78,21 @@ public void testWechatWebhook() throws Exception { remoteEndpoints.add("http://127.0.0.1:" + SERVER.httpPort() + "/wechathook/receiveAlarm"); Rules rules = new Rules(); String template = "{\"msgtype\":\"text\",\"text\":{\"content\":\"Skywaling alarm: %s\"}}"; + String recoveryTemplate = "{\"msgtype\":\"text\",\"text\":{\"content\":\"" + RECOVERED + "Skywaling alarm: %s\"}}"; WechatSettings setting1 = new WechatSettings("setting1", AlarmHooksType.wechat, true); setting1.setWebhooks(remoteEndpoints); setting1.setTextTemplate(template); + setting1.setRecoveryTextTemplate(recoveryTemplate); WechatSettings setting2 = new WechatSettings("setting2", AlarmHooksType.wechat, false); setting2.setWebhooks(remoteEndpoints); setting2.setTextTemplate(template); + setting2.setRecoveryTextTemplate(recoveryTemplate); rules.getWechatSettingsMap().put(setting1.getFormattedName(), setting1); rules.getWechatSettingsMap().put(setting2.getFormattedName(), setting2); AlarmRulesWatcher alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); WechatHookCallback wechatHookCallback = new WechatHookCallback(alarmRulesWatcher); List alarmMessages = new ArrayList<>(2); + List alarmRecoveryMessages = new ArrayList<>(1); AlarmMessage alarmMessage = new AlarmMessage(); alarmMessage.setScopeId(DefaultScopeDefine.SERVICE); alarmMessage.setRuleName("service_resp_time_rule"); @@ -95,6 +106,9 @@ public void testWechatWebhook() throws Exception { anotherAlarmMessage.getHooks().add(setting2.getFormattedName()); alarmMessages.add(anotherAlarmMessage); wechatHookCallback.doAlarm(alarmMessages); + AlarmRecoveryMessage alarmRecoveryMessage = new AlarmRecoveryMessage(anotherAlarmMessage); + alarmRecoveryMessages.add(alarmRecoveryMessage); + wechatHookCallback.doAlarmRecovery(alarmMessages); Assertions.assertTrue(IS_SUCCESS.get()); } } diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml b/test/e2e-v2/cases/alarm/alarm-settings.yml index 9bdf5430171f..8679261c4441 100755 --- a/test/e2e-v2/cases/alarm/alarm-settings.yml +++ b/test/e2e-v2/cases/alarm/alarm-settings.yml @@ -51,9 +51,15 @@ hooks: is-default: true urls: - http://provider:9090/alarm/receive + recovery-urls: + - http://provider:9090/alarm/receive custom: urls: - http://provider:9090/alarm/receive + recovery-urls: + - http://provider:9090/alarm/receive none: urls: - http://none:9090/alarm/receive + recovery-urls: + - http://none:9090/alarm/receive \ No newline at end of file From 5307bafe5541daa51f87ed6447d21f5325c24df9 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Sun, 9 Nov 2025 23:55:19 +0800 Subject: [PATCH 17/21] enhance the alarm kernel with recovered status notification capability #13492 --- .../skywalking/oap/server/core/alarm/provider/RunningRule.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index a72201edcc28..f23805abbe8f 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -268,7 +268,7 @@ public Window(AlarmEntity entity, int period, int silencePeriod, int recoveryObs int additionalPeriod) { this.entity = entity; this.additionalPeriod = additionalPeriod; - this.size = period + additionalPeriod /*+ Math.max(silencePeriod, recoveryObservationPeriod)*/; + this.size = period + additionalPeriod; this.period = period; this.stateMachine = new AlarmStateMachine(silencePeriod, recoveryObservationPeriod); this.init(); From ca113a5c6d9c71facca4927068e35eb52c361fca Mon Sep 17 00:00:00 2001 From: youjie23 Date: Wed, 12 Nov 2025 18:06:13 +0800 Subject: [PATCH 18/21] enhance the alarm kernel with recovered status notification capability #13492 --- dist-material/alarm-settings.yml | 37 ++- .../config-examples/alarm-settings.yml | 38 ++- docs/en/setup/backend/backend-alarm.md | 2 +- docs/en/status/query_alarm_runtime_status.md | 5 + .../alarm/provider/AlarmStatusWatcher.java | 4 +- .../core/alarm/provider/RunningRule.java | 8 +- .../alarm/provider/grpc/GRPCCallback.java | 15 +- .../provider/status/AlarmRuleDetail.java | 1 + .../provider/status/AlarmRunningContext.java | 1 + .../core/alarm/provider/RunningRuleTest.java | 292 +++++++++++++++++- .../provider/grpc/AlarmMockReceiver.java | 27 ++ .../provider/grpc/GRPChookCallbackTest.java | 12 + .../provider/webhook/WebhookCallbackTest.java | 29 +- 13 files changed, 445 insertions(+), 26 deletions(-) diff --git a/dist-material/alarm-settings.yml b/dist-material/alarm-settings.yml index 5dd6a9d2abf4..261f2714aea9 100644 --- a/dist-material/alarm-settings.yml +++ b/dist-material/alarm-settings.yml @@ -23,6 +23,8 @@ rules: expression: sum(service_resp_time > 1000) >= 3 period: 10 silence-period: 5 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 3 message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes. # service_resp_time_rule: # expression: avg(service_resp_time) > 1000 @@ -35,16 +37,20 @@ rules: period: 10 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 3 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 2 message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes service_resp_time_percentile_rule: expression: sum(service_percentile{p='50,75,90,95,99'} > 1000) >= 3 period: 10 silence-period: 5 + recovery-observation-period: 3 message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000 service_instance_resp_time_rule: expression: sum(service_instance_resp_time > 1000) >= 2 period: 10 silence-period: 5 + recovery-observation-period: 2 message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes database_access_resp_time_rule: expression: sum(database_access_resp_time > 1000) >= 2 @@ -63,11 +69,36 @@ rules: # silence-period: 5 # message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes + #hooks: # webhook: # default: # is-default: true # urls: -# - http://127.0.0.1/notify/ -# - http://127.0.0.1/go-wechat/ - +# - http://127.0.0.1/default/alarm +# recovery-urls: +# - http://127.0.0.1/default/alarm-recovery +# custom1: +# urls: +# - http://127.0.0.1/custom1/alarm +# recovery-urls: +# - http://127.0.0.1/custom1/alarm-recovery +# wechat: +# default: +# is-default: true +# text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm: \n %s." +# } +# } +# recovery-text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm Recovered: \n %s." +# } +# } +# webhooks: +# - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key diff --git a/dist-material/config-examples/alarm-settings.yml b/dist-material/config-examples/alarm-settings.yml index afd68583ba26..969e860f9186 100644 --- a/dist-material/config-examples/alarm-settings.yml +++ b/dist-material/config-examples/alarm-settings.yml @@ -23,6 +23,8 @@ rules: period: 10 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 10 + # Number of periods to wait before considering the alarm recovered,default as 0. + recovery-observation-period: 3 message: Successful rate of endpoint {name} is lower than 75% tags: level: WARNING @@ -43,7 +45,35 @@ rules: silence-period: 5 message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes -#webhooks: -# - http://127.0.0.1/notify/ -# - http://127.0.0.1/go-wechat/ - +#hooks: +# webhook: +# default: +# is-default: true +# urls: +# - http://127.0.0.1/default/alarm +# recovery-urls: +# - http://127.0.0.1/default/alarm-recovery +# custom1: +# urls: +# - http://127.0.0.1/custom1/alarm +# recovery-urls: +# - http://127.0.0.1/custom1/alarm-recovery +# wechat: +# default: +# is-default: true +# text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm: \n %s." +# } +# } +# recovery-text-template: |- +# { +# "msgtype": "text", +# "text": { +# "content": "Apache SkyWalking Alarm Recovered: \n %s." +# } +# } +# webhooks: +# - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index 2e8a55aad391..d4d56f103316 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -516,7 +516,7 @@ the sliding window will be destroyed and re-created, causing the Alarm of this s | Hooks | hooks | string array | | ## Alarm state transition -The overall alarm state transition after the introduction of alarm restoration detection and notification since version 10.3.0 is as follows: +The overall alarm state transition after the introduction of alarm restoration detection and notification since version 10.4.0 is as follows: ```mermaid stateDiagram-v2 [*] --> NORMAL diff --git a/docs/en/status/query_alarm_runtime_status.md b/docs/en/status/query_alarm_runtime_status.md index 389f1d616e41..a8b5aba2887f 100644 --- a/docs/en/status/query_alarm_runtime_status.md +++ b/docs/en/status/query_alarm_runtime_status.md @@ -63,6 +63,7 @@ Return the detailed information of the alarm running rule. "expression": "sum(service_resp_time > 1000) >= 1", "period": 10, "silencePeriod": 10, + "recoveryObservationPeriod": 2, "additionalPeriod": 0, "includeEntityNames": [], "excludeEntityNames": [], @@ -97,6 +98,7 @@ Return the detailed information of the alarm running rule. "expression": "sum(service_resp_time > 1000) >= 1", "period": 10, "silencePeriod": 10, + "recoveryObservationPeriod": 2, "additionalPeriod": 0, "includeEntityNames": [], "excludeEntityNames": [], @@ -157,6 +159,7 @@ Return the running context of the alarm rule. "additionalPeriod": 0, "size": 10, "silenceCountdown": 10, + "recoveryObservationCountdown": 2, "entityName": "v2|mock_b_service|default|test-cluster|-", "windowValues": [ { @@ -220,6 +223,7 @@ Return the running context of the alarm rule. "additionalPeriod": 0, "size": 0, "silenceCountdown": 0, + "recoveryObservationCountdown": 0, "windowValues": [] } } @@ -228,6 +232,7 @@ Return the running context of the alarm rule. ``` `size` is the window size. Equal to the `period + additionalPeriod`. `silenceCountdown` is the countdown of the silence period. -1 means silence countdown is not running. +`recoveryObservationCountdown` is the countdown of the recovery observation period. `windowValues` is the original metrics data. The `index` is the index of the window, starting from 0. `mqeMetricsSnapshot` is the metrics data in the MQE format. When checking conditions, these data will be calculated according to the expression. diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java index 1b476d038aff..1cda9bb0050c 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java @@ -92,6 +92,7 @@ public String getAlarmRuleById(final String ruleId) { ruleDetail.setExpression(rule.getExpression()); ruleDetail.setPeriod(rule.getPeriod()); ruleDetail.setSilencePeriod(rule.getSilencePeriod()); + ruleDetail.setRecoveryObservationPeriod(rule.getRecoveryObservationPeriod()); ruleDetail.setAdditionalPeriod(rule.getAdditionalPeriod()); ruleDetail.setIncludeEntityNames(rule.getIncludeNames()); ruleDetail.setExcludeEntityNames(rule.getExcludeNames()); @@ -135,7 +136,8 @@ public String getAlarmRuleContext(final String ruleName, final String entityName runningContext.setEndTime(window.getEndTime().toString()); runningContext.setAdditionalPeriod(window.getAdditionalPeriod()); runningContext.setSize(window.getSize()); - runningContext.setSilenceCountdown(window.getSilenceCountdown()); + runningContext.setSilenceCountdown(window.getStateMachine().getSilenceCountdown()); + runningContext.setRecoveryObservationCountdown(window.getStateMachine().getRecoveryObservationCountdown()); window.scanWindowValues(values -> { for (int i = 0; i < values.size(); i++) { AlarmRunningContext.WindowValue windowValue = new AlarmRunningContext.WindowValue(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index f23805abbe8f..befaca340756 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -360,12 +360,12 @@ public Optional checkAlarm() { } else { stateMachine.onMismatch(); } - if (stateMachine.currentState == State.FIRING) { + if (stateMachine.getCurrentState() == State.FIRING) { AlarmMessage alarmMessage = buildAlarmMessage(); lastAlarmMessage = alarmMessage; return Optional.of(alarmMessage); } - if (stateMachine.currentState == State.RECOVERED) { + if (stateMachine.getCurrentState() == State.RECOVERED) { AlarmRecoveryMessage alarmRecoveryMessage = new AlarmRecoveryMessage(lastAlarmMessage); lastAlarmMessage = null; return Optional.of(alarmRecoveryMessage); @@ -483,6 +483,7 @@ public class AlarmStateMachine { private int recoveryObservationCountdown; private final int silencePeriod; private final int recoveryObservationPeriod; + @Getter private State currentState; public AlarmStateMachine(int silencePeriod, int recoveryObservationPeriod) { @@ -581,9 +582,6 @@ private void resetCountdowns() { recoveryObservationCountdown = this.recoveryObservationPeriod; } - public State getCurrentState() { - return currentState; - } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java index 5c6d31a9eb17..17f415336abe 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java @@ -54,7 +54,6 @@ public class GRPCCallback implements AlarmCallback { public GRPCCallback(AlarmRulesWatcher alarmRulesWatcher) { this.alarmRulesWatcher = alarmRulesWatcher; - this.alarmSettingMap = new HashMap<>(); this.alarmServiceStubMap = new HashMap<>(); this.grpcClientMap = new HashMap<>(); Map alarmSettingMap = alarmRulesWatcher.getGrpchookSetting(); @@ -67,6 +66,7 @@ public GRPCCallback(AlarmRulesWatcher alarmRulesWatcher) { alarmServiceStubMap.put(name, AlarmServiceGrpc.newStub(grpcClient.getChannel())); } }); + this.alarmSettingMap = alarmSettingMap; } } @@ -117,9 +117,7 @@ public void onNext(Response response) { @Override public void onError(Throwable throwable) { status.done(); - if (log.isDebugEnabled()) { - log.debug("Send alarm message failed: {}", throwable.getMessage()); - } + log.warn("Send alarm message failed: {}", throwable.getMessage()); } @Override @@ -193,16 +191,14 @@ public void onNext(Response response) { @Override public void onError(Throwable throwable) { status.done(); - if (log.isDebugEnabled()) { - log.debug("Send alarm message failed: {}", throwable.getMessage()); - } + log.warn("Send alarm recovery message failed: {}", throwable.getMessage()); } @Override public void onCompleted() { status.done(); if (log.isDebugEnabled()) { - log.debug("Send alarm message successful."); + log.debug("Send alarm recovery message successful."); } } }); @@ -219,8 +215,8 @@ public void onCompleted() { builder.setRuleName(recoveryMessage.getRuleName()); builder.setAlarmMessage(recoveryMessage.getAlarmMessage()); builder.setStartTime(recoveryMessage.getStartTime()); - builder.setRecoveryTime(recoveryMessage.getRecoveryTime()); builder.setUuid(recoveryMessage.getUuid()); + builder.setRecoveryTime(recoveryMessage.getRecoveryTime()); AlarmTags.Builder alarmTagsBuilder = AlarmTags.newBuilder(); message.getTags().forEach(m -> alarmTagsBuilder.addData(KeyStringValuePair.newBuilder().setKey(m.getKey()).setValue(m.getValue()).build())); builder.setTags(alarmTagsBuilder.build()); @@ -287,5 +283,6 @@ private void onGRPCAlarmSettingUpdated(Map newAlarmSet alarmServiceStubMap.put(name, AlarmServiceGrpc.newStub(grpcClient.getChannel())); } }); + alarmSettingMap = newAlarmSettingMap; } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java index 9978da65e6e7..a745f44062c7 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRuleDetail.java @@ -31,6 +31,7 @@ public class AlarmRuleDetail { private String expression; private int period; private int silencePeriod; + private int recoveryObservationPeriod; private int additionalPeriod; private List includeEntityNames = new ArrayList<>(); private List excludeEntityNames = new ArrayList<>(); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java index d0ee7e52fe5e..8d98d8960fa9 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java @@ -31,6 +31,7 @@ public class AlarmRunningContext { private int additionalPeriod; private int size; private int silenceCountdown; + private int recoveryObservationCountdown; private String entityName; private List windowValues = new ArrayList<>(); private JsonObject mqeMetricsSnapshot; diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index b6884764f917..36ef84472b82 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -3,7 +3,7 @@ * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with + * (the "License");you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -668,4 +668,294 @@ private void assertLabeled(AlarmRule alarmRule, String value1, String value2, St alarmMessages = getAlarmFiringMessageList(runningRule.check()); Assertions.assertEquals(alarmMsgSize, alarmMessages.size()); } + + @Test + public void testAlarmStateMachine_NoSilenceNoRecoveryObservation() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_no_silence_no_recovery"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 2"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + long timeBucket1 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()); + long timeBucket2 = TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(1).getMillis()); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket2, 71)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + long timeBucket3 = TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()); + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket3, 80)); + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover immediately"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + long timeBucket4 = TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()); + runningRule.in(getMetaInAlarm(123), getMetrics(timeBucket4, 80)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + List messages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, messages.size(), "Should be empty"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_OnlySilencePeriod() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_only_silence"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setSilencePeriod(2); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()), 72)); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()), 72)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(3).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm after silence"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(4).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(4).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(5).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(5).toLocalDateTime()); + alarmMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should recover immediately"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(6).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be normal"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_OnlyRecoveryObservationPeriod() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_only_recovery_observation"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setRecoveryObservationPeriod(1); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(1).getMillis()), 72)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should trigger alarm"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should not recover yet"); + Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover after observation"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(3).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should be normal"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_SilenceGreaterThanRecovery() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_silence_gt_recovery"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(5); + alarmRule.setSilencePeriod(3); + alarmRule.setRecoveryObservationPeriod(2); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + + runningRule.in(getMetaInAlarm(123), getMetrics( + TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size()); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + for (int i = 0; i <= 3; i++) { + runningRule.moveTo(startTime.plusMinutes(i).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics( + TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(i).getMillis()), 72)); + + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + if (i < 3) { + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced at minute " + i); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + } else { + Assertions.assertEquals(1, alarmMessages.size(), "Should fire after silence period"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + } + } + for (int i = 0; i <= 2; i++) { + runningRule.moveTo(startTime.plusMinutes(8 + i).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics( + TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(8 + i).getMillis()), 80)); + if (i < 2) { + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should not recover immediately"); + Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY, stateMachine.getCurrentState()); + } else { + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover after observation period"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + } + } + runningRule.moveTo(startTime.plusMinutes(11).toLocalDateTime()); + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should recover after observation period"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + @Test + public void testAlarmStateMachine_RecoveryGreaterThanSilence() throws IllegalExpressionException { + AlarmRule alarmRule = new AlarmRule(null); + alarmRule.setAlarmRuleName("test_recovery_gt_silence"); + alarmRule.setExpression("sum(endpoint_percent < 75) >= 1"); + alarmRule.getIncludeMetrics().add("endpoint_percent"); + alarmRule.setPeriod(3); + alarmRule.setSilencePeriod(2); + alarmRule.setRecoveryObservationPeriod(3); + alarmRule.setTags(new HashMap() {{ + put("key", "value"); + }}); + + RunningRule runningRule = new RunningRule(alarmRule, null); + + DateTime startTime = DateTime.now(); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.minusMinutes(2).getMillis()), 70)); + + runningRule.moveTo(startTime.toLocalDateTime()); + List alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size()); + RunningRule.Window window = getWindow(runningRule, 123); + RunningRule.Window.AlarmStateMachine stateMachine = window.getStateMachine(); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()), 72)); + runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced"); + Assertions.assertEquals(RunningRule.State.SILENCED, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime()); + alarmMessages = getAlarmFiringMessageList(runningRule.check()); + Assertions.assertEquals(1, alarmMessages.size(), "Should fire after silence period"); + Assertions.assertEquals(RunningRule.State.FIRING, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(4).toLocalDateTime()); + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(4).getMillis()), 80)); + List recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should not recover immediately"); + Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(5).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should still in observation"); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(6).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should still in observation"); + + runningRule.in(getMetaInAlarm(123), getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(7).getMillis()), 80)); + runningRule.moveTo(startTime.plusMinutes(7).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(1, recoveryMessages.size(), "Should recover after full observation period"); + Assertions.assertEquals(RunningRule.State.RECOVERED, stateMachine.getCurrentState()); + + runningRule.moveTo(startTime.plusMinutes(8).toLocalDateTime()); + recoveryMessages = getAlarmRecoveryMessageList(runningRule.check()); + Assertions.assertEquals(0, recoveryMessages.size(), "Should be normal"); + Assertions.assertEquals(RunningRule.State.NORMAL, stateMachine.getCurrentState()); + } + + private RunningRule.Window getWindow(RunningRule runningRule, int entityId) { + Map windows = runningRule.getWindows(); + AlarmEntity entity = getAlarmEntity(entityId); + return windows.get(entity); + } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java index edd87ce9110d..80b0f1b44b1a 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/AlarmMockReceiver.java @@ -21,6 +21,7 @@ import io.grpc.stub.StreamObserver; import lombok.extern.slf4j.Slf4j; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.grpc.AlarmServiceGrpc; import org.apache.skywalking.oap.server.core.alarm.grpc.Response; import org.apache.skywalking.oap.server.library.server.ServerException; @@ -67,5 +68,31 @@ public void onCompleted() { } }; } + + @Override public StreamObserver doAlarmRecovery(StreamObserver responseObserver) { + return new StreamObserver() { + @Override + public void onNext(AlarmRecoveryMessage value) { + log.info("received alarm recovery message: {}", value.toString()); + } + + @Override + public void onError(Throwable throwable) { + responseObserver.onError(throwable); + if (log.isDebugEnabled()) { + log.debug("received alarm recovery message error."); + } + } + + @Override + public void onCompleted() { + responseObserver.onNext(Response.newBuilder().build()); + responseObserver.onCompleted(); + if (log.isDebugEnabled()) { + log.debug("received alarm recovery message completed."); + } + } + }; + } } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java index 9c2934108b49..ec78e2de61c2 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPChookCallbackTest.java @@ -23,6 +23,7 @@ import java.util.Arrays; import java.util.List; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmHooksType; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.core.alarm.provider.Rules; @@ -38,6 +39,7 @@ public class GRPChookCallbackTest { private AlarmRulesWatcher alarmRulesWatcher; private List alarmMessageList; + private List alarmRecoveryMessageList; @BeforeEach public void init() throws Exception { @@ -54,11 +56,13 @@ public void init() throws Exception { alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); grpcCallback = new GRPCCallback(alarmRulesWatcher); mockAlarmMessage(setting1.getFormattedName(), setting2.getFormattedName()); + mockAlarmRecoveryMessage(setting1.getFormattedName(), setting2.getFormattedName()); } @Test public void doAlarm() { grpcCallback.doAlarm(alarmMessageList); + grpcCallback.doAlarmRecovery(alarmRecoveryMessageList); } @Test @@ -71,6 +75,7 @@ public void testGauchoSettingClean() { alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); grpcCallback = new GRPCCallback(alarmRulesWatcher); grpcCallback.doAlarm(alarmMessageList); + grpcCallback.doAlarmRecovery(alarmRecoveryMessageList); } private void mockAlarmMessage(String hook1, String hook2) { @@ -96,4 +101,11 @@ private void mockAlarmMessage(String hook1, String hook2) { alarmMessage2.getHooks().add(hook1); alarmMessageList = Lists.newArrayList(alarmMessage, alarmMessage2); } + + private void mockAlarmRecoveryMessage(String hook1, String hook2) { + AlarmRecoveryMessage alarmRecoveryMessage0 = new AlarmRecoveryMessage(alarmMessageList.get(0)); + AlarmRecoveryMessage alarmRecoveryMessage1 = new AlarmRecoveryMessage(alarmMessageList.get(1)); + + alarmRecoveryMessageList = Lists.newArrayList(alarmRecoveryMessage0, alarmRecoveryMessage1); + } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java index a9d40e2a255e..b9770ef3c972 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/webhook/WebhookCallbackTest.java @@ -26,6 +26,7 @@ import com.linecorp.armeria.server.ServerBuilder; import com.linecorp.armeria.testing.junit5.server.ServerExtension; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; +import org.apache.skywalking.oap.server.core.alarm.AlarmRecoveryMessage; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmHooksType; import org.apache.skywalking.oap.server.core.alarm.provider.AlarmRulesWatcher; import org.apache.skywalking.oap.server.core.alarm.provider.Rules; @@ -73,6 +74,25 @@ protected void configure(ServerBuilder sb) { IS_SUCCESS.set(false); return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); }))); + + sb.service("/webhook/receiveAlarmRecovery", (ctx, req) -> HttpResponse.from(req.aggregate().thenApply(r -> { + final String content = r.content().toStringUtf8(); + List alarmMessages = new Gson().fromJson(content, new TypeToken>() { + }.getType()); + if (alarmMessages.size() != 1) { + IS_SUCCESS.set(false); + return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); + } + if (Objects.equals(alarmMessages.get(0).getId0(), "1")) { + if (alarmMessages.get(0).getRecoveryTime() > 0) { + IS_SUCCESS.set(true); + COUNTER.incrementAndGet(); + return HttpResponse.of(HttpStatus.OK); + } + } + IS_SUCCESS.set(false); + return HttpResponse.of(HttpStatus.INTERNAL_SERVER_ERROR); + }))); } }; @@ -80,9 +100,12 @@ protected void configure(ServerBuilder sb) { public void testWebhook() throws Exception { List remoteEndpoints = new ArrayList<>(); remoteEndpoints.add("http://127.0.0.1:" + SERVER.httpPort() + "/webhook/receiveAlarm"); + List remoteEndpointsForRecovery = new ArrayList<>(); + remoteEndpointsForRecovery.add("http://127.0.0.1:" + SERVER.httpPort() + "/webhook/receiveAlarmRecovery"); Rules rules = new Rules(); WebhookSettings setting1 = new WebhookSettings("setting1", AlarmHooksType.webhook, true); setting1.setUrls(remoteEndpoints); + setting1.setRecoveryUrls(remoteEndpointsForRecovery); WebhookSettings setting2 = new WebhookSettings("setting2", AlarmHooksType.webhook, false); setting2.setUrls(remoteEndpoints); rules.getWebhookSettingsMap().put(setting1.getFormattedName(), setting1); @@ -106,8 +129,10 @@ public void testWebhook() throws Exception { anotherAlarmMessage.getHooks().add(setting2.getFormattedName()); alarmMessages.add(anotherAlarmMessage); webhookCallback.doAlarm(alarmMessages); - + List alarmRecoveryMessages = new ArrayList<>(1); + alarmRecoveryMessages.add(new AlarmRecoveryMessage(alarmMessage)); + webhookCallback.doAlarmRecovery(alarmRecoveryMessages); Assertions.assertTrue(IS_SUCCESS.get()); - Assertions.assertEquals(2, COUNTER.get()); + Assertions.assertEquals(3, COUNTER.get()); } } \ No newline at end of file From 4c1e2c691c61d778bdeb14dd1d793b2ce113db41 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Thu, 13 Nov 2025 01:16:59 +0800 Subject: [PATCH 19/21] fix Copilot review and CI fail --- docs/en/setup/backend/backend-alarm.md | 4 ++-- .../oap/server/core/alarm/provider/RunningRule.java | 7 +++---- .../oap/server/core/alarm/provider/grpc/GRPCCallback.java | 1 + .../oap/server/core/alarm/provider/RunningRuleTest.java | 2 +- .../core/alarm/provider/wechat/WechatHookCallbackTest.java | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index d4d56f103316..f1a053b7d1b3 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -252,8 +252,8 @@ See the following example: "ruleName": "service_resp_time_rule", "alarmMessage": "alarmMessage xxxx", "startTime": 1560524171000, - "recoveryTime": 15596606810000, - "tags": [{ + "recoveryTime": 1560524351000, + "tags": [{ "key": "level", "value": "WARNING" }] diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index befaca340756..e226b950dd84 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -223,6 +223,7 @@ public List check() { log.trace("RuleName:{} AlarmEntity {} {} {} expired", ruleName, alarmEntity.getName(), alarmEntity.getId0(), alarmEntity.getId1()); } + return; } Optional alarmMessageOptional = window.checkAlarm(); @@ -496,10 +497,8 @@ public AlarmStateMachine(int silencePeriod, int recoveryObservationPeriod) { public void onMatch() { if (log.isTraceEnabled()) { - if (log.isTraceEnabled()) { - log.trace("RuleName:{} AlarmEntity {} {} {} onMatch silenceCountdown:{} currentState:{}", - ruleName, entity.getName(), entity.getId0(), entity.getId1(), silenceCountdown, currentState); - } + log.trace("RuleName:{} AlarmEntity {} {} {} onMatch silenceCountdown:{} currentState:{}", + ruleName, entity.getName(), entity.getId0(), entity.getId1(), silenceCountdown, currentState); } silenceCountdown--; switch (currentState) { diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java index 17f415336abe..68b913a75917 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/grpc/GRPCCallback.java @@ -56,6 +56,7 @@ public GRPCCallback(AlarmRulesWatcher alarmRulesWatcher) { this.alarmRulesWatcher = alarmRulesWatcher; this.alarmServiceStubMap = new HashMap<>(); this.grpcClientMap = new HashMap<>(); + this.alarmSettingMap = new HashMap<>(); Map alarmSettingMap = alarmRulesWatcher.getGrpchookSetting(); if (CollectionUtils.isNotEmpty(alarmSettingMap)) { alarmSettingMap.forEach((name, alarmSetting) -> { diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index 36ef84472b82..e6030177f069 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -3,7 +3,7 @@ * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License");you may not use this file except in compliance with + * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java index 3caa2d2304a6..c70b7763657e 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/wechat/WechatHookCallbackTest.java @@ -92,7 +92,7 @@ public void testWechatWebhook() throws Exception { AlarmRulesWatcher alarmRulesWatcher = new AlarmRulesWatcher(rules, null, null); WechatHookCallback wechatHookCallback = new WechatHookCallback(alarmRulesWatcher); List alarmMessages = new ArrayList<>(2); - List alarmRecoveryMessages = new ArrayList<>(1); + List alarmRecoveryMessages = new ArrayList<>(1); AlarmMessage alarmMessage = new AlarmMessage(); alarmMessage.setScopeId(DefaultScopeDefine.SERVICE); alarmMessage.setRuleName("service_resp_time_rule"); @@ -108,7 +108,7 @@ public void testWechatWebhook() throws Exception { wechatHookCallback.doAlarm(alarmMessages); AlarmRecoveryMessage alarmRecoveryMessage = new AlarmRecoveryMessage(anotherAlarmMessage); alarmRecoveryMessages.add(alarmRecoveryMessage); - wechatHookCallback.doAlarmRecovery(alarmMessages); + wechatHookCallback.doAlarmRecovery(alarmRecoveryMessages); Assertions.assertTrue(IS_SUCCESS.get()); } } From 3b8e9c5b994080143e0ac8aff462ab0daafdef46 Mon Sep 17 00:00:00 2001 From: youjie23 Date: Mon, 17 Nov 2025 00:39:45 +0800 Subject: [PATCH 20/21] Sync UI --- skywalking-ui | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skywalking-ui b/skywalking-ui index 30927258d669..6eaf7fe26da7 160000 --- a/skywalking-ui +++ b/skywalking-ui @@ -1 +1 @@ -Subproject commit 30927258d66934278a401b2defa0d9592e7d1974 +Subproject commit 6eaf7fe26da704cf54d1371ac489b3c8f458fbb8 From dad19b86b34252a9557d0cb02a76a4a3b6ee602e Mon Sep 17 00:00:00 2001 From: youjie23 Date: Mon, 17 Nov 2025 14:16:05 +0800 Subject: [PATCH 21/21] docs:update changes.md and backend-alarm.md --- docs/en/changes/changes.md | 2 ++ docs/en/setup/backend/backend-alarm.md | 27 +++++++++++++------------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md index 3a13c5bc8283..e06dad626901 100644 --- a/docs/en/changes/changes.md +++ b/docs/en/changes/changes.md @@ -8,6 +8,8 @@ * Enhance the alarm kernel with recovered status notification capability #### UI +* Fix the missing icon in new native trace view. +* Enhance the alert page to show the recovery time of resolved alerts. #### Documentation diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index f1a053b7d1b3..60a60eaae8a2 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -517,21 +517,22 @@ the sliding window will be destroyed and re-created, causing the Alarm of this s ## Alarm state transition The overall alarm state transition after the introduction of alarm restoration detection and notification since version 10.4.0 is as follows: + ```mermaid stateDiagram-v2 [*] --> NORMAL - NORMAL --> FIRING: Expression match
SilencePeriod reached - - FIRING --> SILENCED: Expression match
SilencePeriod reached - FIRING --> OBSERVING_RECOVERY: Expression mismatch
RecoveryObservationPeriod unreached - FIRING --> RECOVERED: Expression mismatch
RecoveryObservationPeriod reached - - SILENCED --> OBSERVING_RECOVERY: Expression mismatch
RecoveryObservationPeriod unreached - SILENCED --> RECOVERED: Expression mismatch
RecoveryObservationPeriod reached - - OBSERVING_RECOVERY --> FIRING: Expression match
SilencePeriod reached - OBSERVING_RECOVERY --> RECOVERED: Expression mismatch
RecoveryObservationPeriod reached + NORMAL --> FIRING: Expression true
not in silence period - RECOVERED --> FIRING: Expression match
SilencePeriod reached - RECOVERED --> NORMAL: Expression mismatch + FIRING --> SILENCED: Expression true
in silence period + FIRING --> OBSERVING_RECOVERY: Expression false
in recovery window + FIRING --> RECOVERED: Expression false
not in recovery window + + OBSERVING_RECOVERY --> FIRING: Expression true
not in silence period + OBSERVING_RECOVERY --> RECOVERED: Expression false
not in recovery window + + SILENCED --> RECOVERED: Expression false
not in recovery window + SILENCED --> OBSERVING_RECOVERY: Expression false
in recovery window + + RECOVERED --> FIRING: Expression true
not in silence period + RECOVERED --> NORMAL: Expression false ``` \ No newline at end of file