Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions sap-hana-mixin/alerts.libsonnet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use without(...) instead of by(...), from the docs:

sum by (job, sid, host) drops labels; prefer sum without (...) to preserve labels for alert routing

Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
{
new(this): {
local config = this.config,
groups: [
{
name: 'sap-hana-alerts',
rules: [
{
alert: 'SapHanaHighCpuUtilization',
expr: |||
sum without (database_name) (hanadb_cpu_busy_percent{%(filteringSelector)s}) > %(alertsCriticalHighCpuUsage)s
||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'CPU utilization is high.',
description: (
'The CPU usage is at {{ printf "%%.2f" $value }}%% on {{$labels.core}} on {{$labels.host}} which is above the threshold of %(alertsCriticalHighCpuUsage)s%%.'
) % config,
},
},
{
alert: 'SapHanaHighPhysicalMemoryUsage',
expr: |||
100 * sum by (job, sid, host)(hanadb_host_memory_resident_mb{%(filteringSelector)s}) / sum by (job, sid, host) (hanadb_host_memory_physical_total_mb{%(filteringSelector)s}) > %(alertsCriticalHighPhysicalMemoryUsage)s
||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Current physical memory usage of the host is approaching capacity.',
description: (
'The physical memory usage is at {{ printf "%%.2f" $value }}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighPhysicalMemoryUsage)s%%.'
) % config,
},
},
{
alert: 'SapHanaMemAllocLimitBelowRecommendation',
expr: |||
100 * sum by (job, sid, host) (hanadb_host_memory_alloc_limit_mb{%(filteringSelector)s}) / sum by (job, sid, host) (hanadb_host_memory_physical_total_mb{%(filteringSelector)s}) < %(alertsWarningLowMemAllocLimit)s
||| % config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Memory allocation limit set below recommended limit.',
description: (
'The memory allocation limit is set at {{ printf "%%.2f" $value }}%% on {{$labels.host}} which is below the recommended value of %(alertsWarningLowMemAllocLimit)s%%.'
) % config,
},
},
{
alert: 'SapHanaHighMemoryUsage',
expr: |||
100 * sum by (job, sid, host) (hanadb_host_memory_used_total_mb{%(filteringSelector)s}) / sum by (job, sid, host) (hanadb_host_memory_alloc_limit_mb{%(filteringSelector)s}) > %(alertsCriticalHighMemoryUsage)s
||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Current SAP HANA memory usage is approaching capacity.',
description: (
'The memory usage is at {{ printf "%%.2f" $value }}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighMemoryUsage)s%%.'
) % config,
},
},
{
alert: 'SapHanaHighDiskUtilization',
expr: |||
100 * sum by (job, sid, host) (hanadb_disk_total_used_size_mb{%(filteringSelector)s}) / sum by (job, sid, host) (hanadb_disk_total_size_mb{%(filteringSelector)s}) > %(alertsCriticalHighDiskUtilization)s
||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'SAP HANA disk is approaching capacity.',
description: (
'The disk usage is at {{ printf "%%.2f" $value }}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighDiskUtilization)s%%.'
) % config,
},
},
{
alert: 'SapHanaHighSqlExecutionTime',
expr: |||
avg without (database_name, port, service, sql_type) (hanadb_sql_service_elap_per_exec_avg_ms{%(filteringSelector)s}) / 1000 > %(alertsCriticalHighSqlExecutionTime)s
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a minor thing, but in conjunction with my other comment on the config:

Suggest we swap the alert threshold variables to milliseconds to match the native unit of the metric itself.

Would apply for both:

  • alertsCriticalHighSqlExecutionTime
  • alertsCriticalHighReplicationShippingTime

||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'SAP HANA SQL average execution time is high.',
description: (
'The average SQL execution time is at {{ printf "%%.2f" $value }}s on {{$labels.host}} which is above the threshold of %(alertsCriticalHighSqlExecutionTime)ss.'
) % config,
},
},
{
alert: 'SapHanaHighReplicationShippingTime',
expr: |||
avg without (database_name, port, secondary_port, replication_mode) (hanadb_sr_ship_delay{%(filteringSelector)s}) > %(alertsCriticalHighReplicationShippingTime)s
||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'SAP HANA system replication log shipping delay is high.',
description: (
'The average system replication log shipping delay is at {{ printf "%%.2f" $value }}s from primary site {{$labels.site_name}} to replica site {{$labels.secondary_site_name}} which is above the threshold of %(alertsCriticalHighReplicationShippingTime)ss.'
) % config,
},
},
{
alert: 'SapHanaReplicationStatusError',
expr: |||
hanadb_sr_replication{%(filteringSelector)s} == 4
||| % config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'SAP HANA system replication status signifies an error.',
description: 'The replication status of replica {{$labels.secondary_site_name}} is ERROR',
},
},
],
},
],
},
}
147 changes: 0 additions & 147 deletions sap-hana-mixin/alerts/alerts.libsonnet

This file was deleted.

56 changes: 40 additions & 16 deletions sap-hana-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,19 +1,43 @@
{
_config+:: {
dashboardTags: ['sap-hana-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// alerts thresholds
alertsCriticalHighCpuUsage: 80, // percent 0-100
alertsCriticalHighPhysicalMemoryUsage: 80, // percent 0-100
alertsWarningLowMemAllocLimit: 90, // percent 0-100
alertsCriticalHighMemoryUsage: 80, // percent 0-100
alertsCriticalHighDiskUtilization: 80, //percent 0-100
alertsCriticalHighSqlExecutionTime: 1, // second
alertsCriticalHighReplicationShippingTime: 1, //second

enableLokiLogs: true,
local this = self,

// Filtering and label configuration
filteringSelector: '', // set to apply static filters to all queries and alerts, i.e. job="bar"

groupLabels: ['job', 'sid'],
instanceLabels: ['host'],

// Dashboard configuration
uid: 'sap-hana',
dashboardNamePrefix: 'SAP HANA',
dashboardTags: ['sap-hana-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// Alert thresholds
alertsCriticalHighCpuUsage: 80, // percent 0-100
alertsCriticalHighPhysicalMemoryUsage: 80, // percent 0-100
alertsWarningLowMemAllocLimit: 90, // percent 0-100
alertsCriticalHighMemoryUsage: 80, // percent 0-100
alertsCriticalHighDiskUtilization: 80, // percent 0-100
alertsCriticalHighSqlExecutionTime: 1, // second
alertsCriticalHighReplicationShippingTime: 1, // second
Comment on lines +24 to +25
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just had a thought: Given the native metrics use milliseconds we could just match that here, would simplify our alert expressions a bit and lower chance of issues during maintenance


// Loki logs configuration
enableLokiLogs: true,
extraLogLabels: ['level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,


// Metrics source for signals
metricsSource: ['prometheus'],

// Signal definitions
signals: {
system: (import './signals/system.libsonnet')(this),
instance: (import './signals/instance.libsonnet')(this),
performance: (import './signals/performance.libsonnet')(this),
},
}
Loading
Loading