Skip to content

Commit 1384a67

Browse files
committed
feat: enhance health monitoring configuration and reporting
- Added new environment variables for health monitoring thresholds to improve configurability. - Updated health snapshot service to include additional memory metrics and refined status reporting logic. - Modified health controller to reflect updated status handling for health snapshots. - Improved health settings page to provide clearer data visualization and user feedback based on health metrics.
1 parent 73982a6 commit 1384a67

File tree

6 files changed

+255
-65
lines changed

6 files changed

+255
-65
lines changed

.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SESAME_HEALTH_HEAP_THRESHOLD_MB=1024
2+
SESAME_HEALTH_RSS_THRESHOLD_MB=3072
3+
SESAME_HEALTH_NATIVE_DERIVE_MIN_GROWTH_MB=256

apps/api/src/core/backends/backends.service.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ export class BackendsService extends AbstractQueueProcessor {
466466
attempts: 1,
467467
},
468468
);
469-
console.log('job', job)
469+
// console.log('job', job)
470470
const optionals = {};
471471
if (!options?.async) {
472472
optionals['processedAt'] = new Date();

apps/api/src/core/health/health-collector.service.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ export class HealthCollectorService implements OnModuleInit {
2323
try {
2424
const snapshot = await this.healthSnapshotService.collectSnapshot()
2525
await this.healthHistoryService.appendSnapshot({
26-
status: snapshot.status || 'unknown',
26+
status: snapshot.status === 'error' ? 'down' : snapshot.status || 'unknown',
2727
details: snapshot.details || {},
2828
system: snapshot.system || {},
2929
futureChecks: snapshot.futureChecks || {},

apps/api/src/core/health/health-snapshot.service.ts

Lines changed: 124 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,51 @@
11
import { Injectable } from '@nestjs/common'
22
import { statfs } from 'fs/promises'
33
import { cpus, loadavg, totalmem } from 'os'
4-
import { DiskHealthIndicator, HealthCheckError, HealthCheckResult, HealthCheckService, HttpHealthIndicator, MemoryHealthIndicator, MongooseHealthIndicator } from '@nestjs/terminus'
4+
import { DiskHealthIndicator, HealthCheckError, HealthCheckResult, HealthCheckService, HealthIndicatorResult, HttpHealthIndicator, MemoryHealthIndicator, MongooseHealthIndicator } from '@nestjs/terminus'
55

66
const MEMORY_MULTIPLIER = 1024 * 1024
77
const GIGABYTE_MULTIPLIER = 1024 * 1024 * 1024
88
const CPU_LOAD_THRESHOLD = 0.85
99
const DISK_THRESHOLD_PERCENT = 0.95
10-
const HEAP_MEMORY_THRESHOLD_MB = 512
11-
const RSS_MEMORY_THRESHOLD_MB = 512
10+
const IS_DEV = process.env.NODE_ENV !== 'production'
11+
12+
const readPositiveIntegerEnv = (key: string): number | null => {
13+
const rawValue = process.env[key]
14+
if (!rawValue) {
15+
return null
16+
}
17+
18+
const parsedValue = Number.parseInt(rawValue, 10)
19+
if (!Number.isFinite(parsedValue) || parsedValue <= 0) {
20+
return null
21+
}
22+
23+
return parsedValue
24+
}
25+
26+
const resolveThresholdMb = (baseKey: string, defaults: { dev: number; prod: number }): number => {
27+
const envSuffix = IS_DEV ? 'DEV' : 'PROD'
28+
return (
29+
readPositiveIntegerEnv(baseKey) ||
30+
readPositiveIntegerEnv(`${baseKey}_${envSuffix}`) ||
31+
(IS_DEV ? defaults.dev : defaults.prod)
32+
)
33+
}
34+
35+
const HEAP_MEMORY_THRESHOLD_MB = resolveThresholdMb('SESAME_HEALTH_HEAP_THRESHOLD_MB', { dev: 1024, prod: 512 })
36+
const RSS_MEMORY_THRESHOLD_MB = resolveThresholdMb('SESAME_HEALTH_RSS_THRESHOLD_MB', { dev: 3072, prod: 1024 })
37+
const NATIVE_MEMORY_DERIVE_MIN_SAMPLES = 6
38+
const NATIVE_MEMORY_DERIVE_MIN_GROWTH_MB = resolveThresholdMb('SESAME_HEALTH_NATIVE_DERIVE_MIN_GROWTH_MB', { dev: 256, prod: 128 })
1239

1340
export type HealthSnapshotPayload = HealthCheckResult & {
1441
system: {
1542
memory: {
1643
heapUsedMb: number
1744
heapTotalMb: number
1845
rssMb: number
46+
externalMb: number
47+
arrayBuffersMb: number
48+
nativeMb: number
1949
totalSystemMemoryMb: number
2050
}
2151
cpu: {
@@ -42,6 +72,8 @@ export type HealthSnapshotPayload = HealthCheckResult & {
4272

4373
@Injectable()
4474
export class HealthSnapshotService {
75+
private nativeMemoryHistory: number[] = []
76+
4577
public constructor(
4678
private readonly health: HealthCheckService,
4779
private readonly mongoose: MongooseHealthIndicator,
@@ -51,26 +83,34 @@ export class HealthSnapshotService {
5183
) { }
5284

5385
public async collectSnapshot(): Promise<HealthSnapshotPayload> {
54-
const healthResult = await this.health.check([
55-
() => this.checkMongoose(),
56-
() => this.http.pingCheck('http-github', 'https://github.com'),
57-
() => this.checkStorage(),
58-
() => this.checkMemoryHeap(),
59-
() => this.checkMemoryRss(),
60-
() => this.checkCpu(),
61-
])
86+
const healthResult = await this.collectHealthResult()
6287

6388
const memoryUsage = process.memoryUsage()
6489
const cpuCount = Math.max(cpus().length, 1)
6590
const [load1m, load5m, load15m] = loadavg()
91+
const externalMb = Number((memoryUsage.external / MEMORY_MULTIPLIER).toFixed(2))
92+
const arrayBuffersMb = Number((memoryUsage.arrayBuffers / MEMORY_MULTIPLIER).toFixed(2))
93+
const nativeMb = Number((externalMb + arrayBuffersMb).toFixed(2))
94+
const memoryNativeIndicator = this.buildMemoryNativeIndicator(nativeMb, externalMb, arrayBuffersMb)
95+
96+
const details = {
97+
...((healthResult.details || {}) as HealthIndicatorResult),
98+
memory_native: memoryNativeIndicator,
99+
}
100+
const hasAnyDown = Object.values(details).some((indicator) => indicator?.status === 'down')
66101

67102
return {
68103
...healthResult,
104+
status: hasAnyDown ? 'error' : 'ok',
105+
details,
69106
system: {
70107
memory: {
71108
heapUsedMb: Number((memoryUsage.heapUsed / MEMORY_MULTIPLIER).toFixed(2)),
72109
heapTotalMb: Number((memoryUsage.heapTotal / MEMORY_MULTIPLIER).toFixed(2)),
73110
rssMb: Number((memoryUsage.rss / MEMORY_MULTIPLIER).toFixed(2)),
111+
externalMb,
112+
arrayBuffersMb,
113+
nativeMb,
74114
totalSystemMemoryMb: Number((totalmem() / MEMORY_MULTIPLIER).toFixed(2)),
75115
},
76116
cpu: {
@@ -96,6 +136,79 @@ export class HealthSnapshotService {
96136
}
97137
}
98138

139+
private async collectHealthResult(): Promise<HealthCheckResult> {
140+
try {
141+
return await this.health.check([
142+
() => this.checkMongoose(),
143+
() => this.http.pingCheck('http-github', 'https://github.com'),
144+
() => this.checkStorage(),
145+
() => this.checkMemoryHeap(),
146+
() => this.checkMemoryRss(),
147+
() => this.checkCpu(),
148+
])
149+
} catch (error) {
150+
if (error instanceof HealthCheckError) {
151+
const details = this.extractHealthErrorDetails(error)
152+
return {
153+
status: 'error',
154+
info: {},
155+
error: details,
156+
details,
157+
}
158+
}
159+
160+
throw error
161+
}
162+
}
163+
164+
private extractHealthErrorDetails(error: HealthCheckError): HealthIndicatorResult {
165+
const candidate = error.causes || {}
166+
if (typeof candidate === 'object' && candidate !== null && Object.keys(candidate).length > 0) {
167+
return candidate as HealthIndicatorResult
168+
}
169+
170+
return {
171+
unknown: {
172+
status: 'down',
173+
message: error.message,
174+
},
175+
}
176+
}
177+
178+
private buildMemoryNativeIndicator(nativeMb: number, externalMb: number, arrayBuffersMb: number): {
179+
status: 'up' | 'down'
180+
nativeMb: number
181+
externalMb: number
182+
arrayBuffersMb: number
183+
growthMb: number
184+
growthThresholdMb: number
185+
sampleCount: number
186+
} {
187+
this.nativeMemoryHistory.push(nativeMb)
188+
if (this.nativeMemoryHistory.length > NATIVE_MEMORY_DERIVE_MIN_SAMPLES) {
189+
this.nativeMemoryHistory.shift()
190+
}
191+
192+
const sampleCount = this.nativeMemoryHistory.length
193+
const firstValue = this.nativeMemoryHistory[0] || nativeMb
194+
const growthMb = Number((nativeMb - firstValue).toFixed(2))
195+
const hasEnoughSamples = sampleCount >= NATIVE_MEMORY_DERIVE_MIN_SAMPLES
196+
const isStrictlyIncreasing =
197+
hasEnoughSamples &&
198+
this.nativeMemoryHistory.every((value, index, values) => index === 0 || value > values[index - 1])
199+
const isDrifting = isStrictlyIncreasing && growthMb >= NATIVE_MEMORY_DERIVE_MIN_GROWTH_MB
200+
201+
return {
202+
status: isDrifting ? 'down' : 'up',
203+
nativeMb,
204+
externalMb,
205+
arrayBuffersMb,
206+
growthMb,
207+
growthThresholdMb: NATIVE_MEMORY_DERIVE_MIN_GROWTH_MB,
208+
sampleCount,
209+
}
210+
}
211+
99212
private checkCpu(): Record<string, { status: 'up' | 'down'; load1mPerCore: number; threshold: number; cores: number }> {
100213
const cpuCount = Math.max(cpus().length, 1)
101214
const perCoreLoad = loadavg()[0] / cpuCount

apps/api/src/core/health/health.controller.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ export class HealthController {
101101
if (!latestSnapshot) {
102102
const freshSnapshot = await this.healthSnapshotService.collectSnapshot()
103103
await this.healthHistoryService.appendSnapshot({
104-
status: freshSnapshot.status || 'unknown',
104+
status: freshSnapshot.status === 'error' ? 'down' : freshSnapshot.status || 'unknown',
105105
details: freshSnapshot.details || {},
106106
system: freshSnapshot.system || {},
107107
futureChecks: freshSnapshot.futureChecks || {},

0 commit comments

Comments
 (0)