Skip to content

Commit 56a2aaf

Browse files
committed
Add health reporting
1 parent 374b975 commit 56a2aaf

16 files changed

Lines changed: 657 additions & 8 deletions

File tree

agent/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ require (
3131
github.com/go-ole/go-ole v1.2.6 // indirect
3232
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
3333
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
34+
github.com/shoenig/go-m1cpu v0.1.6 // indirect
3435
github.com/tklauser/go-sysconf v0.3.12 // indirect
3536
github.com/tklauser/numcpus v0.6.1 // indirect
3637
github.com/yusufpapurcu/wmi v1.2.4 // indirect

agent/go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF
5353
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
5454
github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI=
5555
github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk=
56+
github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM=
57+
github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ=
58+
github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU=
59+
github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
5660
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
5761
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
5862
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=

agent/internal/agent/reporting.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,25 @@ import (
55
"log"
66
"os"
77
"runtime"
8+
"sync"
89
"time"
910

1011
"techulus/cloud-agent/internal/container"
12+
"techulus/cloud-agent/internal/health"
1113
agenthttp "techulus/cloud-agent/internal/http"
1214
"techulus/cloud-agent/internal/logs"
1315

1416
"github.com/shirou/gopsutil/v3/disk"
1517
"github.com/shirou/gopsutil/v3/mem"
1618
)
1719

20+
var (
21+
agentStartTime = time.Now()
22+
agentVersion = "dev"
23+
lastHealthCollect time.Time
24+
healthCollectMu sync.Mutex
25+
)
26+
1827
func (a *Agent) HeartbeatLoop(ctx context.Context) {
1928
ticker := time.NewTicker(10 * time.Second)
2029
defer ticker.Stop()
@@ -29,6 +38,10 @@ func (a *Agent) HeartbeatLoop(ctx context.Context) {
2938
}
3039
}
3140

41+
func SetAgentVersion(version string) {
42+
agentVersion = version
43+
}
44+
3245
func (a *Agent) ReportStatus(includeResources bool) {
3346
report := &agenthttp.StatusReport{
3447
PublicIP: a.PublicIP,
@@ -42,6 +55,23 @@ func (a *Agent) ReportStatus(includeResources bool) {
4255
report.Meta = GetSystemMeta()
4356
}
4457

58+
healthCollectMu.Lock()
59+
if time.Since(lastHealthCollect) >= 60*time.Second {
60+
report.HealthStats = health.CollectSystemStats()
61+
report.NetworkHealth = health.CollectNetworkHealth("wg0")
62+
report.ContainerHealth = health.CollectContainerHealth()
63+
report.AgentHealth = &agenthttp.AgentHealth{
64+
Version: agentVersion,
65+
UptimeSecs: int64(time.Since(agentStartTime).Seconds()),
66+
}
67+
lastHealthCollect = time.Now()
68+
log.Printf("[health] collected: cpu=%.1f%%, mem=%.1f%%, disk=%.1f%%, network=%v, containers=%d running",
69+
report.HealthStats.CpuUsagePercent, report.HealthStats.MemoryUsagePercent,
70+
report.HealthStats.DiskUsagePercent, report.NetworkHealth.TunnelUp,
71+
report.ContainerHealth.RunningContainers)
72+
}
73+
healthCollectMu.Unlock()
74+
4575
containers, err := container.List()
4676
if err == nil {
4777
for _, c := range containers {

agent/internal/health/health.go

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
package health
2+
3+
import (
4+
"os/exec"
5+
"strconv"
6+
"strings"
7+
"time"
8+
9+
"github.com/shirou/gopsutil/v3/cpu"
10+
"github.com/shirou/gopsutil/v3/disk"
11+
"github.com/shirou/gopsutil/v3/mem"
12+
)
13+
14+
type SystemStats struct {
15+
CpuUsagePercent float64 `json:"cpuUsagePercent"`
16+
MemoryUsagePercent float64 `json:"memoryUsagePercent"`
17+
MemoryUsedMb int `json:"memoryUsedMb"`
18+
DiskUsagePercent float64 `json:"diskUsagePercent"`
19+
DiskUsedGb int `json:"diskUsedGb"`
20+
}
21+
22+
type NetworkPeerHealth struct {
23+
ID string `json:"id"`
24+
LastSeenSecs int `json:"lastSeenSecs"`
25+
Reachable bool `json:"reachable"`
26+
}
27+
28+
type NetworkHealth struct {
29+
TunnelUp bool `json:"tunnelUp"`
30+
PeerCount int `json:"peerCount"`
31+
Peers []NetworkPeerHealth `json:"peers"`
32+
}
33+
34+
type ContainerHealth struct {
35+
RuntimeResponsive bool `json:"runtimeResponsive"`
36+
RunningContainers int `json:"runningContainers"`
37+
StoppedContainers int `json:"stoppedContainers"`
38+
StorageUsedGb float64 `json:"storageUsedGb"`
39+
}
40+
41+
type AgentHealthInfo struct {
42+
Version string `json:"version"`
43+
UptimeSecs int64 `json:"uptimeSecs"`
44+
LastSyncSuccess bool `json:"lastSyncSuccess"`
45+
LastSyncAt string `json:"lastSyncAt"`
46+
}
47+
48+
func CollectSystemStats() *SystemStats {
49+
stats := &SystemStats{}
50+
51+
cpuPercent, err := cpu.Percent(time.Second, false)
52+
if err == nil && len(cpuPercent) > 0 {
53+
stats.CpuUsagePercent = cpuPercent[0]
54+
}
55+
56+
memInfo, err := mem.VirtualMemory()
57+
if err == nil {
58+
stats.MemoryUsagePercent = memInfo.UsedPercent
59+
stats.MemoryUsedMb = int(memInfo.Used / 1024 / 1024)
60+
}
61+
62+
diskInfo, err := disk.Usage("/")
63+
if err == nil {
64+
stats.DiskUsagePercent = diskInfo.UsedPercent
65+
stats.DiskUsedGb = int(diskInfo.Used / 1024 / 1024 / 1024)
66+
}
67+
68+
return stats
69+
}
70+
71+
func CollectNetworkHealth(interfaceName string) *NetworkHealth {
72+
health := &NetworkHealth{
73+
TunnelUp: false,
74+
Peers: []NetworkPeerHealth{},
75+
}
76+
77+
cmd := exec.Command("wg", "show", interfaceName, "dump")
78+
output, err := cmd.CombinedOutput()
79+
if err != nil {
80+
return health
81+
}
82+
83+
health.TunnelUp = true
84+
85+
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
86+
if len(lines) < 1 {
87+
return health
88+
}
89+
90+
for i, line := range lines {
91+
if i == 0 {
92+
continue
93+
}
94+
95+
fields := strings.Split(line, "\t")
96+
if len(fields) < 5 {
97+
continue
98+
}
99+
100+
publicKey := fields[0]
101+
lastHandshake := fields[4]
102+
103+
var lastSeenSecs int
104+
reachable := false
105+
106+
if lastHandshake != "0" {
107+
ts, err := parseUnixTimestamp(lastHandshake)
108+
if err == nil {
109+
lastSeenSecs = int(time.Since(ts).Seconds())
110+
reachable = lastSeenSecs < 180
111+
}
112+
}
113+
114+
health.Peers = append(health.Peers, NetworkPeerHealth{
115+
ID: publicKey[:8],
116+
LastSeenSecs: lastSeenSecs,
117+
Reachable: reachable,
118+
})
119+
}
120+
121+
health.PeerCount = len(health.Peers)
122+
123+
return health
124+
}
125+
126+
func parseUnixTimestamp(s string) (time.Time, error) {
127+
ts, err := strconv.ParseInt(s, 10, 64)
128+
if err != nil {
129+
return time.Time{}, err
130+
}
131+
return time.Unix(ts, 0), nil
132+
}
133+
134+
func CollectContainerHealth() *ContainerHealth {
135+
health := &ContainerHealth{
136+
RuntimeResponsive: false,
137+
}
138+
139+
cmd := exec.Command("podman", "ps", "-a", "--format", "{{.State}}")
140+
output, err := cmd.CombinedOutput()
141+
if err != nil {
142+
return health
143+
}
144+
145+
health.RuntimeResponsive = true
146+
147+
states := strings.Split(strings.TrimSpace(string(output)), "\n")
148+
for _, state := range states {
149+
if state == "" {
150+
continue
151+
}
152+
if state == "running" {
153+
health.RunningContainers++
154+
} else {
155+
health.StoppedContainers++
156+
}
157+
}
158+
159+
infoCmd := exec.Command("podman", "system", "info", "--format", "{{.Store.GraphRoot}}")
160+
infoOutput, err := infoCmd.CombinedOutput()
161+
if err == nil {
162+
graphRoot := strings.TrimSpace(string(infoOutput))
163+
if graphRoot != "" {
164+
diskInfo, err := disk.Usage(graphRoot)
165+
if err == nil {
166+
health.StorageUsedGb = float64(diskInfo.Used) / 1024 / 1024 / 1024
167+
}
168+
}
169+
}
170+
171+
return health
172+
}

agent/internal/http/client.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"time"
1414

1515
"techulus/cloud-agent/internal/crypto"
16+
"techulus/cloud-agent/internal/health"
1617
)
1718

1819
type Client struct {
@@ -234,13 +235,22 @@ type Resources struct {
234235
DiskGb int `json:"diskGb"`
235236
}
236237

238+
type AgentHealth struct {
239+
Version string `json:"version"`
240+
UptimeSecs int64 `json:"uptimeSecs"`
241+
}
242+
237243
type StatusReport struct {
238-
Resources *Resources `json:"resources,omitempty"`
239-
PublicIP string `json:"publicIp,omitempty"`
240-
PrivateIP string `json:"privateIp,omitempty"`
241-
Meta map[string]string `json:"meta,omitempty"`
242-
Containers []ContainerStatus `json:"containers"`
243-
DnsInSync bool `json:"dnsInSync,omitempty"`
244+
Resources *Resources `json:"resources,omitempty"`
245+
PublicIP string `json:"publicIp,omitempty"`
246+
PrivateIP string `json:"privateIp,omitempty"`
247+
Meta map[string]string `json:"meta,omitempty"`
248+
Containers []ContainerStatus `json:"containers"`
249+
DnsInSync bool `json:"dnsInSync,omitempty"`
250+
HealthStats *health.SystemStats `json:"healthStats,omitempty"`
251+
NetworkHealth *health.NetworkHealth `json:"networkHealth,omitempty"`
252+
ContainerHealth *health.ContainerHealth `json:"containerHealth,omitempty"`
253+
AgentHealth *AgentHealth `json:"agentHealth,omitempty"`
244254
}
245255

246256
func (c *Client) ReportStatus(report *StatusReport) error {

web/app/(dashboard)/dashboard/page.tsx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { Box } from "lucide-react";
22
import Link from "next/link";
3-
import { listProjects, listServers } from "@/db/queries";
3+
import { getClusterHealth, listProjects, listServers } from "@/db/queries";
4+
import { ClusterHealthSummary } from "@/components/cluster/cluster-health-summary";
45
import { CreateProjectDialog } from "@/components/project/create-project-dialog";
56
import { ServerList } from "@/components/server/server-list";
67
import {
@@ -20,13 +21,16 @@ import {
2021
} from "@/components/ui/item";
2122

2223
export default async function DashboardPage() {
23-
const [servers, projects] = await Promise.all([
24+
const [servers, projects, clusterHealth] = await Promise.all([
2425
listServers(),
2526
listProjects(),
27+
getClusterHealth(),
2628
]);
2729

2830
return (
2931
<div className="container max-w-7xl mx-auto px-4 py-6 space-y-12">
32+
{servers.length > 0 && <ClusterHealthSummary initialData={clusterHealth} />}
33+
3034
<div className="space-y-6">
3135
<div className="flex items-center justify-between">
3236
<div>

web/app/(dashboard)/dashboard/servers/[id]/page.tsx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { LogViewer } from "@/components/logs/log-viewer";
1212
import { Label } from "@/components/ui/label";
1313
import { ServerDangerZone } from "@/components/server/server-danger-zone";
1414
import { ServerHeader } from "@/components/server/server-header";
15+
import { ServerHealthDetails } from "@/components/server/server-health-details";
1516
import { ServerServices } from "@/components/server/server-services";
1617
import { formatRelativeTime } from "@/lib/date";
1718

@@ -139,6 +140,16 @@ export default async function ServerDetailPage({
139140
</CardContent>
140141
</Card>
141142

143+
<ServerHealthDetails
144+
serverId={id}
145+
initialData={{
146+
healthStats: server.healthStats,
147+
networkHealth: server.networkHealth,
148+
containerHealth: server.containerHealth,
149+
agentHealth: server.agentHealth,
150+
}}
151+
/>
152+
142153
<ServerServices serverId={id} />
143154

144155
<div className="space-y-2">
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
export const dynamic = "force-dynamic";
2+
3+
import { auth } from "@/lib/auth";
4+
import { headers } from "next/headers";
5+
import { getClusterHealth } from "@/db/queries";
6+
7+
export async function GET() {
8+
const session = await auth.api.getSession({
9+
headers: await headers(),
10+
});
11+
12+
if (!session) {
13+
return Response.json({ error: "Unauthorized" }, { status: 401 });
14+
}
15+
16+
const clusterHealth = await getClusterHealth();
17+
return Response.json(clusterHealth);
18+
}

0 commit comments

Comments
 (0)