-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathresource_monitor.py
More file actions
192 lines (161 loc) · 6.52 KB
/
Copy pathresource_monitor.py
File metadata and controls
192 lines (161 loc) · 6.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""
resource_monitor.py
Samples gateway + worker process resource usage every second.
Run in a separate terminal alongside load_tester.py.
Usage:
python3 resource_monitor.py [output_csv]
# default output: logs/resource_<timestamp>.csv
Output CSV columns:
timestamp,
rss_gateway_mb, rss_workers_mb, rss_total_mb,
pss_gateway_mb, pss_workers_mb, pss_total_mb,
cpu_gateway_pct, worker_count
RSS vs PSS:
RSS sums each process's resident pages independently — CoW shared pages
are counted N times for N processes that share them. PSS (Proportional
Set Size, /proc/<pid>/smaps_rollup) divides shared pages by the number
of sharers, so summing PSS across all processes equals true physical
memory usage. Use PSS for fair memory comparisons across baselines that
differ in worker count (e.g. Static-15 vs EWMA+CUSUM).
How to match with load_tester phases:
The CSV uses wall-clock timestamps. Cross-reference with the test_*.log
timestamps to identify which samples fall in which phase.
CSCI 599: Network Systems for Cloud Computing
University of Southern California
"""
import os
import sys
import csv
import time
import signal
import psutil
from datetime import datetime
SAMPLE_INTERVAL = 1.0 # seconds
SERVER_BIN_NAME = "server" # matches the compiled binary name
def find_gateway_proc():
"""Find the C++ gateway process by executable name."""
for p in psutil.process_iter(['pid', 'name', 'exe']):
try:
if p.info['name'] == SERVER_BIN_NAME:
return p
if p.info['exe'] and SERVER_BIN_NAME in p.info['exe']:
return p
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
return None
def count_workers():
"""
Count alive workers by counting /tmp/faas_worker_*.sock files.
Each worker creates its socket file on bind() and the C++ server
calls unlink() when the worker is killed. This is the most reliable
indicator of live worker count — no /proc fd scanning needed.
"""
import glob
return len(glob.glob("/tmp/faas_worker_*.sock"))
def read_pss_mb(pid):
"""
Read PSS (Proportional Set Size) for a process from
/proc/<pid>/smaps_rollup. Returns megabytes, or 0.0 if the
file is unreadable (process gone, permission denied, kernel
too old to expose smaps_rollup).
PSS is the standard memory metric for processes that share pages
via CoW: each process is charged 1/N of any page shared by N
processes, so summing PSS across all processes equals true RAM
usage (no double-counting).
"""
try:
with open(f"/proc/{pid}/smaps_rollup") as f:
for line in f:
if line.startswith("Pss:"):
# format: "Pss: 12345 kB"
return int(line.split()[1]) / 1024.0
except (FileNotFoundError, ProcessLookupError, PermissionError, ValueError):
pass
return 0.0
def sample(gateway_proc):
"""
Take one sample. Returns dict of metrics or None if gateway is gone.
cpu_percent() measures usage since the previous call — the 1s main-loop
sleep between calls provides the measurement window.
"""
try:
rss_gw = gateway_proc.memory_info().rss / (1024 * 1024)
cpu_gw = gateway_proc.cpu_percent() # interval since last call (~1s)
except (psutil.NoSuchProcess, psutil.AccessDenied):
return None
pss_gw = read_pss_mb(gateway_proc.pid)
n_workers = count_workers()
# Worker memory: sum across all python3 processes whose cmdline names
# either worker_template.py (CoW path: template + its forked children)
# or worker.py (--no-cow path: ablation with naive fresh-import workers).
rss_w = 0.0
pss_w = 0.0
try:
for p in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
cmd = p.info['cmdline'] or []
if len(cmd) < 2:
continue
script = cmd[1]
if 'worker_template.py' in script or script.endswith('worker.py'):
rss_w += p.memory_info().rss
pss_w += read_pss_mb(p.info['pid']) * (1024 * 1024)
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
except Exception:
pass
rss_w /= (1024 * 1024)
pss_w /= (1024 * 1024)
return {
"timestamp": datetime.now().strftime("%H:%M:%S"),
"rss_gateway_mb": round(rss_gw, 2),
"rss_workers_mb": round(rss_w, 2),
"rss_total_mb": round(rss_gw + rss_w, 2),
"pss_gateway_mb": round(pss_gw, 2),
"pss_workers_mb": round(pss_w, 2),
"pss_total_mb": round(pss_gw + pss_w, 2),
"cpu_gateway_pct": round(cpu_gw, 1),
"worker_count": n_workers,
}
def main():
os.makedirs("logs", exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_path = sys.argv[1] if len(sys.argv) > 1 else f"logs/resource_{ts}.csv"
print(f"Resource monitor starting. Output -> {out_path}")
print("Waiting for gateway process...")
gateway = None
while gateway is None:
gateway = find_gateway_proc()
if gateway is None:
time.sleep(0.5)
print(f"Gateway found: pid={gateway.pid}")
gateway.cpu_percent() # initialise counter; first real reading comes after 1s sleep
print("Sampling every 1s. Ctrl+C to stop.\n")
fieldnames = ["timestamp",
"rss_gateway_mb", "rss_workers_mb", "rss_total_mb",
"pss_gateway_mb", "pss_workers_mb", "pss_total_mb",
"cpu_gateway_pct", "worker_count"]
stop = [False]
def _sig(s, f): stop[0] = True
signal.signal(signal.SIGINT, _sig)
signal.signal(signal.SIGTERM, _sig)
with open(out_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
while not stop[0]:
row = sample(gateway)
if row is None:
print("Gateway process exited. Stopping.")
break
writer.writerow(row)
f.flush()
print(f" {row['timestamp']} "
f"RSS={row['rss_total_mb']:6.1f}MB "
f"PSS={row['pss_total_mb']:6.1f}MB "
f"(gw={row['pss_gateway_mb']:.1f} + workers={row['pss_workers_mb']:.1f}) "
f"CPU={row['cpu_gateway_pct']:.1f}% "
f"workers={row['worker_count']}")
time.sleep(SAMPLE_INTERVAL)
print(f"\nDone. CSV saved -> {out_path}")
if __name__ == "__main__":
main()