Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
- adjust the following settings for `uvicorn` as environment variables:
- HOST
- PORT
- WORKERS (Default : 3)
- WORKERS (Default : 1)
- LOG_LEVEL=(Default : warning)

⚠️ **Breaking operational change:** if your deployment or runtime environment specifically depends on Flask or Gunicorn, you'll need to adjust service definitions accordingly.
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Replace `<PID>` with the pid of ./start.sh script.
- adjust the following settings for `uvicorn` as environment variables:
- HOST (Default: 0.0.0.0)
- PORT (Default: 8000)
- WORKERS (Default : 3)
- WORKERS (Default : 1)
- LOG_LEVEL (Default : warning)

Example:
Expand Down Expand Up @@ -145,9 +145,9 @@ Check out [dashboards](./dashboards) directory for Json files. including CPU & M
- [x] Check and Unregister *stat* metrics for containers that are not running
- [x] Design and develop a static website to showcase Documentation, new features, etc.
- [x] Enable functionality and smoke testing in ci
- [X] Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand.
- [ ] Design grafana dashboards and share them on grafana cloud
- [ ] Add unit tests
- [ ] Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand.

## Contributions
Welcome to CXP! This project is production-ready now, and we encourage contributions to enhance its functionality, optimize code, and add new features
Expand Down
60 changes: 60 additions & 0 deletions alerting/sample_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
groups:
- name: container-status
rules:
- alert: ContainerDown
expr: cxp_container_status == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.container_name }} is down"
description: "No status heartbeat for container '{{ $labels.container_name }}' in the last 5 minutes."

- name: container-resource-usage
rules:
- alert: HighCPUUsage
expr: avg_over_time(cxp_cpu_percentage[5m]) > 80.0
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.container_name }}"
description: "CPU usage has averaged >80% for more than 5 minutes."

- alert: CriticalCPUUsage
expr: avg_over_time(cxp_cpu_percentage[5m]) > 90.0
for: 5m
labels:
severity: critical
annotations:
summary: "Critical CPU usage on {{ $labels.container_name }}"
description: "CPU usage has averaged >90% for more than 5 minutes."

- alert: HighMemoryUsage
expr: avg_over_time(cxp_memory_percentage[5m]) > 80.0
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.container_name }}"
description: "Memory usage has averaged >80% for more than 5 minutes."

- alert: CriticalMemoryUsage
expr: avg_over_time(cxp_memory_percentage[5m]) > 90.0
for: 5m
labels:
severity: critical
annotations:
summary: "Critical memory usage on {{ $labels.container_name }}"
description: "Memory usage has averaged >90% for more than 5 minutes."

- name: exporter-health
rules:
- alert: ExporterDown
expr: absent(cxp_container_status)
for: 5m
labels:
severity: critical
annotations:
summary: "Container exporter metrics missing"
description: "No cxp_container_status metric scraped for more than 5 minutes; exporter may be down or unreachable."
15 changes: 6 additions & 9 deletions container_exporter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from asyncio import gather
from aiodocker import Docker
from aiodocker.containers import DockerContainer
from stats import get_docker_stats as stat
Expand All @@ -9,21 +8,22 @@
from contextlib import asynccontextmanager
from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
from logging import basicConfig, error, ERROR
from settings.settings import settings

docker_client: Docker

@asynccontextmanager
async def lifespan(app: FastAPI):
global docker_client
docker_client = Docker()

yield

await docker_client.close()

app = FastAPI(lifespan=lifespan)

gauge_container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name'])
gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
Expand All @@ -42,15 +42,12 @@ async def get_containers(all=False) -> list[DockerContainer]:
return await docker_client.containers.list(all=all)

def update_container_status(running_containers:list[DockerContainer]):

current_names = [c._container.get("Names")[0][1:] for c in running_containers]
for name in current_names:
gauge_container_status.labels(container_name=name).set(1)
for c in running_containers:
gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)

# Async metrics gathering
async def container_stats( running_containers: list[DockerContainer]):
tasks = [stat.get_container_stats(container) for container in running_containers]
all_stats = await gather(*tasks)
all_stats = await stat.get_containers_stats(running_containers)

for stats in all_stats:
name = stats[0]['name'][1:]
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ multidict==6.4.4
prometheus-client==0.17.1
propcache==0.3.1
pydantic==2.11.5
pydantic-settings==2.9.1
pydantic_core==2.33.2
python-dotenv==1.1.0
sniffio==1.3.1
starlette==0.46.2
typing-inspection==0.4.1
Expand Down
2 changes: 2 additions & 0 deletions scripts/healthcheck-ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ log "Starting CI Healthcheck..."
log "Spinning up test container: $container_name"
docker run -d --name "$container_name" alpine sleep 60 >/dev/null || fail "Failed to start container"

sleep 3

log "Checking root endpoint..."
if curl --silent --fail http://localhost:8000/ > "${log_dir}/index.txt"; then
pass "Root endpoint responded successfully."
Expand Down
12 changes: 12 additions & 0 deletions settings/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# settings.py
from pydantic_settings import BaseSettings

class Settings(BaseSettings):
CONTAINER_EXPORTER_ENV: str = "production"
CONTAINER_EXPORTER_DEBUG: bool = False
CONTAINER_EXPORTER_CLEAR_METRICS: bool = True
class Config:
env_file = ".env"
env_file_encoding = "utf-8"

settings = Settings()
2 changes: 1 addition & 1 deletion start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Configurable variables
HOST=${HOST:-0.0.0.0}
PORT=${PORT:-8000}
WORKERS=${WORKERS:-3}
WORKERS=${WORKERS:-1}
LOG_LEVEL=${LOG_LEVEL:-warning}

echo "Starting Container Exporter..."
Expand Down
78 changes: 45 additions & 33 deletions stats/get_docker_stats.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,61 @@
from aiodocker.docker import DockerContainer
from asyncio import gather

def calculate_cpu_percentage(stats:dict) -> float:
cpu_percent = 0

cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
def calculate_cpu_percentage(stats: dict) -> float:
cpu_stats = stats.get('cpu_stats', {})
precpu_stats = stats.get('precpu_stats', {})
total = cpu_stats.get('cpu_usage', {}).get('total_usage')
prev_total = precpu_stats.get('cpu_usage', {}).get('total_usage')
system = cpu_stats.get('system_cpu_usage')
prev_system = precpu_stats.get('system_cpu_usage')
n_cpus = cpu_stats.get('online_cpus')

system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
if None in (total, prev_total, system, prev_system, n_cpus):
return 0.0

number_cpus = stats['cpu_stats']['online_cpus']
if cpu_delta is not None and system_delta is not None and number_cpus is not None:
cpu_percent = (cpu_delta / system_delta) * number_cpus * 100.0
cpu_delta = total - prev_total
system_delta = system - prev_system

return cpu_percent
if system_delta <= 0:
return 0.0

def calculate_memory_percentage(stats) -> float:
memory_percent = 0
memory_usage_bytes = 0

memory_usage_bytes = stats['memory_stats']['usage']
memory_limit = stats['memory_stats']['limit']
if memory_usage_bytes is not None and memory_limit is not None:
memory_percent = (memory_usage_bytes / memory_limit) * 100.0
return (cpu_delta / system_delta) * n_cpus * 100.0


def calculate_memory_percentage(stats: dict) -> float:
mem_stats = stats.get('memory_stats', {})
usage = mem_stats.get('usage')
limit = mem_stats.get('limit')

if usage is None or limit is None or limit == 0:
return 0.0

return (usage / limit) * 100.0

return memory_percent

def calculate_memory_bytes(stats) -> bytes:
memory_usage_bytes = stats['memory_stats']['usage']
mem_stats = stats.get('memory_stats', {}) or {}
memory_usage_bytes = mem_stats.get('usage')

if memory_usage_bytes is not None:
return memory_usage_bytes
return 0
return 0.0

def calculate_disk_io(stats) -> bytes:
def calculate_disk_io(stats: dict) -> bytes:
disk_io_read = 0
disk_io_write = 0

if "blkio_stats" in stats and "io_service_bytes_recursive" in stats["blkio_stats"]:
io_service_bytes_recursive = stats["blkio_stats"]["io_service_bytes_recursive"]
io_list = stats.get("blkio_stats", {}) \
.get("io_service_bytes_recursive") or []

for io_stat in io_list:
op = io_stat.get("op")
value = io_stat.get("value", 0)
if op == "read":
disk_io_read += value
elif op == "write":
disk_io_write += value

if io_service_bytes_recursive is not None:
for io_stat in io_service_bytes_recursive:
if "op" in io_stat and "value" in io_stat:
if io_stat["op"] == "read":
disk_io_read += io_stat["value"]
elif io_stat["op"] == "write":
disk_io_write += io_stat["value"]

return disk_io_read, disk_io_write

Expand All @@ -60,6 +72,6 @@ def calculate_network_io(stats) -> bytes:

return network_rx_bytes, network_tx_bytes

async def get_container_stats(container:DockerContainer):
stats = await container.stats(stream=False)
return stats
async def get_containers_stats(containers:list[DockerContainer]):
tasks = [container.stats(stream=False) for container in containers]
return await gather(*tasks)
11 changes: 8 additions & 3 deletions utils/metrics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from aiodocker.containers import DockerContainer
from typing import Union, Iterable
from prometheus_client import Gauge, Counter

from settings.settings import settings
PromMetric = Union[Gauge, Counter]

def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
Expand All @@ -15,8 +15,13 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
for labels in metric._metrics:
name = labels[0]
if name not in active_set:
metric.clear()

if settings.CONTAINER_EXPORTER_CLEAR_METRICS:
metric.clear()
elif isinstance(metric, Gauge):
metric.labels(container_name=name).set(0)
else:
metric.clear()

for metric in persistent_metrics:
for labels in list(metric._metrics):
name = labels[0]
Expand Down