Shayan-Ghani · Shayan-Ghani · May 30, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,7 @@
 - adjust the following settings for `uvicorn` as environment variables:
   - HOST
   - PORT
-  - WORKERS (Default : 3)
+  - WORKERS (Default : 1)
   - LOG_LEVEL=(Default : warning)
 
 ⚠️ **Breaking operational change:** if your deployment or runtime environment specifically depends on Flask or Gunicorn, you'll need to adjust service definitions accordingly.

diff --git a/README.md b/README.md
@@ -104,7 +104,7 @@ Replace `<PID>` with the pid of ./start.sh script.
 - adjust the following settings for `uvicorn` as environment variables:
   - HOST (Default: 0.0.0.0)
   - PORT (Default: 8000)
-  - WORKERS (Default : 3)
+  - WORKERS (Default : 1)
   - LOG_LEVEL (Default : warning)
 
 Example:
@@ -145,9 +145,9 @@ Check out [dashboards](./dashboards) directory for Json files. including CPU & M
  - [x]  Check and Unregister *stat* metrics for containers that are not running
  - [x]  Design and develop a static website to showcase Documentation, new features, etc.
 -  [x] Enable functionality and smoke testing in ci
+ - [X]  Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand.
  - [ ]  Design grafana dashboards and share them on grafana cloud
  - [ ]  Add unit tests
- - [ ]  Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand.
 
 ## Contributions
 Welcome to CXP! This project is production-ready now, and we encourage contributions to enhance its functionality, optimize code, and add new features

diff --git a/alerting/sample_rules.yml b/alerting/sample_rules.yml
@@ -0,0 +1,60 @@
+groups:
+- name: container-status
+  rules:
+  - alert: ContainerDown
+    expr: cxp_container_status == 0
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Container {{ $labels.container_name }} is down"
+      description: "No status heartbeat for container '{{ $labels.container_name }}' in the last 5 minutes."
+
+- name: container-resource-usage
+  rules:
+  - alert: HighCPUUsage
+    expr: avg_over_time(cxp_cpu_percentage[5m]) > 80.0
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "High CPU usage on {{ $labels.container_name }}"
+      description: "CPU usage has averaged >80% for more than 5 minutes."
+
+  - alert: CriticalCPUUsage
+    expr: avg_over_time(cxp_cpu_percentage[5m]) > 90.0
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Critical CPU usage on {{ $labels.container_name }}"
+      description: "CPU usage has averaged >90% for more than 5 minutes."
+
+  - alert: HighMemoryUsage
+    expr: avg_over_time(cxp_memory_percentage[5m]) > 80.0
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: "High memory usage on {{ $labels.container_name }}"
+      description: "Memory usage has averaged >80% for more than 5 minutes."
+
+  - alert: CriticalMemoryUsage
+    expr: avg_over_time(cxp_memory_percentage[5m]) > 90.0
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Critical memory usage on {{ $labels.container_name }}"
+      description: "Memory usage has averaged >90% for more than 5 minutes."
+
+- name: exporter-health
+  rules:
+  - alert: ExporterDown
+    expr: absent(cxp_container_status)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Container exporter metrics missing"
+      description: "No cxp_container_status metric scraped for more than 5 minutes; exporter may be down or unreachable."
diff --git a/container_exporter.py b/container_exporter.py
@@ -1,4 +1,3 @@
-from asyncio import gather
 from aiodocker import Docker
 from aiodocker.containers import DockerContainer
 from stats import get_docker_stats as stat
@@ -9,21 +8,22 @@
 from contextlib import asynccontextmanager
 from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
 from logging import basicConfig, error, ERROR
+from settings.settings import settings
 
 docker_client: Docker
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global docker_client
     docker_client = Docker()
-
+        
     yield
 
     await docker_client.close()
 
 app = FastAPI(lifespan=lifespan)
 
-gauge_container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name'])
+gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
 gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
 gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
 gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
@@ -42,15 +42,12 @@ async def get_containers(all=False) -> list[DockerContainer]:
     return await docker_client.containers.list(all=all)
 
 def update_container_status(running_containers:list[DockerContainer]):
-
-    current_names = [c._container.get("Names")[0][1:] for c in running_containers]
-    for name in current_names:            
-        gauge_container_status.labels(container_name=name).set(1)
+    for c in running_containers:
+        gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)
 
 # Async metrics gathering
 async def container_stats( running_containers: list[DockerContainer]):
-    tasks = [stat.get_container_stats(container) for container in running_containers]
-    all_stats = await gather(*tasks)
+    all_stats = await stat.get_containers_stats(running_containers)
 
     for stats in all_stats:
         name = stats[0]['name'][1:]

diff --git a/requirements.txt b/requirements.txt
@@ -18,7 +18,9 @@ multidict==6.4.4
 prometheus-client==0.17.1
 propcache==0.3.1
 pydantic==2.11.5
+pydantic-settings==2.9.1
 pydantic_core==2.33.2
+python-dotenv==1.1.0
 sniffio==1.3.1
 starlette==0.46.2
 typing-inspection==0.4.1

diff --git a/scripts/healthcheck-ci.sh b/scripts/healthcheck-ci.sh
@@ -31,6 +31,8 @@ log "Starting CI Healthcheck..."
 log "Spinning up test container: $container_name"
 docker run -d --name "$container_name" alpine sleep 60 >/dev/null || fail "Failed to start container"
 
+sleep 3
+
 log "Checking root endpoint..."
 if curl --silent --fail http://localhost:8000/ > "${log_dir}/index.txt"; then
   pass "Root endpoint responded successfully."

diff --git a/settings/settings.py b/settings/settings.py
@@ -0,0 +1,12 @@
+# settings.py
+from pydantic_settings import BaseSettings
+
+class Settings(BaseSettings):
+    CONTAINER_EXPORTER_ENV: str = "production"
+    CONTAINER_EXPORTER_DEBUG: bool = False
+    CONTAINER_EXPORTER_CLEAR_METRICS: bool = True
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+
+settings = Settings()
diff --git a/start.sh b/start.sh
@@ -3,7 +3,7 @@
 # Configurable variables
 HOST=${HOST:-0.0.0.0}
 PORT=${PORT:-8000}
-WORKERS=${WORKERS:-3}
+WORKERS=${WORKERS:-1}
 LOG_LEVEL=${LOG_LEVEL:-warning}
 
 echo "Starting Container Exporter..."

diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py
@@ -1,49 +1,61 @@
 from aiodocker.docker import DockerContainer
+from asyncio import gather
 
-def calculate_cpu_percentage(stats:dict) -> float:
-    cpu_percent = 0
-
-    cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
+def calculate_cpu_percentage(stats: dict) -> float:
+    cpu_stats    = stats.get('cpu_stats', {})
+    precpu_stats = stats.get('precpu_stats', {})
+    total       = cpu_stats.get('cpu_usage', {}).get('total_usage')
+    prev_total  = precpu_stats.get('cpu_usage', {}).get('total_usage')
+    system      = cpu_stats.get('system_cpu_usage')
+    prev_system = precpu_stats.get('system_cpu_usage')
+    n_cpus      = cpu_stats.get('online_cpus')
 
-    system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
+    if None in (total, prev_total, system, prev_system, n_cpus):
+        return 0.0
 
-    number_cpus = stats['cpu_stats']['online_cpus'] 
-    if cpu_delta is not None and system_delta is not None and number_cpus is not None:
-        cpu_percent = (cpu_delta / system_delta) * number_cpus * 100.0 
+    cpu_delta    = total - prev_total
+    system_delta = system - prev_system
 
-    return cpu_percent
+    if system_delta <= 0:
+        return 0.0
 
-def calculate_memory_percentage(stats) -> float:
-    memory_percent = 0
-    memory_usage_bytes = 0
-
-    memory_usage_bytes = stats['memory_stats']['usage']
-    memory_limit = stats['memory_stats']['limit']
-    if memory_usage_bytes is not None and memory_limit is not None:
-        memory_percent = (memory_usage_bytes / memory_limit) * 100.0
+    return (cpu_delta / system_delta) * n_cpus * 100.0
+
+
+def calculate_memory_percentage(stats: dict) -> float:
+    mem_stats = stats.get('memory_stats', {})
+    usage     = mem_stats.get('usage')
+    limit     = mem_stats.get('limit')
+
+    if usage is None or limit is None or limit == 0:
+        return 0.0
+
+    return (usage / limit) * 100.0
 
-    return memory_percent
 
 def calculate_memory_bytes(stats) -> bytes:
-    memory_usage_bytes = stats['memory_stats']['usage']
+    mem_stats = stats.get('memory_stats', {}) or {}
+    memory_usage_bytes = mem_stats.get('usage')
+
     if memory_usage_bytes is not None:
         return memory_usage_bytes
-    return 0
+    return 0.0
 
-def calculate_disk_io(stats) -> bytes:
+def calculate_disk_io(stats: dict) -> bytes:
     disk_io_read = 0
     disk_io_write = 0
 
-    if "blkio_stats" in stats and "io_service_bytes_recursive" in stats["blkio_stats"]:
-        io_service_bytes_recursive = stats["blkio_stats"]["io_service_bytes_recursive"]
+    io_list = stats.get("blkio_stats", {}) \
+                .get("io_service_bytes_recursive") or []
+
+    for io_stat in io_list:
+        op    = io_stat.get("op")
+        value = io_stat.get("value", 0)
+        if op == "read":
+            disk_io_read  += value
+        elif op == "write":
+            disk_io_write += value
 
-        if io_service_bytes_recursive is not None:
-            for io_stat in io_service_bytes_recursive:
-                if "op" in io_stat and "value" in io_stat:
-                    if io_stat["op"] == "read":
-                        disk_io_read += io_stat["value"]
-                    elif io_stat["op"] == "write":
-                        disk_io_write += io_stat["value"]
 
     return disk_io_read, disk_io_write
 
@@ -60,6 +72,6 @@ def calculate_network_io(stats) -> bytes:
 
     return network_rx_bytes, network_tx_bytes
 
-async def get_container_stats(container:DockerContainer):
-    stats = await container.stats(stream=False)
-    return stats
+async def get_containers_stats(containers:list[DockerContainer]):
+    tasks = [container.stats(stream=False) for container in containers]
+    return await gather(*tasks)
diff --git a/utils/metrics.py b/utils/metrics.py
@@ -1,7 +1,7 @@
 from aiodocker.containers import DockerContainer
 from typing import Union, Iterable
 from prometheus_client import Gauge, Counter
-
+from settings.settings import settings
 PromMetric = Union[Gauge, Counter]
 
 def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
@@ -15,8 +15,13 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
         for labels in metric._metrics:
             name = labels[0]
             if name not in active_set:
-                metric.clear()
-
+                if settings.CONTAINER_EXPORTER_CLEAR_METRICS:
+                    metric.clear()                    
+                elif isinstance(metric, Gauge):
+                    metric.labels(container_name=name).set(0)
+                else:
+                    metric.clear()                    
+
     for metric in persistent_metrics:
         for labels in list(metric._metrics):
             name = labels[0]