diff --git a/CHANGELOG.md b/CHANGELOG.md index 75ab654..d9bc377 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ - adjust the following settings for `uvicorn` as environment variables: - HOST - PORT - - WORKERS (Default : 3) + - WORKERS (Default : 1) - LOG_LEVEL=(Default : warning) ⚠️ **Breaking operational change:** if your deployment or runtime environment specifically depends on Flask or Gunicorn, you'll need to adjust service definitions accordingly. diff --git a/README.md b/README.md index b57642a..e145d53 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Replace `` with the pid of ./start.sh script. - adjust the following settings for `uvicorn` as environment variables: - HOST (Default: 0.0.0.0) - PORT (Default: 8000) - - WORKERS (Default : 3) + - WORKERS (Default : 1) - LOG_LEVEL (Default : warning) Example: @@ -145,9 +145,9 @@ Check out [dashboards](./dashboards) directory for Json files. including CPU & M - [x] Check and Unregister *stat* metrics for containers that are not running - [x] Design and develop a static website to showcase Documentation, new features, etc. - [x] Enable functionality and smoke testing in ci + - [X] Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand. - [ ] Design grafana dashboards and share them on grafana cloud - [ ] Add unit tests - - [ ] Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand. ## Contributions Welcome to CXP! This project is production-ready now, and we encourage contributions to enhance its functionality, optimize code, and add new features diff --git a/alerting/sample_rules.yml b/alerting/sample_rules.yml new file mode 100644 index 0000000..64fd899 --- /dev/null +++ b/alerting/sample_rules.yml @@ -0,0 +1,60 @@ +groups: +- name: container-status + rules: + - alert: ContainerDown + expr: cxp_container_status == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Container {{ $labels.container_name }} is down" + description: "No status heartbeat for container '{{ $labels.container_name }}' in the last 5 minutes." + +- name: container-resource-usage + rules: + - alert: HighCPUUsage + expr: avg_over_time(cxp_cpu_percentage[5m]) > 80.0 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on {{ $labels.container_name }}" + description: "CPU usage has averaged >80% for more than 5 minutes." + + - alert: CriticalCPUUsage + expr: avg_over_time(cxp_cpu_percentage[5m]) > 90.0 + for: 5m + labels: + severity: critical + annotations: + summary: "Critical CPU usage on {{ $labels.container_name }}" + description: "CPU usage has averaged >90% for more than 5 minutes." + + - alert: HighMemoryUsage + expr: avg_over_time(cxp_memory_percentage[5m]) > 80.0 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.container_name }}" + description: "Memory usage has averaged >80% for more than 5 minutes." + + - alert: CriticalMemoryUsage + expr: avg_over_time(cxp_memory_percentage[5m]) > 90.0 + for: 5m + labels: + severity: critical + annotations: + summary: "Critical memory usage on {{ $labels.container_name }}" + description: "Memory usage has averaged >90% for more than 5 minutes." + +- name: exporter-health + rules: + - alert: ExporterDown + expr: absent(cxp_container_status) + for: 5m + labels: + severity: critical + annotations: + summary: "Container exporter metrics missing" + description: "No cxp_container_status metric scraped for more than 5 minutes; exporter may be down or unreachable." diff --git a/container_exporter.py b/container_exporter.py index 02ea88a..626307b 100755 --- a/container_exporter.py +++ b/container_exporter.py @@ -1,4 +1,3 @@ -from asyncio import gather from aiodocker import Docker from aiodocker.containers import DockerContainer from stats import get_docker_stats as stat @@ -9,6 +8,7 @@ from contextlib import asynccontextmanager from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels from logging import basicConfig, error, ERROR +from settings.settings import settings docker_client: Docker @@ -16,14 +16,14 @@ async def lifespan(app: FastAPI): global docker_client docker_client = Docker() - + yield await docker_client.close() app = FastAPI(lifespan=lifespan) -gauge_container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name']) +gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name']) gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name']) gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name']) gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name']) @@ -42,15 +42,12 @@ async def get_containers(all=False) -> list[DockerContainer]: return await docker_client.containers.list(all=all) def update_container_status(running_containers:list[DockerContainer]): - - current_names = [c._container.get("Names")[0][1:] for c in running_containers] - for name in current_names: - gauge_container_status.labels(container_name=name).set(1) + for c in running_containers: + gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2) # Async metrics gathering async def container_stats( running_containers: list[DockerContainer]): - tasks = [stat.get_container_stats(container) for container in running_containers] - all_stats = await gather(*tasks) + all_stats = await stat.get_containers_stats(running_containers) for stats in all_stats: name = stats[0]['name'][1:] diff --git a/requirements.txt b/requirements.txt index 0b256ea..efd195c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,9 @@ multidict==6.4.4 prometheus-client==0.17.1 propcache==0.3.1 pydantic==2.11.5 +pydantic-settings==2.9.1 pydantic_core==2.33.2 +python-dotenv==1.1.0 sniffio==1.3.1 starlette==0.46.2 typing-inspection==0.4.1 diff --git a/scripts/healthcheck-ci.sh b/scripts/healthcheck-ci.sh index 7c9d446..47d34bc 100644 --- a/scripts/healthcheck-ci.sh +++ b/scripts/healthcheck-ci.sh @@ -31,6 +31,8 @@ log "Starting CI Healthcheck..." log "Spinning up test container: $container_name" docker run -d --name "$container_name" alpine sleep 60 >/dev/null || fail "Failed to start container" +sleep 3 + log "Checking root endpoint..." if curl --silent --fail http://localhost:8000/ > "${log_dir}/index.txt"; then pass "Root endpoint responded successfully." diff --git a/settings/settings.py b/settings/settings.py new file mode 100644 index 0000000..6d0a9e4 --- /dev/null +++ b/settings/settings.py @@ -0,0 +1,12 @@ +# settings.py +from pydantic_settings import BaseSettings + +class Settings(BaseSettings): + CONTAINER_EXPORTER_ENV: str = "production" + CONTAINER_EXPORTER_DEBUG: bool = False + CONTAINER_EXPORTER_CLEAR_METRICS: bool = True + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + +settings = Settings() \ No newline at end of file diff --git a/start.sh b/start.sh index 5178ebe..83a95d8 100755 --- a/start.sh +++ b/start.sh @@ -3,7 +3,7 @@ # Configurable variables HOST=${HOST:-0.0.0.0} PORT=${PORT:-8000} -WORKERS=${WORKERS:-3} +WORKERS=${WORKERS:-1} LOG_LEVEL=${LOG_LEVEL:-warning} echo "Starting Container Exporter..." diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py index 7b54ffd..67a0e14 100644 --- a/stats/get_docker_stats.py +++ b/stats/get_docker_stats.py @@ -1,49 +1,61 @@ from aiodocker.docker import DockerContainer +from asyncio import gather -def calculate_cpu_percentage(stats:dict) -> float: - cpu_percent = 0 - - cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage'] +def calculate_cpu_percentage(stats: dict) -> float: + cpu_stats = stats.get('cpu_stats', {}) + precpu_stats = stats.get('precpu_stats', {}) + total = cpu_stats.get('cpu_usage', {}).get('total_usage') + prev_total = precpu_stats.get('cpu_usage', {}).get('total_usage') + system = cpu_stats.get('system_cpu_usage') + prev_system = precpu_stats.get('system_cpu_usage') + n_cpus = cpu_stats.get('online_cpus') - system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage'] + if None in (total, prev_total, system, prev_system, n_cpus): + return 0.0 - number_cpus = stats['cpu_stats']['online_cpus'] - if cpu_delta is not None and system_delta is not None and number_cpus is not None: - cpu_percent = (cpu_delta / system_delta) * number_cpus * 100.0 + cpu_delta = total - prev_total + system_delta = system - prev_system - return cpu_percent + if system_delta <= 0: + return 0.0 -def calculate_memory_percentage(stats) -> float: - memory_percent = 0 - memory_usage_bytes = 0 - - memory_usage_bytes = stats['memory_stats']['usage'] - memory_limit = stats['memory_stats']['limit'] - if memory_usage_bytes is not None and memory_limit is not None: - memory_percent = (memory_usage_bytes / memory_limit) * 100.0 + return (cpu_delta / system_delta) * n_cpus * 100.0 + + +def calculate_memory_percentage(stats: dict) -> float: + mem_stats = stats.get('memory_stats', {}) + usage = mem_stats.get('usage') + limit = mem_stats.get('limit') + + if usage is None or limit is None or limit == 0: + return 0.0 + + return (usage / limit) * 100.0 - return memory_percent def calculate_memory_bytes(stats) -> bytes: - memory_usage_bytes = stats['memory_stats']['usage'] + mem_stats = stats.get('memory_stats', {}) or {} + memory_usage_bytes = mem_stats.get('usage') + if memory_usage_bytes is not None: return memory_usage_bytes - return 0 + return 0.0 -def calculate_disk_io(stats) -> bytes: +def calculate_disk_io(stats: dict) -> bytes: disk_io_read = 0 disk_io_write = 0 - if "blkio_stats" in stats and "io_service_bytes_recursive" in stats["blkio_stats"]: - io_service_bytes_recursive = stats["blkio_stats"]["io_service_bytes_recursive"] + io_list = stats.get("blkio_stats", {}) \ + .get("io_service_bytes_recursive") or [] + + for io_stat in io_list: + op = io_stat.get("op") + value = io_stat.get("value", 0) + if op == "read": + disk_io_read += value + elif op == "write": + disk_io_write += value - if io_service_bytes_recursive is not None: - for io_stat in io_service_bytes_recursive: - if "op" in io_stat and "value" in io_stat: - if io_stat["op"] == "read": - disk_io_read += io_stat["value"] - elif io_stat["op"] == "write": - disk_io_write += io_stat["value"] return disk_io_read, disk_io_write @@ -60,6 +72,6 @@ def calculate_network_io(stats) -> bytes: return network_rx_bytes, network_tx_bytes -async def get_container_stats(container:DockerContainer): - stats = await container.stats(stream=False) - return stats \ No newline at end of file +async def get_containers_stats(containers:list[DockerContainer]): + tasks = [container.stats(stream=False) for container in containers] + return await gather(*tasks) \ No newline at end of file diff --git a/utils/metrics.py b/utils/metrics.py index 84aa709..ddaad3e 100644 --- a/utils/metrics.py +++ b/utils/metrics.py @@ -1,7 +1,7 @@ from aiodocker.containers import DockerContainer from typing import Union, Iterable from prometheus_client import Gauge, Counter - +from settings.settings import settings PromMetric = Union[Gauge, Counter] def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]): @@ -15,8 +15,13 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom for labels in metric._metrics: name = labels[0] if name not in active_set: - metric.clear() - + if settings.CONTAINER_EXPORTER_CLEAR_METRICS: + metric.clear() + elif isinstance(metric, Gauge): + metric.labels(container_name=name).set(0) + else: + metric.clear() + for metric in persistent_metrics: for labels in list(metric._metrics): name = labels[0]