diff --git a/roles/grafana/files/definitions/default/grafana_dashboard_definitions_iap_process.json b/roles/grafana/files/definitions/default/grafana_dashboard_definitions_iap_process.json new file mode 100644 index 00000000..e7cc8d06 --- /dev/null +++ b/roles/grafana/files/definitions/default/grafana_dashboard_definitions_iap_process.json @@ -0,0 +1,1175 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Uses the process exporter data coupled with a user's choice of process selected from a dropdown menu to render metrics for the selected process.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 84, + "links": [], + "panels": [ + { + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "# Metrics for ${process}\n\nThese are metrics exposed by the [process exporter](https://github.com/ncabatoff/process-exporter). To use it, select your datasource, your instance, and the process you would like to review.", + "mode": "markdown" + }, + "pluginVersion": "11.5.2", + "title": "Process Metrics", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "How long has this process group been running. This is the duration of the oldest process in the group.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 4 + }, + "id": 1, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "time() - namedprocess_namegroup_oldest_start_time_seconds{groupname=\"${process}\",instance=~\"${instance}\"}", + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "A count of how many process groups are found, commonly referred to as \"instances\".", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 4 + }, + "id": 2, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_num_procs{groupname=\"${process}\", instance=~\"${instance}\"}", + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 4 + }, + "id": 3, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_num_threads{instance=~\"${instance}\", groupname=\"${process}\"}", + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Thread Count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "This metric shows the number of open file descriptors for the Application. File descriptors are handles used by processes to access:\n\n- Files on disk\n- Network sockets (TCP/UDP connections)\n- Pipes and other inter-process communication\n- Standard input/output/error streams", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 4 + }, + "id": 4, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_open_filedesc{groupname=\"Pronghorn AutomationGateway Adapter\", instance=~\"${instance}\"}", + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "File Descriptors", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "These metrics show the cumulative CPU time consumed by the Application since it started, broken down by CPU mode. What Each Value Means:\n\nUser Mode:\n- CPU time spent executing the application's code\n- Normal program operations, calculations, business logic\n- Time spent in user space (not kernel operations)\n\nSystem Mode:\n- CPU time spent in kernel space on behalf of this process\n- System calls, file I/O, network operations, memory management\n- OS operations requested by the application", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Rate per second", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "timezone": [ + "browser" + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_cpu_seconds_total{instance=~\"${instance}\", groupname=\"${process}\"}[5m])", + "instant": false, + "legendFormat": "{{mode}} - {{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "This shows the memory usage breakdown for the application process:\n\n- Virtual Memory: The total address space allocated to the process. This includes all mapped memory regions - libraries, heap space, stack, and memory that may not be physically loaded. A large size is normal for Node.js applications.\n\n- Resident Memory: The actual physical RAM currently being used by the process. This is the \"real\" memory consumption that affects system performance.\n\n- Swapped Memory: A low or zero swapped memory indicates the system has sufficient RAM and isn't under memory pressure.\n\n- Proportional Resident: Should show the process's fair share of shared memory.\n\n- Proportional Swapped: Would show fair share of swapped shared memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory in MiB", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "timezone": [ + "browser" + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_memory_bytes{memtype=\"resident\", instance=~\"${instance}\", groupname=\"${process}\"}", + "instant": false, + "legendFormat": "{{memtype}} - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_memory_bytes{memtype=\"virtual\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": true, + "instant": false, + "legendFormat": "{{memtype}} - {{instance}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_memory_bytes{memtype=\"swapped\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{memtype}} - {{instance}}", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_memory_bytes{memtype=\"proportionalSwapped\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{memtype}} - {{instance}}", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_memory_bytes{memtype=\"proportionalResident\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{memtype}} - {{instance}}", + "range": true, + "refId": "E" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "Context switches occur when the CPU stops executing one process and starts executing another. This happens when:\n\n- A process voluntarily yields control (waiting for I/O, sleeping)\n- The OS forcibly switches processes (time slice expired, higher priority process needs CPU)\n\nThe Two Types:\nVoluntary (ctxswitchtype=\"voluntary\"):\n\n- Process willingly gives up CPU (e.g., waiting for disk I/O, network response)\n- Generally indicates normal, efficient behavior\n- Higher numbers often mean the process is I/O bound\n\nNon-voluntary (ctxswitchtype=\"nonvoluntary\"):\n\n- OS forcibly preempts the process (time slice expired)\n- Can indicate CPU-bound processes or resource contention\n- Very high numbers may suggest performance issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Rate per second", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "timezone": [ + "browser" + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_context_switches_total{ctxswitchtype=\"voluntary\", instance=~\"${instance}\", groupname=\"${process}\"}[5m])", + "instant": false, + "legendFormat": "{{ctxswitchtype}} - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_context_switches_total{ctxswitchtype=\"nonvoluntary\", instance=~\"${instance}\", groupname=\"${process}\"}[5m])", + "hide": false, + "instant": false, + "legendFormat": "{{ctxswitchtype}} - {{instance}}", + "range": true, + "refId": "B" + } + ], + "title": "Voluntary & Nonvoluntary Context Switches", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "This metric shows major page faults for the Application, and the value of 0 indicates the process has experienced zero major page faults since it started. Page faults are when a process tries to access memory that isn't currently in physical RAM. There are two types:\n\n- Minor page fault: Memory exists but isn't mapped to the process (quick fix)\n- Major page fault: Memory must be loaded from disk storage (slow operation)\n\n0 major page faults indicates:\n- The process hasn't needed to load memory pages from disk\n- All required memory has stayed in physical RAM\n- No swapping to disk has occurred for this process\n- Generally indicates good memory management\n\nMajor page faults are expensive because:\n- Disk I/O is ~1000x slower than RAM access\n- Can cause performance hiccups\n- May indicate memory pressure or swap usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Rate per second", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "timezone": [ + "browser" + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_major_page_faults_total{instance=~\"${instance}\", groupname=\"${process}\"}[5m])", + "instant": false, + "legendFormat": "Major Page Faults - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_minor_page_faults_total{instance=~\"${instance}\", groupname=\"${process}\"}[5m])", + "hide": false, + "instant": false, + "legendFormat": "Minor Page Faults - {{instance}}", + "range": true, + "refId": "B" + } + ], + "title": "Major & Minor Page Faults", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "description": "This metric shows the number of processes in \"Other\" state for the Application\nThe process states are:\n\n- Running: Currently executing on CPU\n- Sleeping: Waiting for I/O or events\n- Stopped: Suspended (e.g., by SIGSTOP)\n- Zombie: Terminated but not yet cleaned up by parent\n- Other: Any other process states not in the above categories", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "timezone": [ + "browser" + ], + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_states{state=\"Running\", instance=~\"${instance}\", groupname=\"${process}\"}", + "instant": false, + "legendFormat": "{{state}} - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_states{state=\"Sleeping\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{state}} - {{instance}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_states{state=\"Waiting\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{state}} - {{instance}}", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_states{state=\"Zombie\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{state}} - {{instance}}", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P0147FA0CB911A4EC" + }, + "editorMode": "code", + "expr": "namedprocess_namegroup_states{state=\"Other\", instance=~\"${instance}\", groupname=\"${process}\"}", + "hide": false, + "instant": false, + "legendFormat": "{{state}} - {{instance}}", + "range": true, + "refId": "E" + } + ], + "title": "Process States", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "These metrics measure how much data the application reads from and writes to storage (like hard drives or network storage).\n\n- Read Bytes: The total amount of data your application has pulled in - this could be configuration files, data files, logs it's processing, or any other files it needs to do its work.\n\n- Write Bytes: The total amount of data your application has saved out - this includes log files it creates, reports it generates, database records it saves, or any other output it produces.\n\nHeavy reading might mean your application is processing large datasets. Heavy writing might mean it's generating lots of output or logs. These numbers help you understand how much your application uses storage resources and network bandwidth. Sudden changes in these patterns can indicate if something unusual is happening - like if your application stops writing (which might mean it's stuck) or starts reading excessively (which might mean it's processing more data than expected).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_read_bytes_total{instance=\"${instance}\", groupname=\"${process}\"}[5m])", + "instant": false, + "legendFormat": "Read Bytes - {{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(namedprocess_namegroup_write_bytes_total{instance=\"${instance}\", groupname=\"${process}\"}[5m])", + "hide": false, + "instant": false, + "legendFormat": "Write Bytes - {{instance}}", + "range": true, + "refId": "B" + } + ], + "title": "Disk I/O Operations", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "", + "schemaVersion": 40, + "tags": [ + "IAP", + "Itential" + ], + "templating": { + "list": [ + { + "allowCustomValue": false, + "current": { + "text": "Prometheus IAP", + "value": "P0147FA0CB911A4EC" + }, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": [ + "pe-iap01:9256" + ], + "value": [ + "pe-iap01:9256" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(namedprocess_namegroup_context_switches_total,instance)", + "includeAll": false, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(namedprocess_namegroup_context_switches_total,instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "Pronghorn AutomationGateway Adapter", + "value": "Pronghorn AutomationGateway Adapter" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(namedprocess_namegroup_context_switches_total,groupname)", + "label": "Process", + "name": "process", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(namedprocess_namegroup_context_switches_total,groupname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Itential Process Health", + "uid": "fexofg2vrx24gd", + "version": 9, + "weekStart": "sunday" +} \ No newline at end of file diff --git a/roles/grafana/files/definitions/default/grafana_dashboard_definitions_iap_weblog.json b/roles/grafana/files/definitions/default/grafana_dashboard_definitions_iap_weblog.json new file mode 100644 index 00000000..36447bb5 --- /dev/null +++ b/roles/grafana/files/definitions/default/grafana_dashboard_definitions_iap_weblog.json @@ -0,0 +1,1042 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 60, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(web_requests_total[$__rate_interval]))", + "legendFormat": "Total Request Rate", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "web_error_rate", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Current Error Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*2.." + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*4.." + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*5.." + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(web_status_codes_total[$__rate_interval])) by (status)", + "legendFormat": "HTTP {{status}}", + "refId": "A" + } + ], + "title": "HTTP Status Codes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(web_endpoints_total[$__rate_interval])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "title": "Requests by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(web_users_total[$__rate_interval])) by (user)", + "legendFormat": "{{user}}", + "refId": "A" + } + ], + "title": "Requests by User", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, rate(web_response_size_bytes_bucket[$__rate_interval]))", + "legendFormat": "50th percentile", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, rate(web_response_size_bytes_bucket[$__rate_interval]))", + "legendFormat": "90th percentile", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, rate(web_response_size_bytes_bucket[$__rate_interval]))", + "legendFormat": "95th percentile", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, rate(web_response_size_bytes_bucket[$__rate_interval]))", + "legendFormat": "99th percentile", + "refId": "D" + } + ], + "title": "Response Size Percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Endpoint" + }, + "properties": [ + { + "id": "custom.width", + "value": 200 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 7, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, sum(rate(web_endpoints_total[$__rate_interval])) by (endpoint))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "Top 10 Endpoints by Request Rate", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Requests/sec", + "endpoint": "Endpoint" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "User" + }, + "properties": [ + { + "id": "custom.width", + "value": 200 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 8, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, sum(rate(web_users_total[$__rate_interval])) by (user))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "Top 10 Users by Request Rate", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Requests/sec", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 9, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(web_status_codes_total[$__range])) by (status)", + "legendFormat": "HTTP {{status}}", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false, + "viz": false + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 10, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(web_endpoints_total[$__range])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "title": "Endpoint Distribution", + "type": "piechart" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 40, + "tags": [ + "itential", + "iap", + "web-logs", + "prometheus" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus IAP", + "value": "P0147FA0CB911A4EC" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "IAP Web Server Logs", + "uid": "iap-web-logs", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/roles/platform/files/iap-web-log-exporter.py b/roles/platform/files/iap-web-log-exporter.py new file mode 100755 index 00000000..43e6e6ad --- /dev/null +++ b/roles/platform/files/iap-web-log-exporter.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +import json +import time +from collections import Counter, defaultdict +from prometheus_client import start_http_server, Counter as PromCounter, Histogram, Gauge +import argparse +from pathlib import Path +from urllib.parse import urlparse + +# Prometheus metrics +request_count = PromCounter('web_requests_total', 'Total web requests', ['method', 'status', 'user', 'endpoint']) +request_size = Histogram('web_request_size_bytes', 'Size of web requests', ['method']) +response_size = Histogram('web_response_size_bytes', 'Size of web responses', ['status']) +response_time = Histogram('web_response_time_seconds', 'Response time', ['method', 'status']) +status_codes = PromCounter('web_status_codes_total', 'HTTP status codes', ['status']) +endpoints = PromCounter('web_endpoints_total', 'Requests by endpoint', ['endpoint', 'method']) +users = PromCounter('web_users_total', 'Requests by user', ['user', 'method']) +error_rate = Gauge('web_error_rate', 'Current error rate (5xx responses)') + +class JSONLogParser: + def __init__(self, log_file): + self.log_file = Path(log_file) + # Track file position for incremental parsing + self.position_file = Path(f"{log_file}.position") + self.position = self._load_position() + self.stats = defaultdict(int) + self.recent_requests = [] # For calculating error rates + + def _load_position(self): + """Load the last file position from disk""" + try: + if self.position_file.exists(): + with open(self.position_file, 'r') as f: + return int(f.read().strip()) + except (ValueError, IOError): + pass + return 0 + + def _save_position(self): + """Save the current file position to disk""" + try: + with open(self.position_file, 'w') as f: + f.write(str(self.position)) + except IOError as e: + print(f"Warning: Could not save position: {e}") + + def parse_line(self, line): + """Parse a single JSON log line""" + try: + data = json.loads(line.strip()) + + # Validate required fields + required_fields = ['remote_addr', 'method', 'url', 'status', 'result_length'] + if not all(field in data for field in required_fields): + return None + + # Convert numeric fields + try: + data['status'] = int(data['status']) + data['result_length'] = int(data['result_length']) + except (ValueError, KeyError): + return None + + return data + + except (json.JSONDecodeError, KeyError) as e: + print(f"Error parsing JSON line: {e}") + return None + + def extract_endpoint(self, url): + """Extract endpoint from URL for grouping""" + try: + parsed = urlparse(url) + path = parsed.path + + # Group common patterns + if path.startswith('/status'): + return '/status' + elif path.startswith('/myTtl'): + return '/myTtl' + elif path.startswith('/api/'): + # Group API endpoints by first two segments + parts = path.split('/') + if len(parts) >= 3: + return f"/{parts[1]}/{parts[2]}" + return f"/{parts[1]}" + else: + # For other paths, use the first segment or full path if short + if len(path) < 20: + return path + else: + return path.split('/')[1] if '/' in path[1:] else path + + except Exception: + return url + + def normalize_remote_addr(self, addr): + """Normalize IPv6-mapped IPv4 addresses""" + if addr.startswith('::ffff:'): + return addr[7:] # Remove IPv6 prefix + return addr + + def process_logs(self): + """Process new log entries since last run""" + if not self.log_file.exists(): + print(f"Log file {self.log_file} not found") + return + + # Check if log file was rotated (size decreased) + try: + current_size = self.log_file.stat().st_size + if current_size < self.position: + print("Log rotation detected, resetting position") + self.position = 0 + except OSError: + return + + try: + with open(self.log_file, 'r') as f: + # Seek to last position + f.seek(self.position) + + lines_processed = 0 + for line in f: + if line.strip(): # Skip empty lines + parsed = self.parse_line(line) + if parsed: + self.update_metrics(parsed) + lines_processed += 1 + + # Update and save position + self.position = f.tell() + self._save_position() + + if lines_processed > 0: + print(f"Processed {lines_processed} new log entries") + + except Exception as e: + print(f"Error processing logs: {e}") + + def update_metrics(self, data): + """Update Prometheus metrics with parsed data""" + method = data.get('method', 'UNKNOWN') + status = data.get('status', 0) + status_str = str(status) + user = data.get('remote_user', 'anonymous') + url = data.get('url', '/') + result_length = data.get('result_length', 0) + remote_addr = self.normalize_remote_addr(data.get('remote_addr', 'unknown')) + + # Extract endpoint for grouping + endpoint = self.extract_endpoint(url) + + # Update main counters + request_count.labels( + method=method, + status=status_str, + user=user, + endpoint=endpoint + ).inc() + + # Update specific metric counters + status_codes.labels(status=status_str).inc() + endpoints.labels(endpoint=endpoint, method=method).inc() + users.labels(user=user, method=method).inc() + + # Update histograms + response_size.labels(status=status_str).observe(result_length) + + # Track recent requests for error rate calculation + self.recent_requests.append(status >= 500) + + # Keep only last 100 requests for error rate + if len(self.recent_requests) > 100: + self.recent_requests = self.recent_requests[-100:] + + # Update error rate gauge + if self.recent_requests: + error_rate.set(sum(self.recent_requests) / len(self.recent_requests)) + + # Track some basic stats for logging + self.stats['total_requests'] += 1 + self.stats[f'status_{status_str}'] += 1 + self.stats['total_bytes'] += result_length + + # Log interesting events + if status >= 500: + print(f"Server error: {method} {url} -> {status} (user: {user})") + elif status == 401 or status == 403: + print(f"Auth issue: {method} {url} -> {status} (user: {user}, ip: {remote_addr})") + + def print_summary(self): + """Print current statistics summary""" + if self.stats['total_requests'] > 0: + print(f"\n=== Statistics Summary ===") + print(f"Total requests processed: {self.stats['total_requests']}") + print(f"Total bytes served: {self.stats['total_bytes']:,}") + + # Print status code breakdown + status_codes = {k: v for k, v in self.stats.items() if k.startswith('status_')} + if status_codes: + print("Status codes:") + for status, count in sorted(status_codes.items()): + percentage = (count / self.stats['total_requests']) * 100 + print(f" {status.replace('status_', '')}: {count} ({percentage:.1f}%)") + +def main(): + parser = argparse.ArgumentParser(description='JSON Web Log Prometheus Exporter') + parser.add_argument('--log-file', required=True, help='Path to JSON web server log file') + parser.add_argument('--port', type=int, default=8000, help='Metrics port (default: 8000)') + parser.add_argument('--interval', type=int, default=30, help='Parse interval in seconds (default: 30)') + parser.add_argument('--summary-interval', type=int, default=300, help='Summary print interval in seconds (default: 300)') + + args = parser.parse_args() + + # Initialize log parser + log_parser = JSONLogParser(args.log_file) + + # Start Prometheus metrics server + start_http_server(args.port) + print(f"Metrics server started on port {args.port}") + print(f"Monitoring JSON log file: {args.log_file}") + print(f"Metrics available at: http://localhost:{args.port}/metrics") + + last_summary = time.time() + + # Main processing loop + try: + while True: + log_parser.process_logs() + + # Print summary periodically + if time.time() - last_summary >= args.summary_interval: + log_parser.print_summary() + last_summary = time.time() + + time.sleep(args.interval) + + except KeyboardInterrupt: + print("\nShutting down...") + log_parser.print_summary() + +if __name__ == '__main__': + main() diff --git a/roles/platform/files/iap-web-log-exporter.service b/roles/platform/files/iap-web-log-exporter.service new file mode 100644 index 00000000..d39f24bb --- /dev/null +++ b/roles/platform/files/iap-web-log-exporter.service @@ -0,0 +1,29 @@ +[Unit] +Description=Itential IAP Web Log Prometheus Exporter +After=network.target +Wants=network.target + +[Service] +Type=simple +User=itential +Group=itential +WorkingDirectory=/opt/itential/exporters +ExecStart=/usr/bin/python3 /opt/itential/exporters/iap-web-log-exporter.py --log-file /var/log/itential/platform/webserver.log --port 8000 --interval 30 +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal + +# Security settings +NoNewPrivileges=yes +PrivateTmp=yes +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/opt/itential/exporters + +# Environment +Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +Environment=PYTHONPATH=/home/itential/.local/lib/python3.11/site-packages + +[Install] +WantedBy=multi-user.target