diff --git a/Dockerfile b/Dockerfile index d868971..ebc601c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,11 +2,7 @@ FROM node:22-alpine MAINTAINER Digitransit version: 0.1 RUN mkdir -p /usr/src/app WORKDIR /usr/src/app -ENV CHECK_INTERVAL_MINUTES 5 -ENV DEBUG "" ENV TZ "Europe/Helsinki" -ENV DOCKER_USER "" -ENV DOCKER_AUTH "" RUN apk add --update \ python3 \ diff --git a/README.md b/README.md index 86d5421..8d638fb 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,16 @@ Autodeployer also takes care of restarting dependant deployments. Additionally, some deployments are restarted periodically. +## Env variable configuration + +These following environmental variables should be added: +* "SLACK_ACCESS_TOKEN" access token used for sending slack messages through a Slack app +* "MONITORING_SLACK_CHANNEL_ID" slack channel id (not the name) for most of the Slack messages +* "ALERT_SLACK_CHANNEL_ID" slack channel id (not the name) for sending messages about image freshness checks +* "DOCKER_USER" docker user that is used for interacting with the Docker API +* "DOCKER_AUTH" docker password that is used for interacting with the Docker API +* "TZ" optional timezone (defaults to "Europe/Helsinki") + ## Prerequisites Deployments should have the following labels defined as deployer uses `app` as an identifier for finding deployments/pods. @@ -65,3 +75,16 @@ Restarts deployment at 04:30. Attempts to restart deployment stop after deployme ### restartLimitInterval: "240" Optional label that defines in minutes how long time has to be since the last restart for a restart to trigger at the time defined in "restartAt" label. If "restartLimitInterval" is not defined, the default value will be 1080 minutes (18 hours). + +## Deployment image freshness monitoring + +Optionally, it can be checked that an image has been updated within the last 12 hours. + +This can be enabled with `checkImageFreshnessAt` label that defines when the check is done in `hh.mm` format: + +```yaml + metadata: + labels: + update: "auto" + checkImageFreshnessAt: "09.00" +``` diff --git a/package-lock.json b/package-lock.json index cd8e3b0..0996e6c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,7 @@ "dependencies": { "@dagrejs/graphlib": "^2.2.4", "@kubernetes/client-node": "^1.0.0", - "@slack/webhook": "^7.0.4" + "axios": "^1.7.9" }, "devDependencies": { "chai": "^5.1.2", @@ -382,31 +382,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@slack/types": { - "version": "2.14.0", - "resolved": "https://registry.npmjs.org/@slack/types/-/types-2.14.0.tgz", - "integrity": "sha512-n0EGm7ENQRxlXbgKSrQZL69grzg1gHLAVd+GlRVQJ1NSORo0FrApR7wql/gaKdu2n4TO83Sq/AmeUOqD60aXUA==", - "license": "MIT", - "engines": { - "node": ">= 12.13.0", - "npm": ">= 6.12.0" - } - }, - "node_modules/@slack/webhook": { - "version": "7.0.4", - "resolved": "https://registry.npmjs.org/@slack/webhook/-/webhook-7.0.4.tgz", - "integrity": "sha512-JDJte2dbJCcq1/GCMBYJH6fj+YS4n5GuPjT4tF3O1NPN6pFPCR9yA/apRh9sdfhdFG7hadiRgmiQqC4GLgNkZg==", - "license": "MIT", - "dependencies": { - "@slack/types": "^2.9.0", - "@types/node": ">=18.0.0", - "axios": "^1.7.8" - }, - "engines": { - "node": ">= 18", - "npm": ">= 8.6.0" - } - }, "node_modules/@types/js-yaml": { "version": "4.0.5", "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.5.tgz", diff --git a/package.json b/package.json index edeb5ac..395ef04 100644 --- a/package.json +++ b/package.json @@ -12,9 +12,9 @@ "license": "(AGPL-3.0 OR EUPL-1.2)", "type": "module", "dependencies": { + "@dagrejs/graphlib": "^2.2.4", "@kubernetes/client-node": "^1.0.0", - "@slack/webhook": "^7.0.4", - "@dagrejs/graphlib": "^2.2.4" + "axios": "^1.7.9" }, "devDependencies": { "chai": "^5.1.2", diff --git a/src/dep-deployment-restarter.js b/src/dep-deployment-restarter.js index 88caa5e..3d4a308 100644 --- a/src/dep-deployment-restarter.js +++ b/src/dep-deployment-restarter.js @@ -1,6 +1,6 @@ import graphlib from '@dagrejs/graphlib' import { build, isSubGraphStable, deploymentsNeedingRestart } from './graph.js' -import { postSlackMessage } from './util.js' +import { postMonitoringSlackMessage } from './util.js' /* * Automatically restarts dependand deployments in controlled manner. This is @@ -34,7 +34,7 @@ export default { const deploymentGraph = build(deployments) if (graphlib.alg.findCycles(deploymentGraph).length > 0) { console.log('Bummer! Graph has cycle, %s', deploymentGraph.toJSON()) - postSlackMessage('Deployments are configured to restart each other in a cycle.') + postMonitoringSlackMessage('Deployments are configured to restart each other in a cycle.') } else { deploymentsNeedingRestart(deploymentGraph).filter(({ from, value }) => { console.log('deployment %s needs restart', from) diff --git a/src/graph.js b/src/graph.js index a299c67..f61f14c 100644 --- a/src/graph.js +++ b/src/graph.js @@ -1,5 +1,5 @@ import { Graph } from '@dagrejs/graphlib' -import { postSlackMessage } from './util.js' +import { postMonitoringSlackMessage } from './util.js' function addDepEdges (graph, deployment, deployments) { const deploymentLabels = deployment.metadata.labels @@ -13,7 +13,7 @@ function addDepEdges (graph, deployment, deployments) { graph.setEdge(deploymentName, dependency, { delay }) } else { console.log(`${dependency} does not exist but is defined as a dependency for a deployment`) - postSlackMessage(`${dependency} does not exist but is defined as a dependency for a deployment`) + postMonitoringSlackMessage(`${dependency} does not exist but is defined as a dependency for a deployment`) } }) } @@ -52,11 +52,9 @@ export function hasPendingDependentRestarts (graph, deploymentId) { export function build (deployments) { const graph = new Graph({ directed: true }) - console.log('adding vertexes') deployments.forEach(deployment => { graph.setNode(deployment.metadata.labels.app, deployment) }) - console.log('adding edges') deployments.forEach(deployment => { if (deployment.metadata.labels.restartAfterDeployments) { addDepEdges(graph, deployment, deployments) @@ -86,3 +84,22 @@ export function deploymentsNeedingRestart (graph) { } return deployments } + +export function deploymentsNeedingImageFreshnessCheck (graph, currentDate) { + const deployments = [] + for (const node of graph.nodes()) { + const deployment = graph.node(node) + const checkTime = deployment.metadata.labels.checkImageFreshnessAt + if (checkTime) { + // time format is hh.mm + const checkTimeParts = checkTime.split('.') + const checkDate = new Date(currentDate.getFullYear(), currentDate.getMonth(), currentDate.getDate(), checkTimeParts[0], checkTimeParts[1]) + const timeDifferenceSeconds = Math.round((currentDate.getTime() - checkDate.getTime()) / 1000) + // Between 0 and 5 minutes since the checkTime, this is to avoid duplicate checks + if (timeDifferenceSeconds >= 0 && timeDifferenceSeconds <= 5 * 60) { + deployments.push(deployment) + } + } + } + return deployments +} diff --git a/src/image-freshness-monitor.js b/src/image-freshness-monitor.js new file mode 100644 index 0000000..241ef64 --- /dev/null +++ b/src/image-freshness-monitor.js @@ -0,0 +1,50 @@ +import { build, deploymentsNeedingImageFreshnessCheck } from './graph.js' +import { postAlertSlackMessage } from './util.js' + +/* + * Automatically checks that the image + tag combination used by the deployment + * has been updated within last 24 hours. If not, a message is sent to slack. + * Configured with labels as follows: + * checkImageFreshnessAt: "hh.mm" + * where checkImageFreshnessAt defines when the check should be done (roughly, + * might be delayed by 0-5 mins) + */ +export default { + command: (deployments, context) => { + console.log('Checking for a need to do image freshness checks') + const deploymentGraph = build(deployments) + const now = new Date() + const deploymentsNeedingCheck = deploymentsNeedingImageFreshnessCheck(deploymentGraph, now) + if (deploymentsNeedingCheck.length === 0) { + console.log('Found no deployments that need an image freshness check') + return + } + const promises = [] + deploymentsNeedingCheck.forEach(deployment => { + const deploymentId = deployment.metadata.labels.app + const image = deployment.spec.template.spec.containers[0].image + console.log(`Deployment ${deployment.metadata.labels.app} needs image freshness check`) + promises.push(new Promise((resolve) => { + context.dockerRepo.getImageDate(image).then(repoImageDate => { + // check that image is older than 12 hours old + if (repoImageDate && repoImageDate < now.getTime() - 12 * 60 * 60 * 1000) { + console.log('%s image has not been updated within the last 12 hours', deploymentId) + resolve(deployment.metadata.labels.app) + } else { + console.log('%s image has been updated within the last 12 hours', deploymentId) + resolve(null) + } + }).catch((err) => { + console.log(err) + resolve(null) + }) + })) + }) + Promise.all(promises).then((values) => { + const deploymentsWithOldImages = values.filter(value => value != null) + if (deploymentsWithOldImages.length > 0) { + postAlertSlackMessage(`:boom: These deployments have not been updated within the last 12 hours: ${deploymentsWithOldImages.join(', ')} :boom:`) + } + }) + } +} diff --git a/src/index.js b/src/index.js index db20d50..9d750ec 100644 --- a/src/index.js +++ b/src/index.js @@ -3,10 +3,11 @@ import dockerRepo from './dockerRepo.js' import imageDeployer from './image-deployer.js' import depDeploymentRestarter from './dep-deployment-restarter.js' import cronDeploymentRestarter from './cron-deployment-restarter.js' +import imageFreshnessMonitor from './image-freshness-monitor.js' -const CHECK_INTERVAL = (process.env.CHECK_INTERVAL_MINUTES || 5) * 60 * 1000 +const CHECK_INTERVAL = 5 * 60 * 1000 -const actions = [imageDeployer, depDeploymentRestarter, cronDeploymentRestarter] +const actions = [imageDeployer, depDeploymentRestarter, cronDeploymentRestarter, imageFreshnessMonitor] const logError = (name, e) => { console.log('%s: Error occurred %s', name, e) diff --git a/src/util.js b/src/util.js index 43fa267..c4b034c 100644 --- a/src/util.js +++ b/src/util.js @@ -1,24 +1,43 @@ -import { IncomingWebhook } from '@slack/webhook' +import axios from 'axios' -const url = process.env.SLACK_WEBHOOK_URL || null -let webhook -if (process.env.ENVIRONMENT_TYPE === 'DEV') { - webhook = url !== null ? new IncomingWebhook(url, { username: 'Configuration checker', channel: 'digitransit_monitoring_dev' }) : null -} else { - webhook = url !== null ? new IncomingWebhook(url, { username: 'Configuration checker', channel: 'digitransit_monitoring_prd' }) : null +const MONITORING_CHANNEL_ID = process.env.MONITORING_SLACK_CHANNEL_ID +const MONITORING_USERNAME = `Configuration checker ${process.env.ENVIRONMENT_TYPE}` + +const ALERT_CHANNEL_ID = process.env.ALERT_SLACK_CHANNEL_ID +const ALERT_USERNAME = `Image freshness monitor ${process.env.ENVIRONMENT_TYPE}` + +const headers = { + Authorization: `Bearer ${process.env.SLACK_ACCESS_TOKEN}`, + 'Content-Type': 'application/json', + Accept: '*/*' } -export function postSlackMessage (message) { - if (webhook === null) { - process.stdout.write(`Not sending to slack: ${message}\n`) - return +function postSlackMessage (text, username, channel) { + if (!process.env.SLACK_ACCESS_TOKEN) { + console.log('Not sending to slack: ' + text) } - webhook.send({ text: message }) - .then(() => { - process.stdout.write(`Sent to slack: ${message}\n`) + axios.post('https://slack.com/api/chat.postMessage', { + channel, + text, + username + }, { headers }) + .then(response => { + if (response.status !== 200) { + console.log(`Slack message was not sent successfully. Response: ${response}`) + } else { + console.log(`Sent to slack: ${text}`) + } }) - .catch((err) => { - process.stdout.write(`ERROR sending to slack : ${err}\n`) + .catch(error => { + console.log(`Something went wrong when trying to send message to Slack:\n${error}`) }) } + +export function postMonitoringSlackMessage (text) { + postSlackMessage(text, MONITORING_USERNAME, MONITORING_CHANNEL_ID) +} + +export function postAlertSlackMessage (text) { + postSlackMessage(text, ALERT_USERNAME, ALERT_CHANNEL_ID) +} diff --git a/test/graph-test.js b/test/graph-test.js index d621d0c..9427959 100644 --- a/test/graph-test.js +++ b/test/graph-test.js @@ -1,7 +1,7 @@ import { expect } from 'chai' import { describe, it } from 'mocha' import graphlib from '@dagrejs/graphlib' -import { build, isSubGraphStable, hasPendingDependentRestarts, deploymentsNeedingRestart } from './../src/graph.js' +import { build, isSubGraphStable, hasPendingDependentRestarts, deploymentsNeedingRestart, deploymentsNeedingImageFreshnessCheck } from './../src/graph.js' const NOW = new Date().getTime() @@ -113,4 +113,21 @@ describe('graph-builder', function () { deploymentGraph = build(testApps) expect(deploymentsNeedingRestart(deploymentGraph).length).to.equal(1) }) + + it('Graph should return deployments needing image freshness check', () => { + // app2 should be checked between 09:00 and 09:05 + const testApps = [ + appConfig('app1', NOW, {}, true), + appConfig('app2', NOW, { checkImageFreshnessAt: '09.00' }, true) + ] + const deploymentGraph = build(testApps) + const currentDate = new Date('2025-01-01T09:01:00') + expect(deploymentsNeedingImageFreshnessCheck(deploymentGraph, currentDate).length).to.equal(1) + + const beforeDate = new Date('2025-01-01T08:59:00') + expect(deploymentsNeedingImageFreshnessCheck(deploymentGraph, beforeDate).length).to.equal(0) + + const afterDate = new Date('2025-01-01T09:06:00') + expect(deploymentsNeedingImageFreshnessCheck(deploymentGraph, afterDate).length).to.equal(0) + }) })