diff --git a/.env.example b/.env.example deleted file mode 100644 index 5fe1f9d..0000000 --- a/.env.example +++ /dev/null @@ -1,2 +0,0 @@ -AWS_ACCESS_KEY_ID= -AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/README.md b/README.md index 0655eea..53607b1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,945 @@ # CTFp - CTF Pilot's CTF Platform -> [!WARNING] +> [!TIP] +> If you are looking for **how to build challenges for CTFp**, please check out the **[CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template)** and **[CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit)** repositories. + +CTFp (CTF Pilot's CTF Platform) is a CTF platform designed to host large-scale Capture The Flag (CTF) competitions, with a focus on scalability, resilience, and ease of use. +The platform uses Kubernetes as the underlying orchestration system, where the management, scoreboard, and challenge infrastructure are deployed as Kubernetes resources. It then leverages GitOps through [ArgoCD](https://argo-cd.readthedocs.io/en/stable/) for managing the platform's configuration and deployments, including the CTF challenges. + +CTFp acts as the orchestration layer for deploying and managing the platform, while utilizing a variety of CTF Pilot's components to provide the full functionality of the platform. + +CTFp provides a CLI tool for managing the deployment of the platform, but it is possible to use the individual Terraform components directly if desired. To manage the platform after initial deployment, you will primarily interact with the Kubernetes cluster using `kubectl`, ArgoCD, and the other monitoring systems deployed. + +> [!IMPORTANT] +> In order to run CTFp properly, you will need to have a working knowledge of **Cloud**, **Kubernetes**, **Terraform/OpenTofu**, **GitOps**, and **CTFd**. +> The platform is designed to work with CTF Pilot's Challenges ecosystem, to ensure secure hosting of CTF challenges. +> +> This platform is not intended for beginners, and it is assumed that you have prior experience with these technologies and systems. +> Incorrect handling of Kubernetes resources can lead to data loss, downtime, and security vulnerabilities. +> Incorrectly configured challenges may lead to security vulnerabilities or platform instability. + +This platform deploys real-world infrastructure and will incur costs when deployed. + +## Table of Contents + +- [CTFp - CTF Pilot's CTF Platform](#ctfp---ctf-pilots-ctf-platform) + - [Table of Contents](#table-of-contents) + - [Features](#features) + - [Quick start](#quick-start) + - [How to run](#how-to-run) + - [Pre-requisites](#pre-requisites) + - [Environments](#environments) + - [Configuring the platform](#configuring-the-platform) + - [CLI Tool](#cli-tool) + - [Commands](#commands) + - [`init` - Initialize Platform Configuration](#init---initialize-platform-configuration) + - [`generate-keys` - Generate SSH Keys](#generate-keys---generate-ssh-keys) + - [`insert-keys` - Insert SSH Keys into Configuration](#insert-keys---insert-ssh-keys-into-configuration) + - [`generate-images` - Generate Custom Server Images](#generate-images---generate-custom-server-images) + - [`generate-backend` - Generate Terraform Backend Configuration](#generate-backend---generate-terraform-backend-configuration) + - [`deploy` - Deploy Platform Components](#deploy---deploy-platform-components) + - [`destroy` - Destroy Platform Components](#destroy---destroy-platform-components) + - [Workflow Overview](#workflow-overview) + - [Guides](#guides) + - [Updating sizes of nodes in a running platform](#updating-sizes-of-nodes-in-a-running-platform) + - [Deploying a new challenge](#deploying-a-new-challenge) + - [Updating a challenge](#updating-a-challenge) + - [Deploying a page](#deploying-a-page) + - [The CLI tool does not seem to support my setup](#the-cli-tool-does-not-seem-to-support-my-setup) + - [Architecture](#architecture) + - [Directory structure](#directory-structure) + - [Overview](#overview) + - [Cluster](#cluster) + - [Ops](#ops) + - [Platform](#platform) + - [Challenges](#challenges) + - [Challenge deployment](#challenge-deployment) + - [Network](#network) + - [Cluster networking](#cluster-networking) + - [Challenge networking](#challenge-networking) + - [Getting help](#getting-help) + - [Contributing](#contributing) + - [Background](#background) + - [License](#license) + - [Code of Conduct](#code-of-conduct) + +## Features + +CTFp offers a wide range of features to facilitate the deployment and management of CTF competitions. Below is an overview of the key features: + +- **Infrastructure & Deployment** + - **Multi-environment support** with isolated configurations for Test, Dev, and Production + - **Component-based architecture** with four deployable components: Cluster, Ops, Platform, and Challenges + - **Infrastructure as Code** using Terraform/OpenTofu with automated state management and S3 backend + - **Multi-region Kubernetes clusters** on Hetzner Cloud with configurable node types and auto-scaling + - **Custom server images** generation using Packer + - **Cloudflare DNS integration** for management, platform, and CTF zones +- **Operations & Monitoring** + - **GitOps workflow** powered by ArgoCD for automated deployments + - **Comprehensive monitoring** with Prometheus, Grafana, and metrics exporters + - **Log aggregation** via Filebeat to Elasticsearch + - **Traefik ingress controller** with SSL certificate management (cert-manager) + - **Discord webhook notifications** for platform events + - **Automated descheduling** for optimal resource distribution +- **Scoreboard** + - **Customizable CTFd scoreboard deployment** allowing for bring-your-own CTFd configuration + - **Auto deployment of CTFd configuration** providing a ready-to-use CTFd instance + - **Flexible CTF settings** supporting a large portion of CTFd's configuration options + - **S3 storage configuration** for challenge files and user uploads in CTFd + - **Clustered database setup** with MariaDB operator and automated backups to S3 + - **Redis caching** with Redis operator for ease of use + - **Automatic deployment of CTFd pages** from GitHub +- **Challenge Management** + - **Full support for CTF Pilot's Challenges ecosystem**, including KubeCTF integration + - **Support for three challenge deployment modes**: Isolated, Shared, and Instanced + - **Git-based deployment** with branch-specific configurations + - **IP whitelisting** for challenge access control + - **Custom fallback pages** for errors and instancing states +- **CLI Tool** + - **Simple command-line interface** for managing the deployment and lifecycle of the platform + - **Modular commands** for initializing, deploying, destroying, and managing components + - **Environment management** for handling multiple deployment environments (Test, Dev, Prod) + - **State management** with automated backend configuration, with states stored in S3 + - **Plan generation and review** before applying changes + - **Under 20 minutes** deployment time for the entire platform (excluding image generation) + - **Fully configured through configuration files** for easy setup and management + +## Quick start + +> [!TIP] +> **This is a quick start guide for getting the platform up and running, and acts as a quick reference guide.** +> If it is your first time working with CTFp, we recommend going through the full documentation for a more in-depth understanding of the platform and its components. + +To use the CTFp CLI tool, you first need to clone the repository: + +```bash +git clone https://github.com/ctfpilot/ctfp +cd ctfp +``` + +First, you need to initialize the platform configuration for your desired environment (test, dev, prod): + +```bash +./ctfp.py init +``` + +> [!NOTE] +> You can add `--test`, `--dev` or `--prod` to specify the environment you want to initialize. +> The default environment is `test` (`--test`). +> +> Used in all commands except the `generate-images` command, as it asks for the Hetzner Cloud project to use when generating images. + +Next, you need to fill out the configuration located in the `automated..tfvars` file. + +In order to deploy, ensure you have SSH keys created, and inserted into your configuration: + +```bash +./ctfp.py generate-keys --insert +``` + +To create the server images used for the Kubernetes cluster nodes, run: + +```bash +./ctfp.py generate-images +``` + +To use the Terraform modules, you need to generate the backend configuration for each component. + +```bash +./ctfp.py generate-backend cluster +./ctfp.py generate-backend ops +./ctfp.py generate-backend platform +./ctfp.py generate-backend challenges +``` + +*Replace ``, ``, and `` with your S3 bucket details.* + +Finally, you can deploy the entire platform with: + +```bash +./ctfp.py deploy all +``` + +To destroy the entire platform, run: + +```bash +./ctfp.py destroy all +``` + +`all` can be replaced with any of the individual components: `cluster`, `ops`, `platform`, `challenges`. + +To interact with the cluster, run the following command to configure your `kubectl` context: + +```bash +source kubectl.sh [test|dev|prod] +``` + +*`source` is required to set the environment variables in your current shell session.* + +## How to run + +### Pre-requisites + +In order to even deploy the platform, the following software needs to be installed on your local machine: + +- [OpenTofu](https://opentofu.org) (Alternative version of [Terraform](https://www.terraform.io/downloads.html)) +- [Packer](https://developer.hashicorp.com/packer/tutorials/docker-get-started/get-started-install-cli#installing-packer) - For initial generation of server images +- [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) - For interacting with the Kubernetes cluster +- [hcloud CLI tool](https://github.com/hetznercloud/cli) - For interacting with the Hetzner Cloud API (Otherwise use the Hetzner web interface) +- SSH client - For connecting to the servers +- Python 3 - For running the CTFp CLI tool +- Python package [`python-hcl2`](https://github.com/amplify-education/python-hcl2) - Required by the CTFp CLI tool for parsing Terraform configuration files + +And the following is required in order to deploy the platform: + +- [Hetzner Cloud](https://www.hetzner.com/cloud) account with one or more Hetzner Cloud projects +- [Hetzner Cloud API Token](https://console.hetzner.cloud/projects) - For authenticating with the Hetzner Cloud API +- [Hetzner S3 buckets](https://console.hetzner.cloud/projects) - For storing the Terraform state files, backups, and challenge data. We recommend using 3 separate buckets with separate access keys for security reasons +- [Cloudflare](https://www.cloudflare.com/) account +- [Cloudflare API Token](https://dash.cloudflare.com/profile/api-tokens) - For authenticating with the Cloudflare API +- [3 Cloudflare-managed domains](https://dash.cloudflare.com/) - For allowing the system to allocate a domain for the Kubernetes cluster. Used to allocate management, platform, and challenge domains. +- SMTP mail server - To allow CTFd to send emails to users (Password resets, notifications, etc.). The system is set up to allow outbound connections to [Brevo](https://brevo.com) SMTP on port 587. +- [Discord](https://discord.com) channels to receive notifications. One for monitoring alerts and one for first-blood notifications. +- GitHub repository following [CTF Pilot's Challenges template](https://github.com/ctfpilot/challenges-template) for CTF challenges and CTFd pages - A Git repository containing the CTF challenges to be deployed. This should be your own private repository using the CTF Pilot Challenges Template as a base. This may also contain the pages to be used in CTFd. +- GitHub repository containing the CTFd configuration - We recommend forking [CTF Pilot's CTFd configuration repository](https://github.com/ctfpilot/ctfd). +- Access tokens to access the GitHub repositories and container registry - Fine-grained personal access token and Personal Access Tokens (PAT) with read access to the repositories containing the CTF challenges and CTFd configuration and GitHub container registry. We recommend setting up a bot account for this purpose. +- [Elasticsearch endpoint](https://www.elastic.co/) - Elasticsearch instance with an endpoint and user credentials for log aggregation. Used to connect Filebeat to Elasticsearch. + +### Environments + +CTFp supports three different environments for deployment: + +- **Test**: Intended for testing and experimentation. This environment is suitable for trying out new features, configurations, and updates without affecting the production environment. It is recommended to use smaller server sizes and fewer nodes to minimize costs. +- **Dev**: Intended for development and staging purposes. This environment is suitable for testing new challenges, configurations, and updates before deploying them to production. It should closely resemble the production environment in terms of server sizes and configurations, but can still be scaled down to save costs. +- **Prod**: Intended for hosting live CTF competitions. This environment should be configured for high availability, performance, and security. It is recommended to use larger server sizes, more nodes, and robust configurations to ensure a smooth experience for participants. + +The environments are configured through separate `automated..tfvars` files, allowing for isolated configurations and deployments. + +In the CLI tool, you can specify the environment using the `--test`, `--dev`, or `--prod` flags in the commands. If no flag is provided, the default environment is `test`. + +### Configuring the platform + +> [!TIP] +> To understand the full configuration options and their implications, please refer to the documentation in the `automated..tfvars` or [`template.automated.tfvars`](./template.automated.tfvars) file. + +To configure the platform, you need to configure the `automated..tfvars` file located in the root of the repository. + +It contains a number of configuration options for the platform. +Each configuration option is within the file, explaining and listed with its possible values. + +An automated check, checks if all values are filled out correctly when running the CLI tool. +Therefore, be sure to fill out all required values before attempting to deploy the platform. +Non-required values are per default commented out, and can be left as is if the default value is acceptable. + +The configuration file is the single source of truth for the platform's configuration, and is used by the CLI tool to deploy and manage the platform. +If configuration in the configuration file is changed, the changes will be applied to the platform during the next deployment. +If the platform is manually changed outside of the CLI tool, the changes will be reverted during the next deployment. + +> [!IMPORTANT] +> The `template.automated.tfvars` file is git tracked, and **MUST NOT** be changed in the repository to include sensitive information. +> Instead, copy the file to `automated..tfvars` and fill out the values there. +> The `automated..tfvars` files are git ignored, and will not be tracked by git. +> +> The file can be initialized using the `./ctfp.py init` command. + +Each component is not fully configurable, and may in certain situations require advanced configuration. These configurations are not included in the main configuration file. +These options are either intended to be static, or require manual configuration through the individual Terraform components. +Changing these options may lead to instability or data loss, and should be done with caution. + +### CLI Tool + +The CTFp CLI tool is a Python script that can be executed directly from the command line, and manages the deployment and lifecycle of the CTFp platform. + +**Prerequisites:** + +1. Install required Python dependencies: + + ```bash + pip install -r requirements.txt + ``` + + This installs `python-hcl2`, which is required for parsing Terraform configuration files. + +2. Ensure the script has executable permissions: + + ```bash + chmod +x ctfp.py + ``` + +**Running the CLI tool:** + +You can now run commands directly: + +```bash +./ctfp.py [options] +``` + +Alternatively, you can always run it explicitly with Python: + +```bash +python3 ctfp.py [options] +``` + +Both methods are functionally equivalent. The direct execution method (first example) is more convenient for regular use. + +#### Commands + +> [!TIP] +> You can run any command with the `--help` flag to get more information about the command and its options. +> For example: `./ctfp.py deploy --help` > -> We are currently in the process of publishing the CTFp system. -> Meanwhile, some components may not be present or fully functional. +> Available commands: +> +> - `init` - Initialize Platform Configuration +> - `generate-keys` - Generate SSH Keys +> - `insert-keys` - Insert SSH Keys into Configuration +> - `generate-images` - Generate Custom Server Images +> - `generate-backend` - Generate Terraform Backend Configuration +> - `deploy` - Deploy Platform Components +> - `destroy` - Destroy Platform Components + +Below is a detailed overview of each available command: + +##### `init` - Initialize Platform Configuration + +Initializes the platform configuration for a specified environment by creating an `automated..tfvars` file based on the template. + +**Syntax:** + +```bash +./ctfp.py init [--force] [--test|--dev|--prod] +``` + +**Options:** + +- `--force`: Force overwrite the configuration file if it already exists (by default, the tool prompts before overwriting) +- `--test`: Initialize TEST environment (default) +- `--dev`: Initialize DEV environment +- `--prod`: Initialize PROD environment + +**Example:** + +```bash +./ctfp.py init --test +./ctfp.py init --prod --force +``` + +**Output:** Creates `automated.test.tfvars`, `automated.dev.tfvars`, or `automated.prod.tfvars` in the repository root. + +##### `generate-keys` - Generate SSH Keys + +Generates SSH keys (ed25519) required for accessing the cluster nodes. Optionally inserts the base64-encoded keys directly into the configuration file. + +**Syntax:** + +```bash +./ctfp.py generate-keys [--insert] [--test|--dev|--prod] +``` + +**Options:** + +- `--insert`: Automatically insert the generated keys into the `automated..tfvars` file +- `--test`: Generate keys for TEST environment (default) +- `--dev`: Generate keys for DEV environment +- `--prod`: Generate keys for PROD environment + +**Example:** + +```bash +./ctfp.py generate-keys --insert --test +./ctfp.py generate-keys --dev +``` + +**Output:** Creates `keys/k8s-.pub` (public key) and `keys/k8s-` (private key) in the `keys/` directory. + +##### `insert-keys` - Insert SSH Keys into Configuration + +Manually inserts previously generated SSH keys into the configuration file. Useful if keys were generated separately or if you need to update existing keys. + +**Syntax:** + +```bash +./ctfp.py insert-keys [--test|--dev|--prod] +``` + +**Options:** + +- `--test`: Insert keys for TEST environment (default) +- `--dev`: Insert keys for DEV environment +- `--prod`: Insert keys for PROD environment + +**Example:** + +```bash +./ctfp.py insert-keys --test +./ctfp.py insert-keys --prod +``` + +**Prerequisite:** Keys must already exist in the `keys/` directory. + +##### `generate-images` - Generate Custom Server Images + +Generates custom Packer images for Kubernetes cluster nodes. These images are used when provisioning the cluster infrastructure on Hetzner Cloud. + +**Syntax:** + +```bash +./ctfp.py generate-images +``` + +> [!NOTE] +> The `generate-images` command does not use environment flags. It requires you to select the Hetzner Cloud project interactively during execution. + +**Output:** Packer creates and uploads custom images to your Hetzner Cloud project. + +**Time:** This is typically the longest-running operation, taking 5-15 minutes. + +##### `generate-backend` - Generate Terraform Backend Configuration + +Generates the Terraform backend configuration file (`backend.tf`) for the specified environment. This file configures the S3 backend for storing Terraform state files. + +**Syntax:** + +```bash +./ctfp.py generate-backend +``` + +**Arguments:** + +- ``: Component for which to generate the backend configuration: `cluster`, `ops`, `platform`, or `challenges` +- ``: Name of the S3 bucket to use for storing the Terraform state +- ``: Region where the S3 bucket is located +- ``: Endpoint URL for the S3-compatible storage. For example `nbg1.your-objectstorage.com` for Hetzner Cloud Object Storage in `nbg1` region. + +**Example:** + +```bash +./ctfp.py generate-backend cluster ctfp-cluster-state nbg1 nbg1.your-objectstorage.com +./ctfp.py generate-backend platform ctfp-platform-state fsn1 fsn1.your-objectstorage.com +``` + +**Output:** Creates a HCL configuration for the specified component's Terraform backend in the `backend/generated/` directory. + +See more about this command in the [backend directory](./backend). + +##### `deploy` - Deploy Platform Components + +Deploys one or more components of the platform to the specified environment. Can deploy individual components or the entire platform at once. + +**Syntax:** + +```bash +./ctfp.py deploy [--auto-apply] [--test|--dev|--prod] +``` + +**Arguments:** + +- ``: Component to deploy: `cluster`, `ops`, `platform`, `challenges`, or `all` + - `cluster`: Provisions Kubernetes infrastructure on Hetzner Cloud + - `ops`: Deploys operational tools (ArgoCD, monitoring, logging, ingress) + - `platform`: Deploys CTFd scoreboard and associated services + - `challenges`: Deploys CTF challenges infrastructure + - `all`: Deploys all components in sequence + +**Options:** + +- `--auto-apply`: Automatically apply Terraform changes without interactive prompts (use with extreme caution) +- `--test`: Deploy to TEST environment (default) +- `--dev`: Deploy to DEV environment +- `--prod`: Deploy to PROD environment + +**Example:** + +```bash +./ctfp.py deploy all --test +./ctfp.py deploy cluster --prod +./ctfp.py deploy platform --dev --auto-apply +``` + +**Deployment Order:** When deploying `all`, components are deployed in this order: `cluster` → `ops` → `platform` → `challenges`. Each component must be successfully deployed before the next begins. + +**Output:** Creates Terraform state files in the `terraform/` directory and outputs deployment status and timing information. + +##### `destroy` - Destroy Platform Components + +> [!WARNING] +> Destroying the platform will **delete all data** associated with the environment, including databases, user data, and challenge instances. This action cannot be undone. Always ensure you have backups before destroying production environments. + +Destroys one or more components of the platform. This is the reverse of `deploy` and tears down infrastructure, databases, and services. + +**Syntax:** + +```bash +./ctfp.py destroy [--auto-apply] [--test|--dev|--prod] +``` + +**Arguments:** + +- ``: Component to destroy: `cluster`, `ops`, `platform`, `challenges`, or `all` + +**Options:** + +- `--auto-apply`: Automatically confirm destruction without interactive prompts (use with extreme caution) +- `--test`: Destroy TEST environment (default) +- `--dev`: Destroy DEV environment +- `--prod`: Destroy PROD environment + +**Example:** + +```bash +./ctfp.py destroy all --prod +./ctfp.py destroy challenges --test --auto-apply +``` + +**Destruction Order:** When destroying `all`, components are destroyed in reverse order: `challenges` → `platform` → `ops` → `cluster`. This ensures dependencies are properly cleaned up. + +### Workflow Overview + +The workflow for deploying and managing CTFp can be summarized in the following key phases: + +1. **Setup Phase**: + - Clone the repository and generate backend configurations. + +2. **Preparation Phase**: + - Generate custom server images (one-time setup per Hetzner project). + - Generate SSH keys. + - Create needed pre-requisites. + - Configure the platform using the `automated..tfvars` file. + +3. **Deployment Phase**: + - Deploy components in sequence: `Cluster → Ops → Platform → Challenges`. + - Use `deploy all` for automated deployment or deploy components individually. + +4. **Live Operations**: + - Monitor the platform using tools like ArgoCD, Grafana, and Prometheus. + - Manage challenges, and apply updates as needed. + +5. **Teardown Phase**: + - Destroy components in reverse order: `Challenges → Platform → Ops → Cluster`. + - Use `destroy all` for automated teardown or destroy components individually. + +### Guides + +#### Updating sizes of nodes in a running platform + +> [!TIP] +> When upgrading existing clusters, it is recommended to drain node pools before changing their sizes, to avoid disruption of running workloads. +> Update one node pool at a time, to minimize the impact on the cluster. + +When updating the sizes of nodes in an existing cluster, it is important to follow a specific procedure to ensure a smooth transition and avoid downtime or data loss. +Below are the steps to update the sizes of nodes in an existing cluster: + +1. **Drain the Node Pool**: Before making any changes, drain the node pool that you intend to update. This will safely evict all workloads from the nodes in the pool, allowing them to be rescheduled on other nodes in the cluster. + + ```bash + # List nodes + kubectl get nodes + + # Drain each node in the node pool + kubectl drain --ignore-daemonsets --delete-local-data + ``` + + *You will need to repeat this for each node in the node pool. You can use tools such as [`draino`](https://github.com/planetlabs/draino) to automate this process.* + +2. **Update the Configuration**: Modify the `automated..tfvars` file to reflect the new sizes for the nodes in the node pool. Ensure that you only change the sizes for the specific node pool you are updating. +3. **Deploy the Changes**: Use the CTFp CLI tool to deploy the changes to the cluster. This will apply the updated configuration and resize the nodes in the specified node pool. + + ```bash + ./ctfp.py deploy cluster -- + ``` + + *Replace `` with the appropriate environment flag (`--test`, `--dev`, or `--prod`).* +4. **Monitor the Deployment**: Keep an eye on the deployment process to ensure that the nodes are resized correctly and that there are no issues. You can use `kubectl get nodes` to check the status of the nodes in the cluster. +5. **Uncordon the Node Pool**: Once the nodes have been resized and are ready, uncordon the node pool to allow workloads to be scheduled on the nodes again. + + ```bash + kubectl uncordon + ``` + + *Repeat this for each node in the node pool.* +6. **Verify the Changes**: Finally, verify that the workloads are running correctly on the resized nodes and that there are no issues in the cluster. +7. **Repeat for Other Node Pools**: If you have multiple node pools to update, repeat the above steps for each node pool, one at a time. + +> [!WARNING] +> Changing node sizes can lead to temporary disruption of workloads. +> Always ensure that you have backups of critical data before making changes to the cluster configuration. + +Changes to the `scale_type` will only affect new nodes being created, and will not resize existing nodes, as the deployment of these nodes is done as resources are needed. + +You may need to manually intervene to resize existing nodes if required, or delete them, forcing the system to create new nodes with the updated sizes. However, this may lead to downtime for workloads running on the nodes being deleted. + +> [!NOTE] +> Downscaling nodes may not be possible, depending on the initial size of the nodes and the new size. + +Hetzner does not support downsizing nodes if they were initially created with a larger size. +In such cases, the nodes will need to be deleted, forcing the system to create new nodes with the desired size. + +#### Deploying a new challenge + +To deploy a new challenge, you will need to add the challenge to the configuration file, and then deploy the changes to the platform. + +Challenges are split into three types: + +- `static` - Static challenge, often with a handout (files, puzzles, etc.). +- `shared` - Challenge with a single instance for all teams to connect to. +- `instanced` - Challenge with individual instances for each team. + +The challenge should be formatted using the [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template), and built using the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenge Schema](https://github.com/ctfpilot/challenge-schema). + +In the configuration file, you will need to add the challenge under the `Challenges configuration` section. + +For static files, add the challenge under the `challenges_static` list: + +```hcl +challenges_static = { + = [ + "" + ] +} +``` + +For shared challenges, add the challenge under the `challenges_shared` list: + +```hcl +challenges_shared = { + = [ + "" + ] +} +``` + +For instanced challenges, add the challenge under the `challenges_instanced` list: + +```hcl +challenges_instanced = { + = [ + "" + ] +} +``` + +An example of this, using [CTF Pilot's Challenges example repository](https://github.com/ctfpilot/challenges-example), would look like this: + +```hcl +challenges_static = { + forensics = ["oh-look-a-flag"], +} +challenges_shared = { + web = ["the-shared-site"], +} +challenges_instanced = { + web = ["where-robots-cannot-search"], + misc = ["a-true-connection"], +} +``` + +In order to deploy the new challenge, you need to deploy the `challenges` component using the CLI tool: + +```bash +./ctfp.py deploy challenges -- +``` + +To remove a challenge, delete it from the configuration file, and then deploy the `challenges` component again. + +Challenge changes are automatically and continuously deployed through ArgoCD, so no manual intervention is required after the initial deployment. + +#### Updating a challenge + +Challenge updates are handled through the Git repository containing the challenges. + +If a challenge's slug has been changed, you need to remove the old slug from the configuration file, and add the new slug. +For this, follow the [Deploying a new challenge](#deploying-a-new-challenge) guide. + +#### Deploying a page + +To deploy a new page to CTFd, you will need to add the page to a Git repository that should be formatted using the [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template), and built using the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Page Schema](https://github.com/ctfpilot/page-schema). + +In the configuration file, you will need to add the page under the `Pages configuration` section. + +For pages, add the page under the `pages` list: + +```hcl +pages = [ + "" +] +``` + +An example of this, using the [CTF Pilot's Challenges example repository](https://github.com/ctfpilot/challenges-example), would look like this: + +```hcl +pages = ["index"] +``` + +In order to deploy the new page, you need to deploy the `platform` component using the CLI tool: + +```bash +./ctfp.py deploy platform -- +``` + +To remove a page, you need to remove it from the configuration file, and then deploy the `platform` component again. + +Page changes are automatically and continuously deployed through ArgoCD, so no manual intervention is required after the initial deployment. + +#### The CLI tool does not seem to support my setup + +The CLI tool is designed to cover a wide range of deployment scenarios, but it may be that your specific setup requires some customization in each Terraform component. + +Each component is located in its own directory, and can be deployed manually using OpenTofu/terraform commands. + +However, be aware that the CLI tool also manages the Terraform backend configuration, and you will need to set this up manually if you choose to deploy the components manually. + +Documentation is located within each component directory, explaining the configuration options and how to deploy the component manually. +A template tfvars file is also located in each component directory in `tfvars/template.tfvars`, explaining the configuration options available for that component. + +## Architecture + +CTFp is composed of four main components, each responsible for different aspects of the platform's functionality: + +1. **Cluster**: Responsible for provisioning and managing the underlying Kubernetes cluster infrastructure on Hetzner Cloud. + This includes setting up the necessary servers, networking, and storage resources required for the cluster to operate. + This can be found in the [`cluster`](./cluster) directory, and as the `cluster` component in the CLI tool. +2. **Ops** (Operations): Focuses on deploying and managing the operational tools and monitoring systems for the platform. + This includes setting up ArgoCD, monitoring, logging, ingress controllers, and other essential services that ensure the smooth operation of the platform. + This can be found in the [`ops`](./ops) directory, and as the `ops` component in the CLI tool. +3. **Platform**: Handles the deployment and configuration of the CTFd scoreboard and its associated services. + This includes setting up the database, caching, and storage solutions required for the scoreboard to function effectively. + This can be found in the [`platform`](./platform) directory, and as the `platform` component in the CLI tool. +4. **Challenges**: Manages the deployment and configuration of the CTF challenges. + This includes setting up the necessary resources and configurations to host and manage the challenges securely and efficiently. + This can be found in the [`challenges`](./challenges) directory, and as the `challenges` component in the CLI tool. + +Each component is designed to be modular and can be deployed independently or together, allowing for flexibility in managing the platform's infrastructure and services. + +### Directory structure + +The CTFp repository is structured as follows: + +```txt +ctfp/ +├── backend/ # Terraform backend configurations +├── keys/ # Generated SSH keys +├── terraform/ # Terraform plans +├── tf-modules/ # Reusable Terraform modules +├── cluster/ # Cluster component Terraform configurations +├── ops/ # Ops component Terraform configurations +├── platform/ # Platform component Terraform configurations +├── challenges/ # Challenges component Terraform configurations +├── ctfp.py # CTFp CLI tool +├── kubectl.sh # Script for configuring kubectl context +├── README.md # This README file +├── requirements.txt # Python dependencies for the CLI tool +├── template.automated.tfvars # Template for CTFp CLI configuration +└── ... # Other files and directories, such as license, contributing guidelines, etc. +``` + +### Overview + +![CTFp Architecture](./docs/attachments/architecture/overview.svg) + +The above figure, details how the different components come together to form the complete CTFp platform. +It highlights the central elements: [CTFd](https://github.com/ctfpilot/ctfd), DB Cluster, Redis, [CTFd-manager](https://github.com/ctfpilot/ctfd-manager), [KubeCTF](https://github.com/ctfpilot/kube-ctf), monitoring and deployment flow. + +*The figure serves as an overview of the platform's architecture, and does therefore not include all components and services involved in the platform.* + +#### Cluster + +The Cluster component is responsible for provisioning and managing the Kubernetes cluster infrastructure on Hetzner Cloud. + +It deploys a [kube-hetzner](https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner) cluster within the Hetzner Cloud environment, setting up the necessary servers, networking, and storage resources required for the cluster to operate. + +Specifically, it handles: + +- **Cluster provisioning**: Creating and configuring the Kubernetes cluster using Hetzner Cloud resources. +- **Node management**: Setting up and managing the worker nodes that will run the workloads. + This includes configuring node pools, scaling, and updating nodes as needed, along with setting up the node-autoscaler for automatic scaling based on demand. +- **Networking**: Configuring the network settings to ensure proper communication between cluster components. + This includes setting up a private network, configuring VPN connectivity between the nodes and setting up Flannel CNI for pod networking. + It opens the required firewall rules to allow communication between nodes, and outbound connections to required services. +- **Storage**: Setting up storage controller (CSI) to use Hetzner Block storage volumes. +- **Traefik proxy**: Deploying Traefik as the ingress controller for managing incoming traffic to the cluster. + +If an alternative cluster setup is desired, the Cluster component can be replaced with a different Kubernetes cluster, as long as it meets the requirements for running the platform. + +**Cluster requirements**: + +The Kubernetes cluster used for CTFp must meet the following requirements: + +- Kubernetes version 1.33 or higher +- Traefik ingress controller, with correctly configured load balancer +- Persistent storage support (CSI). You may use whatever storage solution you prefer, as long as it supports dynamic provisioning of Persistent Volumes, and is set as the default storage class. +- Provides a kubeconfig file for the cluster, to allow the CLI tool to interact with the cluster. This config should have full admin access to the cluster. +- Has at least a single node with the taint `cluster.ctfpilot.com/node=scaler:PreferNoSchedule` for running challenge instances. + *May be skipped, if no instanced challenges are to be deployed, or you change the taints in the challenge deployment configuration.* +- Enough resources to run the platform components. + *This depends on the CTFd setup, challenges and CTF size.* +- Has correct firewall rules to allow outbound connections to required services, such as logging aggregation, SMTP servers, Discord, Cloudflare API, GitHub, and reverse connections from challenges (if they need internet access). +- Flannel CNI installed for networking. +- Cert-manager is not installed, as it is managed by the Ops component. + +#### Ops + +The Ops component is responsible for deploying and managing the operational tools, services, and configurations required for the platform to function. + +It deploys essential infrastructure components on top of the Kubernetes cluster, providing foundational services that other platform components depend on. This component must be deployed after the Cluster and before the Platform and Challenges components. + +Specifically, it deploys the following: + +- **ArgoCD**: GitOps continuous delivery tool used to deploy and manage applications within the Kubernetes cluster. ArgoCD continuously synchronizes the cluster state with Git repositories, enabling declarative infrastructure management. +- **Cert-manager**: Certificate management system for automating TLS/SSL certificate provisioning and renewal. It integrates with Cloudflare for DNS validation challenges. +- **Traefik configuration**: Deploys additional Helm chart configuration for the Traefik ingress controller already present in the cluster, enabling advanced routing and middleware features, along with additional logging with Filebeat log aggregation. +- **Descheduler**: Continuously rebalances the cluster by evicting workloads from nodes, ensuring optimal resource utilization and distribution across available nodes. +- **Error Fallback**: Deploys [CTF Pilot's Error Fallback](https://github.com/ctfpilot/error-fallback) page service, providing custom error pages for HTTP error responses (e.g., 404, 502, 503). +- **Filebeat**: Log aggregation and forwarding system that sends logs to Elasticsearch or other log aggregation services, enabling centralized logging and analysis. +- **MariaDB Operator**: Kubernetes operator for managing MariaDB database instances. Allows automated provisioning, scaling, and management of MySQL-compatible databases. +- **Redis Operator**: Kubernetes operator for managing Redis cache instances. Enables automated deployment and management of Redis clusters for caching and data storage. +- **Prometheus & Grafana Stack**: Comprehensive monitoring and visualization solution. Prometheus scrapes metrics from cluster components, while Grafana provides dashboards for monitoring cluster health, resource usage, and application performance. Custom dashboards for Kubernetes, CTFd, and KubeCTF are included. +- **Alertmanager**: Alerting system integrated with Prometheus, used to send notifications based on defined alerting rules. Configured to send alerts to Discord channels for monitoring purposes. + +#### Platform + +The Platform component is responsible for deploying and managing the CTFd scoreboard and its associated services. + +It handles the complete setup of the CTF competition's scoring system, database infrastructure, and management services. The Platform component must be deployed after both the Cluster and Ops components, as it depends on services provided by the Ops component. + +Specifically, it deploys the following: + +- **CTFd**: The main CTF scoreboard application. This is deployed as a customizable instance that manages team registration, challenge submissions, scoring, and leaderboards. It deploys using the provided CTFd configuration from the defined GitHub repository. See [CTF Pilot's CTFd configuration](https://github.com/ctfpilot/ctfd) for more information. +- [**CTFd-manager**](https://github.com/ctfpilot/ctfd-manager): A companion service for CTFd that provides automated configuration management and administrative functions. It handles initial setup of CTFd and continuous synchronization of pages and challenges. +- **MariaDB database cluster**: A highly available database cluster for storing CTFd data, user accounts, challenge information, and competition state. Deployed using the MariaDB Operator with automated backups to S3. +- **Redis caching layer**: A Redis cluster for caching CTFd data and improving performance. +- **S3 storage configuration**: Integration with S3-compatible object storage for storing challenge files, user uploads, and other assets uploaded to CTFd. +- **Metrics and monitoring**: Deploys metrics exporters and monitoring configurations specific to the CTFd instance for tracking performance and availability. +- **Pages deployment**: Automatically deploys CTF-related pages (e.g., rules, schedule, information pages) from the defined GitHub repository using [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). +- **Traefik ingress configuration**: Sets up ingress routing rules to expose CTFd and related services through the Traefik ingress controller. +- **Initial CTFd setup**: Configures initial CTFd settings, such as competition name, start/end times, and other global settings using [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). + +The Platform automatically sets up Kubernetes secrets and configurations for the components deployed, so that this information is not required to be tracked within Git. +This means that critical secrets are stored within Kubernetes secrets once the Platform component is deployed. + +Backups of the database are automatically created and stored in the configured S3 storage, allowing for disaster recovery and data retention. Currently backups are configured to run every 15 minutes, and retained for 30 days. +Backups are stored as cleartext SQL dump files, so ensure that the S3 storage has proper access policies in place to prevent unauthorized access. + +#### Challenges + +The Challenges component is responsible for managing the deployment and configuration of CTF challenges within the platform. + +It handles the infrastructure setup required to host, isolate, and manage challenges across the Kubernetes cluster. Challenge instances can be deployed in different modes (static, shared or instanced), and the component manages the networking, resource allocation, and lifecycle of challenge containers. The Challenges component must be deployed after the Cluster, Ops, and Platform components. + +Specifically, it manages the following: + +- **Challenge deployment infrastructure**: Sets up the necessary Kubernetes resources for hosting challenges. This includes namespaces, network policies, and RBAC configurations for proper challenge isolation and access control. +- **KubeCTF integration**: Integrates with [KubeCTF](https://github.com/ctfpilot/kube-ctf) to enable dynamic challenge instance management. [KubeCTF](https://github.com/ctfpilot/kube-ctf) handles the creation, scaling, and destruction of challenge instances. +- **Challenge mode support**: Supports three deployment modes: + - **Static challenges**: Challenges that are deployed as static files (e.g., forensics challenges) and are only deployed to CTFd through [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). + - **Shared challenges**: Challenges that have a single instance shared among all teams (e.g., web challenges). This is deployed through ArgoCD. + - **Instanced challenges**: Challenges that have individual instances for each team (e.g., dynamic web challenges). This is managed through [KubeCTF](https://github.com/ctfpilot/kube-ctf). +- **IP whitelisting**: Implements IP-based access control to challenges, allowing restrictions on which IPs or networks can access specific challenges. For public access, the `0.0.0.0/0` CIDR can be used. +- **Custom fallback pages**: Deploys custom error pages for various challenge states (e.g., instancing fallback page for when a challenge is being provisioned). +- **Challenge deployment and configuration management**: Deploys challenge deployment configurations through ArgoCD, allowing for GitOps-style management of challenge definitions and updates, controlling it through defined GitHub repository and defined challenge slugs to be deployed. + +Challenges are deployed and managed through Git repositories, with configurations defined in challenge definition files. Use the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template) for challenge development. + +By default, the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) deployment templates use taints to control which nodes challenge instances are scheduled on. Therefore, the cluster must have at least one node with the taint `cluster.ctfpilot.com/node=scaler:PreferNoSchedule` if using Instanced challenges, to ensure challenge instances are properly scheduled and deployed. + +### Challenge deployment + +![CTFp Challenge Deployment](./docs/attachments/architecture/challenge-deployment.svg) + +The challenge deployment system, utilizes a combination of GitOps principles and dynamic instance management to efficiently deploy and manage CTF challenges. + +It is built to use [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template) for preparing the challenge definitions, and ArgoCD for deploying the challenge configurations to the Kubernetes cluster. +Here, ArgoCD continuously monitors the defined GitHub repository for changes, and automatically applies updates to the cluster. + +Static challenges are deployed as configurations for CTFd through [CTFd-manager](https://github.com/ctfpilot/ctfd-manager), while Shared challenges are deployed as single instances through ArgoCD. +Instanced challenges are managed through [KubeCTF](https://github.com/ctfpilot/kube-ctf), where ArgoCD deploys deployment templates to [KubeCTF](https://github.com/ctfpilot/kube-ctf). + +Container images can be stored in any container registry, as long as the Kubernetes cluster has access to pull the images. +By default, pull secrets are configured for GitHub Container Registry, and are currently **not** configurable through the platform configuration. +Any additional pull secrets must be created manually in the cluster, and referenced in the challenge deployment configuration. + +For more information on how to develop challenges, see the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template). An example challenges repository can be found at [CTF Pilot's Challenges example repository](https://github.com/ctfpilot/challenges-example). + +### Network + +The following diagrams provide an overview of CTFp's cluster and challenge networking setups. + +#### Cluster networking + +![CTFp Cluster Networking Overview](./docs/attachments/architecture/cluster-network-architecture.svg) + +CTFp requires three domains, as it configures different services under different domains: + +- **Management domain**: Used for accessing the management services, such as ArgoCD, Grafana, and Prometheus. + This domain should only be distributed to administrators. +- **Platform domain**: Used for accessing the CTFd scoreboard and related services. + This domain is distributed to participants for accessing the CTF platform. +- **CTF domain**: Used for accessing the challenges. + This domain is also distributed to participants for accessing the challenges. + +The platform does not require you to allocate the full top-level domain (TLD) for CTFp, as subdomains for each of the three domains can be configured. + +Management and Platform domains are configured to be proxied through Cloudflare, to take advantage of their CDN and DDoS protection services. +CTF domain is not proxied, as challenges often require direct access to the challenge instances. + +Domain management is built into the system, and DNS entries are therefore automatically created and managed through Cloudflare's API. + +Hetzner Cloud's Load Balancers are used to distribute incoming traffic to the Traefik ingress controllers deployed on each node in the cluster. +Within the cluster, Traefik handles routing of incoming requests to the appropriate services based on the configured ingress rules. +Network is shared between nodes using Hetzner Cloud's private networking, ensuring efficient and secure communication between cluster components. + +#### Challenge networking + +![CTFp Challenge Networking Overview](./docs/attachments/architecture/challenge-network-architecture.svg) + +As described in the [Cluster networking](#cluster-networking) section, CTFp utilizes three main domains for different purposes. +Challenges are accessed through the CTF domain, which is specifically designated for hosting and serving challenge instances, and are therefore not proxied through Cloudflare; they point directly to the Hetzner Cloud Load Balancers. + +This load balancer is set up to forward all incoming traffic to the Traefik ingress controllers deployed within the Kubernetes cluster. + +Traefik supports TCP and HTTP(S) routing, allowing it to handle a wide range of challenge types and protocols. +However, a limited number of middleware options are available for TCP routing, so ensure that your challenges are compatible with the available features. + +IP whitelisting is implemented at the ingress level, allowing challenges to restrict access based on IP addresses or CIDR ranges. + +By default, HTTP(S) traffic is configured with fallback middleware, providing custom error pages for various HTTP error responses (e.g., 404, 502, 503). +When an instanced challenge is being provisioned, the custom error page will inform the user that the challenge is being started and automatically refresh the page until the challenge is ready. + +Shared and Instanced challenges are deployed within either `ctfpilot-challenges` or `ctfpilot-challenges-instanced` namespaces, while static challenges are only deployed to CTFd through [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). +The two namespaces are configured with network policies to restrict any outgoing local traffic, allowing only outbound internet access. + +Challenges can therefore not talk to each other, nor communicate across multiple deployments. +If your challenge requires multiple containers, they need to be deployed within the same challenge deployment, and set up in a sidecar pattern. + +Cluster DNS is not available for challenges, so any service discovery must be handled through external DNS services. +Challenges allow for multiple endpoints to be defined, across both HTTP(S) and TCP protocols. + +TCP endpoints are handled either through a custom Traefik port (only available for shared TCP challenges), or as an SSL TCP endpoint using SNI routing (recommended). +Hetzner limits the number of ports available for Load Balancers, so ensure that you plan accordingly when deploying challenges requiring TCP endpoints using custom ports. +*Currently, configuring custom ports for TCP endpoints is not supported through the platform configuration, and must be set up manually after deployment, or manually in the cluster Terraform module.* + +SSL TCP connections can be made using one of the following command examples: + +```bash +# Using openssl +openssl s_client -connect :443 -servername + +# Netcat +ncat --ssl 443 +``` + +*The netcat command is the one displayed in the [CTFd plugin for Kube-CTF](https://github.com/ctfpilot/ctfd-kubectf-plugin).* + +We understand that this increases the complexity of challenge connection, but it provides a way to easily and dynamically allocate TCP endpoints without the need for managing multiple ports on the Load Balancer. + +## Getting help + +If you need help or have questions regarding CTFp, you can reach out through the following channels: + +- **GitHub Issues**: You can open an issue in the [CTFp GitHub repository](https://github.com/ctfpilot/ctfp/issues) for bug reports, feature requests, or general questions. +- **Discord**: Join the [CTF Pilot Discord server](https://discord.ctfpilot.com) to engage with the community, ask questions, and get support from other users and contributors. + +*The project is delivered as-is, and we do not provide official support services. However, we encourage community engagement and collaboration to help each other out.* +*Contributors and maintainers may assist with questions and issues as time permits.* ## Contributing @@ -21,14 +957,25 @@ To administrate the CLA signing process, we are using **[CLA assistant lite](htt ## Background -CTF Pilot started as a CTF Platform project, originating in **[Brunnerne](https://github.com/brunnerne)**. +CTF Pilot started as a CTF platform project, originating in **[Brunnerne](https://github.com/brunnerne)**. + +The goal of the project is to provide a scalable, resilient, and easy-to-use CTF platform for hosting large-scale Capture The Flag competitions, starting with BrunnerCTF 2025. + +The project is still in active development, and we welcome contributions from the community to help improve and expand the platform's capabilities. ## License -CTFp is licensed under a dual license, the **PolyForm Noncommercial License 1.0.0** for non-commercial use, and a **Commercial License** for commercial use. +CTFp is licensed under a dual license, the **PolyForm Noncommercial License 1.0.0** for non-commercial use, and a **Commercial License** for commercial use. You can find the full license for non-commercial use in the **[LICENSE.md](LICENSE.md)** file. For commercial licensing, please contact **[The0Mikkel](https://github.com/The0Mikkel)**. +Without commercial licensing, the platform **MUST NOT** be used for commercial purposes, including but not limited to: + +- Hosting CTF competitions for profit +- Hosting a CTF as a commercial organization, even if the CTF itself is free or only provided to internal users +- Offering CTF hosting as a paid service +- Using the platform in any commercial product or service + We encourage all modifications and contributions to be shared back with the community, for example through pull requests to this repository. We also encourage all derivative works to be publicly available under **PolyForm Noncommercial License 1.0.0**. At all times must the license terms be followed. diff --git a/cluster/kube.tf b/cluster/kube.tf index a24f974..1b88ec3 100644 --- a/cluster/kube.tf +++ b/cluster/kube.tf @@ -227,7 +227,7 @@ module "kube-hetzner" { }, { name = "challs-1", - server_type = var.scale_type, + server_type = var.challs_type, location = var.region_1, labels = [ "ressource-type=node", diff --git a/cluster/tfvars/template.tfvars b/cluster/tfvars/template.tfvars index 0f9813c..e8cb3cf 100644 --- a/cluster/tfvars/template.tfvars +++ b/cluster/tfvars/template.tfvars @@ -35,35 +35,44 @@ cluster_dns_ctf = "" # The domain name to use for # Cluster configuration # ------------------------ # WARNING: Changing region while the cluster is running will cause all servers in the group to be destroyed and recreated. -# For optimal performance, it is recommended to use the same region for all servers. -# Region 1 is used for scale nodes and loadbalancer. -# Possible values: fsn1, hel1, nbg1 -region_1 = "fsn1" # Region for subgroup 1 -region_2 = "fsn1" # Region for subgroup 2 -region_3 = "fsn1" # Region for subgroup 3 +# For optimal performance, it is recommended to use the same region for all servers. If you want redundancy, use different regions for each group. +# Region 1 is used for challs nodes, scale nodes and loadbalancer. +# Possible values: fsn1, hel1, nbg1, ash, hil, sin - See https://docs.hetzner.com/cloud/general/locations/ +region_1 = "nbg1" # Region for group 1, challs nodes, scale nodes and loadbalancer +region_2 = "nbg1" # Region for group 2 +region_3 = "nbg1" # Region for group 3 network_zone = "eu-central" # Hetzner network zone. Possible values: "eu-central", "us-east", "us-west", "ap-southeast". Regions must be within the network zone. # Servers -# Server definitions are split into three groups: Control Plane, Agents, and Scale. Control plane and agents has three groups each, and scale has one group. +# Server definitions are split into four groups: Control Plane, Agents, Challs and Scale. Control plane and agents has three groups each, while challs and scale is one group each. # Each group can be scaled and defined independently, to allow for smooth transitions between different server types and sizes. # Control planes are the servers that run the Kubernetes control plane, and are responsible for managing the cluster. # Agents are the servers that run the workloads, and scale is used to scale the cluster up or down dynamically. -# Scale is automatically scaled agent nodes, which is handled by the cluster autoscaler. It is optional, and can be used to scale the cluster up or down dynamically. +# Challs are the servers that run the CTF challenges. +# Scale is automatically scaled agent nodes, which is handled by the cluster autoscaler. It is optional, and can be used to scale the cluster up or down dynamically if there is not enough resources in the cluster. +# Challs and scale nodes are placed in region_1, and are tainted to make normal resources prefer agent nodes, but allow scheduling on challs and scale nodes if needed. # Server types. See https://www.hetzner.com/cloud +# Control plane nodes - Nodes that run the Kubernetes control plane components. control_plane_type_1 = "cx23" # Control plane group 1 control_plane_type_2 = "cx23" # Control plane group 2 control_plane_type_3 = "cx23" # Control plane group 3 +# Agent nodes - Nodes that run general workloads, excluding CTF challenges. agent_type_1 = "cx33" # Agent group 1 agent_type_2 = "cx33" # Agent group 2 agent_type_3 = "cx33" # Agent group 3 +# Challenge nodes - Nodes dedicated to running CTF challenges. +challs_type = "cx33" # CTF challenge nodes +# Scale nodes - Nodes that are automatically scaled by the cluster autoscaler. These nodes are used to scale the cluster up or down dynamically. scale_type = "cx33" # Scale group # Server count +# Control plane nodes - Nodes that run the Kubernetes control plane components. # Minimum of 1 control plane across all groups. 1 in each group is recommended for HA. control_plane_count_1 = 1 # Number of control plane nodes in group 1 control_plane_count_2 = 1 # Number of control plane nodes in group 2 control_plane_count_3 = 1 # Number of control plane nodes in group 3 +# Agent nodes - Nodes that run general workloads, excluding CTF challenges. # Minimum of 1 agent across all groups. 1 in each group is recommended for HA. agent_count_1 = 1 # Number of agent nodes in group 1 agent_count_2 = 1 # Number of agent nodes in group 2 diff --git a/cluster/variables.tf b/cluster/variables.tf index d90bc6e..4f326af 100644 --- a/cluster/variables.tf +++ b/cluster/variables.tf @@ -138,6 +138,11 @@ variable "agent_type_3" { default = "cx32" } +variable "challs_type" { + type = string + description = "CTF challenge nodes server type" + default = "cx32" +} variable "scale_type" { type = string description = "Scale group server type" diff --git a/ctfp.py b/ctfp.py index 9d38352..b5f1137 100755 --- a/ctfp.py +++ b/ctfp.py @@ -701,12 +701,8 @@ def get_filename_tfvars(environment="test"): :param environment: The environment name (test, dev, prod) :return: The filename for the tfvars file ''' - - prefix = "" - if environment != "test": - prefix = f"{environment}." - return f"automated.{prefix}tfvars" + return f"automated.{environment}.tfvars" @staticmethod def load_tfvars(file_path: str): @@ -895,6 +891,9 @@ def init_terraform(self, path, components: str = ""): try: # Check if tfvars file exists and is valid self.check_values() + + # Load backend connection credentials + self.load_backend_credentials() # Check if backend config exists if not TFBackend.backend_exists(components): @@ -1032,7 +1031,7 @@ def check_values(self): # Check if fields include "<" or ">" def check_placeholders(value): - if isinstance(value, str) and value.startswith("<") and value.endswith(">"): + if isinstance(value, str) and (value.startswith("<") or value.startswith("https://github.com/<")) and value.endswith(">"): return True elif isinstance(value, dict): for v in value.values(): @@ -1050,6 +1049,25 @@ def check_placeholders(value): Logger.info(f"{self.get_filename_tfvars()} is filled out correctly") + + def load_backend_credentials(self): + ''' + Load S3 backend credentials from automated.tfvars, to set Terraform S3 connection credentials + ''' + + # Load tfvars file + tfvars_data = TFVARS.safe_load_tfvars(self.get_path_tfvars()) + + # Set environment variables for S3 backend + os.environ["AWS_ACCESS_KEY_ID"] = tfvars_data.get("terraform_backend_s3_access_key", "") + os.environ["AWS_SECRET_ACCESS_KEY"] = tfvars_data.get("terraform_backend_s3_secret_key", "") + + if os.environ["AWS_ACCESS_KEY_ID"] == "" or os.environ["AWS_SECRET_ACCESS_KEY"] == "": + Logger.error("S3 backend credentials not found in automated.tfvars. Please fill out terraform_backend_s3_access_key and terraform_backend_s3_secret_key as they are required to run the Terraform components.") + exit(1) + + Logger.info(f"S3 backend credentials loaded") + def cluster_deploy(self): Logger.info("Deploying the cluster") @@ -1265,10 +1283,6 @@ def challenges_destroy(self): class CLI: def run(self): Logger.info("Starting CTF-Pilot CLI") - Logger.info("Checking availability of requried tools") - self.platform_check() - self.tool_check() - Logger.success("Required Tools are available") args = Args() if args.parser is None: @@ -1298,6 +1312,11 @@ def run(self): args.print_help() exit(1) + Logger.info("Checking availability of required tools") + self.platform_check() + self.tool_check() + Logger.success("Required Tools are available") + # Run the subcommand try: namespace.func(namespace) diff --git a/docs/attachments/architecture/challenge-deployment.drawio b/docs/attachments/architecture/challenge-deployment.drawio new file mode 100644 index 0000000..43c9d4a --- /dev/null +++ b/docs/attachments/architecture/challenge-deployment.drawio @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/challenge-deployment.png b/docs/attachments/architecture/challenge-deployment.png new file mode 100644 index 0000000..a75cefa Binary files /dev/null and b/docs/attachments/architecture/challenge-deployment.png differ diff --git a/docs/attachments/architecture/challenge-deployment.svg b/docs/attachments/architecture/challenge-deployment.svg new file mode 100644 index 0000000..553aaff --- /dev/null +++ b/docs/attachments/architecture/challenge-deployment.svg @@ -0,0 +1 @@ +
ArgoCD
ArgoCD
Shared challenge
Shared challenge
Instanced challenge
Instanced challenge
Deploys
Deploys
KubeCTF
KubeCTF
Deploys
instanced challenge template
Deploys...
Deploys
Deploys
Master
Master
Container registry
Container registry
Generate
deployment files
Github actions
Generate...
Update deployment
Update dep...
Challenge updated
Challenge...
Push docker images
Github actions
Push docker images...
Pulls
Deployment templates
Pulls...
Pulls
Docker image
Pulls...
Chall dev
Chall...
Commit
Commit
Kubernetes
cluster
Kuberne...
Github
Github
Service / Deployment
Service / Deployment
Cluster
Cluster
Github
Github
Action
Action
Background
operation
Background...
Github branch
Github branch
Challenge deployment
Challenge deployment
CTFd
CTFd
CTFd manager
CTFd manager
Updates CTFd
Updates CTFd
Deploys
Chall information
Deploys...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/attachments/architecture/challenge-network-architecture.drawio b/docs/attachments/architecture/challenge-network-architecture.drawio new file mode 100644 index 0000000..725a121 --- /dev/null +++ b/docs/attachments/architecture/challenge-network-architecture.drawio @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/challenge-network-architecture.png b/docs/attachments/architecture/challenge-network-architecture.png new file mode 100644 index 0000000..125154d Binary files /dev/null and b/docs/attachments/architecture/challenge-network-architecture.png differ diff --git a/docs/attachments/architecture/challenge-network-architecture.svg b/docs/attachments/architecture/challenge-network-architecture.svg new file mode 100644 index 0000000..0dcd4d2 --- /dev/null +++ b/docs/attachments/architecture/challenge-network-architecture.svg @@ -0,0 +1 @@ +
User
User
Kubernetes
cluster
Kuberne...
Service / Deployment
Service / Deployment
Cluster
Cluster
Hetzner Cloud
Hetzner Cloud
Request
Request
Challenge Network architecture
Challenge Network architecture
Load balancer
Load balancer
Branching
Branching
Challenge
Challenge
Yes
Yes
No
No
TCP?
TCP?
Yes
Yes
No
No
Available?
Available?
Fallback
Fallback
Traefik
Traefik
Traefik
Traefik
Traefik
Traefik
Hetzner
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/attachments/architecture/cluster-network-architecture.drawio b/docs/attachments/architecture/cluster-network-architecture.drawio new file mode 100644 index 0000000..9c85d9f --- /dev/null +++ b/docs/attachments/architecture/cluster-network-architecture.drawio @@ -0,0 +1,214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/cluster-network-architecture.png b/docs/attachments/architecture/cluster-network-architecture.png new file mode 100644 index 0000000..8717a6a Binary files /dev/null and b/docs/attachments/architecture/cluster-network-architecture.png differ diff --git a/docs/attachments/architecture/cluster-network-architecture.svg b/docs/attachments/architecture/cluster-network-architecture.svg new file mode 100644 index 0000000..1f23851 --- /dev/null +++ b/docs/attachments/architecture/cluster-network-architecture.svg @@ -0,0 +1 @@ +
User
User
Service / Deployment
Service / Deployment
Private network
Private network
Hetzner Cloud
Hetzner Cloud
Request
Request
Cluster Network architecture
Cluster Network architecture
Load balancer
Load balancer
Server
Server
Control plane
Load balancer
Control planeLoad ba...
Control plane
Control plane
Control plane
Control plane
Control plane
Control plane
Agents
Agents
Challs
Challs
Scale
Scale
Scale
Scale
Scale
Scale
Challs
Challs
Challs
Challs
Agents
Agents
Agents
Agents
Cluster
Cluster
Kubernetes
cluster
Kuberne...
K8s
resources
K8s...
Hetzner
Platform
domain
Platform...
Management
domain
Management...
CTF
domain
CTF...
Cloudflare proxy
Cloudflare proxy
Cloudflare
Cloudflare
Traefik
Traefik
Traefik
Traefik
Traefik
Traefik
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/attachments/architecture/overview.drawio b/docs/attachments/architecture/overview.drawio new file mode 100644 index 0000000..d65521c --- /dev/null +++ b/docs/attachments/architecture/overview.drawio @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/overview.png b/docs/attachments/architecture/overview.png new file mode 100644 index 0000000..87a00ba Binary files /dev/null and b/docs/attachments/architecture/overview.png differ diff --git a/docs/attachments/architecture/overview.svg b/docs/attachments/architecture/overview.svg new file mode 100644 index 0000000..cd1b8db --- /dev/null +++ b/docs/attachments/architecture/overview.svg @@ -0,0 +1 @@ +
CTFd
CTFd
Redis
Redis
Redis
Redis
DB
DB
DB
DB
CTFd
CTFd
CTFd
CTFd
Chall
Chall
Chall
Chall
ArgoCD
ArgoCD
DB cluster
DB cluster
Redis
Redis
KubeCTF
KubeCTF
Instanced
Challenges
Instanced...
Prometheus
Grafana
Prometheus...
Logging
Logging
Chall
Chall
Chall
Chall
Shared
Challenges
Shared...
Kubernetes
cluster
Kuberne...
Uses
Uses
Deploys
Deploys
Deploys
Instanced challenges
templates
Deploys...
Ops
Ops
Deploys
Deploys
Platform
Platform
Deploys
Deploys
Challenges
Challenges
Cluster
Cluster
Deploys
Deploys
Deploys
Deploys
Configures
Configures
CTFd
CTFd
Pulls
deployment config
Pulls...
Orders instanced deployment
Orders instanced...
Deploys
Deploys
Deploys
Deploys
Challenges
Challenges
CTFp
CTFp
Service / Deployment
Service / Deployment
Repository
Repository
Terraform project
Terraform project
Cluster
Cluster
Github
Github
Action
Action
CTFd-manager
CTFd-manager
Deploys
Challs
Deploys...
Configures
Configures
Git
Git
Architecture overview
Architecture overview
Text is not SVG - cannot display
\ No newline at end of file diff --git a/kubectl.sh b/kubectl.sh old mode 100644 new mode 100755 index 6bb997b..1fd9b1d --- a/kubectl.sh +++ b/kubectl.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash # Select environment between test, dev or prod -# Usage: ./kubectl-setup.sh [test|dev|prod] +# Usage: source ./kubectl.sh [test|dev|prod] CTFP_EXECUTE=true if [ -z "$1" ]; then - echo "Usage: $0 [test|dev|prod]" + echo "Usage: source $0 [test|dev|prod]" CTFP_EXECUTE=false fi diff --git a/template.automated.tfvars b/template.automated.tfvars index 1f355ac..0ee1eb3 100644 --- a/template.automated.tfvars +++ b/template.automated.tfvars @@ -2,39 +2,56 @@ # Clone this file to `automated.tfvars` and fill in the values. # This file (`template.automated.tfvars`) is git tracked, and MUST NOT be changed in the repository to include sensitive information. +# ------------------------ +# CLI Tool configuration +# ------------------------ +# The following variables are used by the CLI tool to configure the backend connection. +# Specifically setting the credentials to access the Terraform S3 backend. +terraform_backend_s3_access_key = "" # Access key for the S3 backend +terraform_backend_s3_secret_key = "" # Secret key for the S3 backend + # ------------------------ # Cluster configuration # ------------------------ # WARNING: Changing region while the cluster is running will cause all servers in the group to be destroyed and recreated. -# For optimal performance, it is recommended to use the same region for all servers. -# Region 1 is used for scale nodes and loadbalancer. -# Possible values: fsn1, hel1, nbg1 -region_1 = "nbg1" # Region for subgroup 1 -region_2 = "nbg1" # Region for subgroup 2 -region_3 = "nbg1" # Region for subgroup 3 +# For optimal performance, it is recommended to use the same region for all servers. If you want redundancy, use different regions for each group. +# Region 1 is used for challs nodes, scale nodes and loadbalancer. +# Possible values: fsn1, hel1, nbg1, ash, hil, sin - See https://docs.hetzner.com/cloud/general/locations/ +region_1 = "nbg1" # Region for group 1, challs nodes, scale nodes and loadbalancer +region_2 = "nbg1" # Region for group 2 +region_3 = "nbg1" # Region for group 3 network_zone = "eu-central" # Hetzner network zone. Possible values: "eu-central", "us-east", "us-west", "ap-southeast". Regions must be within the network zone. # Servers -# Server definitions are split into three groups: Control Plane, Agents, and Scale. Control plane and agents has three groups each, and scale has one group. +# Server definitions are split into four groups: Control Plane, Agents, Challs and Scale. Control plane and agents has three groups each, while challs and scale is one group each. # Each group can be scaled and defined independently, to allow for smooth transitions between different server types and sizes. # Control planes are the servers that run the Kubernetes control plane, and are responsible for managing the cluster. # Agents are the servers that run the workloads, and scale is used to scale the cluster up or down dynamically. -# Scale is automatically scaled agent nodes, which is handled by the cluster autoscaler. It is optional, and can be used to scale the cluster up or down dynamically. +# Challs are the servers that run the CTF challenges. +# Scale is automatically scaled agent nodes, which is handled by the cluster autoscaler. It is optional, and can be used to scale the cluster up or down dynamically if there is not enough resources in the cluster. +# Challs and scale nodes are placed in region_1, and are tainted to make normal resources prefer agent nodes, but allow scheduling on challs and scale nodes if needed. # Server types. See https://www.hetzner.com/cloud +# Control plane nodes - Nodes that run the Kubernetes control plane components. control_plane_type_1 = "cx23" # Control plane group 1 control_plane_type_2 = "cx23" # Control plane group 2 control_plane_type_3 = "cx23" # Control plane group 3 +# Agent nodes - Nodes that run general workloads, excluding CTF challenges. agent_type_1 = "cx33" # Agent group 1 agent_type_2 = "cx33" # Agent group 2 agent_type_3 = "cx33" # Agent group 3 +# Challenge nodes - Nodes dedicated to running CTF challenges. +challs_type = "cx33" # CTF challenge nodes +# Scale nodes - Nodes that are automatically scaled by the cluster autoscaler. These nodes are used to scale the cluster up or down dynamically. scale_type = "cx33" # Scale group # Server count +# Control plane nodes - Nodes that run the Kubernetes control plane components. # Minimum of 1 control plane across all groups. 1 in each group is recommended for HA. control_plane_count_1 = 1 # Number of control plane nodes in group 1 control_plane_count_2 = 1 # Number of control plane nodes in group 2 control_plane_count_3 = 1 # Number of control plane nodes in group 3 +# Agent nodes - Nodes that run general workloads, excluding CTF challenges. # Minimum of 1 agent across all groups. 1 in each group is recommended for HA. agent_count_1 = 1 # Number of agent nodes in group 1 agent_count_2 = 1 # Number of agent nodes in group 2