diff --git a/ANSIBLE_TOWER_ALTERNATIVES_ANALYSIS.md b/ANSIBLE_TOWER_ALTERNATIVES_ANALYSIS.md new file mode 100644 index 0000000000..07530c4b6b --- /dev/null +++ b/ANSIBLE_TOWER_ALTERNATIVES_ANALYSIS.md @@ -0,0 +1,350 @@ +# Ansible Tower vs Open-Source Alternatives: Production Infrastructure Analysis + +## 1. What is Ansible Tower (Now "Red Hat Ansible Automation Platform")? + +Ansible Tower is Red Hat's **commercial, licensed** web-based UI and REST API for Ansible. It provides: + +- Visual dashboard for job status and inventory +- Role-based access control (RBAC) +- Job scheduling and workflow orchestration +- Centralized logging and auditing +- Credential management +- Multi-tenant support +- REST API for integration + +### Licensing Cost (Why We Need Alternatives) + +| Tier | Cost (Approx.) | +|------|----------------| +| Standard | ~$13,000/year (up to 100 nodes) | +| Premium | ~$17,500/year (up to 100 nodes) | +| Enterprise (unlimited) | Custom pricing ($50,000+) | + +**Verdict:** Too expensive for teams without budget for licensing. + +--- + +## 2. Open-Source Alternatives Comparison + +### 2.1 AWX (Ansible AWX) + +**What:** The upstream open-source project behind Ansible Tower. Essentially Tower without the Red Hat support/license. + +| Category | Details | +|----------|--------| +| License | Apache 2.0 (Fully Free) | +| Maintained By | Red Hat / Community | +| Deployment | Kubernetes (via AWX Operator), Docker | +| UI | Full web dashboard (identical to Tower) | +| RBAC | Yes (full role-based access control) | +| API | Full REST API | +| Job Scheduling | Yes | +| Credential Vault | Yes (integrates with HashiCorp Vault, CyberArk) | +| Notifications | Slack, Email, Webhook, PagerDuty | +| Audit/Logging | Full job logging with centralized history | +| Scalability | Horizontal (Kubernetes-native) | +| Community | Very active (GitHub: 14k+ stars) | + +**Security Level: HIGH** +- Built-in credential encryption (AES-256) +- RBAC with granular permissions +- LDAP/SAML/OAuth2 authentication +- Audit trail for all operations +- Secret management integration +- Container-isolated job execution + +**Production Readiness: 8.5/10** +- Same codebase as Tower +- No commercial SLA or support (community only) +- Frequent releases (can be unstable between versions) + +--- + +### 2.2 Semaphore UI + +**What:** A lightweight, modern open-source alternative to Ansible Tower/AWX. + +| Category | Details | +|----------|--------| +| License | MIT (Fully Free) | +| Maintained By | Community | +| Deployment | Single binary, Docker, or package manager | +| UI | Clean, modern web dashboard | +| RBAC | Yes (team-based access) | +| API | REST API | +| Job Scheduling | Yes (cron-based) | +| Credential Vault | Built-in key store | +| Notifications | Slack, Email, Telegram, Microsoft Teams | +| Audit/Logging | Task execution history | +| Scalability | Vertical (single instance) | +| Community | Active (GitHub: 10k+ stars) | + +**Security Level: MEDIUM-HIGH** +- Encrypted credential storage +- Team-based access control +- LDAP authentication support +- Audit logging +- No native secret manager integration (limited vs AWX) +- Smaller attack surface (simpler architecture) + +**Production Readiness: 7/10** +- Very lightweight and easy to maintain +- Less feature-rich than AWX +- Excellent for small-to-medium infrastructure +- Limited horizontal scaling + +--- + +### 2.3 Rundeck (PagerDuty Process Automation - Community Edition) + +**What:** A general-purpose operations automation platform (not Ansible-specific but supports Ansible as a plugin). + +| Category | Details | +|----------|--------| +| License | Apache 2.0 (Community Edition is Free) | +| Maintained By | PagerDuty / Community | +| Deployment | Java application (WAR/Docker/RPM/DEB) | +| UI | Full web dashboard | +| RBAC | Yes (ACL policies) | +| API | Full REST API | +| Job Scheduling | Yes (cron + event-based triggers) | +| Credential Vault | Yes (KeyStorage with plugins for Vault, AWS SSM, Thycotic) | +| Notifications | Email, Slack, Webhook, PagerDuty, custom plugins | +| Audit/Logging | Full audit trail with centralized logging | +| Scalability | Horizontal (clustering in enterprise) | +| Community | Mature (GitHub: 5k+ stars, 10+ years) | + +**Security Level: HIGH** +- ACL-based fine-grained access control +- Key storage with encryption +- LDAP/Active Directory/OAuth integration +- Full audit trail +- Plugin-based secret management +- SSH key management +- Node filtering with security context + +**Production Readiness: 8/10** +- Very mature and battle-tested +- Supports Ansible, Terraform, scripts, and more +- Enterprise features available in community edition +- Requires Java (JVM overhead) + +--- + +### 2.4 StackStorm (ST2) + +**What:** An event-driven automation platform with powerful workflow orchestration. + +| Category | Details | +|----------|--------| +| License | Apache 2.0 (Fully Free) | +| Maintained By | StackStorm / Community | +| Deployment | Docker, Kubernetes, packages | +| UI | Web UI (st2web) | +| RBAC | Yes (with enterprise features in open-source) | +| API | Full REST API | +| Job Scheduling | Yes + Event-driven triggers | +| Credential Vault | Datastore with encryption | +| Notifications | Slack, Email, Webhook, ChatOps (native) | +| Audit/Logging | Full execution history and audit | +| Scalability | Horizontal (microservices architecture) | +| Community | Active (GitHub: 6k+ stars) | + +**Security Level: HIGH** +- RBAC with fine-grained permissions +- Encrypted datastore for secrets +- LDAP/PAM authentication +- API key + token-based auth +- Audit trail +- HTTPS enforcement +- HashiCorp Vault integration + +**Production Readiness: 7.5/10** +- Excellent for event-driven automation +- More complex to set up than AWX +- Strong ChatOps integration +- Steeper learning curve + +--- + +### 2.5 Foreman + Ansible Plugin + +**What:** A complete lifecycle management tool with native Ansible integration. + +| Category | Details | +|----------|--------| +| License | GPL v3 (Fully Free) | +| Maintained By | Red Hat / Community | +| Deployment | RPM/DEB packages, containerized | +| UI | Full web dashboard | +| RBAC | Yes (roles and permissions) | +| API | Full REST API | +| Job Scheduling | Yes (via Remote Execution plugin) | +| Credential Vault | Smart Proxy-based | +| Notifications | Email, Webhook | +| Audit/Logging | Full audit trail | +| Scalability | Horizontal (Smart Proxies) | +| Community | Mature (10+ years) | + +**Security Level: HIGH** +- Fine-grained RBAC +- LDAP/IPA/AD authentication +- Full audit logging +- SSL/TLS everywhere +- Smart Proxy for distributed security +- Puppet CA integration + +**Production Readiness: 8/10** +- Very mature for provisioning + configuration management +- Heavier to install and maintain +- Best when you also need provisioning/lifecycle management + +--- + +## 3. Security Comparison Matrix + +| Feature | AWX | Semaphore | Rundeck | StackStorm | Foreman | +|---------|-----|-----------|---------|------------|--------| +| Credential Encryption | AES-256 | Yes | Yes | Yes | Yes | +| RBAC | Full | Team-based | ACL-based | Full | Full | +| LDAP/AD Auth | Yes | Yes | Yes | Yes | Yes | +| SAML/SSO | Yes | No | Enterprise | No | Yes | +| OAuth2 | Yes | No | Yes | No | Yes | +| Audit Trail | Full | Basic | Full | Full | Full | +| Vault Integration | Yes | No | Yes (plugins) | Yes | Limited | +| API Auth (Token) | Yes | Yes | Yes | Yes | Yes | +| Container Isolation | Yes | No | Plugin | Yes | No | +| CVE History | Low | Very Low | Low | Low | Medium | +| Compliance Ready | Yes | Limited | Yes | Yes | Yes | + +### Security Rating Summary + +| Tool | Security Score | Notes | +|------|---------------|-------| +| **AWX** | 9/10 | Enterprise-grade security, same as Tower | +| **Rundeck** | 8.5/10 | Mature security model, ACL-based | +| **StackStorm** | 8/10 | Strong but complex to configure | +| **Foreman** | 8/10 | Excellent with IPA/LDAP integration | +| **Semaphore** | 7/10 | Good for size, lacks advanced SSO | + +--- + +## 4. Final Recommendation for Production + +### PRIMARY RECOMMENDATION: AWX + +**Why AWX is the best choice for your production infrastructure:** + +1. **Identical to Ansible Tower** - Same codebase, same features, zero migration cost if you ever move to Tower +2. **Highest security** - AES-256 encryption, full RBAC, SAML/SSO, Vault integration +3. **Kubernetes-native** - Deploy via AWX Operator on K8s/OpenShift (aligns with your DO180 containerization path) +4. **Full REST API** - Integrate with CI/CD pipelines, monitoring, and other tools +5. **Active community** - Rapid bug fixes and security patches +6. **Credential management** - Secure handling of SSH keys, cloud credentials, vault passwords +7. **Job isolation** - Each playbook runs in an isolated container +8. **Scalable** - Horizontal scaling on Kubernetes + +### SECONDARY RECOMMENDATION: Semaphore UI + +**When to choose Semaphore instead:** + +- Small team (< 10 people) +- Fewer than 50 managed nodes +- Need simplicity over features +- Limited Kubernetes expertise +- Want minimal maintenance overhead + +### TERTIARY RECOMMENDATION: Rundeck + +**When to choose Rundeck instead:** + +- Multi-tool environment (not just Ansible, also Terraform, scripts, etc.) +- Need mature, battle-tested solution +- Java/JVM is acceptable in your stack +- Need fine-grained ACL policies + +--- + +## 5. Recommended AWX Deployment Architecture (Production) + +``` + +-------------------+ + | Load Balancer | + | (HAProxy/Nginx) | + +--------+----------+ + | + +--------------+--------------+ + | | + +--------v--------+ +---------v-------+ + | AWX Web Pod | | AWX Web Pod | + | (Django + UI) | | (Django + UI) | + +--------+---------+ +--------+--------+ + | | + +--------v-----------------------------v--------+ + | AWX Task Pods | + | (Ansible Job Runners) | + | [Container-isolated execution] | + +------------------------+----------------------+ + | + +---------------+---------------+ + | | | + +--------v---+ +--------v---+ +--------v---+ + | PostgreSQL | | Redis | | Receptor | + | (Database) | | (Cache/ | | (Mesh for | + | | | Queue) | | remote) | + +-------------+ +------------+ +------------+ +``` + +### Minimum Hardware Requirements (Production) + +| Component | CPU | RAM | Storage | +|-----------|-----|-----|--------| +| AWX Web (x2) | 2 vCPU | 4 GB | 20 GB | +| AWX Task (x2) | 4 vCPU | 8 GB | 40 GB | +| PostgreSQL | 2 vCPU | 4 GB | 100 GB | +| Redis | 1 vCPU | 2 GB | 10 GB | + +--- + +## 6. Quick Start - AWX on Kubernetes + +```bash +# 1. Install AWX Operator +kubectl apply -f https://raw.githubusercontent.com/ansible/awx-operator/main/deploy/awx-operator.yaml + +# 2. Create AWX instance +cat < /backups/awx_backup_${TIMESTAMP}.dump + # Keep only last 7 days of backups + find /backups -name "awx_backup_*.dump" -mtime +7 -delete + volumeMounts: + - name: backup-storage + mountPath: /backups + volumes: + - name: backup-storage + persistentVolumeClaim: + claimName: awx-backup-pvc + restartPolicy: OnFailure +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: awx-backup-pvc + namespace: awx +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: local-path diff --git a/awx-deployment/scripts/deploy-awx.sh b/awx-deployment/scripts/deploy-awx.sh new file mode 100644 index 0000000000..49f7b45b45 --- /dev/null +++ b/awx-deployment/scripts/deploy-awx.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# AWX Production Deployment Script +# Deploys AWX on Kubernetes for managing 150 servers + +set -euo pipefail + +MANIFESTS_DIR="$(dirname "$0")/../manifests" + +echo "=== AWX Production Deployment ===" +echo "Target: 150 server infrastructure" +echo "" + +# Check prerequisites +if ! command -v kubectl &> /dev/null; then + echo "ERROR: kubectl not found. Install Kubernetes first." + echo "Run: ./install-k3s.sh" + exit 1 +fi + +# Verify cluster is ready +if ! kubectl cluster-info &> /dev/null; then + echo "ERROR: Cannot connect to Kubernetes cluster." + exit 1 +fi + +echo "[1/5] Creating namespace..." +kubectl apply -f "${MANIFESTS_DIR}/00-namespace.yaml" + +echo "[2/5] Deploying AWX Operator..." +kubectl apply -f "${MANIFESTS_DIR}/01-awx-operator.yaml" + +echo "Waiting for AWX Operator to be ready..." +kubectl wait --namespace awx \ + --for=condition=available deployment/awx-operator-controller-manager \ + --timeout=300s + +echo "[3/5] Creating persistent storage..." +kubectl apply -f "${MANIFESTS_DIR}/02-postgres-pvc.yaml" + +echo "[4/5] Deploying AWX instance..." +kubectl apply -f "${MANIFESTS_DIR}/03-awx-production.yaml" + +echo "Waiting for AWX to be ready (this may take 5-10 minutes)..." +echo "You can monitor progress with: kubectl -n awx get pods -w" + +# Wait for AWX web pod to be ready +for i in {1..60}; do + if kubectl -n awx get pods | grep -q "awx-production-web.*Running"; then + break + fi + echo " Still deploying... (${i}/60)" + sleep 10 +done + +echo "[5/5] Setting up backup schedule..." +kubectl apply -f "${MANIFESTS_DIR}/05-backup-cronjob.yaml" + +echo "" +echo "=== Deployment Complete ===" +echo "" + +# Get admin password +ADMIN_PASSWORD=$(kubectl -n awx get secret awx-production-admin-password \ + -o jsonpath="{.data.password}" 2>/dev/null | base64 --decode) + +if [[ -n "$ADMIN_PASSWORD" ]]; then + echo "AWX Admin Credentials:" + echo " Username: admin" + echo " Password: ${ADMIN_PASSWORD}" + echo "" +fi + +echo "AWX URL: https://awx.yourdomain.com" +echo "" +echo "Next steps:" +echo " 1. Configure TLS: Update manifests/04-ingress-tls.yaml with your certificates" +echo " 2. Add your 150 servers to the inventory" +echo " 3. Configure LDAP authentication" +echo " 4. Set up credential vaults" +echo " 5. Import playbooks from your Git repositories" diff --git a/awx-deployment/scripts/health-check.sh b/awx-deployment/scripts/health-check.sh new file mode 100644 index 0000000000..3476349e40 --- /dev/null +++ b/awx-deployment/scripts/health-check.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# AWX Production Health Check Script +# Run this periodically to verify AWX is healthy + +set -euo pipefail + +echo "=== AWX Production Health Check ===" +echo "Date: $(date)" +echo "" + +ERRORS=0 + +# Check pods +echo "[Pods Status]" +kubectl -n awx get pods -o wide +echo "" + +# Check if all pods are running +NOT_RUNNING=$(kubectl -n awx get pods --no-headers | grep -v "Running\|Completed" | wc -l) +if [[ $NOT_RUNNING -gt 0 ]]; then + echo "WARNING: ${NOT_RUNNING} pod(s) not in Running state" + ERRORS=$((ERRORS + 1)) +fi + +# Check PVC usage +echo "[Storage Status]" +kubectl -n awx get pvc +echo "" + +# Check AWX web endpoint +echo "[Web Endpoint]" +AWX_SVC=$(kubectl -n awx get svc awx-production-service -o jsonpath='{.spec.clusterIP}' 2>/dev/null) +if [[ -n "$AWX_SVC" ]]; then + HTTP_CODE=$(curl -sk -o /dev/null -w "%{http_code}" "http://${AWX_SVC}:80/api/v2/ping/" 2>/dev/null || echo "000") + if [[ "$HTTP_CODE" == "200" ]]; then + echo " API Status: HEALTHY (HTTP ${HTTP_CODE})" + else + echo " API Status: UNHEALTHY (HTTP ${HTTP_CODE})" + ERRORS=$((ERRORS + 1)) + fi +else + echo " AWX service not found" + ERRORS=$((ERRORS + 1)) +fi +echo "" + +# Check recent jobs +echo "[Recent Job Status]" +kubectl -n awx logs -l app.kubernetes.io/component=awx-task --tail=20 2>/dev/null | grep -i "error\|fail" | tail -5 || echo " No recent errors" +echo "" + +# Check resource usage +echo "[Resource Usage]" +kubectl -n awx top pods 2>/dev/null || echo " Metrics server not available" +echo "" + +# Summary +echo "=== Summary ===" +if [[ $ERRORS -eq 0 ]]; then + echo "Status: ALL HEALTHY" +else + echo "Status: ${ERRORS} ISSUE(S) DETECTED" +fi diff --git a/awx-deployment/scripts/install-k3s.sh b/awx-deployment/scripts/install-k3s.sh new file mode 100644 index 0000000000..02e617760e --- /dev/null +++ b/awx-deployment/scripts/install-k3s.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# AWX Production - K3s Installation Script +# This script installs a lightweight Kubernetes cluster for AWX + +set -euo pipefail + +echo "=== AWX Production: Installing K3s ===" + +# Check if running as root +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + exit 1 +fi + +# System requirements check +TOTAL_MEM=$(free -g | awk '/^Mem:/{print $2}') +if [[ $TOTAL_MEM -lt 8 ]]; then + echo "WARNING: Minimum 8 GB RAM recommended. Found: ${TOTAL_MEM} GB" + read -p "Continue anyway? (y/n) " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +fi + +# Install K3s +curl -sfL https://get.k3s.io | sh -s - \ + --write-kubeconfig-mode 644 \ + --disable traefik \ + --disable servicelb + +# Wait for K3s to be ready +echo "Waiting for K3s to be ready..." +kubectl wait --for=condition=Ready nodes --all --timeout=120s + +# Install NGINX Ingress Controller +kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/cloud/deploy.yaml + +echo "Waiting for ingress controller..." +kubectl wait --namespace ingress-nginx \ + --for=condition=ready pod \ + --selector=app.kubernetes.io/component=controller \ + --timeout=120s + +echo "" +echo "=== K3s Installation Complete ===" +echo "Kubeconfig: /etc/rancher/k3s/k3s.yaml" +echo "Next step: Run ./deploy-awx.sh"