LandmarkDiff-public/docker-compose.yml at main · dreamlessx/LandmarkDiff-public · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# LandmarkDiff Docker Compose configuration
#
# Services:
#   app       - Main LandmarkDiff Gradio demo (CPU-only, lightweight)
#   gpu       - GPU-accelerated demo using runtime CUDA image (recommended)
#   app-gpu   - GPU-accelerated demo using devel CUDA image
#   docs      - Sphinx documentation builder
#   train     - Training service (GPU required, use with: docker compose run train)
#
# Usage:
#   docker compose up app           # CPU demo on :7860
#   docker compose up gpu           # GPU demo on :7861
#   docker compose up app-gpu       # GPU demo on :7860 (devel image)
#   docker compose run docs         # Build Sphinx docs into docs/_build/
#   docker compose --profile training run train  # Run training
#
# GPU setup: see docs/docker-gpu.md for prerequisites and troubleshooting.

services:
  # ── CPU app (default) ─────────────────────────────────────────
  # Runs the Gradio demo in TPS (CPU) mode. No GPU required.
  # Access at http://localhost:7860
  app:
    build:
      context: .
      dockerfile: Dockerfile.cpu
    ports:
      - "7860:7860"
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - model-cache:/root/.cache
    environment:
      - LANDMARKDIFF_MODE=tps
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:7860/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s

  # ── GPU app (runtime image) ───────────────────────────────────
  # GPU-accelerated demo using the smaller runtime CUDA image.
  # Requires nvidia-container-toolkit installed on the host.
  # Access at http://localhost:7861
  # See docs/docker-gpu.md for setup instructions.
  gpu:
    build:
      context: .
      dockerfile: Dockerfile.gpu
    ports:
      - "7861:7860"
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./models:/app/models
      - model-cache:/root/.cache
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    environment:
      - CUDA_VISIBLE_DEVICES=0
    restart: unless-stopped

  # ── GPU app (devel image) ────────────────────────────────────
  # Full GPU-accelerated demo with ControlNet inference.
  # Uses the larger devel CUDA image (needed if compiling extensions).
  # Requires nvidia-container-toolkit installed on the host.
  # Access at http://localhost:7860
  app-gpu:
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      - "7860:7860"
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - model-cache:/root/.cache
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    environment:
      - CUDA_VISIBLE_DEVICES=0
    restart: unless-stopped

  # ── Docs builder ──────────────────────────────────────────────
  # Builds Sphinx documentation into docs/_build/html/.
  # Usage: docker compose run docs
  docs:
    image: python:3.11-slim
    working_dir: /app
    volumes:
      - .:/app:ro
      - ./docs/_build:/app/docs/_build
    command: >
      sh -c "pip install --quiet -r docs/requirements.txt &&
             sphinx-build -b html docs/ docs/_build/html"

  # ── Training (optional) ──────────────────────────────────────
  # Runs ControlNet fine-tuning. Requires GPU.
  # Usage: docker compose --profile training run train
  train:
    build:
      context: .
      dockerfile: Dockerfile
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - model-cache:/root/.cache
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command: >
      python scripts/train_controlnet.py
        --data_dir /app/data/synthetic_pairs
        --output_dir /app/checkpoints
        --num_train_steps 10000
    profiles:
      - training

volumes:
  model-cache: