Swarm/docker-compose.yml

services:
  # --- Persona (Gemma-4-26B-A4B-it) ---
  # Physical: 0 & 7 | Container: 0 & 1
  persona:
    build: ./swarm-control/persona
    ipc: host # Replaces shm_size to avoid shared memory bottlenecks
    ulimits:
      memlock:
        soft: -1
        hard: -1
    environment:
      - CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing
      - NCCL_P2P_DISABLE=0
      - HUGGING_FACE_HUB_TOKEN=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
    volumes:
      - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
      - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
    ports:
      - "3000:3000"
    command: >
      python3 -m sglang.launch_server
      --model-path google/gemma-4-26b-a4b-it
      --tp 2
      --port 3000
      --host 0.0.0.0
      --attention-backend triton
      --mem-fraction-static 0.8
      --max-running-requests 128
      --chunked-prefill-size 4096
      --context-length 65536
      --kv-cache-dtype fp8_e4m3
      --trust-remote-code
      --enable-piecewise-cuda-graph
      --schedule-policy lpm
      --tool-call-parser gemma4
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0', '7']
              capabilities: [gpu]

# --- TTS ---
  tts:
    build:
      context: .  # This allows the build to see the Qwen3-TTS folder at the root
      dockerfile: ./swarm-control/indra-tts-server/Dockerfile
    image: swarm-tts
    depends_on:
      - persona
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - PYTHONPATH=/app:/app/Qwen3-TTS # Keep this so the app finds the local code
      - NVIDIA_DRIVER_CAPABILITIES=all
    volumes:
      - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
      - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS # Keep this for live code edits
    ports:
      - "8002:8002"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['7']
              capabilities: [gpu]

  # --- STT ---
  stt:
    image: ghcr.io/speaches-ai/speaches:latest-cuda
    depends_on:
      - persona
    user: "1000:1000" # Explicitly match your host user UID
    environment:
      - CUDA_VISIBLE_DEVICES=0
      # Use the full HF ID. Speaches will auto-download this to your cache on first boot.
      - PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2
    volumes:
      # Map to the base cache folder
      - /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface
    ports:
      - "8005:8000"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]

  # ==========================================
  # TIER 1: COMMAND LOBE (GPUs 1 & 2)
  # ==========================================
  coder_next:
    image: lmsysorg/sglang:latest
    ipc: host
    environment:
      - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2
      - HUGGING_FACE_HUB_TOKEN=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
    volumes:
      - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
      - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
    ports:
      - "3001:3001"
    command: >
      sglang serve
      --model-path Qwen/Qwen3-Coder-Next-FP8
      --tp 2
      --port 3001
      --host 0.0.0.0
      --hf-chat-template-name tool_use
      --mem-fraction-static 0.95
      --context-length 131072
      --trust-remote-code
      --tool-call-parser qwen3_coder
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['1', '2']
              capabilities: [gpu]

  # ==========================================
  # TIER 2: HETEROGENEOUS MODELS (GPUs 3, 4, 5)
  # ==========================================


  qwen_27b:
    image: lmsysorg/sglang:dev
    ipc: host
    environment:
      - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 3, 4
      - HUGGING_FACE_HUB_TOKEN=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
    volumes:
      - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
      - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
    ports:
      - "3002:3002"
    command: >
      python3 -m sglang.launch_server
      --model-path Qwen/Qwen3.6-27B-FP8
      --tp 2
      --port 3002
      --host 0.0.0.0
      --mem-fraction-static 0.85
      --context-length 131072
      --tool-call-parser qwen3_coder
      --reasoning-parser qwen3
      --trust-remote-code

    cap_add:
      - SYS_NICE

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['3', '4']
              capabilities: [gpu]


  gemma_31b:
    build: ./swarm-control/persona
    ipc: host
    cap_add:
      - SYS_NICE
    environment:
      - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 5, 6
      - HUGGING_FACE_HUB_TOKEN=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
    volumes:
      - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
      - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
    ports:
      - "3003:3003"
    command: >
      python3 -m sglang.launch_server
      --model-path RedHatAI/gemma-4-31B-it-FP8-Dynamic
      --tokenizer-path google/gemma-4-31B-it
      --tp 2
      --port 3003
      --host 0.0.0.0
      --mem-fraction-static 0.80
      --context-length 131072
      --kv-cache-dtype fp8_e4m3
      --trust-remote-code
      --tool-call-parser gemma4
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['5', '6']
              capabilities: [gpu]

# ==========================================
  # TIER 3: MEMORY (GPU 6)
  # ==========================================
  embeddings:
    image: ghcr.io/huggingface/text-embeddings-inference:latest
    environment:
      - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6
      - HUGGING_FACE_HUB_TOKEN=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
    volumes:
      - /mnt/nvme3n1/swarm/huggingface_cache:/data
    ports:
      - "8000:8000"
    command: >
      --model-id google/embeddinggemma-300m
      --max-client-batch-size 1024
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['6']
              capabilities: [gpu]

# --- Networks ---

networks:
  default:
    name: swarm-network