services: # --- Persona (Gemma-4-26B-A4B-it) --- # Physical: 0 & 7 | Container: 0 & 1 persona: build: ./swarm-control/persona ipc: host # Replaces shm_size to avoid shared memory bottlenecks ulimits: memlock: soft: -1 hard: -1 environment: - CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing - NCCL_P2P_DISABLE=0 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3000:3000" command: > python3 -m sglang.launch_server --model-path google/gemma-4-26b-a4b-it --tp 2 --port 3000 --host 0.0.0.0 --attention-backend triton --mem-fraction-static 0.8 --max-running-requests 128 --chunked-prefill-size 4096 --context-length 32768 --trust-remote-code --enable-piecewise-cuda-graph --schedule-policy lpm --tool-call-parser gemma4 deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0', '7'] capabilities: [gpu] # --- TTS --- # Physical: 7 | Container: 0 tts: build: ./swarm-control/indra-tts-server image: swarm-tts depends_on: - persona environment: - CUDA_VISIBLE_DEVICES=0 - PYTHONPATH=/app:/app/Qwen3-TTS - NVIDIA_DRIVER_CAPABILITIES=all volumes: - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS ports: - "8002:8002" deploy: resources: reservations: devices: - driver: nvidia device_ids: ['7'] capabilities: [gpu] # --- STT --- stt: image: ghcr.io/speaches-ai/speaches:latest-cuda depends_on: - persona user: "1000:1000" # Explicitly match your host user UID environment: - CUDA_VISIBLE_DEVICES=0 # Use the full HF ID. Speaches will auto-download this to your cache on first boot. - PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2 volumes: # Map to the base cache folder - /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface ports: - "8005:8000" deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0'] capabilities: [gpu] # ========================================== # TIER 1: COMMAND LOBE (GPUs 1 & 2) # ========================================== coder_next: image: lmsysorg/sglang:latest ipc: host environment: - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3001:3001" command: > sglang serve --model-path Qwen/Qwen3-Coder-Next-FP8 --tp 2 --port 3001 --host 0.0.0.0 --hf-chat-template-name tool_use --mem-fraction-static 0.95 --context-length 32768 --trust-remote-code --tool-call-parser qwen3_coder deploy: resources: reservations: devices: - driver: nvidia device_ids: ['1', '2'] capabilities: [gpu] # ========================================== # TIER 2: HETEROGENEOUS MODELS (GPUs 3, 4, 5) # ========================================== qwen_27b: image: lmsysorg/sglang:dev ipc: host environment: - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 3, 4 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3002:3002" command: > python3 -m sglang.launch_server --model-path Qwen/Qwen3.6-27B-FP8 --tp 2 --port 3002 --host 0.0.0.0 --mem-fraction-static 0.85 --context-length 131072 --tool-call-parser qwen3_coder --reasoning-parser qwen3 --trust-remote-code cap_add: - SYS_NICE deploy: resources: reservations: devices: - driver: nvidia device_ids: ['3', '4'] capabilities: [gpu] gemma_31b: build: ./swarm-control/persona ipc: host cap_add: - SYS_NICE environment: - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 5, 6 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3003:3003" command: > python3 -m sglang.launch_server --model-path RedHatAI/gemma-4-31B-it-FP8-Dynamic --tokenizer-path google/gemma-4-31B-it --tp 2 --port 3003 --host 0.0.0.0 --mem-fraction-static 0.85 --context-length 65536 --kv-cache-dtype fp8_e4m3 --trust-remote-code --tool-call-parser gemma4 deploy: resources: reservations: devices: - driver: nvidia device_ids: ['5', '6'] capabilities: [gpu] # ========================================== # TIER 3: MEMORY (GPU 6) # ========================================== embeddings: image: ghcr.io/huggingface/text-embeddings-inference:latest environment: - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/data ports: - "8000:8000" command: > --model-id jinaai/jina-embeddings-v2-base-code --max-client-batch-size 1024 deploy: resources: reservations: devices: - driver: nvidia device_ids: ['6'] capabilities: [gpu] # --- Networks --- networks: default: name: swarm-network