services: # --- Persona (Gemma-4-26B-A4B-it) --- # Physical: 0 & 7 | Container: 0 & 1 persona: build: ./swarm-control/persona ipc: host # Replaces shm_size to avoid shared memory bottlenecks ulimits: memlock: soft: -1 hard: -1 environment: - CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing - NCCL_P2P_DISABLE=0 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3000:3000" command: > python3 -m sglang.launch_server --model-path google/gemma-4-26b-a4b-it --tp 2 --port 3000 --host 0.0.0.0 --attention-backend triton --mem-fraction-static 0.8 --max-running-requests 128 --chunked-prefill-size 4096 --context-length 32768 --trust-remote-code --enable-piecewise-cuda-graph --schedule-policy lpm --tool-call-parser gemma4 deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0', '7'] capabilities: [gpu] # --- TTS --- # Physical: 7 | Container: 0 tts: build: ./swarm-control/indra-tts-server image: swarm-tts depends_on: - persona environment: - CUDA_VISIBLE_DEVICES=0 - PYTHONPATH=/app:/app/Qwen3-TTS - NVIDIA_DRIVER_CAPABILITIES=all volumes: - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS ports: - "8002:8002" deploy: resources: reservations: devices: - driver: nvidia device_ids: ['7'] capabilities: [gpu] # --- STT --- stt: image: ghcr.io/speaches-ai/speaches:latest-cuda depends_on: - persona user: "1000:1000" # Explicitly match your host user UID environment: - CUDA_VISIBLE_DEVICES=0 # Use the full HF ID. Speaches will auto-download this to your cache on first boot. - PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2 volumes: # Map to the base cache folder - /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface ports: - "8005:8000" deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0'] capabilities: [gpu] # ========================================== # TIER 1: COMMAND LOBE (GPUs 1 & 2) # ========================================== coder_next: image: lmsysorg/sglang:latest ipc: host environment: - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3001:3001" command: > sglang serve --model-path Qwen/Qwen3-Coder-Next-FP8 --tp 2 --port 3001 --host 0.0.0.0 --hf-chat-template-name tool_use --mem-fraction-static 0.95 --context-length 32768 --trust-remote-code deploy: resources: reservations: devices: - driver: nvidia device_ids: ['1', '2'] capabilities: [gpu] # ========================================== # TIER 2: SWARM LOBE (GPUs 3, 4, 5) # ========================================== swarm_router: image: lmsysorg/sgl-model-gateway:latest ports: - "4000:4000" command: > --port 4000 --worker-urls http://worker_1:3002 http://worker_2:3003 http://worker_3:3004 --policy cache_aware depends_on: - worker_1 - worker_2 - worker_3 networks: - default worker_1: image: lmsysorg/sglang:latest ipc: host environment: - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 3 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3002:3002" command: > sglang serve --model-path Qwen/Qwen3.6-35B-A3B-FP8 --tp 1 --port 3002 --host 0.0.0.0 --mem-fraction-static 0.9 --context-length 131072 --kv-cache-dtype fp8_e4m3 --allow-auto-truncate --max-running-requests 256 --chunked-prefill-size 2048 --schedule-policy lpm --tool-call-parser qwen3_coder --reasoning-parser qwen3 --hf-chat-template-name tool_use --trust-remote-code deploy: resources: reservations: devices: - driver: nvidia device_ids: ['3'] capabilities: [gpu] worker_2: image: lmsysorg/sglang:latest ipc: host environment: - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 4 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3003:3003" command: > sglang serve --model-path Qwen/Qwen3.6-35B-A3B-FP8 --tp 1 --port 3003 --host 0.0.0.0 --mem-fraction-static 0.90 --context-length 131072 --kv-cache-dtype fp8_e4m3 --allow-auto-truncate --max-running-requests 256 --chunked-prefill-size 2048 --schedule-policy lpm --tool-call-parser qwen3_coder --reasoning-parser qwen3 --hf-chat-template-name tool_use --trust-remote-code deploy: resources: reservations: devices: - driver: nvidia device_ids: ['4'] capabilities: [gpu] worker_3: image: lmsysorg/sglang:latest ipc: host environment: - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 5 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3004:3004" command: > sglang serve --model-path Qwen/Qwen3.6-35B-A3B-FP8 --tp 1 --port 3004 --host 0.0.0.0 --mem-fraction-static 0.90 --context-length 131072 --kv-cache-dtype fp8_e4m3 --allow-auto-truncate --max-running-requests 256 --chunked-prefill-size 2048 --schedule-policy lpm --tool-call-parser qwen3_coder --reasoning-parser qwen3 --hf-chat-template-name tool_use --trust-remote-code deploy: resources: reservations: devices: - driver: nvidia device_ids: ['5'] capabilities: [gpu] # ========================================== # TIER 3: MEMORY LOBE (GPU 6) # ========================================== embeddings: image: ghcr.io/huggingface/text-embeddings-inference:latest environment: - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6 - HUGGING_FACE_HUB_TOKEN=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 volumes: # TEI uses /data as its internal cache directory - /mnt/nvme3n1/swarm/huggingface_cache:/data ports: - "8000:8000" command: > --model-id jinaai/jina-embeddings-v2-base-code --max-client-batch-size 1024 deploy: resources: reservations: devices: - driver: nvidia device_ids: ['6'] capabilities: [gpu] # --- Networks --- networks: default: name: swarm-network