226 lines
6.3 KiB
YAML
226 lines
6.3 KiB
YAML
services:
|
|
# --- Persona (Gemma-4-26B-A4B-it) ---
|
|
# Physical: 0 & 7 | Container: 0 & 1
|
|
persona:
|
|
build: ./swarm-control/persona
|
|
ipc: host # Replaces shm_size to avoid shared memory bottlenecks
|
|
ulimits:
|
|
memlock:
|
|
soft: -1
|
|
hard: -1
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing
|
|
- NCCL_P2P_DISABLE=0
|
|
- HUGGING_FACE_HUB_TOKEN=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
|
|
volumes:
|
|
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
|
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
|
ports:
|
|
- "3000:3000"
|
|
command: >
|
|
python3 -m sglang.launch_server
|
|
--model-path google/gemma-4-26b-a4b-it
|
|
--tp 2
|
|
--port 3000
|
|
--host 0.0.0.0
|
|
--attention-backend triton
|
|
--mem-fraction-static 0.8
|
|
--max-running-requests 128
|
|
--chunked-prefill-size 4096
|
|
--context-length 65536
|
|
--kv-cache-dtype fp8_e4m3
|
|
--trust-remote-code
|
|
--enable-piecewise-cuda-graph
|
|
--schedule-policy lpm
|
|
--tool-call-parser gemma4
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['0', '7']
|
|
capabilities: [gpu]
|
|
|
|
# --- TTS ---
|
|
tts:
|
|
build:
|
|
context: . # This allows the build to see the Qwen3-TTS folder at the root
|
|
dockerfile: ./swarm-control/indra-tts-server/Dockerfile
|
|
image: swarm-tts
|
|
depends_on:
|
|
- persona
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
- PYTHONPATH=/app:/app/Qwen3-TTS # Keep this so the app finds the local code
|
|
- NVIDIA_DRIVER_CAPABILITIES=all
|
|
volumes:
|
|
- /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
|
|
- /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS # Keep this for live code edits
|
|
ports:
|
|
- "8002:8002"
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['7']
|
|
capabilities: [gpu]
|
|
|
|
# --- STT ---
|
|
stt:
|
|
image: ghcr.io/speaches-ai/speaches:latest-cuda
|
|
depends_on:
|
|
- persona
|
|
user: "1000:1000" # Explicitly match your host user UID
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
# Use the full HF ID. Speaches will auto-download this to your cache on first boot.
|
|
- PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2
|
|
volumes:
|
|
# Map to the base cache folder
|
|
- /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface
|
|
ports:
|
|
- "8005:8000"
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['0']
|
|
capabilities: [gpu]
|
|
|
|
# ==========================================
|
|
# TIER 1: COMMAND LOBE (GPUs 1 & 2)
|
|
# ==========================================
|
|
coder_next:
|
|
image: lmsysorg/sglang:latest
|
|
ipc: host
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2
|
|
- HUGGING_FACE_HUB_TOKEN=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
|
|
volumes:
|
|
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
|
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
|
ports:
|
|
- "3001:3001"
|
|
command: >
|
|
sglang serve
|
|
--model-path Qwen/Qwen3-Coder-Next-FP8
|
|
--tp 2
|
|
--port 3001
|
|
--host 0.0.0.0
|
|
--hf-chat-template-name tool_use
|
|
--mem-fraction-static 0.95
|
|
--context-length 131072
|
|
--trust-remote-code
|
|
--tool-call-parser qwen3_coder
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['1', '2']
|
|
capabilities: [gpu]
|
|
|
|
# ==========================================
|
|
# TIER 2: HETEROGENEOUS MODELS (GPUs 3, 4, 5)
|
|
# ==========================================
|
|
|
|
|
|
qwen_27b:
|
|
image: lmsysorg/sglang:dev
|
|
ipc: host
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 3, 4
|
|
- HUGGING_FACE_HUB_TOKEN=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
|
|
volumes:
|
|
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
|
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
|
ports:
|
|
- "3002:3002"
|
|
command: >
|
|
python3 -m sglang.launch_server
|
|
--model-path Qwen/Qwen3.6-27B-FP8
|
|
--tp 2
|
|
--port 3002
|
|
--host 0.0.0.0
|
|
--mem-fraction-static 0.85
|
|
--context-length 131072
|
|
--tool-call-parser qwen3_coder
|
|
--reasoning-parser qwen3
|
|
--trust-remote-code
|
|
|
|
cap_add:
|
|
- SYS_NICE
|
|
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['3', '4']
|
|
capabilities: [gpu]
|
|
|
|
|
|
gemma_31b:
|
|
build: ./swarm-control/persona
|
|
ipc: host
|
|
cap_add:
|
|
- SYS_NICE
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 5, 6
|
|
- HUGGING_FACE_HUB_TOKEN=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
|
|
volumes:
|
|
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
|
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
|
ports:
|
|
- "3003:3003"
|
|
command: >
|
|
python3 -m sglang.launch_server
|
|
--model-path RedHatAI/gemma-4-31B-it-FP8-Dynamic
|
|
--tokenizer-path google/gemma-4-31B-it
|
|
--tp 2
|
|
--port 3003
|
|
--host 0.0.0.0
|
|
--mem-fraction-static 0.80
|
|
--context-length 131072
|
|
--kv-cache-dtype fp8_e4m3
|
|
--trust-remote-code
|
|
--tool-call-parser gemma4
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['5', '6']
|
|
capabilities: [gpu]
|
|
|
|
# ==========================================
|
|
# TIER 3: MEMORY (GPU 6)
|
|
# ==========================================
|
|
embeddings:
|
|
image: ghcr.io/huggingface/text-embeddings-inference:latest
|
|
environment:
|
|
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6
|
|
- HUGGING_FACE_HUB_TOKEN=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
|
|
volumes:
|
|
- /mnt/nvme3n1/swarm/huggingface_cache:/data
|
|
ports:
|
|
- "8000:8000"
|
|
command: >
|
|
--model-id google/embeddinggemma-300m
|
|
--max-client-batch-size 1024
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['6']
|
|
capabilities: [gpu]
|
|
|
|
# --- Networks ---
|
|
|
|
networks:
|
|
default:
|
|
name: swarm-network
|