Initial commit: Productionized Swarm with Docker support
This commit is contained in:
92
docker-compose.yml
Normal file
92
docker-compose.yml
Normal file
@@ -0,0 +1,92 @@
|
||||
services:
|
||||
# --- Persona (Gemma-4-26B-A4B-it) ---
|
||||
# Physical: 0 & 7 | Container: 0 & 1
|
||||
persona:
|
||||
build: ./swarm-control/persona
|
||||
ipc: host # Replaces shm_size to avoid shared memory bottlenecks
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing
|
||||
- NCCL_P2P_DISABLE=0
|
||||
- HUGGING_FACE_HUB_TOKEN=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
|
||||
volumes:
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
||||
ports:
|
||||
- "3000:3000"
|
||||
command: >
|
||||
python3 -m sglang.launch_server
|
||||
--model-path google/gemma-4-26b-a4b-it
|
||||
--tp 2
|
||||
--port 3000
|
||||
--host 0.0.0.0
|
||||
--attention-backend triton
|
||||
--mem-fraction-static 0.8
|
||||
--max-running-requests 128
|
||||
--chunked-prefill-size 4096
|
||||
--context-length 32768
|
||||
--trust-remote-code
|
||||
--enable-piecewise-cuda-graph
|
||||
--schedule-policy lpm
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['0', '7']
|
||||
capabilities: [gpu]
|
||||
|
||||
# --- TTS ---
|
||||
# Physical: 7 | Container: 0
|
||||
tts:
|
||||
build: ./swarm-control/indra-tts-server
|
||||
image: swarm-tts
|
||||
depends_on:
|
||||
- persona
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- PYTHONPATH=/app:/app/Qwen3-TTS
|
||||
- NVIDIA_DRIVER_CAPABILITIES=all
|
||||
volumes:
|
||||
- /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
|
||||
- /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS
|
||||
ports:
|
||||
- "8002:8002"
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['7']
|
||||
capabilities: [gpu]
|
||||
|
||||
|
||||
# --- STT ---
|
||||
stt:
|
||||
image: ghcr.io/speaches-ai/speaches:latest-cuda
|
||||
depends_on:
|
||||
- persona
|
||||
user: "1000:1000" # Explicitly match your host user UID
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
# Use the full HF ID. Speaches will auto-download this to your cache on first boot.
|
||||
- PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2
|
||||
volumes:
|
||||
# Map to the base cache folder
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface
|
||||
ports:
|
||||
- "8005:8000"
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['0']
|
||||
capabilities: [gpu]
|
||||
|
||||
networks:
|
||||
default:
|
||||
name: swarm-network
|
||||
Reference in New Issue
Block a user