services: # --- Persona (Gemma-4-26B-A4B-it) --- # Physical: 0 & 7 | Container: 0 & 1 persona: build: ./swarm-control/persona ipc: host # Replaces shm_size to avoid shared memory bottlenecks ulimits: memlock: soft: -1 hard: -1 environment: - CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing - NCCL_P2P_DISABLE=0 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang ports: - "3000:3000" command: > python3 -m sglang.launch_server --model-path google/gemma-4-26b-a4b-it --tp 2 --port 3000 --host 0.0.0.0 --attention-backend triton --mem-fraction-static 0.8 --max-running-requests 128 --chunked-prefill-size 4096 --context-length 32768 --trust-remote-code --enable-piecewise-cuda-graph --schedule-policy lpm deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0', '7'] capabilities: [gpu] # --- TTS --- # Physical: 7 | Container: 0 tts: build: ./swarm-control/indra-tts-server image: swarm-tts depends_on: - persona environment: - CUDA_VISIBLE_DEVICES=0 - PYTHONPATH=/app:/app/Qwen3-TTS - NVIDIA_DRIVER_CAPABILITIES=all volumes: - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS ports: - "8002:8002" deploy: resources: reservations: devices: - driver: nvidia device_ids: ['7'] capabilities: [gpu] # --- STT --- stt: image: ghcr.io/speaches-ai/speaches:latest-cuda depends_on: - persona user: "1000:1000" # Explicitly match your host user UID environment: - CUDA_VISIBLE_DEVICES=0 # Use the full HF ID. Speaches will auto-download this to your cache on first boot. - PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2 volumes: # Map to the base cache folder - /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface ports: - "8005:8000" deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0'] capabilities: [gpu] networks: default: name: swarm-network