Initial commit: Productionized Swarm with Docker support

2026-04-16 16:46:24 +10:00
commit c2e2e52ff3
39 changed files with 331 additions and 0 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,92 @@
+services:
+  # --- Persona (Gemma-4-26B-A4B-it) ---
+  # Physical: 0 & 7 | Container: 0 & 1
+  persona:
+    build: ./swarm-control/persona
+    ipc: host # Replaces shm_size to avoid shared memory bottlenecks
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    environment:
+      - CUDA_VISIBLE_DEVICES=0,1 # Corrected for container re-indexing
+      - NCCL_P2P_DISABLE=0
+      - HUGGING_FACE_HUB_TOKEN=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
+    volumes:
+      - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
+      - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
+    ports:
+      - "3000:3000"
+    command: >
+      python3 -m sglang.launch_server
+      --model-path google/gemma-4-26b-a4b-it
+      --tp 2
+      --port 3000
+      --host 0.0.0.0
+      --attention-backend triton
+      --mem-fraction-static 0.8
+      --max-running-requests 128
+      --chunked-prefill-size 4096
+      --context-length 32768
+      --trust-remote-code
+      --enable-piecewise-cuda-graph
+      --schedule-policy lpm
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0', '7']
+              capabilities: [gpu]
+
+# --- TTS ---
+  # Physical: 7 | Container: 0
+  tts:
+    build: ./swarm-control/indra-tts-server
+    image: swarm-tts
+    depends_on:
+      - persona
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - PYTHONPATH=/app:/app/Qwen3-TTS
+      - NVIDIA_DRIVER_CAPABILITIES=all
+    volumes:
+      - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
+      - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS
+    ports:
+      - "8002:8002"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['7']
+              capabilities: [gpu]
+
+
+# --- STT ---
+  stt:
+    image: ghcr.io/speaches-ai/speaches:latest-cuda
+    depends_on:
+      - persona
+    user: "1000:1000" # Explicitly match your host user UID
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      # Use the full HF ID. Speaches will auto-download this to your cache on first boot.
+      - PRELOAD_MODELS=deepdml/faster-whisper-large-v3-turbo-ct2
+    volumes:
+      # Map to the base cache folder
+      - /mnt/nvme3n1/swarm/huggingface_cache:/home/ubuntu/.cache/huggingface
+    ports:
+      - "8005:8000"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+
+networks:
+  default:
+    name: swarm-network