Initial commit: Productionized Swarm with Docker support

2026-04-16 16:46:24 +10:00
commit c2e2e52ff3
39 changed files with 331 additions and 0 deletions
--- a/swarm-control/indra-tts-server/Dockerfile
+++ b/swarm-control/indra-tts-server/Dockerfile
@@ -0,0 +1,37 @@
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+# Prevent interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+# 1. Install Python 3.12 and SoX dependencies
+RUN apt-get update && apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa -y && \
+    apt-get update && apt-get install -y \
+    python3.12 \
+    python3.12-dev \
+    curl \
+    git \
+    libsndfile1 \
+    ffmpeg \
+    sox \
+    libsox-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# 2. Use the official bootstrap to install a clean Pip for 3.12
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
+
+WORKDIR /app
+
+# 3. Explicitly install BOTH torch and torchaudio from the cu124 index
+RUN python3.12 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
+RUN python3.12 -m pip install --no-cache-dir fastapi uvicorn numpy soundfile
+
+# 4. Install the local Qwen3-TTS requirements
+RUN python3.12 -m pip install --no-cache-dir faster-qwen3-tts
+
+COPY tts-server.py .
+
+EXPOSE 8002
+CMD ["python3.12", "tts-server.py"]
--- a/swarm-control/indra-tts-server/tts-server.py
+++ b/swarm-control/indra-tts-server/tts-server.py
@@ -0,0 +1,80 @@
+import os
+import torch
+import numpy as np
+import io
+import wave
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import Response
+from pydantic import BaseModel
+from faster_qwen3_tts import FasterQwen3TTS
+
+app = FastAPI(title="Indra tts")
+
+if not torch.cuda.is_available():
+    raise RuntimeError("Mouth cannot find CUDA. Check nvidia-container-toolkit.")
+print(f"Loading model on: {torch.cuda.get_device_name(0)}")
+
+# Load the Base model for high-fidelity mimicry
+model = FasterQwen3TTS.from_pretrained(
+    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+    device="cuda:0", # Targets GPU 7
+    dtype=torch.bfloat16
+)
+
+class TTSRequest(BaseModel):
+    model: str = "tts-1" # ignored by backend, here to satisfy modelix router
+    input: str
+    voice: str = "oni"
+    response_format: str = "wav"
+    seed: int = 42
+
+@app.post("/v1/audio/speech")
+async def generate_speech(request: TTSRequest):
+    try:
+        voice_file = f"{request.voice}.wav"
+        base_path = "/mnt/nvme3n1/swarm/voice-samples"
+        ref_path = os.path.join(base_path, voice_file)
+        txt_path = os.path.splitext(ref_path)[0] + ".txt"
+
+        ref_text = None
+        if os.path.exists(txt_path):
+            with open(txt_path, "r") as f:
+                ref_text = f.read().strip()
+
+        # Fix the seed for the persona identity
+        torch.manual_seed(request.seed)
+
+        full_audio = []
+        # Non-streaming call is fine here since it takes <1s on your L40S
+        audio_data, sample_rate = model.generate_voice_clone(
+            text=request.input,
+            language="English",
+            ref_audio=ref_path,
+            ref_text=ref_text,
+            xvec_only=(ref_text is None)
+        )
+
+        audio_data = np.array(audio_data)
+
+        audio_data = audio_data.flatten()
+
+        # Convert Float32 to Int16 for standard WAV compatibility
+        audio_int16 = (audio_data * 32767).astype(np.int16)
+
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(audio_int16.tobytes())
+
+        wav_io.seek(0)
+        return Response(content=wav_io.getvalue(), media_type="audio/wav")
+
+    except Exception as e:
+        print(f"Indra Mouth Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8002)
--- a/swarm-control/persona/Dockerfile
+++ b/swarm-control/persona/Dockerfile
@@ -0,0 +1,12 @@
+FROM lmsysorg/sglang:latest
+
+# 1. Force the upgrade of transformers without triggering pip's strict dependency resolver
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir --upgrade --no-deps \
+    git+https://github.com/huggingface/transformers.git \
+    --break-system-packages
+
+# 2. Inject your working bare-metal SGLang source code directly over the container's default
+COPY local-sglang/python /sgl-workspace/sglang/python
+
+WORKDIR /app
--- a/swarm-control/start-persona.sh
+++ b/swarm-control/start-persona.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Node Matali: Gemma-4-26B-A4B-it
+# GPU Mapping: 0, 7
+
+# 1. Point to the BIG drive
+export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
+export SGLANG_CACHE_DIR=/mnt/nvme3n1/swarm/sglang_cache
+
+# 2. Source the environment
+source /home/isnai/anaconda3/etc/profile.d/conda.sh
+conda activate swarm
+
+export CUDA_VISIBLE_DEVICES=0,7
+export NCCL_P2P_DISABLE=0 
+
+# 3. Launch
+python3 -m sglang.launch_server \
+    --model-path google/gemma-4-26b-a4b-it \
+    --tp 2 \
+    --port 3000 \
+    --host 0.0.0.0 \
+    --attention-backend triton \
+    --mem-fraction-static 0.8 \
+    --max-running-requests 128\
+    --chunked-prefill-size 4096\
+    --context-length 32768 \
+    --trust-remote-code \
+    --enable-piecewise-cuda-graph \
+    --schedule-policy lpm
--- a/swarm-control/start-tts-qwen.sh
+++ b/swarm-control/start-tts-qwen.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# --- Resident Swarm Mouth (Qwen3-TTS 1.7B) ---
+# GPU Mapping: Shared with Node Matali on GPU 7
+
+export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
+source /home/isnai/anaconda3/etc/profile.d/conda.sh
+conda activate swarm-voice
+
+# Explicitly lock to GPU 7
+export CUDA_VISIBLE_DEVICES=7
+export PYTHONPATH=$PYTHONPATH:/mnt/nvme3n1/swarm/Qwen3-TTS
+
+echo "--- Launching Resident Swarm Mouth (Port 8002) ---"
+
+# Move to the server directory
+cd /mnt/nvme3n1/swarm/swarm-control/indra-tts-server
+
+# Launching our Turbo-Mouth server
+# Because CUDA_VISIBLE_DEVICES=7, the server will see GPU 7 as 'cuda:0'
+python tts-server.py
--- a/swarm-control/start-whisper-stt.sh
+++ b/swarm-control/start-whisper-stt.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# --- Environment Setup ---
+export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
+export SGLANG_CACHE_DIR=/mnt/nvme3n1/swarm/sglang_cache
+
+source /home/isnai/anaconda3/etc/profile.d/conda.sh
+conda activate swarm-voice
+
+export CUDA_VISIBLE_DEVICES=0
+
+echo "--- Launching Resident Swarm Ears (Port 8005) ---"
+
+# 2026 Positional Argument Syntax
+faster-whisper-server \
+    --host 0.0.0.0 \
+    --port 8005 \
+    whisper-v4-turbo