Initial commit: Productionized Swarm with Docker support

This commit is contained in:
damith
2026-04-16 16:46:24 +10:00
commit c2e2e52ff3
39 changed files with 331 additions and 0 deletions

View File

@@ -0,0 +1,37 @@
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
# Prevent interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
# 1. Install Python 3.12 and SoX dependencies
RUN apt-get update && apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get update && apt-get install -y \
python3.12 \
python3.12-dev \
curl \
git \
libsndfile1 \
ffmpeg \
sox \
libsox-dev && \
rm -rf /var/lib/apt/lists/*
# 2. Use the official bootstrap to install a clean Pip for 3.12
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
WORKDIR /app
# 3. Explicitly install BOTH torch and torchaudio from the cu124 index
RUN python3.12 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
RUN python3.12 -m pip install --no-cache-dir fastapi uvicorn numpy soundfile
# 4. Install the local Qwen3-TTS requirements
RUN python3.12 -m pip install --no-cache-dir faster-qwen3-tts
COPY tts-server.py .
EXPOSE 8002
CMD ["python3.12", "tts-server.py"]

View File

@@ -0,0 +1,80 @@
import os
import torch
import numpy as np
import io
import wave
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from pydantic import BaseModel
from faster_qwen3_tts import FasterQwen3TTS
app = FastAPI(title="Indra tts")
if not torch.cuda.is_available():
raise RuntimeError("Mouth cannot find CUDA. Check nvidia-container-toolkit.")
print(f"Loading model on: {torch.cuda.get_device_name(0)}")
# Load the Base model for high-fidelity mimicry
model = FasterQwen3TTS.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
device="cuda:0", # Targets GPU 7
dtype=torch.bfloat16
)
class TTSRequest(BaseModel):
model: str = "tts-1" # ignored by backend, here to satisfy modelix router
input: str
voice: str = "oni"
response_format: str = "wav"
seed: int = 42
@app.post("/v1/audio/speech")
async def generate_speech(request: TTSRequest):
try:
voice_file = f"{request.voice}.wav"
base_path = "/mnt/nvme3n1/swarm/voice-samples"
ref_path = os.path.join(base_path, voice_file)
txt_path = os.path.splitext(ref_path)[0] + ".txt"
ref_text = None
if os.path.exists(txt_path):
with open(txt_path, "r") as f:
ref_text = f.read().strip()
# Fix the seed for the persona identity
torch.manual_seed(request.seed)
full_audio = []
# Non-streaming call is fine here since it takes <1s on your L40S
audio_data, sample_rate = model.generate_voice_clone(
text=request.input,
language="English",
ref_audio=ref_path,
ref_text=ref_text,
xvec_only=(ref_text is None)
)
audio_data = np.array(audio_data)
audio_data = audio_data.flatten()
# Convert Float32 to Int16 for standard WAV compatibility
audio_int16 = (audio_data * 32767).astype(np.int16)
wav_io = io.BytesIO()
with wave.open(wav_io, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_int16.tobytes())
wav_io.seek(0)
return Response(content=wav_io.getvalue(), media_type="audio/wav")
except Exception as e:
print(f"Indra Mouth Error: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8002)

View File

@@ -0,0 +1,12 @@
FROM lmsysorg/sglang:latest
# 1. Force the upgrade of transformers without triggering pip's strict dependency resolver
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir --upgrade --no-deps \
git+https://github.com/huggingface/transformers.git \
--break-system-packages
# 2. Inject your working bare-metal SGLang source code directly over the container's default
COPY local-sglang/python /sgl-workspace/sglang/python
WORKDIR /app

29
swarm-control/start-persona.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/bin/bash
# Node Matali: Gemma-4-26B-A4B-it
# GPU Mapping: 0, 7
# 1. Point to the BIG drive
export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
export SGLANG_CACHE_DIR=/mnt/nvme3n1/swarm/sglang_cache
# 2. Source the environment
source /home/isnai/anaconda3/etc/profile.d/conda.sh
conda activate swarm
export CUDA_VISIBLE_DEVICES=0,7
export NCCL_P2P_DISABLE=0
# 3. Launch
python3 -m sglang.launch_server \
--model-path google/gemma-4-26b-a4b-it \
--tp 2 \
--port 3000 \
--host 0.0.0.0 \
--attention-backend triton \
--mem-fraction-static 0.8 \
--max-running-requests 128\
--chunked-prefill-size 4096\
--context-length 32768 \
--trust-remote-code \
--enable-piecewise-cuda-graph \
--schedule-policy lpm

20
swarm-control/start-tts-qwen.sh Executable file
View File

@@ -0,0 +1,20 @@
#!/bin/bash
# --- Resident Swarm Mouth (Qwen3-TTS 1.7B) ---
# GPU Mapping: Shared with Node Matali on GPU 7
export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
source /home/isnai/anaconda3/etc/profile.d/conda.sh
conda activate swarm-voice
# Explicitly lock to GPU 7
export CUDA_VISIBLE_DEVICES=7
export PYTHONPATH=$PYTHONPATH:/mnt/nvme3n1/swarm/Qwen3-TTS
echo "--- Launching Resident Swarm Mouth (Port 8002) ---"
# Move to the server directory
cd /mnt/nvme3n1/swarm/swarm-control/indra-tts-server
# Launching our Turbo-Mouth server
# Because CUDA_VISIBLE_DEVICES=7, the server will see GPU 7 as 'cuda:0'
python tts-server.py

View File

@@ -0,0 +1,17 @@
#!/bin/bash
# --- Environment Setup ---
export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
export SGLANG_CACHE_DIR=/mnt/nvme3n1/swarm/sglang_cache
source /home/isnai/anaconda3/etc/profile.d/conda.sh
conda activate swarm-voice
export CUDA_VISIBLE_DEVICES=0
echo "--- Launching Resident Swarm Ears (Port 8005) ---"
# 2026 Positional Argument Syntax
faster-whisper-server \
--host 0.0.0.0 \
--port 8005 \
whisper-v4-turbo