From 109084e8e464993399f2b1df9a770aa3b56c015f Mon Sep 17 00:00:00 2001
From: Nathan <NSemertzidis@isn.edu.au>
Date: Tue, 5 May 2026 16:42:49 +1000
Subject: [PATCH] Swapped back to qwen3-tts

---
 docker-compose.yml                           |  22 ++--
 swarm-control/indra-tts-server/Dockerfile    |  42 ++++---
 swarm-control/indra-tts-server/tts-server.py | 114 +++++++++++--------
 3 files changed, 100 insertions(+), 78 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 0478980..368898e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -27,7 +27,8 @@ services:
       --mem-fraction-static 0.8
       --max-running-requests 128
       --chunked-prefill-size 4096
-      --context-length 32768
+      --context-length 65536
+      --kv-cache-dtype fp8_e4m3
       --trust-remote-code
       --enable-piecewise-cuda-graph
       --schedule-policy lpm
@@ -40,20 +41,21 @@ services:
               device_ids: ['0', '7']
               capabilities: [gpu]
 
-  # --- TTS ---
-  # Physical: 7 | Container: 0
+# --- TTS ---
   tts:
-    build: ./swarm-control/indra-tts-server
+    build:
+      context: .  # This allows the build to see the Qwen3-TTS folder at the root
+      dockerfile: ./swarm-control/indra-tts-server/Dockerfile
     image: swarm-tts
     depends_on:
       - persona
     environment:
       - CUDA_VISIBLE_DEVICES=0
-      - PYTHONPATH=/app:/app/Qwen3-TTS
+      - PYTHONPATH=/app:/app/Qwen3-TTS # Keep this so the app finds the local code
       - NVIDIA_DRIVER_CAPABILITIES=all
     volumes:
       - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
-      - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS
+      - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS # Keep this for live code edits
     ports:
       - "8002:8002"
     deploy:
@@ -109,7 +111,7 @@ services:
       --host 0.0.0.0
       --hf-chat-template-name tool_use
       --mem-fraction-static 0.95
-      --context-length 32768
+      --context-length 131072
       --trust-remote-code
       --tool-call-parser qwen3_coder
     deploy:
@@ -180,8 +182,8 @@ services:
       --tp 2
       --port 3003
       --host 0.0.0.0
-      --mem-fraction-static 0.85
-      --context-length 65536     
+      --mem-fraction-static 0.80
+      --context-length 131072     
       --kv-cache-dtype fp8_e4m3  
       --trust-remote-code
       --tool-call-parser gemma4
@@ -206,7 +208,7 @@ services:
     ports:
       - "8000:8000"
     command: >
-      --model-id jinaai/jina-embeddings-v2-base-code
+      --model-id google/embeddinggemma-300m
       --max-client-batch-size 1024
     deploy:
       resources:
diff --git a/swarm-control/indra-tts-server/Dockerfile b/swarm-control/indra-tts-server/Dockerfile
index b30dca9..0de2d31 100644
--- a/swarm-control/indra-tts-server/Dockerfile
+++ b/swarm-control/indra-tts-server/Dockerfile
@@ -1,37 +1,33 @@
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
 
-# Prevent interactive prompts
 ENV DEBIAN_FRONTEND=noninteractive
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 
-# 1. Install Python 3.12 and SoX dependencies
-RUN apt-get update && apt-get install -y software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa -y && \
-    apt-get update && apt-get install -y \
-    python3.12 \
-    python3.12-dev \
-    curl \
-    git \
-    libsndfile1 \
-    ffmpeg \
-    sox \
-    libsox-dev && \
+# No deadsnakes PPA needed. Native Python 3.10 works perfectly.
+RUN apt-get update && apt-get install -y \
+    python3 python3-dev python3-pip curl git libsndfile1 ffmpeg sox libsox-dev ninja-build && \
     rm -rf /var/lib/apt/lists/*
 
-# 2. Use the official bootstrap to install a clean Pip for 3.12
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
-
 WORKDIR /app
 
-# 3. Explicitly install BOTH torch and torchaudio from the cu124 index
-RUN python3.12 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
-RUN python3.12 -m pip install --no-cache-dir fastapi uvicorn numpy soundfile
+# Install Torch and core packages
+RUN python3 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
 
-# 4. Install the local Qwen3-TTS requirements
-RUN python3.12 -m pip install --no-cache-dir faster-qwen3-tts
+# 1. Install foundational build tools and numpy first
+RUN python3 -m pip install --no-cache-dir numpy setuptools wheel ninja packaging psutil
 
-COPY tts-server.py .
+# 2. Install the rest of the stack that relies on numpy being present
+RUN python3 -m pip install --no-cache-dir fastapi uvicorn soundfile librosa transformers==4.57.3 accelerate sox onnxruntime
+
+# Force ABI compatibility for the C++ compiler
+ENV _GLIBCXX_USE_CXX11_ABI=0
+ENV MAX_JOBS=8
+
+# Install flash-attn
+RUN python3 -m pip install --no-cache-dir flash-attn --no-build-isolation
+
+COPY swarm-control/indra-tts-server/tts-server.py .
 
 EXPOSE 8002
-CMD ["python3.12", "tts-server.py"]
+CMD ["python3", "tts-server.py"]
diff --git a/swarm-control/indra-tts-server/tts-server.py b/swarm-control/indra-tts-server/tts-server.py
index 15c3c4b..45cce19 100644
--- a/swarm-control/indra-tts-server/tts-server.py
+++ b/swarm-control/indra-tts-server/tts-server.py
@@ -1,74 +1,98 @@
 import os
 import torch
-import numpy as np
 import io
-import wave
+import soundfile as sf
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import Response
 from pydantic import BaseModel
-from faster_qwen3_tts import FasterQwen3TTS
+from qwen_tts import Qwen3TTSModel
 
-app = FastAPI(title="Indra tts")
+# --- 1. HARDWARE OPTIMIZATIONS ---
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision('high') 
 
-if not torch.cuda.is_available():
-    raise RuntimeError("Mouth cannot find CUDA. Check nvidia-container-toolkit.")
-print(f"Loading model on: {torch.cuda.get_device_name(0)}")
+# NEW: Confine PyTorch CPU threads. On massive servers, PyTorch tries to use 
+# all available cores for background tasks, causing severe context-switching lag.
+torch.set_num_threads(4)
 
-# Load the Base model for high-fidelity mimicry
-model = FasterQwen3TTS.from_pretrained(
+app = FastAPI(title="Indra Mouth - Qwen3-TTS Official (Optimized)")
+prompt_cache = {}
+
+print(f"Loading Official Qwen3-TTS on: {torch.cuda.get_device_name(0)}")
+
+model = Qwen3TTSModel.from_pretrained(
     "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
-    device="cuda:0", # Targets GPU 7
-    dtype=torch.bfloat16
+    device_map="cuda:0",
+    dtype=torch.bfloat16,
+    attn_implementation="sdpa" 
+)
+model.eval()
+
+# --- 2. THE SURGICAL COMPILE ---
+# We bypass the wrapper class and strictly compile the heavy LLM engine
+# --- 2. THE SURGICAL COMPILE ---
+import torch._dynamo
+import torch._inductor.config
+
+torch._dynamo.config.suppress_errors = True
+# Expand the cache limit so it doesn't thrash when sequence lengths vary
+torch._dynamo.config.cache_size_limit = 128 
+# Force inductor to use the fastest possible memory layouts for L40S
+torch._inductor.config.coordinate_descent_tuning = True
+
+print("Compiling Autoregressive Engine (Dynamic Shapes)...")
+model.talker = torch.compile(
+    model.talker,
+    mode="max-autotune", # The most aggressive optimization level available
+    dynamic=True         # CRITICAL: Tells the compiler the audio length will grow
 )
 
+
+
 class TTSRequest(BaseModel):
-    model: str = "tts-1" # ignored by backend, here to satisfy modelix router
     input: str
     voice: str = "oni"
-    response_format: str = "wav"
     seed: int = 42
 
 @app.post("/v1/audio/speech")
 async def generate_speech(request: TTSRequest):
     try:
-        voice_file = f"{request.voice}.wav"
-        base_path = "/mnt/nvme3n1/swarm/voice-samples"
-        ref_path = os.path.join(base_path, voice_file)
-        txt_path = os.path.splitext(ref_path)[0] + ".txt"
+        voice_key = request.voice
+        
+        if voice_key not in prompt_cache:
+            base_path = "/mnt/nvme3n1/swarm/voice-samples"
+            ref_path = os.path.join(base_path, f"{voice_key}.wav")
+            txt_path = os.path.join(base_path, f"{voice_key}.txt")
+            
+            if not os.path.exists(ref_path):
+                raise FileNotFoundError(f"Voice sample {ref_path} not found.")
+            
+            ref_text = None
+            if os.path.exists(txt_path):
+                with open(txt_path, "r") as f:
+                    ref_text = f.read().strip()
 
-        ref_text = None
-        if os.path.exists(txt_path):
-            with open(txt_path, "r") as f:
-                ref_text = f.read().strip()
+            prompt_cache[voice_key] = model.create_voice_clone_prompt(
+                ref_audio=ref_path,
+                ref_text=ref_text,
+                x_vector_only_mode=(ref_text is None)
+            )
+            print(f"--- Cached pristine voice prompt: {voice_key} ---")
 
-        # Fix the seed for the persona identity
         torch.manual_seed(request.seed)
-
-        full_audio = []
-        # Non-streaming call is fine here since it takes <1s on your L40S
-        audio_data, sample_rate = model.generate_voice_clone(
-            text=request.input,
-            language="English",
-            ref_audio=ref_path,
-            ref_text=ref_text,
-            xvec_only=(ref_text is None)
-        )
-
-        audio_data = np.array(audio_data)
-
-        audio_data = audio_data.flatten()
-
-        # Convert Float32 to Int16 for standard WAV compatibility
-        audio_int16 = (audio_data * 32767).astype(np.int16)
+        
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+            wavs, sr = model.generate_voice_clone(
+                text=[request.input],
+                language=["English"],
+                voice_clone_prompt=prompt_cache[voice_key] 
+            )
 
         wav_io = io.BytesIO()
-        with wave.open(wav_io, 'wb') as wav_file:
-            wav_file.setnchannels(1)
-            wav_file.setsampwidth(2)
-            wav_file.setframerate(sample_rate)
-            wav_file.writeframes(audio_int16.tobytes())
-
+        sf.write(wav_io, wavs[0], sr, format='WAV', subtype='FLOAT')
         wav_io.seek(0)
+
         return Response(content=wav_io.getvalue(), media_type="audio/wav")
 
     except Exception as e: