Swapped back to qwen3-tts

This commit is contained in:
2026-05-05 16:42:49 +10:00
parent e90d2b1ec2
commit 109084e8e4
3 changed files with 100 additions and 78 deletions

View File

@@ -27,7 +27,8 @@ services:
--mem-fraction-static 0.8 --mem-fraction-static 0.8
--max-running-requests 128 --max-running-requests 128
--chunked-prefill-size 4096 --chunked-prefill-size 4096
--context-length 32768 --context-length 65536
--kv-cache-dtype fp8_e4m3
--trust-remote-code --trust-remote-code
--enable-piecewise-cuda-graph --enable-piecewise-cuda-graph
--schedule-policy lpm --schedule-policy lpm
@@ -40,20 +41,21 @@ services:
device_ids: ['0', '7'] device_ids: ['0', '7']
capabilities: [gpu] capabilities: [gpu]
# --- TTS --- # --- TTS ---
# Physical: 7 | Container: 0
tts: tts:
build: ./swarm-control/indra-tts-server build:
context: . # This allows the build to see the Qwen3-TTS folder at the root
dockerfile: ./swarm-control/indra-tts-server/Dockerfile
image: swarm-tts image: swarm-tts
depends_on: depends_on:
- persona - persona
environment: environment:
- CUDA_VISIBLE_DEVICES=0 - CUDA_VISIBLE_DEVICES=0
- PYTHONPATH=/app:/app/Qwen3-TTS - PYTHONPATH=/app:/app/Qwen3-TTS # Keep this so the app finds the local code
- NVIDIA_DRIVER_CAPABILITIES=all - NVIDIA_DRIVER_CAPABILITIES=all
volumes: volumes:
- /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro - /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
- /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS - /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS # Keep this for live code edits
ports: ports:
- "8002:8002" - "8002:8002"
deploy: deploy:
@@ -109,7 +111,7 @@ services:
--host 0.0.0.0 --host 0.0.0.0
--hf-chat-template-name tool_use --hf-chat-template-name tool_use
--mem-fraction-static 0.95 --mem-fraction-static 0.95
--context-length 32768 --context-length 131072
--trust-remote-code --trust-remote-code
--tool-call-parser qwen3_coder --tool-call-parser qwen3_coder
deploy: deploy:
@@ -180,8 +182,8 @@ services:
--tp 2 --tp 2
--port 3003 --port 3003
--host 0.0.0.0 --host 0.0.0.0
--mem-fraction-static 0.85 --mem-fraction-static 0.80
--context-length 65536 --context-length 131072
--kv-cache-dtype fp8_e4m3 --kv-cache-dtype fp8_e4m3
--trust-remote-code --trust-remote-code
--tool-call-parser gemma4 --tool-call-parser gemma4
@@ -206,7 +208,7 @@ services:
ports: ports:
- "8000:8000" - "8000:8000"
command: > command: >
--model-id jinaai/jina-embeddings-v2-base-code --model-id google/embeddinggemma-300m
--max-client-batch-size 1024 --max-client-batch-size 1024
deploy: deploy:
resources: resources:

View File

@@ -1,37 +1,33 @@
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
# Prevent interactive prompts
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
# 1. Install Python 3.12 and SoX dependencies # No deadsnakes PPA needed. Native Python 3.10 works perfectly.
RUN apt-get update && apt-get install -y software-properties-common && \ RUN apt-get update && apt-get install -y \
add-apt-repository ppa:deadsnakes/ppa -y && \ python3 python3-dev python3-pip curl git libsndfile1 ffmpeg sox libsox-dev ninja-build && \
apt-get update && apt-get install -y \
python3.12 \
python3.12-dev \
curl \
git \
libsndfile1 \
ffmpeg \
sox \
libsox-dev && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# 2. Use the official bootstrap to install a clean Pip for 3.12
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
WORKDIR /app WORKDIR /app
# 3. Explicitly install BOTH torch and torchaudio from the cu124 index # Install Torch and core packages
RUN python3.12 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124 RUN python3 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
RUN python3.12 -m pip install --no-cache-dir fastapi uvicorn numpy soundfile
# 4. Install the local Qwen3-TTS requirements # 1. Install foundational build tools and numpy first
RUN python3.12 -m pip install --no-cache-dir faster-qwen3-tts RUN python3 -m pip install --no-cache-dir numpy setuptools wheel ninja packaging psutil
COPY tts-server.py . # 2. Install the rest of the stack that relies on numpy being present
RUN python3 -m pip install --no-cache-dir fastapi uvicorn soundfile librosa transformers==4.57.3 accelerate sox onnxruntime
# Force ABI compatibility for the C++ compiler
ENV _GLIBCXX_USE_CXX11_ABI=0
ENV MAX_JOBS=8
# Install flash-attn
RUN python3 -m pip install --no-cache-dir flash-attn --no-build-isolation
COPY swarm-control/indra-tts-server/tts-server.py .
EXPOSE 8002 EXPOSE 8002
CMD ["python3.12", "tts-server.py"] CMD ["python3", "tts-server.py"]

View File

@@ -1,74 +1,98 @@
import os import os
import torch import torch
import numpy as np
import io import io
import wave import soundfile as sf
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from fastapi.responses import Response from fastapi.responses import Response
from pydantic import BaseModel from pydantic import BaseModel
from faster_qwen3_tts import FasterQwen3TTS from qwen_tts import Qwen3TTSModel
app = FastAPI(title="Indra tts") # --- 1. HARDWARE OPTIMIZATIONS ---
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
if not torch.cuda.is_available(): # NEW: Confine PyTorch CPU threads. On massive servers, PyTorch tries to use
raise RuntimeError("Mouth cannot find CUDA. Check nvidia-container-toolkit.") # all available cores for background tasks, causing severe context-switching lag.
print(f"Loading model on: {torch.cuda.get_device_name(0)}") torch.set_num_threads(4)
# Load the Base model for high-fidelity mimicry app = FastAPI(title="Indra Mouth - Qwen3-TTS Official (Optimized)")
model = FasterQwen3TTS.from_pretrained( prompt_cache = {}
print(f"Loading Official Qwen3-TTS on: {torch.cuda.get_device_name(0)}")
model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-Base", "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
device="cuda:0", # Targets GPU 7 device_map="cuda:0",
dtype=torch.bfloat16 dtype=torch.bfloat16,
attn_implementation="sdpa"
)
model.eval()
# --- 2. THE SURGICAL COMPILE ---
# We bypass the wrapper class and strictly compile the heavy LLM engine
# --- 2. THE SURGICAL COMPILE ---
import torch._dynamo
import torch._inductor.config
torch._dynamo.config.suppress_errors = True
# Expand the cache limit so it doesn't thrash when sequence lengths vary
torch._dynamo.config.cache_size_limit = 128
# Force inductor to use the fastest possible memory layouts for L40S
torch._inductor.config.coordinate_descent_tuning = True
print("Compiling Autoregressive Engine (Dynamic Shapes)...")
model.talker = torch.compile(
model.talker,
mode="max-autotune", # The most aggressive optimization level available
dynamic=True # CRITICAL: Tells the compiler the audio length will grow
) )
class TTSRequest(BaseModel): class TTSRequest(BaseModel):
model: str = "tts-1" # ignored by backend, here to satisfy modelix router
input: str input: str
voice: str = "oni" voice: str = "oni"
response_format: str = "wav"
seed: int = 42 seed: int = 42
@app.post("/v1/audio/speech") @app.post("/v1/audio/speech")
async def generate_speech(request: TTSRequest): async def generate_speech(request: TTSRequest):
try: try:
voice_file = f"{request.voice}.wav" voice_key = request.voice
if voice_key not in prompt_cache:
base_path = "/mnt/nvme3n1/swarm/voice-samples" base_path = "/mnt/nvme3n1/swarm/voice-samples"
ref_path = os.path.join(base_path, voice_file) ref_path = os.path.join(base_path, f"{voice_key}.wav")
txt_path = os.path.splitext(ref_path)[0] + ".txt" txt_path = os.path.join(base_path, f"{voice_key}.txt")
if not os.path.exists(ref_path):
raise FileNotFoundError(f"Voice sample {ref_path} not found.")
ref_text = None ref_text = None
if os.path.exists(txt_path): if os.path.exists(txt_path):
with open(txt_path, "r") as f: with open(txt_path, "r") as f:
ref_text = f.read().strip() ref_text = f.read().strip()
# Fix the seed for the persona identity prompt_cache[voice_key] = model.create_voice_clone_prompt(
torch.manual_seed(request.seed)
full_audio = []
# Non-streaming call is fine here since it takes <1s on your L40S
audio_data, sample_rate = model.generate_voice_clone(
text=request.input,
language="English",
ref_audio=ref_path, ref_audio=ref_path,
ref_text=ref_text, ref_text=ref_text,
xvec_only=(ref_text is None) x_vector_only_mode=(ref_text is None)
)
print(f"--- Cached pristine voice prompt: {voice_key} ---")
torch.manual_seed(request.seed)
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
wavs, sr = model.generate_voice_clone(
text=[request.input],
language=["English"],
voice_clone_prompt=prompt_cache[voice_key]
) )
audio_data = np.array(audio_data)
audio_data = audio_data.flatten()
# Convert Float32 to Int16 for standard WAV compatibility
audio_int16 = (audio_data * 32767).astype(np.int16)
wav_io = io.BytesIO() wav_io = io.BytesIO()
with wave.open(wav_io, 'wb') as wav_file: sf.write(wav_io, wavs[0], sr, format='WAV', subtype='FLOAT')
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_int16.tobytes())
wav_io.seek(0) wav_io.seek(0)
return Response(content=wav_io.getvalue(), media_type="audio/wav") return Response(content=wav_io.getvalue(), media_type="audio/wav")
except Exception as e: except Exception as e: