Swapped back to qwen3-tts
This commit is contained in:
@@ -27,7 +27,8 @@ services:
|
|||||||
--mem-fraction-static 0.8
|
--mem-fraction-static 0.8
|
||||||
--max-running-requests 128
|
--max-running-requests 128
|
||||||
--chunked-prefill-size 4096
|
--chunked-prefill-size 4096
|
||||||
--context-length 32768
|
--context-length 65536
|
||||||
|
--kv-cache-dtype fp8_e4m3
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--enable-piecewise-cuda-graph
|
--enable-piecewise-cuda-graph
|
||||||
--schedule-policy lpm
|
--schedule-policy lpm
|
||||||
@@ -40,20 +41,21 @@ services:
|
|||||||
device_ids: ['0', '7']
|
device_ids: ['0', '7']
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
# --- TTS ---
|
# --- TTS ---
|
||||||
# Physical: 7 | Container: 0
|
|
||||||
tts:
|
tts:
|
||||||
build: ./swarm-control/indra-tts-server
|
build:
|
||||||
|
context: . # This allows the build to see the Qwen3-TTS folder at the root
|
||||||
|
dockerfile: ./swarm-control/indra-tts-server/Dockerfile
|
||||||
image: swarm-tts
|
image: swarm-tts
|
||||||
depends_on:
|
depends_on:
|
||||||
- persona
|
- persona
|
||||||
environment:
|
environment:
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
- CUDA_VISIBLE_DEVICES=0
|
||||||
- PYTHONPATH=/app:/app/Qwen3-TTS
|
- PYTHONPATH=/app:/app/Qwen3-TTS # Keep this so the app finds the local code
|
||||||
- NVIDIA_DRIVER_CAPABILITIES=all
|
- NVIDIA_DRIVER_CAPABILITIES=all
|
||||||
volumes:
|
volumes:
|
||||||
- /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
|
- /mnt/nvme3n1/swarm/voice-samples:/mnt/nvme3n1/swarm/voice-samples:ro
|
||||||
- /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS
|
- /mnt/nvme3n1/swarm/Qwen3-TTS:/app/Qwen3-TTS # Keep this for live code edits
|
||||||
ports:
|
ports:
|
||||||
- "8002:8002"
|
- "8002:8002"
|
||||||
deploy:
|
deploy:
|
||||||
@@ -109,7 +111,7 @@ services:
|
|||||||
--host 0.0.0.0
|
--host 0.0.0.0
|
||||||
--hf-chat-template-name tool_use
|
--hf-chat-template-name tool_use
|
||||||
--mem-fraction-static 0.95
|
--mem-fraction-static 0.95
|
||||||
--context-length 32768
|
--context-length 131072
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--tool-call-parser qwen3_coder
|
--tool-call-parser qwen3_coder
|
||||||
deploy:
|
deploy:
|
||||||
@@ -180,8 +182,8 @@ services:
|
|||||||
--tp 2
|
--tp 2
|
||||||
--port 3003
|
--port 3003
|
||||||
--host 0.0.0.0
|
--host 0.0.0.0
|
||||||
--mem-fraction-static 0.85
|
--mem-fraction-static 0.80
|
||||||
--context-length 65536
|
--context-length 131072
|
||||||
--kv-cache-dtype fp8_e4m3
|
--kv-cache-dtype fp8_e4m3
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
--tool-call-parser gemma4
|
--tool-call-parser gemma4
|
||||||
@@ -206,7 +208,7 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
command: >
|
command: >
|
||||||
--model-id jinaai/jina-embeddings-v2-base-code
|
--model-id google/embeddinggemma-300m
|
||||||
--max-client-batch-size 1024
|
--max-client-batch-size 1024
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
|
|||||||
@@ -1,37 +1,33 @@
|
|||||||
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
|
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
|
||||||
|
|
||||||
# Prevent interactive prompts
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV NVIDIA_VISIBLE_DEVICES=all
|
ENV NVIDIA_VISIBLE_DEVICES=all
|
||||||
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
|
||||||
# 1. Install Python 3.12 and SoX dependencies
|
# No deadsnakes PPA needed. Native Python 3.10 works perfectly.
|
||||||
RUN apt-get update && apt-get install -y software-properties-common && \
|
RUN apt-get update && apt-get install -y \
|
||||||
add-apt-repository ppa:deadsnakes/ppa -y && \
|
python3 python3-dev python3-pip curl git libsndfile1 ffmpeg sox libsox-dev ninja-build && \
|
||||||
apt-get update && apt-get install -y \
|
|
||||||
python3.12 \
|
|
||||||
python3.12-dev \
|
|
||||||
curl \
|
|
||||||
git \
|
|
||||||
libsndfile1 \
|
|
||||||
ffmpeg \
|
|
||||||
sox \
|
|
||||||
libsox-dev && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# 2. Use the official bootstrap to install a clean Pip for 3.12
|
|
||||||
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# 3. Explicitly install BOTH torch and torchaudio from the cu124 index
|
# Install Torch and core packages
|
||||||
RUN python3.12 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
|
RUN python3 -m pip install --no-cache-dir torch==2.6.0 torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||||
RUN python3.12 -m pip install --no-cache-dir fastapi uvicorn numpy soundfile
|
|
||||||
|
|
||||||
# 4. Install the local Qwen3-TTS requirements
|
# 1. Install foundational build tools and numpy first
|
||||||
RUN python3.12 -m pip install --no-cache-dir faster-qwen3-tts
|
RUN python3 -m pip install --no-cache-dir numpy setuptools wheel ninja packaging psutil
|
||||||
|
|
||||||
COPY tts-server.py .
|
# 2. Install the rest of the stack that relies on numpy being present
|
||||||
|
RUN python3 -m pip install --no-cache-dir fastapi uvicorn soundfile librosa transformers==4.57.3 accelerate sox onnxruntime
|
||||||
|
|
||||||
|
# Force ABI compatibility for the C++ compiler
|
||||||
|
ENV _GLIBCXX_USE_CXX11_ABI=0
|
||||||
|
ENV MAX_JOBS=8
|
||||||
|
|
||||||
|
# Install flash-attn
|
||||||
|
RUN python3 -m pip install --no-cache-dir flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
COPY swarm-control/indra-tts-server/tts-server.py .
|
||||||
|
|
||||||
EXPOSE 8002
|
EXPOSE 8002
|
||||||
CMD ["python3.12", "tts-server.py"]
|
CMD ["python3", "tts-server.py"]
|
||||||
|
|||||||
@@ -1,74 +1,98 @@
|
|||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
|
||||||
import io
|
import io
|
||||||
import wave
|
import soundfile as sf
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from faster_qwen3_tts import FasterQwen3TTS
|
from qwen_tts import Qwen3TTSModel
|
||||||
|
|
||||||
app = FastAPI(title="Indra tts")
|
# --- 1. HARDWARE OPTIMIZATIONS ---
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
|
||||||
if not torch.cuda.is_available():
|
# NEW: Confine PyTorch CPU threads. On massive servers, PyTorch tries to use
|
||||||
raise RuntimeError("Mouth cannot find CUDA. Check nvidia-container-toolkit.")
|
# all available cores for background tasks, causing severe context-switching lag.
|
||||||
print(f"Loading model on: {torch.cuda.get_device_name(0)}")
|
torch.set_num_threads(4)
|
||||||
|
|
||||||
# Load the Base model for high-fidelity mimicry
|
app = FastAPI(title="Indra Mouth - Qwen3-TTS Official (Optimized)")
|
||||||
model = FasterQwen3TTS.from_pretrained(
|
prompt_cache = {}
|
||||||
|
|
||||||
|
print(f"Loading Official Qwen3-TTS on: {torch.cuda.get_device_name(0)}")
|
||||||
|
|
||||||
|
model = Qwen3TTSModel.from_pretrained(
|
||||||
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
||||||
device="cuda:0", # Targets GPU 7
|
device_map="cuda:0",
|
||||||
dtype=torch.bfloat16
|
dtype=torch.bfloat16,
|
||||||
|
attn_implementation="sdpa"
|
||||||
|
)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# --- 2. THE SURGICAL COMPILE ---
|
||||||
|
# We bypass the wrapper class and strictly compile the heavy LLM engine
|
||||||
|
# --- 2. THE SURGICAL COMPILE ---
|
||||||
|
import torch._dynamo
|
||||||
|
import torch._inductor.config
|
||||||
|
|
||||||
|
torch._dynamo.config.suppress_errors = True
|
||||||
|
# Expand the cache limit so it doesn't thrash when sequence lengths vary
|
||||||
|
torch._dynamo.config.cache_size_limit = 128
|
||||||
|
# Force inductor to use the fastest possible memory layouts for L40S
|
||||||
|
torch._inductor.config.coordinate_descent_tuning = True
|
||||||
|
|
||||||
|
print("Compiling Autoregressive Engine (Dynamic Shapes)...")
|
||||||
|
model.talker = torch.compile(
|
||||||
|
model.talker,
|
||||||
|
mode="max-autotune", # The most aggressive optimization level available
|
||||||
|
dynamic=True # CRITICAL: Tells the compiler the audio length will grow
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TTSRequest(BaseModel):
|
class TTSRequest(BaseModel):
|
||||||
model: str = "tts-1" # ignored by backend, here to satisfy modelix router
|
|
||||||
input: str
|
input: str
|
||||||
voice: str = "oni"
|
voice: str = "oni"
|
||||||
response_format: str = "wav"
|
|
||||||
seed: int = 42
|
seed: int = 42
|
||||||
|
|
||||||
@app.post("/v1/audio/speech")
|
@app.post("/v1/audio/speech")
|
||||||
async def generate_speech(request: TTSRequest):
|
async def generate_speech(request: TTSRequest):
|
||||||
try:
|
try:
|
||||||
voice_file = f"{request.voice}.wav"
|
voice_key = request.voice
|
||||||
base_path = "/mnt/nvme3n1/swarm/voice-samples"
|
|
||||||
ref_path = os.path.join(base_path, voice_file)
|
if voice_key not in prompt_cache:
|
||||||
txt_path = os.path.splitext(ref_path)[0] + ".txt"
|
base_path = "/mnt/nvme3n1/swarm/voice-samples"
|
||||||
|
ref_path = os.path.join(base_path, f"{voice_key}.wav")
|
||||||
|
txt_path = os.path.join(base_path, f"{voice_key}.txt")
|
||||||
|
|
||||||
|
if not os.path.exists(ref_path):
|
||||||
|
raise FileNotFoundError(f"Voice sample {ref_path} not found.")
|
||||||
|
|
||||||
|
ref_text = None
|
||||||
|
if os.path.exists(txt_path):
|
||||||
|
with open(txt_path, "r") as f:
|
||||||
|
ref_text = f.read().strip()
|
||||||
|
|
||||||
ref_text = None
|
prompt_cache[voice_key] = model.create_voice_clone_prompt(
|
||||||
if os.path.exists(txt_path):
|
ref_audio=ref_path,
|
||||||
with open(txt_path, "r") as f:
|
ref_text=ref_text,
|
||||||
ref_text = f.read().strip()
|
x_vector_only_mode=(ref_text is None)
|
||||||
|
)
|
||||||
|
print(f"--- Cached pristine voice prompt: {voice_key} ---")
|
||||||
|
|
||||||
# Fix the seed for the persona identity
|
|
||||||
torch.manual_seed(request.seed)
|
torch.manual_seed(request.seed)
|
||||||
|
|
||||||
full_audio = []
|
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
|
||||||
# Non-streaming call is fine here since it takes <1s on your L40S
|
wavs, sr = model.generate_voice_clone(
|
||||||
audio_data, sample_rate = model.generate_voice_clone(
|
text=[request.input],
|
||||||
text=request.input,
|
language=["English"],
|
||||||
language="English",
|
voice_clone_prompt=prompt_cache[voice_key]
|
||||||
ref_audio=ref_path,
|
)
|
||||||
ref_text=ref_text,
|
|
||||||
xvec_only=(ref_text is None)
|
|
||||||
)
|
|
||||||
|
|
||||||
audio_data = np.array(audio_data)
|
|
||||||
|
|
||||||
audio_data = audio_data.flatten()
|
|
||||||
|
|
||||||
# Convert Float32 to Int16 for standard WAV compatibility
|
|
||||||
audio_int16 = (audio_data * 32767).astype(np.int16)
|
|
||||||
|
|
||||||
wav_io = io.BytesIO()
|
wav_io = io.BytesIO()
|
||||||
with wave.open(wav_io, 'wb') as wav_file:
|
sf.write(wav_io, wavs[0], sr, format='WAV', subtype='FLOAT')
|
||||||
wav_file.setnchannels(1)
|
|
||||||
wav_file.setsampwidth(2)
|
|
||||||
wav_file.setframerate(sample_rate)
|
|
||||||
wav_file.writeframes(audio_int16.tobytes())
|
|
||||||
|
|
||||||
wav_io.seek(0)
|
wav_io.seek(0)
|
||||||
|
|
||||||
return Response(content=wav_io.getvalue(), media_type="audio/wav")
|
return Response(content=wav_io.getvalue(), media_type="audio/wav")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user