introduced the qwen_35b swarm

This commit is contained in:
2026-04-27 12:05:54 +10:00
parent ea4c11e32f
commit 1a255f8205

View File

@@ -31,6 +31,7 @@ services:
--trust-remote-code --trust-remote-code
--enable-piecewise-cuda-graph --enable-piecewise-cuda-graph
--schedule-policy lpm --schedule-policy lpm
--tool-call-parser gemma4
deploy: deploy:
resources: resources:
reservations: reservations:
@@ -39,7 +40,7 @@ services:
device_ids: ['0', '7'] device_ids: ['0', '7']
capabilities: [gpu] capabilities: [gpu]
# --- TTS --- # --- TTS ---
# Physical: 7 | Container: 0 # Physical: 7 | Container: 0
tts: tts:
build: ./swarm-control/indra-tts-server build: ./swarm-control/indra-tts-server
@@ -64,7 +65,7 @@ services:
capabilities: [gpu] capabilities: [gpu]
# --- STT --- # --- STT ---
stt: stt:
image: ghcr.io/speaches-ai/speaches:latest-cuda image: ghcr.io/speaches-ai/speaches:latest-cuda
depends_on: depends_on:
@@ -87,6 +88,194 @@ services:
device_ids: ['0'] device_ids: ['0']
capabilities: [gpu] capabilities: [gpu]
# ==========================================
# TIER 1: COMMAND LOBE (GPUs 1 & 2)
# ==========================================
coder_next:
image: lmsysorg/sglang:latest
ipc: host
environment:
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2
- HUGGING_FACE_HUB_TOKEN=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
volumes:
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
ports:
- "3001:3001"
command: >
sglang serve
--model-path Qwen/Qwen3-Coder-Next-FP8
--tp 2
--port 3001
--host 0.0.0.0
--hf-chat-template-name tool_use
--mem-fraction-static 0.95
--context-length 32768
--trust-remote-code
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['1', '2']
capabilities: [gpu]
# ==========================================
# TIER 2: SWARM LOBE (GPUs 3, 4, 5)
# ==========================================
swarm_router:
image: lmsysorg/sgl-model-gateway:latest
ports:
- "4000:4000"
command: >
--port 4000
--worker-urls http://worker_1:3002 http://worker_2:3003 http://worker_3:3004
--policy cache_aware
depends_on:
- worker_1
- worker_2
- worker_3
networks:
- default
worker_1:
image: lmsysorg/sglang:latest
ipc: host
environment:
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 3
- HUGGING_FACE_HUB_TOKEN=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
volumes:
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
ports:
- "3002:3002"
command: >
sglang serve
--model-path Qwen/Qwen3.6-35B-A3B-FP8
--tp 1
--port 3002
--host 0.0.0.0
--mem-fraction-static 0.9
--context-length 131072
--kv-cache-dtype fp8_e4m3
--allow-auto-truncate
--max-running-requests 256
--chunked-prefill-size 2048
--schedule-policy lpm
--tool-call-parser qwen3_coder
--reasoning-parser qwen3
--hf-chat-template-name tool_use
--trust-remote-code
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['3']
capabilities: [gpu]
worker_2:
image: lmsysorg/sglang:latest
ipc: host
environment:
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 4
- HUGGING_FACE_HUB_TOKEN=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
volumes:
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
ports:
- "3003:3003"
command: >
sglang serve
--model-path Qwen/Qwen3.6-35B-A3B-FP8
--tp 1
--port 3003
--host 0.0.0.0
--mem-fraction-static 0.90
--context-length 131072
--kv-cache-dtype fp8_e4m3
--allow-auto-truncate
--max-running-requests 256
--chunked-prefill-size 2048
--schedule-policy lpm
--tool-call-parser qwen3_coder
--reasoning-parser qwen3
--hf-chat-template-name tool_use
--trust-remote-code
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['4']
capabilities: [gpu]
worker_3:
image: lmsysorg/sglang:latest
ipc: host
environment:
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 5
- HUGGING_FACE_HUB_TOKEN=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
volumes:
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
ports:
- "3004:3004"
command: >
sglang serve
--model-path Qwen/Qwen3.6-35B-A3B-FP8
--tp 1
--port 3004
--host 0.0.0.0
--mem-fraction-static 0.90
--context-length 131072
--kv-cache-dtype fp8_e4m3
--allow-auto-truncate
--max-running-requests 256
--chunked-prefill-size 2048
--schedule-policy lpm
--tool-call-parser qwen3_coder
--reasoning-parser qwen3
--hf-chat-template-name tool_use
--trust-remote-code
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['5']
capabilities: [gpu]
# ==========================================
# TIER 3: MEMORY LOBE (GPU 6)
# ==========================================
embeddings:
image: ghcr.io/huggingface/text-embeddings-inference:latest
environment:
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6
- HUGGING_FACE_HUB_TOKEN=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
volumes:
# TEI uses /data as its internal cache directory
- /mnt/nvme3n1/swarm/huggingface_cache:/data
ports:
- "8000:8000"
command: >
--model-id jinaai/jina-embeddings-v2-base-code
--max-client-batch-size 1024
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['6']
capabilities: [gpu]
# --- Networks ---
networks: networks:
default: default:
name: swarm-network name: swarm-network