introduced the qwen_35b swarm
This commit is contained in:
@@ -31,6 +31,7 @@ services:
|
||||
--trust-remote-code
|
||||
--enable-piecewise-cuda-graph
|
||||
--schedule-policy lpm
|
||||
--tool-call-parser gemma4
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
@@ -87,6 +88,194 @@ services:
|
||||
device_ids: ['0']
|
||||
capabilities: [gpu]
|
||||
|
||||
|
||||
|
||||
# ==========================================
|
||||
# TIER 1: COMMAND LOBE (GPUs 1 & 2)
|
||||
# ==========================================
|
||||
coder_next:
|
||||
image: lmsysorg/sglang:latest
|
||||
ipc: host
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2
|
||||
- HUGGING_FACE_HUB_TOKEN=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
|
||||
volumes:
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
||||
ports:
|
||||
- "3001:3001"
|
||||
command: >
|
||||
sglang serve
|
||||
--model-path Qwen/Qwen3-Coder-Next-FP8
|
||||
--tp 2
|
||||
--port 3001
|
||||
--host 0.0.0.0
|
||||
--hf-chat-template-name tool_use
|
||||
--mem-fraction-static 0.95
|
||||
--context-length 32768
|
||||
--trust-remote-code
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['1', '2']
|
||||
capabilities: [gpu]
|
||||
|
||||
# ==========================================
|
||||
# TIER 2: SWARM LOBE (GPUs 3, 4, 5)
|
||||
# ==========================================
|
||||
swarm_router:
|
||||
image: lmsysorg/sgl-model-gateway:latest
|
||||
ports:
|
||||
- "4000:4000"
|
||||
command: >
|
||||
--port 4000
|
||||
--worker-urls http://worker_1:3002 http://worker_2:3003 http://worker_3:3004
|
||||
--policy cache_aware
|
||||
depends_on:
|
||||
- worker_1
|
||||
- worker_2
|
||||
- worker_3
|
||||
networks:
|
||||
- default
|
||||
|
||||
worker_1:
|
||||
image: lmsysorg/sglang:latest
|
||||
ipc: host
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 3
|
||||
- HUGGING_FACE_HUB_TOKEN=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
|
||||
volumes:
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
||||
ports:
|
||||
- "3002:3002"
|
||||
command: >
|
||||
sglang serve
|
||||
--model-path Qwen/Qwen3.6-35B-A3B-FP8
|
||||
--tp 1
|
||||
--port 3002
|
||||
--host 0.0.0.0
|
||||
--mem-fraction-static 0.9
|
||||
--context-length 131072
|
||||
--kv-cache-dtype fp8_e4m3
|
||||
--allow-auto-truncate
|
||||
--max-running-requests 256
|
||||
--chunked-prefill-size 2048
|
||||
--schedule-policy lpm
|
||||
--tool-call-parser qwen3_coder
|
||||
--reasoning-parser qwen3
|
||||
--hf-chat-template-name tool_use
|
||||
--trust-remote-code
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['3']
|
||||
capabilities: [gpu]
|
||||
|
||||
worker_2:
|
||||
image: lmsysorg/sglang:latest
|
||||
ipc: host
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 4
|
||||
- HUGGING_FACE_HUB_TOKEN=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
|
||||
volumes:
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
||||
ports:
|
||||
- "3003:3003"
|
||||
command: >
|
||||
sglang serve
|
||||
--model-path Qwen/Qwen3.6-35B-A3B-FP8
|
||||
--tp 1
|
||||
--port 3003
|
||||
--host 0.0.0.0
|
||||
--mem-fraction-static 0.90
|
||||
--context-length 131072
|
||||
--kv-cache-dtype fp8_e4m3
|
||||
--allow-auto-truncate
|
||||
--max-running-requests 256
|
||||
--chunked-prefill-size 2048
|
||||
--schedule-policy lpm
|
||||
--tool-call-parser qwen3_coder
|
||||
--reasoning-parser qwen3
|
||||
--hf-chat-template-name tool_use
|
||||
--trust-remote-code
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['4']
|
||||
capabilities: [gpu]
|
||||
|
||||
worker_3:
|
||||
image: lmsysorg/sglang:latest
|
||||
ipc: host
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 5
|
||||
- HUGGING_FACE_HUB_TOKEN=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
|
||||
volumes:
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
||||
ports:
|
||||
- "3004:3004"
|
||||
command: >
|
||||
sglang serve
|
||||
--model-path Qwen/Qwen3.6-35B-A3B-FP8
|
||||
--tp 1
|
||||
--port 3004
|
||||
--host 0.0.0.0
|
||||
--mem-fraction-static 0.90
|
||||
--context-length 131072
|
||||
--kv-cache-dtype fp8_e4m3
|
||||
--allow-auto-truncate
|
||||
--max-running-requests 256
|
||||
--chunked-prefill-size 2048
|
||||
--schedule-policy lpm
|
||||
--tool-call-parser qwen3_coder
|
||||
--reasoning-parser qwen3
|
||||
--hf-chat-template-name tool_use
|
||||
--trust-remote-code
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['5']
|
||||
capabilities: [gpu]
|
||||
|
||||
# ==========================================
|
||||
# TIER 3: MEMORY LOBE (GPU 6)
|
||||
# ==========================================
|
||||
embeddings:
|
||||
image: ghcr.io/huggingface/text-embeddings-inference:latest
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6
|
||||
- HUGGING_FACE_HUB_TOKEN=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
|
||||
volumes:
|
||||
# TEI uses /data as its internal cache directory
|
||||
- /mnt/nvme3n1/swarm/huggingface_cache:/data
|
||||
ports:
|
||||
- "8000:8000"
|
||||
command: >
|
||||
--model-id jinaai/jina-embeddings-v2-base-code
|
||||
--max-client-batch-size 1024
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['6']
|
||||
capabilities: [gpu]
|
||||
|
||||
|
||||
|
||||
# --- Networks ---
|
||||
|
||||
networks:
|
||||
default:
|
||||
name: swarm-network
|
||||
|
||||
Reference in New Issue
Block a user