Removed the qwen3.6-35b swarm and added qwen3.6-27b dense and gemma-4-31b dense
This commit is contained in:
@@ -64,7 +64,6 @@ services:
|
|||||||
device_ids: ['7']
|
device_ids: ['7']
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
|
|
||||||
# --- STT ---
|
# --- STT ---
|
||||||
stt:
|
stt:
|
||||||
image: ghcr.io/speaches-ai/speaches:latest-cuda
|
image: ghcr.io/speaches-ai/speaches:latest-cuda
|
||||||
@@ -88,8 +87,6 @@ services:
|
|||||||
device_ids: ['0']
|
device_ids: ['0']
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ==========================================
|
# ==========================================
|
||||||
# TIER 1: COMMAND LOBE (GPUs 1 & 2)
|
# TIER 1: COMMAND LOBE (GPUs 1 & 2)
|
||||||
# ==========================================
|
# ==========================================
|
||||||
@@ -114,6 +111,7 @@ services:
|
|||||||
--mem-fraction-static 0.95
|
--mem-fraction-static 0.95
|
||||||
--context-length 32768
|
--context-length 32768
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
|
--tool-call-parser qwen3_coder
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
@@ -123,28 +121,15 @@ services:
|
|||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
# ==========================================
|
# ==========================================
|
||||||
# TIER 2: SWARM LOBE (GPUs 3, 4, 5)
|
# TIER 2: HETEROGENEOUS MODELS (GPUs 3, 4, 5)
|
||||||
# ==========================================
|
# ==========================================
|
||||||
swarm_router:
|
|
||||||
image: lmsysorg/sgl-model-gateway:latest
|
|
||||||
ports:
|
|
||||||
- "4000:4000"
|
|
||||||
command: >
|
|
||||||
--port 4000
|
|
||||||
--worker-urls http://worker_1:3002 http://worker_2:3003 http://worker_3:3004
|
|
||||||
--policy cache_aware
|
|
||||||
depends_on:
|
|
||||||
- worker_1
|
|
||||||
- worker_2
|
|
||||||
- worker_3
|
|
||||||
networks:
|
|
||||||
- default
|
|
||||||
|
|
||||||
worker_1:
|
|
||||||
image: lmsysorg/sglang:latest
|
qwen_27b:
|
||||||
|
image: lmsysorg/sglang:dev
|
||||||
ipc: host
|
ipc: host
|
||||||
environment:
|
environment:
|
||||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 3
|
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 3, 4
|
||||||
- HUGGING_FACE_HUB_TOKEN=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
|
- HUGGING_FACE_HUB_TOKEN=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
|
||||||
volumes:
|
volumes:
|
||||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||||
@@ -152,35 +137,36 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "3002:3002"
|
- "3002:3002"
|
||||||
command: >
|
command: >
|
||||||
sglang serve
|
python3 -m sglang.launch_server
|
||||||
--model-path Qwen/Qwen3.6-35B-A3B-FP8
|
--model-path Qwen/Qwen3.6-27B-FP8
|
||||||
--tp 1
|
--tp 2
|
||||||
--port 3002
|
--port 3002
|
||||||
--host 0.0.0.0
|
--host 0.0.0.0
|
||||||
--mem-fraction-static 0.9
|
--mem-fraction-static 0.85
|
||||||
--context-length 131072
|
--context-length 131072
|
||||||
--kv-cache-dtype fp8_e4m3
|
|
||||||
--allow-auto-truncate
|
|
||||||
--max-running-requests 256
|
|
||||||
--chunked-prefill-size 2048
|
|
||||||
--schedule-policy lpm
|
|
||||||
--tool-call-parser qwen3_coder
|
--tool-call-parser qwen3_coder
|
||||||
--reasoning-parser qwen3
|
--reasoning-parser qwen3
|
||||||
--hf-chat-template-name tool_use
|
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
|
|
||||||
|
cap_add:
|
||||||
|
- SYS_NICE
|
||||||
|
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
devices:
|
devices:
|
||||||
- driver: nvidia
|
- driver: nvidia
|
||||||
device_ids: ['3']
|
device_ids: ['3', '4']
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
worker_2:
|
|
||||||
image: lmsysorg/sglang:latest
|
gemma_31b:
|
||||||
|
build: ./swarm-control/persona
|
||||||
ipc: host
|
ipc: host
|
||||||
|
cap_add:
|
||||||
|
- SYS_NICE
|
||||||
environment:
|
environment:
|
||||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 4
|
- CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 5, 6
|
||||||
- HUGGING_FACE_HUB_TOKEN=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
|
- HUGGING_FACE_HUB_TOKEN=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
|
||||||
volumes:
|
volumes:
|
||||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
||||||
@@ -188,68 +174,27 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "3003:3003"
|
- "3003:3003"
|
||||||
command: >
|
command: >
|
||||||
sglang serve
|
python3 -m sglang.launch_server
|
||||||
--model-path Qwen/Qwen3.6-35B-A3B-FP8
|
--model-path RedHatAI/gemma-4-31B-it-FP8-Dynamic
|
||||||
--tp 1
|
--tokenizer-path google/gemma-4-31B-it
|
||||||
|
--tp 2
|
||||||
--port 3003
|
--port 3003
|
||||||
--host 0.0.0.0
|
--host 0.0.0.0
|
||||||
--mem-fraction-static 0.90
|
--mem-fraction-static 0.85
|
||||||
--context-length 131072
|
--context-length 65536
|
||||||
--kv-cache-dtype fp8_e4m3
|
--kv-cache-dtype fp8_e4m3
|
||||||
--allow-auto-truncate
|
|
||||||
--max-running-requests 256
|
|
||||||
--chunked-prefill-size 2048
|
|
||||||
--schedule-policy lpm
|
|
||||||
--tool-call-parser qwen3_coder
|
|
||||||
--reasoning-parser qwen3
|
|
||||||
--hf-chat-template-name tool_use
|
|
||||||
--trust-remote-code
|
--trust-remote-code
|
||||||
|
--tool-call-parser gemma4
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
devices:
|
devices:
|
||||||
- driver: nvidia
|
- driver: nvidia
|
||||||
device_ids: ['4']
|
device_ids: ['5', '6']
|
||||||
capabilities: [gpu]
|
|
||||||
|
|
||||||
worker_3:
|
|
||||||
image: lmsysorg/sglang:latest
|
|
||||||
ipc: host
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 5
|
|
||||||
- HUGGING_FACE_HUB_TOKEN=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
|
|
||||||
volumes:
|
|
||||||
- /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface
|
|
||||||
- /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang
|
|
||||||
ports:
|
|
||||||
- "3004:3004"
|
|
||||||
command: >
|
|
||||||
sglang serve
|
|
||||||
--model-path Qwen/Qwen3.6-35B-A3B-FP8
|
|
||||||
--tp 1
|
|
||||||
--port 3004
|
|
||||||
--host 0.0.0.0
|
|
||||||
--mem-fraction-static 0.90
|
|
||||||
--context-length 131072
|
|
||||||
--kv-cache-dtype fp8_e4m3
|
|
||||||
--allow-auto-truncate
|
|
||||||
--max-running-requests 256
|
|
||||||
--chunked-prefill-size 2048
|
|
||||||
--schedule-policy lpm
|
|
||||||
--tool-call-parser qwen3_coder
|
|
||||||
--reasoning-parser qwen3
|
|
||||||
--hf-chat-template-name tool_use
|
|
||||||
--trust-remote-code
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
device_ids: ['5']
|
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
# ==========================================
|
# ==========================================
|
||||||
# TIER 3: MEMORY LOBE (GPU 6)
|
# TIER 3: MEMORY (GPU 6)
|
||||||
# ==========================================
|
# ==========================================
|
||||||
embeddings:
|
embeddings:
|
||||||
image: ghcr.io/huggingface/text-embeddings-inference:latest
|
image: ghcr.io/huggingface/text-embeddings-inference:latest
|
||||||
@@ -257,7 +202,6 @@ services:
|
|||||||
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6
|
- CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6
|
||||||
- HUGGING_FACE_HUB_TOKEN=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
|
- HUGGING_FACE_HUB_TOKEN=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
|
||||||
volumes:
|
volumes:
|
||||||
# TEI uses /data as its internal cache directory
|
|
||||||
- /mnt/nvme3n1/swarm/huggingface_cache:/data
|
- /mnt/nvme3n1/swarm/huggingface_cache:/data
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
@@ -272,8 +216,6 @@ services:
|
|||||||
device_ids: ['6']
|
device_ids: ['6']
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# --- Networks ---
|
# --- Networks ---
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
Reference in New Issue
Block a user