From 1a255f8205555b2b32889dc2329084b72157808a Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 27 Apr 2026 12:05:54 +1000 Subject: [PATCH] introduced the qwen_35b swarm --- docker-compose.yml | 193 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 24f0054..14d7233 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,6 +31,7 @@ services: --trust-remote-code --enable-piecewise-cuda-graph --schedule-policy lpm + --tool-call-parser gemma4 deploy: resources: reservations: @@ -39,7 +40,7 @@ services: device_ids: ['0', '7'] capabilities: [gpu] -# --- TTS --- + # --- TTS --- # Physical: 7 | Container: 0 tts: build: ./swarm-control/indra-tts-server @@ -64,7 +65,7 @@ services: capabilities: [gpu] -# --- STT --- + # --- STT --- stt: image: ghcr.io/speaches-ai/speaches:latest-cuda depends_on: @@ -87,6 +88,194 @@ services: device_ids: ['0'] capabilities: [gpu] + + +# ========================================== + # TIER 1: COMMAND LOBE (GPUs 1 & 2) + # ========================================== + coder_next: + image: lmsysorg/sglang:latest + ipc: host + environment: + - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 1, 2 + - HUGGING_FACE_HUB_TOKEN=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 + volumes: + - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface + - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang + ports: + - "3001:3001" + command: > + sglang serve + --model-path Qwen/Qwen3-Coder-Next-FP8 + --tp 2 + --port 3001 + --host 0.0.0.0 + --hf-chat-template-name tool_use + --mem-fraction-static 0.95 + --context-length 32768 + --trust-remote-code + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['1', '2'] + capabilities: [gpu] + + # ========================================== + # TIER 2: SWARM LOBE (GPUs 3, 4, 5) + # ========================================== + swarm_router: + image: lmsysorg/sgl-model-gateway:latest + ports: + - "4000:4000" + command: > + --port 4000 + --worker-urls http://worker_1:3002 http://worker_2:3003 http://worker_3:3004 + --policy cache_aware + depends_on: + - worker_1 + - worker_2 + - worker_3 + networks: + - default + + worker_1: + image: lmsysorg/sglang:latest + ipc: host + environment: + - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 3 + - HUGGING_FACE_HUB_TOKEN=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 + volumes: + - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface + - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang + ports: + - "3002:3002" + command: > + sglang serve + --model-path Qwen/Qwen3.6-35B-A3B-FP8 + --tp 1 + --port 3002 + --host 0.0.0.0 + --mem-fraction-static 0.9 + --context-length 131072 + --kv-cache-dtype fp8_e4m3 + --allow-auto-truncate + --max-running-requests 256 + --chunked-prefill-size 2048 + --schedule-policy lpm + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --hf-chat-template-name tool_use + --trust-remote-code + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['3'] + capabilities: [gpu] + + worker_2: + image: lmsysorg/sglang:latest + ipc: host + environment: + - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 4 + - HUGGING_FACE_HUB_TOKEN=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 + volumes: + - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface + - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang + ports: + - "3003:3003" + command: > + sglang serve + --model-path Qwen/Qwen3.6-35B-A3B-FP8 + --tp 1 + --port 3003 + --host 0.0.0.0 + --mem-fraction-static 0.90 + --context-length 131072 + --kv-cache-dtype fp8_e4m3 + --allow-auto-truncate + --max-running-requests 256 + --chunked-prefill-size 2048 + --schedule-policy lpm + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --hf-chat-template-name tool_use + --trust-remote-code + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['4'] + capabilities: [gpu] + + worker_3: + image: lmsysorg/sglang:latest + ipc: host + environment: + - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 5 + - HUGGING_FACE_HUB_TOKEN=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 + volumes: + - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface + - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang + ports: + - "3004:3004" + command: > + sglang serve + --model-path Qwen/Qwen3.6-35B-A3B-FP8 + --tp 1 + --port 3004 + --host 0.0.0.0 + --mem-fraction-static 0.90 + --context-length 131072 + --kv-cache-dtype fp8_e4m3 + --allow-auto-truncate + --max-running-requests 256 + --chunked-prefill-size 2048 + --schedule-policy lpm + --tool-call-parser qwen3_coder + --reasoning-parser qwen3 + --hf-chat-template-name tool_use + --trust-remote-code + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['5'] + capabilities: [gpu] + + # ========================================== + # TIER 3: MEMORY LOBE (GPU 6) + # ========================================== + embeddings: + image: ghcr.io/huggingface/text-embeddings-inference:latest + environment: + - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6 + - HUGGING_FACE_HUB_TOKEN=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 + volumes: + # TEI uses /data as its internal cache directory + - /mnt/nvme3n1/swarm/huggingface_cache:/data + ports: + - "8000:8000" + command: > + --model-id jinaai/jina-embeddings-v2-base-code + --max-client-batch-size 1024 + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['6'] + capabilities: [gpu] + + + +# --- Networks --- + networks: default: name: swarm-network