diff --git a/docker-compose.yml b/docker-compose.yml index 14d7233..0478980 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -64,7 +64,6 @@ services: device_ids: ['7'] capabilities: [gpu] - # --- STT --- stt: image: ghcr.io/speaches-ai/speaches:latest-cuda @@ -88,9 +87,7 @@ services: device_ids: ['0'] capabilities: [gpu] - - -# ========================================== + # ========================================== # TIER 1: COMMAND LOBE (GPUs 1 & 2) # ========================================== coder_next: @@ -114,6 +111,7 @@ services: --mem-fraction-static 0.95 --context-length 32768 --trust-remote-code + --tool-call-parser qwen3_coder deploy: resources: reservations: @@ -123,28 +121,15 @@ services: capabilities: [gpu] # ========================================== - # TIER 2: SWARM LOBE (GPUs 3, 4, 5) + # TIER 2: HETEROGENEOUS MODELS (GPUs 3, 4, 5) # ========================================== - swarm_router: - image: lmsysorg/sgl-model-gateway:latest - ports: - - "4000:4000" - command: > - --port 4000 - --worker-urls http://worker_1:3002 http://worker_2:3003 http://worker_3:3004 - --policy cache_aware - depends_on: - - worker_1 - - worker_2 - - worker_3 - networks: - - default - worker_1: - image: lmsysorg/sglang:latest + + qwen_27b: + image: lmsysorg/sglang:dev ipc: host environment: - - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 3 + - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 3, 4 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface @@ -152,35 +137,36 @@ services: ports: - "3002:3002" command: > - sglang serve - --model-path Qwen/Qwen3.6-35B-A3B-FP8 - --tp 1 + python3 -m sglang.launch_server + --model-path Qwen/Qwen3.6-27B-FP8 + --tp 2 --port 3002 --host 0.0.0.0 - --mem-fraction-static 0.9 + --mem-fraction-static 0.85 --context-length 131072 - --kv-cache-dtype fp8_e4m3 - --allow-auto-truncate - --max-running-requests 256 - --chunked-prefill-size 2048 - --schedule-policy lpm --tool-call-parser qwen3_coder --reasoning-parser qwen3 - --hf-chat-template-name tool_use --trust-remote-code + + cap_add: + - SYS_NICE + deploy: resources: reservations: devices: - driver: nvidia - device_ids: ['3'] + device_ids: ['3', '4'] capabilities: [gpu] - worker_2: - image: lmsysorg/sglang:latest + + gemma_31b: + build: ./swarm-control/persona ipc: host + cap_add: + - SYS_NICE environment: - - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 4 + - CUDA_VISIBLE_DEVICES=0,1 # Mapped from physical 5, 6 - HUGGING_FACE_HUB_TOKEN=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 volumes: - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface @@ -188,68 +174,27 @@ services: ports: - "3003:3003" command: > - sglang serve - --model-path Qwen/Qwen3.6-35B-A3B-FP8 - --tp 1 + python3 -m sglang.launch_server + --model-path RedHatAI/gemma-4-31B-it-FP8-Dynamic + --tokenizer-path google/gemma-4-31B-it + --tp 2 --port 3003 --host 0.0.0.0 - --mem-fraction-static 0.90 - --context-length 131072 - --kv-cache-dtype fp8_e4m3 - --allow-auto-truncate - --max-running-requests 256 - --chunked-prefill-size 2048 - --schedule-policy lpm - --tool-call-parser qwen3_coder - --reasoning-parser qwen3 - --hf-chat-template-name tool_use + --mem-fraction-static 0.85 + --context-length 65536 + --kv-cache-dtype fp8_e4m3 --trust-remote-code + --tool-call-parser gemma4 deploy: resources: reservations: devices: - driver: nvidia - device_ids: ['4'] + device_ids: ['5', '6'] capabilities: [gpu] - - worker_3: - image: lmsysorg/sglang:latest - ipc: host - environment: - - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 5 - - HUGGING_FACE_HUB_TOKEN=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 - volumes: - - /mnt/nvme3n1/swarm/huggingface_cache:/root/.cache/huggingface - - /mnt/nvme3n1/swarm/sglang_cache:/root/.cache/sglang - ports: - - "3004:3004" - command: > - sglang serve - --model-path Qwen/Qwen3.6-35B-A3B-FP8 - --tp 1 - --port 3004 - --host 0.0.0.0 - --mem-fraction-static 0.90 - --context-length 131072 - --kv-cache-dtype fp8_e4m3 - --allow-auto-truncate - --max-running-requests 256 - --chunked-prefill-size 2048 - --schedule-policy lpm - --tool-call-parser qwen3_coder - --reasoning-parser qwen3 - --hf-chat-template-name tool_use - --trust-remote-code - deploy: - resources: - reservations: - devices: - - driver: nvidia - device_ids: ['5'] - capabilities: [gpu] - - # ========================================== - # TIER 3: MEMORY LOBE (GPU 6) + +# ========================================== + # TIER 3: MEMORY (GPU 6) # ========================================== embeddings: image: ghcr.io/huggingface/text-embeddings-inference:latest @@ -257,7 +202,6 @@ services: - CUDA_VISIBLE_DEVICES=0 # Mapped from physical 6 - HUGGING_FACE_HUB_TOKEN=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 volumes: - # TEI uses /data as its internal cache directory - /mnt/nvme3n1/swarm/huggingface_cache:/data ports: - "8000:8000" @@ -272,8 +216,6 @@ services: device_ids: ['6'] capabilities: [gpu] - - # --- Networks --- networks: