#!/bin/bash
# Node Matali: Gemma-4-26B-A4B-it
# GPU Mapping: 0, 7

# 1. Point to the BIG drive
export HF_HOME=/mnt/nvme3n1/swarm/huggingface_cache
export SGLANG_CACHE_DIR=/mnt/nvme3n1/swarm/sglang_cache

# 2. Source the environment
source /home/isnai/anaconda3/etc/profile.d/conda.sh
conda activate swarm

export CUDA_VISIBLE_DEVICES=0,7
export NCCL_P2P_DISABLE=0 

# 3. Launch
python3 -m sglang.launch_server \
    --model-path google/gemma-4-26b-a4b-it \
    --tp 2 \
    --port 3000 \
    --host 0.0.0.0 \
    --attention-backend triton \
    --mem-fraction-static 0.8 \
    --max-running-requests 128\
    --chunked-prefill-size 4096\
    --context-length 32768 \
    --trust-remote-code \
    --enable-piecewise-cuda-graph \
    --schedule-policy lpm