# Recipe: Nemotron-3-Ultra-550B-A55B NVFP4 on 4x ASUS GX10 (TP=4, EP) # Model: nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 # Architecture: LatentMoE + Mamba-2 + attention hybrid; 512 routed experts top-22 # Source: HF model card recommended single-node 4×GPU TP=4 EP launch, adapted # for our Ray-over-fabric 4-GX10 cluster. recipe_version: "1" name: Nemotron-3-Ultra-NVFP4-TP4-EP description: vLLM serving Nemotron-3-Ultra 550B NVFP4 across 4 GX10s (TP=4, expert-parallel) model: /root/.cache/huggingface/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 container: vllm-node cluster_only: true # Pull the model-specific reasoning parser into the workspace, and patch vLLM # to accept --gpu-memory-utilization-gb (GX10 unified-memory budget flag). mods: - mods/gpu-mem-util-gb - mods/nemotron-ultra env: VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm VLLM_ALLOW_LONG_MAX_MODEL_LEN: 0 # Let the caching allocator return segments after the FlashInfer autotuner # spike so KV-cache alloc can claim the freed memory instead of hitting a # fragmented high-water-mark. Important with MTP added. PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True defaults: port: 8000 host: 0.0.0.0 tensor_parallel: 4 # Spark unified memory budget tightrope for Nemotron-3-Ultra 550B NVFP4 (2026-06-05): # 108 GiB → raylet OOM during FlashInfer autotuner spike (attempt 1, w/ MTP) # 95 GiB + MTP → KV alloc fails (attempt 2) # 95 GiB no MTP → KV alloc fails (attempt 3) # 105 GiB no MTP, ctx 131072 → serves cleanly (attempt 4, baseline) # 105 GiB + MTP-5, ctx 65536, batch 16384 → serves cleanly (attempt 5) # 105 GiB + MTP-3, ctx 262144, seqs 4, batch 16384 → serves cleanly (attempt 6) # Attempt 8: 110 GiB + MTP-3 + ctx 262144 + seqs 4 + batch 16384 → engine # served fine but OOM-killed a Ray worker on FIRST heavy inference (200K # prefill + 8K decode). NVRM driver logged NV_ERR_NO_MEMORY repeatedly, # then kernel OOM-killer reaped the python worker. # Attempt 9: drop budget 110 → 105 for more OS/driver headroom, # drop MTP entirely (frees draft head state and activation), batch already # halved to 8192. Trades single-stream decode speed for big-context stability. # Validated: 4× concurrent 200K (4/4 succeeded, 22.7 min wall, agg 19 tok/s). # Attempt 10 (this): use the verified headroom — bump budget 105 → 108 (still # 2 GiB under the 110 firm cap), bump concurrency seqs 4 → 6 (1.5× KV slots), # re-enable MTP-3 for decode speedup. Batch stays 8192 (attempt-8's killer # was batch=16384, not gpu_mem alone). Three compounding changes — if KV alloc # fails or first heavy prefill OOMs, back off MTP first, then seqs. gpu_memory_utilization_gb: 108 max_model_len: 262144 max_num_seqs: 6 # Halved from 32768 to shrink the FlashInfer fp8 autotuner's transient # workspace high-water-mark, which the CUDA caching allocator otherwise # holds onto and starves the KV allocator. # 16384 OOMs the NVRM driver during large-context prefill (200K input # workload, attempt 8). Halve so the per-chunk activation peak fits inside # the headroom the driver needs for transient allocations. max_num_batched_tokens: 8192 command: | vllm serve /root/.cache/huggingface/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 \ --served-model-name nvidia/nemotron-3-ultra \ --host {host} \ --port {port} \ --trust-remote-code \ --tensor-parallel-size {tensor_parallel} \ --enable-expert-parallel \ --distributed-executor-backend ray \ --kv-cache-dtype fp8 \ --gpu-memory-utilization-gb {gpu_memory_utilization_gb} \ --max-model-len {max_model_len} \ --max-num-seqs {max_num_seqs} \ --max-num-batched-tokens {max_num_batched_tokens} \ --enable-chunked-prefill \ --enable-prefix-caching \ --enable-auto-tool-choice \ --reasoning-parser nemotron_v3 \ --tool-call-parser qwen3_coder \ --mamba-ssm-cache-dtype float16 \ --mamba-backend flashinfer \ --enable-mamba-cache-stochastic-rounding \ --mamba-cache-philox-rounds 5 \ --moe-backend flashinfer_cutlass \ --speculative-config '{{"method": "nemotron_h_mtp", "num_speculative_tokens": 3}}' \ --model-loader-extra-config '{{"enable_multithread_load": true, "num_threads": 16}}' \ --compilation-config '{{"pass_config": {{"fuse_allreduce_rms": false}}}}' \ --distributed-timeout-seconds 3600