# Recipe: Nemotron-3-Ultra-550B-A55B NVFP4 on 4x ASUS GX10 (TP=4, EP)
# Model: nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4
# Architecture: LatentMoE + Mamba-2 + attention hybrid; 512 routed experts top-22
# Source: HF model card recommended single-node 4×GPU TP=4 EP launch, adapted
#         for our Ray-over-fabric 4-GX10 cluster.

recipe_version: "1"
name: Nemotron-3-Ultra-NVFP4-TP4-EP
description: vLLM serving Nemotron-3-Ultra 550B NVFP4 across 4 GX10s (TP=4, expert-parallel)

model: /root/.cache/huggingface/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4
container: vllm-node
cluster_only: true

# Pull the model-specific reasoning parser into the workspace, and patch vLLM
# to accept --gpu-memory-utilization-gb (GX10 unified-memory budget flag).
mods:
  - mods/gpu-mem-util-gb
  - mods/nemotron-ultra

env:
  VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm
  VLLM_ALLOW_LONG_MAX_MODEL_LEN: 0
  # Let the caching allocator return segments after the FlashInfer autotuner
  # spike so KV-cache alloc can claim the freed memory instead of hitting a
  # fragmented high-water-mark. Important with MTP added.
  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True

defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 4
  # Spark unified memory budget tightrope for Nemotron-3-Ultra 550B NVFP4 (2026-06-05):
  #   108 GiB        → raylet OOM during FlashInfer autotuner spike (attempt 1, w/ MTP)
  #   95  GiB + MTP  → KV alloc fails (attempt 2)
  #   95  GiB no MTP → KV alloc fails (attempt 3)
  #   105 GiB no MTP, ctx 131072 → serves cleanly (attempt 4, baseline)
  #   105 GiB + MTP-5, ctx 65536, batch 16384 → serves cleanly (attempt 5)
  #   105 GiB + MTP-3, ctx 262144, seqs 4, batch 16384 → serves cleanly (attempt 6)
  # Attempt 8: 110 GiB + MTP-3 + ctx 262144 + seqs 4 + batch 16384 → engine
  # served fine but OOM-killed a Ray worker on FIRST heavy inference (200K
  # prefill + 8K decode). NVRM driver logged NV_ERR_NO_MEMORY repeatedly,
  # then kernel OOM-killer reaped the python worker.
  # Attempt 9: drop budget 110 → 105 for more OS/driver headroom,
  # drop MTP entirely (frees draft head state and activation), batch already
  # halved to 8192. Trades single-stream decode speed for big-context stability.
  # Validated: 4× concurrent 200K (4/4 succeeded, 22.7 min wall, agg 19 tok/s).
  # Attempt 10 (this): use the verified headroom — bump budget 105 → 108 (still
  # 2 GiB under the 110 firm cap), bump concurrency seqs 4 → 6 (1.5× KV slots),
  # re-enable MTP-3 for decode speedup. Batch stays 8192 (attempt-8's killer
  # was batch=16384, not gpu_mem alone). Three compounding changes — if KV alloc
  # fails or first heavy prefill OOMs, back off MTP first, then seqs.
  gpu_memory_utilization_gb: 108
  max_model_len: 262144
  max_num_seqs: 6
  # Halved from 32768 to shrink the FlashInfer fp8 autotuner's transient
  # workspace high-water-mark, which the CUDA caching allocator otherwise
  # holds onto and starves the KV allocator.
  # 16384 OOMs the NVRM driver during large-context prefill (200K input
  # workload, attempt 8). Halve so the per-chunk activation peak fits inside
  # the headroom the driver needs for transient allocations.
  max_num_batched_tokens: 8192

command: |
  vllm serve /root/.cache/huggingface/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 \
  --served-model-name nvidia/nemotron-3-ultra \
  --host {host} \
  --port {port} \
  --trust-remote-code \
  --tensor-parallel-size {tensor_parallel} \
  --enable-expert-parallel \
  --distributed-executor-backend ray \
  --kv-cache-dtype fp8 \
  --gpu-memory-utilization-gb {gpu_memory_utilization_gb} \
  --max-model-len {max_model_len} \
  --max-num-seqs {max_num_seqs} \
  --max-num-batched-tokens {max_num_batched_tokens} \
  --enable-chunked-prefill \
  --enable-prefix-caching \
  --enable-auto-tool-choice \
  --reasoning-parser nemotron_v3 \
  --tool-call-parser qwen3_coder \
  --mamba-ssm-cache-dtype float16 \
  --mamba-backend flashinfer \
  --enable-mamba-cache-stochastic-rounding \
  --mamba-cache-philox-rounds 5 \
  --moe-backend flashinfer_cutlass \
  --speculative-config '{{"method": "nemotron_h_mtp", "num_speculative_tokens": 3}}' \
  --model-loader-extra-config '{{"enable_multithread_load": true, "num_threads": 16}}' \
  --compilation-config '{{"pass_config": {{"fuse_allreduce_rms": false}}}}' \
  --distributed-timeout-seconds 3600