From 734c1bad8a3177394df15a5bbbee87d6508cbc0b Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Thu, 19 Mar 2026 18:55:07 -0700
Subject: [PATCH 01/19] Add Qwen3.5 h200 MTP

---
 .github/configs/nvidia-master.yaml            | 22 +++++
 .../single_node/qwen3.5_fp8_h200_mtp.sh       | 89 +++++++++++++++++++
 perf-changelog.yaml                           |  6 ++
 3 files changed, 117 insertions(+)
 create mode 100755 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1dbc62841..0dec09f11 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2020,6 +2020,28 @@ qwen3.5-fp8-h200-sglang:
     search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
+qwen3.5-fp8-h200-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.9-cu129-amd64
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: h200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+  - isl: 1024
+    osl: 8192
+    search-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
 glm5-fp8-h200-sglang:
   image: lmsysorg/sglang:glm5-hopper
   model: zai-org/GLM-5-FP8
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
new file mode 100755
index 000000000..4a54a1da6
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE 
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+MAX_SEQ_LEN=$((ISL + OSL + 20))
+
+echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
+
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+python3 -m sglang.launch_server \
+  --model "$MODEL" \
+  --host 0.0.0.0 \
+  --port "$PORT" \
+  --tp "$TP" \
+  --expert-parallel-size "$EP_SIZE" \
+  --reasoning-parser qwen3 \
+  --tool-call-parser qwen3_coder \
+  --enable-flashinfer-allreduce-fusion \
+  --max-running-requests 128 \
+  --chunked-prefill-size 16384 \
+  --decode-log-interval 1 \
+  --mem-fraction-static 0.8 \
+  --cuda-graph-max-bs "$CONC" \
+  --context-length "$MAX_SEQ_LEN" \
+  --kv-cache-dtype fp8_e4m3 \
+  --quantization fp8 \
+  --attention-backend flashinfer \
+  --stream-interval 50 \
+  --tokenizer-worker-num 6 \
+  --mamba-ssm-dtype bfloat16 \
+  --disable-radix-cache \
+  --trust-remote-code \
+  --speculative-algorithm EAGLE \
+  --speculative-num-steps 2 \
+  --speculative-num-draft-tokens 3 \
+  --speculative-eagle-topk 1 \
+  > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c6a340e96..bd302b1ca 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -979,3 +979,9 @@
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
+
+- config-keys:
+    - qwen3.5-fp8-h200-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX

From 3d806875ffe65a69f6bdb6bd95327974abd2bc02 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Thu, 19 Mar 2026 19:00:35 -0700
Subject: [PATCH 02/19] extend conc

---
 .github/configs/nvidia-master.yaml | 6 +++---
 perf-changelog.yaml                | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0dec09f11..00fa8271b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2032,15 +2032,15 @@ qwen3.5-fp8-h200-sglang-mtp:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
 
 glm5-fp8-h200-sglang:
   image: lmsysorg/sglang:glm5-hopper
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index bd302b1ca..5ae0bb20d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -984,4 +984,4 @@
     - qwen3.5-fp8-h200-sglang-mtp
   description:
     - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921

From 96a8f025457cf52098f15df936a89d5dd4f51652 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Thu, 19 Mar 2026 21:48:02 -0700
Subject: [PATCH 03/19] adding flag

---
 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
index 4a54a1da6..d6a3fc6ed 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -75,6 +75,7 @@ run_benchmark_serving \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
     --num-prompts "$((CONC * 10))" \
     --max-concurrency "$CONC" \
+    --use-chat-template \
     --result-filename "$RESULT_FILENAME" \
     --result-dir /workspace/
 

From 0d13e556093cb651c7eb1f3bbf76741dcd2da550 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 15:42:46 -0700
Subject: [PATCH 04/19] Update perf-changelog.yaml

---
 perf-changelog.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c2a7846b4..9140f073d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1040,4 +1040,5 @@
     - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
\ No newline at end of file
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
+  

From 0ce06cfb6055d980517265c2e06ca9275f968f64 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 15:47:55 -0700
Subject: [PATCH 05/19] add new line

---
 perf-changelog.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9140f073d..5dca29afc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1042,3 +1042,5 @@
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
   
+ 
+  

From 5dd23085e71486389e4cbf3da2459fc4a25f1976 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 15:49:53 -0700
Subject: [PATCH 06/19] Update perf-changelog.yaml

---
 perf-changelog.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5dca29afc..9140f073d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1042,5 +1042,3 @@
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
   
- 
-  

From 14a584f10052e680dadb378c5441e49fc7b55db5 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 15:57:05 -0700
Subject: [PATCH 07/19] add new line

---
 perf-changelog.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9140f073d..f94d20efd 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1041,4 +1041,3 @@
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
-  

From 55a6d039f87b8d47902e291c929aa05b47bbb9a1 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 15:58:46 -0700
Subject: [PATCH 08/19] Update perf-changelog.yaml

---
 perf-changelog.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f94d20efd..9140f073d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1041,3 +1041,4 @@
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
+  

From 2e489b92fe57b6e74e31b9d2cb5ad9a5ca1b49ca Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 16:12:13 -0700
Subject: [PATCH 09/19] fix: perf bug

---
 perf-changelog.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9140f073d..3b8d63dc3 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1042,3 +1042,9 @@
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
   
+- config-keys:
+    - qwen3.5-fp8-h200-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921
+  

From cbe069dee77f315defc21a25918854c907f2e9b8 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 16:22:08 -0700
Subject: [PATCH 10/19] fix:perf

---
 docs/accuracy_evals_slides.html | 436 ++++++++++++++++++++++++++++++++
 perf-changelog.yaml             |  19 +-
 2 files changed, 445 insertions(+), 10 deletions(-)
 create mode 100644 docs/accuracy_evals_slides.html

diff --git a/docs/accuracy_evals_slides.html b/docs/accuracy_evals_slides.html
new file mode 100644
index 000000000..46b06da2c
--- /dev/null
+++ b/docs/accuracy_evals_slides.html
@@ -0,0 +1,436 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>InferenceX Accuracy Evals</title>
+<style>
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: 'Segoe UI', system-ui, sans-serif; background: #0f1117; color: #e2e8f0; }
+
+  .slide {
+    width: 100vw; height: 100vh;
+    display: flex; flex-direction: column; justify-content: center; align-items: center;
+    padding: 36px 64px;
+    border-bottom: 2px solid #1e2535;
+    overflow: hidden; position: relative;
+  }
+
+  .slide-number {
+    position: absolute; top: 20px; right: 36px;
+    font-size: 12px; color: #4a5568; letter-spacing: 2px; text-transform: uppercase;
+  }
+
+  h1 { font-size: 2rem; font-weight: 700; color: #fff; margin-bottom: 10px; text-align: center; }
+  h2 { font-size: 0.85rem; font-weight: 400; color: #76e3a0; text-transform: uppercase;
+       letter-spacing: 3px; margin-bottom: 20px; text-align: center; }
+
+  /* ── Slide 1 ── */
+  .pipeline {
+    display: flex; align-items: center; width: 100%; max-width: 1100px;
+    justify-content: center; flex-wrap: nowrap;
+  }
+  .step {
+    background: #1a2035; border: 1px solid #2d3a55; border-radius: 10px;
+    padding: 8px 10px; min-width: 100px; text-align: center; flex-shrink: 1;
+  }
+  .step .icon { font-size: 1.1rem; margin-bottom: 3px; }
+  .step .label { font-size: 0.58rem; color: #94a3b8; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 1px; }
+  .step .title { font-size: 0.68rem; font-weight: 600; color: #e2e8f0; }
+  .step .sub { font-size: 0.58rem; color: #64748b; margin-top: 2px; }
+  .arrow { font-size: 1.1rem; color: #2d3a55; margin: 0 4px; flex-shrink: 0; }
+
+  .note-row { display: flex; gap: 16px; margin-top: 20px; width: 100%; max-width: 1100px; }
+  .note {
+    flex: 1; background: #12192a; border-left: 3px solid #76e3a0;
+    border-radius: 6px; padding: 10px 14px;
+    font-size: 0.75rem; color: #94a3b8; line-height: 1.5;
+  }
+  .note strong { color: #e2e8f0; }
+
+  /* ── Shared card ── */
+  .card {
+    background: #1a2035; border: 1px solid #2d3a55; border-radius: 10px;
+    padding: 14px 16px; margin-bottom: 12px;
+  }
+  .card h3 { font-size: 0.72rem; color: #76e3a0; text-transform: uppercase;
+              letter-spacing: 2px; margin-bottom: 10px; }
+
+  .filter-list { list-style: none; }
+  .filter-list li {
+    display: flex; align-items: flex-start; gap: 8px;
+    padding: 6px 0; border-bottom: 1px solid #1e2535;
+    font-size: 0.78rem; color: #cbd5e1; line-height: 1.4;
+  }
+  .filter-list li:last-child { border-bottom: none; }
+  .badge {
+    background: #0d2a1f; color: #76e3a0; border: 1px solid #76e3a0;
+    border-radius: 4px; padding: 1px 6px; font-size: 0.65rem;
+    font-weight: 600; white-space: nowrap; margin-top: 1px; flex-shrink: 0;
+  }
+  .badge.red { background: #2a0d0d; color: #f87171; border-color: #f87171; }
+
+  .group-key { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 6px; }
+  .tag {
+    background: #1e2a45; border: 1px solid #2d3a55; border-radius: 5px;
+    padding: 3px 10px; font-size: 0.72rem; color: #94a3b8;
+  }
+
+  /* ── Slide 3 ── */
+  .metrics-grid { display: flex; gap: 20px; width: 100%; max-width: 1100px; align-items: flex-start; }
+  .task-card {
+    flex: 1; background: #1a2035; border: 1px solid #2d3a55; border-radius: 12px;
+    padding: 18px 20px;
+  }
+  .task-card .task-name { font-size: 1.05rem; font-weight: 700; color: #fff; margin-bottom: 2px; }
+  .task-card .task-desc { font-size: 0.72rem; color: #64748b; margin-bottom: 12px; }
+  .metric-item {
+    display: flex; justify-content: space-between; align-items: center;
+    padding: 6px 0; border-bottom: 1px solid #1e2535; font-size: 0.8rem;
+  }
+  .metric-item:last-child { border-bottom: none; }
+  .metric-item .m-name { color: #cbd5e1; }
+  .metric-item .m-type { color: #76e3a0; font-family: monospace; font-size: 0.73rem; }
+
+  .output-card {
+    background: #12192a; border: 1px solid #2d3a55; border-radius: 8px;
+    padding: 12px 18px; margin-top: 16px; width: 100%; max-width: 1100px;
+  }
+  .output-card h3 { font-size: 0.72rem; color: #94a3b8; text-transform: uppercase;
+                    letter-spacing: 2px; margin-bottom: 10px; }
+  .output-row { display: flex; gap: 12px; flex-wrap: wrap; }
+  .output-chip {
+    background: #1a2035; border: 1px solid #2d3a55; border-radius: 6px;
+    padding: 6px 12px; font-size: 0.75rem; color: #94a3b8;
+  }
+  .output-chip span { color: #e2e8f0; font-weight: 600; }
+</style>
+</head>
+<body>
+
+<!-- ═══════════════════════════════════════════════════ SLIDE 1 (Intro) -->
+<div class="slide">
+  <div class="slide-number">01 / 04</div>
+  <h2>LLM Evaluation Landscape</h2>
+  <h1>Common Benchmark Categories</h1>
+
+  <div style="width:100%;max-width:1200px;margin-bottom:16px;">
+    <div style="background:#12192a;border-left:3px solid #76e3a0;border-radius:6px;padding:10px 16px;font-size:0.78rem;color:#94a3b8;line-height:1.5;">
+      <strong style="color:#e2e8f0;">What are LLM Evals?</strong>
+      &nbsp;Standardized benchmarks used to measure a language model's capabilities — from reasoning and knowledge to coding and safety.
+      Each eval tests a specific skill using curated datasets, scoring model outputs against known correct answers to produce comparable, reproducible metrics.
+    </div>
+  </div>
+
+  <div style="display:flex;gap:14px;width:100%;max-width:1200px;align-items:flex-start;">
+
+    <!-- Col 1 -->
+    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
+      <div class="card" style="margin-bottom:0;">
+        <h3>🧮 Math &amp; Logical Reasoning</h3>
+        <ul style="list-style:none;">
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">GSM8K</strong> <span style="color:#76e3a0;font-size:0.65rem;margin-left:4px;">★ used</span><br><span style="color:#64748b;font-size:0.68rem;">Multi-step math word problems</span></li>
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MATH</strong><br><span style="color:#64748b;font-size:0.68rem;">Competition-level mathematics</span></li>
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">ARC</strong><br><span style="color:#64748b;font-size:0.68rem;">Scientific reasoning, grade-school science</span></li>
+          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">BBH</strong><br><span style="color:#64748b;font-size:0.68rem;">BIG-bench Hard — complex logic tasks</span></li>
+        </ul>
+      </div>
+      <div class="card" style="margin-bottom:0;">
+        <h3>💻 Coding &amp; Technical</h3>
+        <ul style="list-style:none;">
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">HumanEval</strong><br><span style="color:#64748b;font-size:0.68rem;">Python code generation correctness</span></li>
+          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MBPP</strong><br><span style="color:#64748b;font-size:0.68rem;">Crowd-sourced Python problems</span></li>
+        </ul>
+      </div>
+    </div>
+
+    <!-- Col 2 -->
+    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
+      <div class="card" style="margin-bottom:0;">
+        <h3>📚 Knowledge &amp; Language</h3>
+        <ul style="list-style:none;">
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MMLU</strong><br><span style="color:#64748b;font-size:0.68rem;">57 subjects — STEM to humanities</span></li>
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">HellaSwag</strong><br><span style="color:#64748b;font-size:0.68rem;">Commonsense sentence completion</span></li>
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">WinoGrande</strong><br><span style="color:#64748b;font-size:0.68rem;">Context-aware pronoun resolution</span></li>
+          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">SuperGLUE</strong><br><span style="color:#64748b;font-size:0.68rem;">Comprehensive NLU suite</span></li>
+        </ul>
+      </div>
+      <div class="card" style="margin-bottom:0;">
+        <h3>🛡️ Safety &amp; Human Preference</h3>
+        <ul style="list-style:none;">
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">TruthfulQA</strong><br><span style="color:#64748b;font-size:0.68rem;">Factual accuracy, anti-hallucination</span></li>
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">Chatbot Arena</strong><br><span style="color:#64748b;font-size:0.68rem;">Elo-rated human pairwise comparisons</span></li>
+          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MT-Bench</strong><br><span style="color:#64748b;font-size:0.68rem;">Multi-turn dialogue evaluation</span></li>
+        </ul>
+      </div>
+    </div>
+
+    <!-- Col 3 -->
+    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
+      <div class="card" style="margin-bottom:0;">
+        <h3>🔬 Specialized &amp; Advanced</h3>
+        <ul style="list-style:none;">
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">GPQA-Diamond</strong> <span style="color:#76e3a0;font-size:0.65rem;margin-left:4px;">★ used</span><br><span style="color:#64748b;font-size:0.68rem;">Expert-curated science questions</span></li>
+          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">DROP</strong><br><span style="color:#64748b;font-size:0.68rem;">Reading comprehension + discrete ops</span></li>
+          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">FinanceBench</strong><br><span style="color:#64748b;font-size:0.68rem;">Financial analysis (earnings reports)</span></li>
+        </ul>
+      </div>
+
+      <!-- InferenceX highlight -->
+      <div style="background:#0d2a1f;border:1px solid #76e3a0;border-radius:10px;padding:14px 16px;">
+        <h3 style="color:#76e3a0;font-size:0.72rem;text-transform:uppercase;letter-spacing:2px;margin-bottom:10px;">★ Used in InferenceX</h3>
+        <div style="display:flex;flex-direction:column;gap:8px;">
+          <div style="font-size:0.78rem;color:#e2e8f0;"><strong>GSM8K</strong> <span style="color:#94a3b8;font-size:0.68rem;">— math word problems, 5-shot, exact match</span></div>
+          <div style="font-size:0.78rem;color:#e2e8f0;"><strong>GPQA-Diamond</strong> <span style="color:#94a3b8;font-size:0.68rem;">— expert science MCQ, 2 repeats, A-D accuracy</span></div>
+        </div>
+      </div>
+    </div>
+
+  </div>
+</div>
+
+<!-- ═══════════════════════════════════════════════════ SLIDE 1 -->
+<div class="slide">
+  <div class="slide-number">02 / 04</div>
+  <h2>InferenceX Accuracy Evals</h2>
+  <h1>End-to-End Workflow</h1>
+
+  <div style="width:100%;max-width:1100px;margin-bottom:18px;">
+    <div style="background:#12192a;border-left:3px solid #76e3a0;border-radius:6px;padding:10px 16px;font-size:0.78rem;color:#94a3b8;line-height:1.5;">
+      <strong style="color:#e2e8f0;">Purpose of Eval:</strong>
+      &nbsp;Verify that throughput optimizations — TP degree, concurrency, and spec-decoding — do not degrade model accuracy.
+    </div>
+  </div>
+
+  <div class="pipeline">
+    <div class="step">
+      <div class="icon">📄</div>
+      <div class="label">Input</div>
+      <div class="title">perf-changelog.yaml</div>
+      <div class="sub">push / PR trigger</div>
+    </div>
+    <div class="arrow">→</div>
+    <div class="step">
+      <div class="icon">⚙️</div>
+      <div class="label">Config</div>
+      <div class="title" style="font-size:0.55rem;">generate sweep configs</div>
+      <div class="sub">build matrix<br>mark run_eval=true</div>
+    </div>
+    <div class="arrow">→</div>
+    <div class="step">
+      <div class="icon">🖥️</div>
+      <div class="label">Serve</div>
+      <div class="title">Inference Server</div>
+      <div class="sub">SGLang / vLLM / TRT<br>port 8888</div>
+    </div>
+    <div class="arrow">→</div>
+    <div class="step">
+      <div class="icon">📊</div>
+      <div class="label">Benchmark</div>
+      <div class="title">Throughput Run</div>
+      <div class="sub">results*.json</div>
+    </div>
+    <div class="arrow">→</div>
+    <div class="step">
+      <div class="icon">🧪</div>
+      <div class="label">Eval</div>
+      <div class="title">lm-eval Harness</div>
+      <div class="sub">gsm8k + gpqa<br>/v1/chat/completions</div>
+    </div>
+    <div class="arrow">→</div>
+    <div class="step">
+      <div class="icon">🗂️</div>
+      <div class="label">Collect</div>
+      <div class="title">collect-evals.yml</div>
+      <div class="sub">parse artifacts<br>agg_eval_*.json</div>
+    </div>
+    <div class="arrow">→</div>
+    <div class="step">
+      <div class="icon">🗄️</div>
+      <div class="label">Ingest</div>
+      <div class="title">InferenceX-app DB</div>
+      <div class="sub">webhook dispatch</div>
+    </div>
+  </div>
+
+  <div class="note-row">
+    <div class="note">
+      <strong>Parallelism:</strong> All benchmark jobs run in parallel across hardware (H100, H200, B200, MI325X…).
+      Eval jobs only run on entries flagged <code>run_eval=true</code>.
+    </div>
+  </div>
+</div>
+
+<!-- ═══════════════════════════════════════════════════ SLIDE 2 -->
+<div class="slide">
+  <div class="slide-number">03 / 04</div>
+  <h2>Eval Selection Policy</h2>
+  <h1>How Buckets Are Made &amp; run_eval=true</h1>
+
+  <div style="display:flex;gap:20px;width:100%;max-width:1200px;align-items:flex-start;">
+
+    <!-- Col 1: Filters + Group key -->
+    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
+      <div class="card" style="margin-bottom:0;">
+        <h3>Pre-filters (hard gates)</h3>
+        <ul class="filter-list">
+          <li><span class="badge">PASS</span> Sequence length must be <strong>1k8k</strong> (ISL=1024, OSL=8192)</li>
+          <li><span class="badge red">SKIP</span> No top-level <code>tp</code> field (multinode / disaggregated)</li>
+          <li><span class="badge red">SKIP</span> 1k1k and 8k1k sequence lengths</li>
+        </ul>
+      </div>
+      <div class="card" style="margin-bottom:0;">
+        <h3>Group key — one bucket per unique combo</h3>
+        <div class="group-key">
+          <span class="tag">model</span><span class="tag">runner</span><span class="tag">framework</span>
+          <span class="tag">precision</span><span class="tag">isl</span><span class="tag">osl</span>
+          <span class="tag">spec_decoding</span><span class="tag">dp_attn</span>
+        </div>
+        <p style="font-size:0.72rem;color:#64748b;line-height:1.4;">MTP and non-MTP bucketed independently. dp_attn variants kept separate.</p>
+      </div>
+    </div>
+
+    <!-- Col 2: Real example -->
+    <div style="flex:1.3;min-width:0;">
+      <div class="card" style="margin-bottom:0;border-color:#3b4a6b;">
+        <h3 style="color:#94a3b8;">Real Example — DeepSeek-R1-0528 · B200 · TRT · FP4</h3>
+        <p style="font-size:0.68rem;color:#64748b;margin-bottom:8px;">
+          Bucket: <code style="color:#94a3b8;font-size:0.65rem;">(…FP4-V2, b200, trt, fp4, 1024, 8192, none, false)</code>
+          · spec_decoding omitted → <code>none</code> · step-size 2
+        </p>
+        <pre style="background:#0d1117;border:1px solid #1e2535;border-radius:6px;padding:8px 12px;font-size:0.67rem;color:#94a3b8;line-height:1.6;overflow:auto;margin-bottom:10px;">search-space:
+  - { tp: 4,             conc-start:  4, conc-end: 16 }
+  - { tp: 4, dp-attn: true, ... }  <span style="color:#4a5568;"># → different bucket</span>
+  - { tp: 8,             conc-start:  4, conc-end:  4 }
+  - { tp: 8, ep:8,       conc-start: 64, conc-end: 64 }
+  - { tp: 8, dp-attn: true, ... }  <span style="color:#4a5568;"># → different bucket</span></pre>
+        <table style="width:100%;border-collapse:collapse;font-size:0.76rem;">
+          <thead>
+            <tr style="border-bottom:1px solid #2d3a55;">
+              <th style="text-align:left;padding:4px 8px;color:#64748b;font-weight:500;">TP</th>
+              <th style="text-align:left;padding:4px 8px;color:#64748b;font-weight:500;">Conc</th>
+              <th style="text-align:left;padding:4px 8px;color:#64748b;font-weight:500;">run_eval</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr style="border-bottom:1px solid #1a2035;"><td style="padding:4px 8px;color:#64748b;">4</td><td style="padding:4px 8px;color:#64748b;">4, 8</td><td style="padding:4px 8px;color:#4a5568;">false</td></tr>
+            <tr style="border-bottom:1px solid #1a2035;background:#0d2210;"><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">4</td><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">16</td><td style="padding:4px 8px;"><span style="background:#0d2a1f;color:#76e3a0;border:1px solid #76e3a0;border-radius:4px;padding:1px 7px;font-size:0.68rem;font-weight:600;">true ✓</span></td></tr>
+            <tr style="border-bottom:1px solid #1a2035;"><td style="padding:4px 8px;color:#64748b;">8</td><td style="padding:4px 8px;color:#64748b;">4</td><td style="padding:4px 8px;color:#4a5568;">false</td></tr>
+            <tr style="background:#0d2210;"><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">8</td><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">64</td><td style="padding:4px 8px;"><span style="background:#0d2a1f;color:#76e3a0;border:1px solid #76e3a0;border-radius:4px;padding:1px 7px;font-size:0.68rem;font-weight:600;">true ✓</span></td></tr>
+          </tbody>
+        </table>
+        <p style="font-size:0.67rem;color:#4a5568;margin-top:6px;">dp_attn=true rows → separate bucket, own eval selections.</p>
+      </div>
+    </div>
+
+  </div>
+
+  <!-- Why only two TPs -->
+  <div style="width:100%;max-width:1200px;margin-top:14px;">
+    <div style="background:#12192a;border-left:3px solid #76e3a0;border-radius:6px;padding:10px 16px;font-size:0.76rem;color:#94a3b8;line-height:1.5;">
+      <strong style="color:#e2e8f0;">Why only min &amp; max TP?</strong>
+      &nbsp;Different TP degrees partition tensors across GPUs differently, changing floating-point accumulation order.
+      If accuracy holds at both extremes under maximum concurrency, intermediate TP values are assumed bounded by these results.
+    </div>
+  </div>
+</div>
+
+<!-- ═══════════════════════════════════════════════════ SLIDE 3 -->
+<div class="slide">
+  <div class="slide-number">04 / 04</div>
+  <h2>lm-eval Harness</h2>
+  <h1>Eval Tasks &amp; Metrics</h1>
+
+  <div class="metrics-grid">
+    <!-- GSM8K -->
+    <div class="task-card">
+      <div class="task-name">GSM8K</div>
+      <div class="task-desc">Grade school math word problems · 5-shot</div>
+      <div class="metric-item"><span class="m-name">Exact Match (strict)</span><span class="m-type">em_strict</span></div>
+      <div class="metric-item"><span class="m-name">Exact Match (flexible)</span><span class="m-type">em_flexible</span></div>
+      <div class="metric-item"><span class="m-name">Std error (strict)</span><span class="m-type">em_strict_se</span></div>
+      <div class="metric-item"><span class="m-name">Std error (flexible)</span><span class="m-type">em_flexible_se</span></div>
+      <div class="metric-item"><span class="m-name">Effective samples</span><span class="m-type">n_eff</span></div>
+      <div style="margin-top:12px;padding:10px;background:#12192a;border-radius:7px;font-size:0.72rem;color:#64748b;line-height:1.5;">
+        <strong style="color:#94a3b8;">Strict:</strong> regex extracts after <code>#### </code><br>
+        <strong style="color:#94a3b8;">Flexible:</strong> regex matches any number<br>
+        <strong style="color:#94a3b8;">temp=0</strong>, max_tokens=8192
+      </div>
+    </div>
+
+    <!-- GPQA Diamond -->
+    <div class="task-card">
+      <div class="task-name">GPQA Diamond</div>
+      <div class="task-desc">Graduate-level multiple choice · 2 repeats</div>
+      <div class="metric-item"><span class="m-name">Accuracy (A–D)</span><span class="m-type">accuracy</span></div>
+      <div class="metric-item"><span class="m-name">Standard error</span><span class="m-type">score_se</span></div>
+      <div class="metric-item"><span class="m-name">Effective samples</span><span class="m-type">n_eff</span></div>
+      <div style="margin-top:12px;padding:10px;background:#12192a;border-radius:7px;font-size:0.72rem;color:#64748b;line-height:1.5;">
+        <strong style="color:#94a3b8;">Dataset:</strong> Idavidrein/gpqa · gpqa_diamond split<br>
+        <strong style="color:#94a3b8;">Repeats:</strong> 2× randomized choice ordering (seeded)<br>
+        <strong style="color:#94a3b8;">Filter:</strong> extract last A–D letter · <strong style="color:#94a3b8;">temp=0</strong>
+      </div>
+    </div>
+
+    <!-- Metadata -->
+    <div class="task-card" style="flex:0.8;">
+      <div class="task-name">Run Metadata</div>
+      <div class="task-desc">Captured in meta_env.json per run</div>
+      <div class="metric-item"><span class="m-name">Model</span><span class="m-type">model</span></div>
+      <div class="metric-item"><span class="m-name">Hardware</span><span class="m-type">hw</span></div>
+      <div class="metric-item"><span class="m-name">Framework</span><span class="m-type">framework</span></div>
+      <div class="metric-item"><span class="m-name">Precision</span><span class="m-type">precision</span></div>
+      <div class="metric-item"><span class="m-name">Tensor Parallel</span><span class="m-type">tp</span></div>
+      <div class="metric-item"><span class="m-name">Concurrency</span><span class="m-type">conc</span></div>
+      <div class="metric-item"><span class="m-name">Spec Decoding</span><span class="m-type">spec_decoding</span></div>
+      <div class="metric-item"><span class="m-name">DP Attention</span><span class="m-type">dp_attention</span></div>
+    </div>
+  </div>
+
+  <div class="output-card">
+    <h3>Sample Output — collect-evals summary</h3>
+    <div style="overflow:hidden;">
+      <table style="width:100%;border-collapse:collapse;font-size:0.63rem;white-space:nowrap;table-layout:fixed;">
+        <thead>
+          <tr style="border-bottom:1px solid #2d3a55;background:#1a2035;">
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:7%;">Prefix</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">HW</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:7%;">Framework</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">Prec</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">Spec</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:center;width:4%;">TP</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:center;width:4%;">EP</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:center;width:5%;">Conc</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">DP Attn</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">Task</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:9%;">Score</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:9%;">EM Strict</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:9%;">EM Flexible</th>
+            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:5%;">N eff</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr style="background:#0d1a10;">
+            <td style="padding:5px 7px;color:#e2e8f0;font-weight:600;">qwen3.5</td>
+            <td style="padding:5px 7px;color:#94a3b8;">H200</td>
+            <td style="padding:5px 7px;color:#94a3b8;">SGLANG</td>
+            <td style="padding:5px 7px;color:#94a3b8;">FP8</td>
+            <td style="padding:5px 7px;color:#94a3b8;">mtp</td>
+            <td style="padding:5px 7px;color:#94a3b8;text-align:center;">8</td>
+            <td style="padding:5px 7px;color:#94a3b8;text-align:center;">8</td>
+            <td style="padding:5px 7px;color:#94a3b8;text-align:center;">128</td>
+            <td style="padding:5px 7px;color:#94a3b8;">false</td>
+            <td style="padding:5px 7px;color:#94a3b8;">gsm8k</td>
+            <td style="padding:5px 7px;color:#76e3a0;font-weight:600;text-align:right;">96.97%<span style="color:#4a5568;font-weight:400;"> ±0.47%</span></td>
+            <td style="padding:5px 7px;color:#76e3a0;font-weight:600;text-align:right;">96.97%<span style="color:#4a5568;font-weight:400;"> ±0.47%</span></td>
+            <td style="padding:5px 7px;color:#76e3a0;font-weight:600;text-align:right;">96.89%<span style="color:#4a5568;font-weight:400;"> ±0.48%</span></td>
+            <td style="padding:5px 7px;color:#94a3b8;text-align:right;">1319</td>
+          </tr>
+        </tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+</body>
+</html>
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3b8d63dc3..2cc5f8e59 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -984,6 +984,15 @@
     - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907
 
+- config-keys:
+    - glm5-fp8-h200-sglang
+  description:
+    - "Add GLM-5 FP8 SGLang H200 single-node benchmark"
+    - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
+    - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
+    - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
+  
 - config-keys:
     - glm5-fp8-b200-sglang
   description:
@@ -1033,18 +1042,8 @@
     - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934
 
-- config-keys:
-    - glm5-fp8-h200-sglang
-  description:
-    - "Add GLM-5 FP8 SGLang H200 single-node benchmark"
-    - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
-    - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
-    - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
-  
 - config-keys:
     - qwen3.5-fp8-h200-sglang-mtp
   description:
     - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921
-  

From 250e8501c5fe11ba02a086bccb798c79c62a3025 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 16:25:24 -0700
Subject: [PATCH 11/19] Delete docs/accuracy_evals_slides.html

---
 docs/accuracy_evals_slides.html | 436 --------------------------------
 1 file changed, 436 deletions(-)
 delete mode 100644 docs/accuracy_evals_slides.html

diff --git a/docs/accuracy_evals_slides.html b/docs/accuracy_evals_slides.html
deleted file mode 100644
index 46b06da2c..000000000
--- a/docs/accuracy_evals_slides.html
+++ /dev/null
@@ -1,436 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-<meta charset="UTF-8">
-<meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>InferenceX Accuracy Evals</title>
-<style>
-  * { box-sizing: border-box; margin: 0; padding: 0; }
-  body { font-family: 'Segoe UI', system-ui, sans-serif; background: #0f1117; color: #e2e8f0; }
-
-  .slide {
-    width: 100vw; height: 100vh;
-    display: flex; flex-direction: column; justify-content: center; align-items: center;
-    padding: 36px 64px;
-    border-bottom: 2px solid #1e2535;
-    overflow: hidden; position: relative;
-  }
-
-  .slide-number {
-    position: absolute; top: 20px; right: 36px;
-    font-size: 12px; color: #4a5568; letter-spacing: 2px; text-transform: uppercase;
-  }
-
-  h1 { font-size: 2rem; font-weight: 700; color: #fff; margin-bottom: 10px; text-align: center; }
-  h2 { font-size: 0.85rem; font-weight: 400; color: #76e3a0; text-transform: uppercase;
-       letter-spacing: 3px; margin-bottom: 20px; text-align: center; }
-
-  /* ── Slide 1 ── */
-  .pipeline {
-    display: flex; align-items: center; width: 100%; max-width: 1100px;
-    justify-content: center; flex-wrap: nowrap;
-  }
-  .step {
-    background: #1a2035; border: 1px solid #2d3a55; border-radius: 10px;
-    padding: 8px 10px; min-width: 100px; text-align: center; flex-shrink: 1;
-  }
-  .step .icon { font-size: 1.1rem; margin-bottom: 3px; }
-  .step .label { font-size: 0.58rem; color: #94a3b8; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 1px; }
-  .step .title { font-size: 0.68rem; font-weight: 600; color: #e2e8f0; }
-  .step .sub { font-size: 0.58rem; color: #64748b; margin-top: 2px; }
-  .arrow { font-size: 1.1rem; color: #2d3a55; margin: 0 4px; flex-shrink: 0; }
-
-  .note-row { display: flex; gap: 16px; margin-top: 20px; width: 100%; max-width: 1100px; }
-  .note {
-    flex: 1; background: #12192a; border-left: 3px solid #76e3a0;
-    border-radius: 6px; padding: 10px 14px;
-    font-size: 0.75rem; color: #94a3b8; line-height: 1.5;
-  }
-  .note strong { color: #e2e8f0; }
-
-  /* ── Shared card ── */
-  .card {
-    background: #1a2035; border: 1px solid #2d3a55; border-radius: 10px;
-    padding: 14px 16px; margin-bottom: 12px;
-  }
-  .card h3 { font-size: 0.72rem; color: #76e3a0; text-transform: uppercase;
-              letter-spacing: 2px; margin-bottom: 10px; }
-
-  .filter-list { list-style: none; }
-  .filter-list li {
-    display: flex; align-items: flex-start; gap: 8px;
-    padding: 6px 0; border-bottom: 1px solid #1e2535;
-    font-size: 0.78rem; color: #cbd5e1; line-height: 1.4;
-  }
-  .filter-list li:last-child { border-bottom: none; }
-  .badge {
-    background: #0d2a1f; color: #76e3a0; border: 1px solid #76e3a0;
-    border-radius: 4px; padding: 1px 6px; font-size: 0.65rem;
-    font-weight: 600; white-space: nowrap; margin-top: 1px; flex-shrink: 0;
-  }
-  .badge.red { background: #2a0d0d; color: #f87171; border-color: #f87171; }
-
-  .group-key { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 6px; }
-  .tag {
-    background: #1e2a45; border: 1px solid #2d3a55; border-radius: 5px;
-    padding: 3px 10px; font-size: 0.72rem; color: #94a3b8;
-  }
-
-  /* ── Slide 3 ── */
-  .metrics-grid { display: flex; gap: 20px; width: 100%; max-width: 1100px; align-items: flex-start; }
-  .task-card {
-    flex: 1; background: #1a2035; border: 1px solid #2d3a55; border-radius: 12px;
-    padding: 18px 20px;
-  }
-  .task-card .task-name { font-size: 1.05rem; font-weight: 700; color: #fff; margin-bottom: 2px; }
-  .task-card .task-desc { font-size: 0.72rem; color: #64748b; margin-bottom: 12px; }
-  .metric-item {
-    display: flex; justify-content: space-between; align-items: center;
-    padding: 6px 0; border-bottom: 1px solid #1e2535; font-size: 0.8rem;
-  }
-  .metric-item:last-child { border-bottom: none; }
-  .metric-item .m-name { color: #cbd5e1; }
-  .metric-item .m-type { color: #76e3a0; font-family: monospace; font-size: 0.73rem; }
-
-  .output-card {
-    background: #12192a; border: 1px solid #2d3a55; border-radius: 8px;
-    padding: 12px 18px; margin-top: 16px; width: 100%; max-width: 1100px;
-  }
-  .output-card h3 { font-size: 0.72rem; color: #94a3b8; text-transform: uppercase;
-                    letter-spacing: 2px; margin-bottom: 10px; }
-  .output-row { display: flex; gap: 12px; flex-wrap: wrap; }
-  .output-chip {
-    background: #1a2035; border: 1px solid #2d3a55; border-radius: 6px;
-    padding: 6px 12px; font-size: 0.75rem; color: #94a3b8;
-  }
-  .output-chip span { color: #e2e8f0; font-weight: 600; }
-</style>
-</head>
-<body>
-
-<!-- ═══════════════════════════════════════════════════ SLIDE 1 (Intro) -->
-<div class="slide">
-  <div class="slide-number">01 / 04</div>
-  <h2>LLM Evaluation Landscape</h2>
-  <h1>Common Benchmark Categories</h1>
-
-  <div style="width:100%;max-width:1200px;margin-bottom:16px;">
-    <div style="background:#12192a;border-left:3px solid #76e3a0;border-radius:6px;padding:10px 16px;font-size:0.78rem;color:#94a3b8;line-height:1.5;">
-      <strong style="color:#e2e8f0;">What are LLM Evals?</strong>
-      &nbsp;Standardized benchmarks used to measure a language model's capabilities — from reasoning and knowledge to coding and safety.
-      Each eval tests a specific skill using curated datasets, scoring model outputs against known correct answers to produce comparable, reproducible metrics.
-    </div>
-  </div>
-
-  <div style="display:flex;gap:14px;width:100%;max-width:1200px;align-items:flex-start;">
-
-    <!-- Col 1 -->
-    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
-      <div class="card" style="margin-bottom:0;">
-        <h3>🧮 Math &amp; Logical Reasoning</h3>
-        <ul style="list-style:none;">
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">GSM8K</strong> <span style="color:#76e3a0;font-size:0.65rem;margin-left:4px;">★ used</span><br><span style="color:#64748b;font-size:0.68rem;">Multi-step math word problems</span></li>
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MATH</strong><br><span style="color:#64748b;font-size:0.68rem;">Competition-level mathematics</span></li>
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">ARC</strong><br><span style="color:#64748b;font-size:0.68rem;">Scientific reasoning, grade-school science</span></li>
-          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">BBH</strong><br><span style="color:#64748b;font-size:0.68rem;">BIG-bench Hard — complex logic tasks</span></li>
-        </ul>
-      </div>
-      <div class="card" style="margin-bottom:0;">
-        <h3>💻 Coding &amp; Technical</h3>
-        <ul style="list-style:none;">
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">HumanEval</strong><br><span style="color:#64748b;font-size:0.68rem;">Python code generation correctness</span></li>
-          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MBPP</strong><br><span style="color:#64748b;font-size:0.68rem;">Crowd-sourced Python problems</span></li>
-        </ul>
-      </div>
-    </div>
-
-    <!-- Col 2 -->
-    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
-      <div class="card" style="margin-bottom:0;">
-        <h3>📚 Knowledge &amp; Language</h3>
-        <ul style="list-style:none;">
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MMLU</strong><br><span style="color:#64748b;font-size:0.68rem;">57 subjects — STEM to humanities</span></li>
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">HellaSwag</strong><br><span style="color:#64748b;font-size:0.68rem;">Commonsense sentence completion</span></li>
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">WinoGrande</strong><br><span style="color:#64748b;font-size:0.68rem;">Context-aware pronoun resolution</span></li>
-          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">SuperGLUE</strong><br><span style="color:#64748b;font-size:0.68rem;">Comprehensive NLU suite</span></li>
-        </ul>
-      </div>
-      <div class="card" style="margin-bottom:0;">
-        <h3>🛡️ Safety &amp; Human Preference</h3>
-        <ul style="list-style:none;">
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">TruthfulQA</strong><br><span style="color:#64748b;font-size:0.68rem;">Factual accuracy, anti-hallucination</span></li>
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">Chatbot Arena</strong><br><span style="color:#64748b;font-size:0.68rem;">Elo-rated human pairwise comparisons</span></li>
-          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">MT-Bench</strong><br><span style="color:#64748b;font-size:0.68rem;">Multi-turn dialogue evaluation</span></li>
-        </ul>
-      </div>
-    </div>
-
-    <!-- Col 3 -->
-    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
-      <div class="card" style="margin-bottom:0;">
-        <h3>🔬 Specialized &amp; Advanced</h3>
-        <ul style="list-style:none;">
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">GPQA-Diamond</strong> <span style="color:#76e3a0;font-size:0.65rem;margin-left:4px;">★ used</span><br><span style="color:#64748b;font-size:0.68rem;">Expert-curated science questions</span></li>
-          <li style="padding:5px 0;border-bottom:1px solid #1e2535;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">DROP</strong><br><span style="color:#64748b;font-size:0.68rem;">Reading comprehension + discrete ops</span></li>
-          <li style="padding:5px 0;font-size:0.75rem;color:#cbd5e1;"><strong style="color:#e2e8f0;">FinanceBench</strong><br><span style="color:#64748b;font-size:0.68rem;">Financial analysis (earnings reports)</span></li>
-        </ul>
-      </div>
-
-      <!-- InferenceX highlight -->
-      <div style="background:#0d2a1f;border:1px solid #76e3a0;border-radius:10px;padding:14px 16px;">
-        <h3 style="color:#76e3a0;font-size:0.72rem;text-transform:uppercase;letter-spacing:2px;margin-bottom:10px;">★ Used in InferenceX</h3>
-        <div style="display:flex;flex-direction:column;gap:8px;">
-          <div style="font-size:0.78rem;color:#e2e8f0;"><strong>GSM8K</strong> <span style="color:#94a3b8;font-size:0.68rem;">— math word problems, 5-shot, exact match</span></div>
-          <div style="font-size:0.78rem;color:#e2e8f0;"><strong>GPQA-Diamond</strong> <span style="color:#94a3b8;font-size:0.68rem;">— expert science MCQ, 2 repeats, A-D accuracy</span></div>
-        </div>
-      </div>
-    </div>
-
-  </div>
-</div>
-
-<!-- ═══════════════════════════════════════════════════ SLIDE 1 -->
-<div class="slide">
-  <div class="slide-number">02 / 04</div>
-  <h2>InferenceX Accuracy Evals</h2>
-  <h1>End-to-End Workflow</h1>
-
-  <div style="width:100%;max-width:1100px;margin-bottom:18px;">
-    <div style="background:#12192a;border-left:3px solid #76e3a0;border-radius:6px;padding:10px 16px;font-size:0.78rem;color:#94a3b8;line-height:1.5;">
-      <strong style="color:#e2e8f0;">Purpose of Eval:</strong>
-      &nbsp;Verify that throughput optimizations — TP degree, concurrency, and spec-decoding — do not degrade model accuracy.
-    </div>
-  </div>
-
-  <div class="pipeline">
-    <div class="step">
-      <div class="icon">📄</div>
-      <div class="label">Input</div>
-      <div class="title">perf-changelog.yaml</div>
-      <div class="sub">push / PR trigger</div>
-    </div>
-    <div class="arrow">→</div>
-    <div class="step">
-      <div class="icon">⚙️</div>
-      <div class="label">Config</div>
-      <div class="title" style="font-size:0.55rem;">generate sweep configs</div>
-      <div class="sub">build matrix<br>mark run_eval=true</div>
-    </div>
-    <div class="arrow">→</div>
-    <div class="step">
-      <div class="icon">🖥️</div>
-      <div class="label">Serve</div>
-      <div class="title">Inference Server</div>
-      <div class="sub">SGLang / vLLM / TRT<br>port 8888</div>
-    </div>
-    <div class="arrow">→</div>
-    <div class="step">
-      <div class="icon">📊</div>
-      <div class="label">Benchmark</div>
-      <div class="title">Throughput Run</div>
-      <div class="sub">results*.json</div>
-    </div>
-    <div class="arrow">→</div>
-    <div class="step">
-      <div class="icon">🧪</div>
-      <div class="label">Eval</div>
-      <div class="title">lm-eval Harness</div>
-      <div class="sub">gsm8k + gpqa<br>/v1/chat/completions</div>
-    </div>
-    <div class="arrow">→</div>
-    <div class="step">
-      <div class="icon">🗂️</div>
-      <div class="label">Collect</div>
-      <div class="title">collect-evals.yml</div>
-      <div class="sub">parse artifacts<br>agg_eval_*.json</div>
-    </div>
-    <div class="arrow">→</div>
-    <div class="step">
-      <div class="icon">🗄️</div>
-      <div class="label">Ingest</div>
-      <div class="title">InferenceX-app DB</div>
-      <div class="sub">webhook dispatch</div>
-    </div>
-  </div>
-
-  <div class="note-row">
-    <div class="note">
-      <strong>Parallelism:</strong> All benchmark jobs run in parallel across hardware (H100, H200, B200, MI325X…).
-      Eval jobs only run on entries flagged <code>run_eval=true</code>.
-    </div>
-  </div>
-</div>
-
-<!-- ═══════════════════════════════════════════════════ SLIDE 2 -->
-<div class="slide">
-  <div class="slide-number">03 / 04</div>
-  <h2>Eval Selection Policy</h2>
-  <h1>How Buckets Are Made &amp; run_eval=true</h1>
-
-  <div style="display:flex;gap:20px;width:100%;max-width:1200px;align-items:flex-start;">
-
-    <!-- Col 1: Filters + Group key -->
-    <div style="flex:1;display:flex;flex-direction:column;gap:12px;">
-      <div class="card" style="margin-bottom:0;">
-        <h3>Pre-filters (hard gates)</h3>
-        <ul class="filter-list">
-          <li><span class="badge">PASS</span> Sequence length must be <strong>1k8k</strong> (ISL=1024, OSL=8192)</li>
-          <li><span class="badge red">SKIP</span> No top-level <code>tp</code> field (multinode / disaggregated)</li>
-          <li><span class="badge red">SKIP</span> 1k1k and 8k1k sequence lengths</li>
-        </ul>
-      </div>
-      <div class="card" style="margin-bottom:0;">
-        <h3>Group key — one bucket per unique combo</h3>
-        <div class="group-key">
-          <span class="tag">model</span><span class="tag">runner</span><span class="tag">framework</span>
-          <span class="tag">precision</span><span class="tag">isl</span><span class="tag">osl</span>
-          <span class="tag">spec_decoding</span><span class="tag">dp_attn</span>
-        </div>
-        <p style="font-size:0.72rem;color:#64748b;line-height:1.4;">MTP and non-MTP bucketed independently. dp_attn variants kept separate.</p>
-      </div>
-    </div>
-
-    <!-- Col 2: Real example -->
-    <div style="flex:1.3;min-width:0;">
-      <div class="card" style="margin-bottom:0;border-color:#3b4a6b;">
-        <h3 style="color:#94a3b8;">Real Example — DeepSeek-R1-0528 · B200 · TRT · FP4</h3>
-        <p style="font-size:0.68rem;color:#64748b;margin-bottom:8px;">
-          Bucket: <code style="color:#94a3b8;font-size:0.65rem;">(…FP4-V2, b200, trt, fp4, 1024, 8192, none, false)</code>
-          · spec_decoding omitted → <code>none</code> · step-size 2
-        </p>
-        <pre style="background:#0d1117;border:1px solid #1e2535;border-radius:6px;padding:8px 12px;font-size:0.67rem;color:#94a3b8;line-height:1.6;overflow:auto;margin-bottom:10px;">search-space:
-  - { tp: 4,             conc-start:  4, conc-end: 16 }
-  - { tp: 4, dp-attn: true, ... }  <span style="color:#4a5568;"># → different bucket</span>
-  - { tp: 8,             conc-start:  4, conc-end:  4 }
-  - { tp: 8, ep:8,       conc-start: 64, conc-end: 64 }
-  - { tp: 8, dp-attn: true, ... }  <span style="color:#4a5568;"># → different bucket</span></pre>
-        <table style="width:100%;border-collapse:collapse;font-size:0.76rem;">
-          <thead>
-            <tr style="border-bottom:1px solid #2d3a55;">
-              <th style="text-align:left;padding:4px 8px;color:#64748b;font-weight:500;">TP</th>
-              <th style="text-align:left;padding:4px 8px;color:#64748b;font-weight:500;">Conc</th>
-              <th style="text-align:left;padding:4px 8px;color:#64748b;font-weight:500;">run_eval</th>
-            </tr>
-          </thead>
-          <tbody>
-            <tr style="border-bottom:1px solid #1a2035;"><td style="padding:4px 8px;color:#64748b;">4</td><td style="padding:4px 8px;color:#64748b;">4, 8</td><td style="padding:4px 8px;color:#4a5568;">false</td></tr>
-            <tr style="border-bottom:1px solid #1a2035;background:#0d2210;"><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">4</td><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">16</td><td style="padding:4px 8px;"><span style="background:#0d2a1f;color:#76e3a0;border:1px solid #76e3a0;border-radius:4px;padding:1px 7px;font-size:0.68rem;font-weight:600;">true ✓</span></td></tr>
-            <tr style="border-bottom:1px solid #1a2035;"><td style="padding:4px 8px;color:#64748b;">8</td><td style="padding:4px 8px;color:#64748b;">4</td><td style="padding:4px 8px;color:#4a5568;">false</td></tr>
-            <tr style="background:#0d2210;"><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">8</td><td style="padding:4px 8px;color:#e2e8f0;font-weight:700;">64</td><td style="padding:4px 8px;"><span style="background:#0d2a1f;color:#76e3a0;border:1px solid #76e3a0;border-radius:4px;padding:1px 7px;font-size:0.68rem;font-weight:600;">true ✓</span></td></tr>
-          </tbody>
-        </table>
-        <p style="font-size:0.67rem;color:#4a5568;margin-top:6px;">dp_attn=true rows → separate bucket, own eval selections.</p>
-      </div>
-    </div>
-
-  </div>
-
-  <!-- Why only two TPs -->
-  <div style="width:100%;max-width:1200px;margin-top:14px;">
-    <div style="background:#12192a;border-left:3px solid #76e3a0;border-radius:6px;padding:10px 16px;font-size:0.76rem;color:#94a3b8;line-height:1.5;">
-      <strong style="color:#e2e8f0;">Why only min &amp; max TP?</strong>
-      &nbsp;Different TP degrees partition tensors across GPUs differently, changing floating-point accumulation order.
-      If accuracy holds at both extremes under maximum concurrency, intermediate TP values are assumed bounded by these results.
-    </div>
-  </div>
-</div>
-
-<!-- ═══════════════════════════════════════════════════ SLIDE 3 -->
-<div class="slide">
-  <div class="slide-number">04 / 04</div>
-  <h2>lm-eval Harness</h2>
-  <h1>Eval Tasks &amp; Metrics</h1>
-
-  <div class="metrics-grid">
-    <!-- GSM8K -->
-    <div class="task-card">
-      <div class="task-name">GSM8K</div>
-      <div class="task-desc">Grade school math word problems · 5-shot</div>
-      <div class="metric-item"><span class="m-name">Exact Match (strict)</span><span class="m-type">em_strict</span></div>
-      <div class="metric-item"><span class="m-name">Exact Match (flexible)</span><span class="m-type">em_flexible</span></div>
-      <div class="metric-item"><span class="m-name">Std error (strict)</span><span class="m-type">em_strict_se</span></div>
-      <div class="metric-item"><span class="m-name">Std error (flexible)</span><span class="m-type">em_flexible_se</span></div>
-      <div class="metric-item"><span class="m-name">Effective samples</span><span class="m-type">n_eff</span></div>
-      <div style="margin-top:12px;padding:10px;background:#12192a;border-radius:7px;font-size:0.72rem;color:#64748b;line-height:1.5;">
-        <strong style="color:#94a3b8;">Strict:</strong> regex extracts after <code>#### </code><br>
-        <strong style="color:#94a3b8;">Flexible:</strong> regex matches any number<br>
-        <strong style="color:#94a3b8;">temp=0</strong>, max_tokens=8192
-      </div>
-    </div>
-
-    <!-- GPQA Diamond -->
-    <div class="task-card">
-      <div class="task-name">GPQA Diamond</div>
-      <div class="task-desc">Graduate-level multiple choice · 2 repeats</div>
-      <div class="metric-item"><span class="m-name">Accuracy (A–D)</span><span class="m-type">accuracy</span></div>
-      <div class="metric-item"><span class="m-name">Standard error</span><span class="m-type">score_se</span></div>
-      <div class="metric-item"><span class="m-name">Effective samples</span><span class="m-type">n_eff</span></div>
-      <div style="margin-top:12px;padding:10px;background:#12192a;border-radius:7px;font-size:0.72rem;color:#64748b;line-height:1.5;">
-        <strong style="color:#94a3b8;">Dataset:</strong> Idavidrein/gpqa · gpqa_diamond split<br>
-        <strong style="color:#94a3b8;">Repeats:</strong> 2× randomized choice ordering (seeded)<br>
-        <strong style="color:#94a3b8;">Filter:</strong> extract last A–D letter · <strong style="color:#94a3b8;">temp=0</strong>
-      </div>
-    </div>
-
-    <!-- Metadata -->
-    <div class="task-card" style="flex:0.8;">
-      <div class="task-name">Run Metadata</div>
-      <div class="task-desc">Captured in meta_env.json per run</div>
-      <div class="metric-item"><span class="m-name">Model</span><span class="m-type">model</span></div>
-      <div class="metric-item"><span class="m-name">Hardware</span><span class="m-type">hw</span></div>
-      <div class="metric-item"><span class="m-name">Framework</span><span class="m-type">framework</span></div>
-      <div class="metric-item"><span class="m-name">Precision</span><span class="m-type">precision</span></div>
-      <div class="metric-item"><span class="m-name">Tensor Parallel</span><span class="m-type">tp</span></div>
-      <div class="metric-item"><span class="m-name">Concurrency</span><span class="m-type">conc</span></div>
-      <div class="metric-item"><span class="m-name">Spec Decoding</span><span class="m-type">spec_decoding</span></div>
-      <div class="metric-item"><span class="m-name">DP Attention</span><span class="m-type">dp_attention</span></div>
-    </div>
-  </div>
-
-  <div class="output-card">
-    <h3>Sample Output — collect-evals summary</h3>
-    <div style="overflow:hidden;">
-      <table style="width:100%;border-collapse:collapse;font-size:0.63rem;white-space:nowrap;table-layout:fixed;">
-        <thead>
-          <tr style="border-bottom:1px solid #2d3a55;background:#1a2035;">
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:7%;">Prefix</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">HW</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:7%;">Framework</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">Prec</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">Spec</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:center;width:4%;">TP</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:center;width:4%;">EP</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:center;width:5%;">Conc</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">DP Attn</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:left;width:6%;">Task</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:9%;">Score</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:9%;">EM Strict</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:9%;">EM Flexible</th>
-            <th style="padding:5px 7px;color:#64748b;font-weight:500;text-align:right;width:5%;">N eff</th>
-          </tr>
-        </thead>
-        <tbody>
-          <tr style="background:#0d1a10;">
-            <td style="padding:5px 7px;color:#e2e8f0;font-weight:600;">qwen3.5</td>
-            <td style="padding:5px 7px;color:#94a3b8;">H200</td>
-            <td style="padding:5px 7px;color:#94a3b8;">SGLANG</td>
-            <td style="padding:5px 7px;color:#94a3b8;">FP8</td>
-            <td style="padding:5px 7px;color:#94a3b8;">mtp</td>
-            <td style="padding:5px 7px;color:#94a3b8;text-align:center;">8</td>
-            <td style="padding:5px 7px;color:#94a3b8;text-align:center;">8</td>
-            <td style="padding:5px 7px;color:#94a3b8;text-align:center;">128</td>
-            <td style="padding:5px 7px;color:#94a3b8;">false</td>
-            <td style="padding:5px 7px;color:#94a3b8;">gsm8k</td>
-            <td style="padding:5px 7px;color:#76e3a0;font-weight:600;text-align:right;">96.97%<span style="color:#4a5568;font-weight:400;"> ±0.47%</span></td>
-            <td style="padding:5px 7px;color:#76e3a0;font-weight:600;text-align:right;">96.97%<span style="color:#4a5568;font-weight:400;"> ±0.47%</span></td>
-            <td style="padding:5px 7px;color:#76e3a0;font-weight:600;text-align:right;">96.89%<span style="color:#4a5568;font-weight:400;"> ±0.48%</span></td>
-            <td style="padding:5px 7px;color:#94a3b8;text-align:right;">1319</td>
-          </tr>
-        </tbody>
-      </table>
-    </div>
-  </div>
-</div>
-
-</body>
-</html>

From af9bda17bbe815486ac49c25e2f30cf2f3b178c9 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 22:36:50 -0700
Subject: [PATCH 12/19] change env variable

---
 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
index d6a3fc6ed..561b63b30 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -24,6 +24,11 @@ SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 MAX_SEQ_LEN=$((ISL + OSL + 20))
 
+# MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
+SPECULATIVE_NUM_STEPS=2
+SPECULATIVE_DRAFT_TOKENS=3
+SPECULATIVE_EAGLE_TOPK=1
+
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
 
 # Start GPU monitoring (power, temperature, clocks every second)
@@ -54,9 +59,9 @@ python3 -m sglang.launch_server \
   --disable-radix-cache \
   --trust-remote-code \
   --speculative-algorithm EAGLE \
-  --speculative-num-steps 2 \
-  --speculative-num-draft-tokens 3 \
-  --speculative-eagle-topk 1 \
+  --speculative-num-steps "$SPECULATIVE_NUM_STEPS" \
+  --speculative-num-draft-tokens "$SPECULATIVE_DRAFT_TOKENS" \
+  --speculative-eagle-topk "$SPECULATIVE_EAGLE_TOPK" \
   > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!

From 571512b487679e2002dfd6c429f9d1cd73fcae23 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Mon, 23 Mar 2026 22:59:29 -0700
Subject: [PATCH 13/19] fix: max seq len

---
 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
index 561b63b30..b3a06570b 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -10,7 +10,8 @@ check_env_vars \
     OSL \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
-    EP_SIZE 
+    EP_SIZE \
+    MAX_MODEL_LEN
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -22,14 +23,13 @@ hf download "$MODEL"
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
-MAX_SEQ_LEN=$((ISL + OSL + 20))
 
 # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
 SPECULATIVE_NUM_STEPS=2
 SPECULATIVE_DRAFT_TOKENS=3
 SPECULATIVE_EAGLE_TOPK=1
 
-echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
+echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN"
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
@@ -49,7 +49,7 @@ python3 -m sglang.launch_server \
   --decode-log-interval 1 \
   --mem-fraction-static 0.8 \
   --cuda-graph-max-bs "$CONC" \
-  --context-length "$MAX_SEQ_LEN" \
+  --context-length "$MAX_MODEL_LEN" \
   --kv-cache-dtype fp8_e4m3 \
   --quantization fp8 \
   --attention-backend flashinfer \

From cc42d88cdfb2c4505bb81b4ce35ab26f91cb164e Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Wed, 25 Mar 2026 10:52:06 -0700
Subject: [PATCH 14/19] remove extra flag

---
 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
index b3a06570b..6874a073f 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -46,7 +46,6 @@ python3 -m sglang.launch_server \
   --enable-flashinfer-allreduce-fusion \
   --max-running-requests 128 \
   --chunked-prefill-size 16384 \
-  --decode-log-interval 1 \
   --mem-fraction-static 0.8 \
   --cuda-graph-max-bs "$CONC" \
   --context-length "$MAX_MODEL_LEN" \

From 4a35e5c98499b4b106d80ea4bcb23ea2571d2edc Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Wed, 25 Mar 2026 11:00:36 -0700
Subject: [PATCH 15/19] fix:perf

---
 perf-changelog.yaml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2cc5f8e59..ddb404c65 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1041,9 +1041,36 @@
     - "Add --exclusive flag to MI355X single-node salloc and multi-node sbatch to prevent node sharing during benchmarks"
     - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934
+  
+- config-keys:
+    - qwen3.5-fp8-b200-sglang
+  description:
+    - "Replace FP8 with combination of TP4 and TP8 config"
+    - "Add --enable-flashinfer-allreduce-fusion to TP8"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918
+
+- config-keys:
+    - kimik2.5-int4-b200-vllm
+  description:
+    - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935
+
+- config-keys:
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-h200-sglang
+  description:
+    - "Update SGLang image to v0.5.9-cu130 for all DSR1 SGLang configs"
+    - "dsr1-fp4-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130"
+    - "dsr1-fp8-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130"
+    - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130"
+    - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 
 
 - config-keys:
     - qwen3.5-fp8-h200-sglang-mtp
   description:
     - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921
+  
\ No newline at end of file

From 61d531d89256c7f6aa281e22bda08a94f41819b4 Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Wed, 25 Mar 2026 12:51:11 -0700
Subject: [PATCH 16/19] add new line

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1e3de1cc0..3e7bb771b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1092,4 +1092,4 @@
   description:
     - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921
-  
\ No newline at end of file
+ 
\ No newline at end of file

From 15443cd1e86d3b76bbc206d36bdba91bcd111f40 Mon Sep 17 00:00:00 2001
From: Ankur Singh <ankusingh@nvidia.com>
Date: Wed, 25 Mar 2026 13:01:49 -0700
Subject: [PATCH 17/19] Clean up perf-changelog.yaml by removing old entries

Removed deprecated configurations and updated descriptions for performance changelog.
---
 perf-changelog.yaml | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3e7bb771b..34d39a15d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1068,28 +1068,9 @@
     - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
 
-- config-keys:
-    - kimik2.5-int4-b200-vllm
-  description:
-    - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935
-
-- config-keys:
-    - dsr1-fp4-b200-sglang
-    - dsr1-fp8-b200-sglang
-    - dsr1-fp8-b200-sglang-mtp
-    - dsr1-fp8-h200-sglang
-  description:
-    - "Update SGLang image to v0.5.9-cu130 for all DSR1 SGLang configs"
-    - "dsr1-fp4-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130"
-    - "dsr1-fp8-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130"
-    - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130"
-    - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 
-
 - config-keys:
     - qwen3.5-fp8-h200-sglang-mtp
   description:
     - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921
- 
\ No newline at end of file
+

From 75a416034313050537ad9ad1fc0f1e50221493ef Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Wed, 25 Mar 2026 14:43:50 -0700
Subject: [PATCH 18/19] update:spec num

---
 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
index 6874a073f..fe856bd8a 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -25,7 +25,7 @@ SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
 # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
-SPECULATIVE_NUM_STEPS=2
+SPECULATIVE_NUM_STEPS=3
 SPECULATIVE_DRAFT_TOKENS=3
 SPECULATIVE_EAGLE_TOPK=1
 

From 1e93eae1b29babddeb3bae01f382182eeacdef0a Mon Sep 17 00:00:00 2001
From: hshrivastava-droid <hshrivastava@nvidia.com>
Date: Wed, 25 Mar 2026 14:48:30 -0700
Subject: [PATCH 19/19] update: spec token

---
 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
index fe856bd8a..fc2525f34 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
@@ -26,7 +26,7 @@ PORT=${PORT:-8888}
 
 # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
 SPECULATIVE_NUM_STEPS=3
-SPECULATIVE_DRAFT_TOKENS=3
+SPECULATIVE_DRAFT_TOKENS=4
 SPECULATIVE_EAGLE_TOPK=1
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN"