From 734c1bad8a3177394df15a5bbbee87d6508cbc0b Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 19 Mar 2026 18:55:07 -0700 Subject: [PATCH 01/19] Add Qwen3.5 h200 MTP --- .github/configs/nvidia-master.yaml | 22 +++++ .../single_node/qwen3.5_fp8_h200_mtp.sh | 89 +++++++++++++++++++ perf-changelog.yaml | 6 ++ 3 files changed, 117 insertions(+) create mode 100755 benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1dbc62841..0dec09f11 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2020,6 +2020,28 @@ qwen3.5-fp8-h200-sglang: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +qwen3.5-fp8-h200-sglang-mtp: + image: lmsysorg/sglang:v0.5.9-cu129-amd64 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: h200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + glm5-fp8-h200-sglang: image: lmsysorg/sglang:glm5-hopper model: zai-org/GLM-5-FP8 diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh new file mode 100755 index 000000000..4a54a1da6 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} +MAX_SEQ_LEN=$((ISL + OSL + 20)) + +echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +python3 -m sglang.launch_server \ + --model "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --tp "$TP" \ + --expert-parallel-size "$EP_SIZE" \ + --reasoning-parser qwen3 \ + --tool-call-parser qwen3_coder \ + --enable-flashinfer-allreduce-fusion \ + --max-running-requests 128 \ + --chunked-prefill-size 16384 \ + --decode-log-interval 1 \ + --mem-fraction-static 0.8 \ + --cuda-graph-max-bs "$CONC" \ + --context-length "$MAX_SEQ_LEN" \ + --kv-cache-dtype fp8_e4m3 \ + --quantization fp8 \ + --attention-backend flashinfer \ + --stream-interval 50 \ + --tokenizer-worker-num 6 \ + --mamba-ssm-dtype bfloat16 \ + --disable-radix-cache \ + --trust-remote-code \ + --speculative-algorithm EAGLE \ + --speculative-num-steps 2 \ + --speculative-num-draft-tokens 3 \ + --speculative-eagle-topk 1 \ + > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c6a340e96..bd302b1ca 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -979,3 +979,9 @@ - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 + +- config-keys: + - qwen3.5-fp8-h200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 3d806875ffe65a69f6bdb6bd95327974abd2bc02 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 19 Mar 2026 19:00:35 -0700 Subject: [PATCH 02/19] extend conc --- .github/configs/nvidia-master.yaml | 6 +++--- perf-changelog.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0dec09f11..00fa8271b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2032,15 +2032,15 @@ qwen3.5-fp8-h200-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } - isl: 1024 osl: 8192 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } glm5-fp8-h200-sglang: image: lmsysorg/sglang:glm5-hopper diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bd302b1ca..5ae0bb20d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -984,4 +984,4 @@ - qwen3.5-fp8-h200-sglang-mtp description: - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921 From 96a8f025457cf52098f15df936a89d5dd4f51652 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 19 Mar 2026 21:48:02 -0700 Subject: [PATCH 03/19] adding flag --- benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index 4a54a1da6..d6a3fc6ed 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -75,6 +75,7 @@ run_benchmark_serving \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ + --use-chat-template \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ From 0d13e556093cb651c7eb1f3bbf76741dcd2da550 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 15:42:46 -0700 Subject: [PATCH 04/19] Update perf-changelog.yaml --- perf-changelog.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c2a7846b4..9140f073d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1040,4 +1040,5 @@ - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 \ No newline at end of file + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 + From 0ce06cfb6055d980517265c2e06ca9275f968f64 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 15:47:55 -0700 Subject: [PATCH 05/19] add new line --- perf-changelog.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9140f073d..5dca29afc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1042,3 +1042,5 @@ - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 + + From 5dd23085e71486389e4cbf3da2459fc4a25f1976 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 15:49:53 -0700 Subject: [PATCH 06/19] Update perf-changelog.yaml --- perf-changelog.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5dca29afc..9140f073d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1042,5 +1042,3 @@ - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 - - From 14a584f10052e680dadb378c5441e49fc7b55db5 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 15:57:05 -0700 Subject: [PATCH 07/19] add new line --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9140f073d..f94d20efd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1041,4 +1041,3 @@ - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 - From 55a6d039f87b8d47902e291c929aa05b47bbb9a1 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 15:58:46 -0700 Subject: [PATCH 08/19] Update perf-changelog.yaml --- perf-changelog.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f94d20efd..9140f073d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1041,3 +1041,4 @@ - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 + From 2e489b92fe57b6e74e31b9d2cb5ad9a5ca1b49ca Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 16:12:13 -0700 Subject: [PATCH 09/19] fix: perf bug --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9140f073d..3b8d63dc3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1042,3 +1042,9 @@ - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 +- config-keys: + - qwen3.5-fp8-h200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921 + From cbe069dee77f315defc21a25918854c907f2e9b8 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 16:22:08 -0700 Subject: [PATCH 10/19] fix:perf --- docs/accuracy_evals_slides.html | 436 ++++++++++++++++++++++++++++++++ perf-changelog.yaml | 19 +- 2 files changed, 445 insertions(+), 10 deletions(-) create mode 100644 docs/accuracy_evals_slides.html diff --git a/docs/accuracy_evals_slides.html b/docs/accuracy_evals_slides.html new file mode 100644 index 000000000..46b06da2c --- /dev/null +++ b/docs/accuracy_evals_slides.html @@ -0,0 +1,436 @@ + + + + + +InferenceX Accuracy Evals + + + + + +
+
01 / 04
+

LLM Evaluation Landscape

+

Common Benchmark Categories

+ +
+
+ What are LLM Evals? +  Standardized benchmarks used to measure a language model's capabilities — from reasoning and knowledge to coding and safety. + Each eval tests a specific skill using curated datasets, scoring model outputs against known correct answers to produce comparable, reproducible metrics. +
+
+ +
+ + +
+
+

🧮 Math & Logical Reasoning

+
    +
  • GSM8K ★ used
    Multi-step math word problems
  • +
  • MATH
    Competition-level mathematics
  • +
  • ARC
    Scientific reasoning, grade-school science
  • +
  • BBH
    BIG-bench Hard — complex logic tasks
  • +
+
+
+

💻 Coding & Technical

+
    +
  • HumanEval
    Python code generation correctness
  • +
  • MBPP
    Crowd-sourced Python problems
  • +
+
+
+ + +
+
+

📚 Knowledge & Language

+
    +
  • MMLU
    57 subjects — STEM to humanities
  • +
  • HellaSwag
    Commonsense sentence completion
  • +
  • WinoGrande
    Context-aware pronoun resolution
  • +
  • SuperGLUE
    Comprehensive NLU suite
  • +
+
+
+

🛡️ Safety & Human Preference

+
    +
  • TruthfulQA
    Factual accuracy, anti-hallucination
  • +
  • Chatbot Arena
    Elo-rated human pairwise comparisons
  • +
  • MT-Bench
    Multi-turn dialogue evaluation
  • +
+
+
+ + +
+
+

🔬 Specialized & Advanced

+
    +
  • GPQA-Diamond ★ used
    Expert-curated science questions
  • +
  • DROP
    Reading comprehension + discrete ops
  • +
  • FinanceBench
    Financial analysis (earnings reports)
  • +
+
+ + +
+

★ Used in InferenceX

+
+
GSM8K — math word problems, 5-shot, exact match
+
GPQA-Diamond — expert science MCQ, 2 repeats, A-D accuracy
+
+
+
+ +
+
+ + +
+
02 / 04
+

InferenceX Accuracy Evals

+

End-to-End Workflow

+ +
+
+ Purpose of Eval: +  Verify that throughput optimizations — TP degree, concurrency, and spec-decoding — do not degrade model accuracy. +
+
+ +
+
+
📄
+
Input
+
perf-changelog.yaml
+
push / PR trigger
+
+
+
+
⚙️
+
Config
+
generate sweep configs
+
build matrix
mark run_eval=true
+
+
+
+
🖥️
+
Serve
+
Inference Server
+
SGLang / vLLM / TRT
port 8888
+
+
+
+
📊
+
Benchmark
+
Throughput Run
+
results*.json
+
+
+
+
🧪
+
Eval
+
lm-eval Harness
+
gsm8k + gpqa
/v1/chat/completions
+
+
+
+
🗂️
+
Collect
+
collect-evals.yml
+
parse artifacts
agg_eval_*.json
+
+
+
+
🗄️
+
Ingest
+
InferenceX-app DB
+
webhook dispatch
+
+
+ +
+
+ Parallelism: All benchmark jobs run in parallel across hardware (H100, H200, B200, MI325X…). + Eval jobs only run on entries flagged run_eval=true. +
+
+
+ + +
+
03 / 04
+

Eval Selection Policy

+

How Buckets Are Made & run_eval=true

+ +
+ + +
+
+

Pre-filters (hard gates)

+
    +
  • PASS Sequence length must be 1k8k (ISL=1024, OSL=8192)
  • +
  • SKIP No top-level tp field (multinode / disaggregated)
  • +
  • SKIP 1k1k and 8k1k sequence lengths
  • +
+
+
+

Group key — one bucket per unique combo

+
+ modelrunnerframework + precisionislosl + spec_decodingdp_attn +
+

MTP and non-MTP bucketed independently. dp_attn variants kept separate.

+
+
+ + +
+
+

Real Example — DeepSeek-R1-0528 · B200 · TRT · FP4

+

+ Bucket: (…FP4-V2, b200, trt, fp4, 1024, 8192, none, false) + · spec_decoding omitted → none · step-size 2 +

+
search-space:
+  - { tp: 4,             conc-start:  4, conc-end: 16 }
+  - { tp: 4, dp-attn: true, ... }  # → different bucket
+  - { tp: 8,             conc-start:  4, conc-end:  4 }
+  - { tp: 8, ep:8,       conc-start: 64, conc-end: 64 }
+  - { tp: 8, dp-attn: true, ... }  # → different bucket
+ + + + + + + + + + + + + + +
TPConcrun_eval
44, 8false
416true ✓
84false
864true ✓
+

dp_attn=true rows → separate bucket, own eval selections.

+
+
+ +
+ + +
+
+ Why only min & max TP? +  Different TP degrees partition tensors across GPUs differently, changing floating-point accumulation order. + If accuracy holds at both extremes under maximum concurrency, intermediate TP values are assumed bounded by these results. +
+
+
+ + +
+
04 / 04
+

lm-eval Harness

+

Eval Tasks & Metrics

+ +
+ +
+
GSM8K
+
Grade school math word problems · 5-shot
+
Exact Match (strict)em_strict
+
Exact Match (flexible)em_flexible
+
Std error (strict)em_strict_se
+
Std error (flexible)em_flexible_se
+
Effective samplesn_eff
+
+ Strict: regex extracts after ####
+ Flexible: regex matches any number
+ temp=0, max_tokens=8192 +
+
+ + +
+
GPQA Diamond
+
Graduate-level multiple choice · 2 repeats
+
Accuracy (A–D)accuracy
+
Standard errorscore_se
+
Effective samplesn_eff
+
+ Dataset: Idavidrein/gpqa · gpqa_diamond split
+ Repeats: 2× randomized choice ordering (seeded)
+ Filter: extract last A–D letter · temp=0 +
+
+ + +
+
Run Metadata
+
Captured in meta_env.json per run
+
Modelmodel
+
Hardwarehw
+
Frameworkframework
+
Precisionprecision
+
Tensor Paralleltp
+
Concurrencyconc
+
Spec Decodingspec_decoding
+
DP Attentiondp_attention
+
+
+ +
+

Sample Output — collect-evals summary

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PrefixHWFrameworkPrecSpecTPEPConcDP AttnTaskScoreEM StrictEM FlexibleN eff
qwen3.5H200SGLANGFP8mtp88128falsegsm8k96.97% ±0.47%96.97% ±0.47%96.89% ±0.48%1319
+
+
+
+ + + diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3b8d63dc3..2cc5f8e59 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -984,6 +984,15 @@ - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907 +- config-keys: + - glm5-fp8-h200-sglang + description: + - "Add GLM-5 FP8 SGLang H200 single-node benchmark" + - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" + - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" + - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 + - config-keys: - glm5-fp8-b200-sglang description: @@ -1033,18 +1042,8 @@ - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934 -- config-keys: - - glm5-fp8-h200-sglang - description: - - "Add GLM-5 FP8 SGLang H200 single-node benchmark" - - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" - - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 - - config-keys: - qwen3.5-fp8-h200-sglang-mtp description: - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921 - From 250e8501c5fe11ba02a086bccb798c79c62a3025 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 16:25:24 -0700 Subject: [PATCH 11/19] Delete docs/accuracy_evals_slides.html --- docs/accuracy_evals_slides.html | 436 -------------------------------- 1 file changed, 436 deletions(-) delete mode 100644 docs/accuracy_evals_slides.html diff --git a/docs/accuracy_evals_slides.html b/docs/accuracy_evals_slides.html deleted file mode 100644 index 46b06da2c..000000000 --- a/docs/accuracy_evals_slides.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - -InferenceX Accuracy Evals - - - - - -
-
01 / 04
-

LLM Evaluation Landscape

-

Common Benchmark Categories

- -
-
- What are LLM Evals? -  Standardized benchmarks used to measure a language model's capabilities — from reasoning and knowledge to coding and safety. - Each eval tests a specific skill using curated datasets, scoring model outputs against known correct answers to produce comparable, reproducible metrics. -
-
- -
- - -
-
-

🧮 Math & Logical Reasoning

-
    -
  • GSM8K ★ used
    Multi-step math word problems
  • -
  • MATH
    Competition-level mathematics
  • -
  • ARC
    Scientific reasoning, grade-school science
  • -
  • BBH
    BIG-bench Hard — complex logic tasks
  • -
-
-
-

💻 Coding & Technical

-
    -
  • HumanEval
    Python code generation correctness
  • -
  • MBPP
    Crowd-sourced Python problems
  • -
-
-
- - -
-
-

📚 Knowledge & Language

-
    -
  • MMLU
    57 subjects — STEM to humanities
  • -
  • HellaSwag
    Commonsense sentence completion
  • -
  • WinoGrande
    Context-aware pronoun resolution
  • -
  • SuperGLUE
    Comprehensive NLU suite
  • -
-
-
-

🛡️ Safety & Human Preference

-
    -
  • TruthfulQA
    Factual accuracy, anti-hallucination
  • -
  • Chatbot Arena
    Elo-rated human pairwise comparisons
  • -
  • MT-Bench
    Multi-turn dialogue evaluation
  • -
-
-
- - -
-
-

🔬 Specialized & Advanced

-
    -
  • GPQA-Diamond ★ used
    Expert-curated science questions
  • -
  • DROP
    Reading comprehension + discrete ops
  • -
  • FinanceBench
    Financial analysis (earnings reports)
  • -
-
- - -
-

★ Used in InferenceX

-
-
GSM8K — math word problems, 5-shot, exact match
-
GPQA-Diamond — expert science MCQ, 2 repeats, A-D accuracy
-
-
-
- -
-
- - -
-
02 / 04
-

InferenceX Accuracy Evals

-

End-to-End Workflow

- -
-
- Purpose of Eval: -  Verify that throughput optimizations — TP degree, concurrency, and spec-decoding — do not degrade model accuracy. -
-
- -
-
-
📄
-
Input
-
perf-changelog.yaml
-
push / PR trigger
-
-
-
-
⚙️
-
Config
-
generate sweep configs
-
build matrix
mark run_eval=true
-
-
-
-
🖥️
-
Serve
-
Inference Server
-
SGLang / vLLM / TRT
port 8888
-
-
-
-
📊
-
Benchmark
-
Throughput Run
-
results*.json
-
-
-
-
🧪
-
Eval
-
lm-eval Harness
-
gsm8k + gpqa
/v1/chat/completions
-
-
-
-
🗂️
-
Collect
-
collect-evals.yml
-
parse artifacts
agg_eval_*.json
-
-
-
-
🗄️
-
Ingest
-
InferenceX-app DB
-
webhook dispatch
-
-
- -
-
- Parallelism: All benchmark jobs run in parallel across hardware (H100, H200, B200, MI325X…). - Eval jobs only run on entries flagged run_eval=true. -
-
-
- - -
-
03 / 04
-

Eval Selection Policy

-

How Buckets Are Made & run_eval=true

- -
- - -
-
-

Pre-filters (hard gates)

-
    -
  • PASS Sequence length must be 1k8k (ISL=1024, OSL=8192)
  • -
  • SKIP No top-level tp field (multinode / disaggregated)
  • -
  • SKIP 1k1k and 8k1k sequence lengths
  • -
-
-
-

Group key — one bucket per unique combo

-
- modelrunnerframework - precisionislosl - spec_decodingdp_attn -
-

MTP and non-MTP bucketed independently. dp_attn variants kept separate.

-
-
- - -
-
-

Real Example — DeepSeek-R1-0528 · B200 · TRT · FP4

-

- Bucket: (…FP4-V2, b200, trt, fp4, 1024, 8192, none, false) - · spec_decoding omitted → none · step-size 2 -

-
search-space:
-  - { tp: 4,             conc-start:  4, conc-end: 16 }
-  - { tp: 4, dp-attn: true, ... }  # → different bucket
-  - { tp: 8,             conc-start:  4, conc-end:  4 }
-  - { tp: 8, ep:8,       conc-start: 64, conc-end: 64 }
-  - { tp: 8, dp-attn: true, ... }  # → different bucket
- - - - - - - - - - - - - - -
TPConcrun_eval
44, 8false
416true ✓
84false
864true ✓
-

dp_attn=true rows → separate bucket, own eval selections.

-
-
- -
- - -
-
- Why only min & max TP? -  Different TP degrees partition tensors across GPUs differently, changing floating-point accumulation order. - If accuracy holds at both extremes under maximum concurrency, intermediate TP values are assumed bounded by these results. -
-
-
- - -
-
04 / 04
-

lm-eval Harness

-

Eval Tasks & Metrics

- -
- -
-
GSM8K
-
Grade school math word problems · 5-shot
-
Exact Match (strict)em_strict
-
Exact Match (flexible)em_flexible
-
Std error (strict)em_strict_se
-
Std error (flexible)em_flexible_se
-
Effective samplesn_eff
-
- Strict: regex extracts after ####
- Flexible: regex matches any number
- temp=0, max_tokens=8192 -
-
- - -
-
GPQA Diamond
-
Graduate-level multiple choice · 2 repeats
-
Accuracy (A–D)accuracy
-
Standard errorscore_se
-
Effective samplesn_eff
-
- Dataset: Idavidrein/gpqa · gpqa_diamond split
- Repeats: 2× randomized choice ordering (seeded)
- Filter: extract last A–D letter · temp=0 -
-
- - -
-
Run Metadata
-
Captured in meta_env.json per run
-
Modelmodel
-
Hardwarehw
-
Frameworkframework
-
Precisionprecision
-
Tensor Paralleltp
-
Concurrencyconc
-
Spec Decodingspec_decoding
-
DP Attentiondp_attention
-
-
- -
-

Sample Output — collect-evals summary

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PrefixHWFrameworkPrecSpecTPEPConcDP AttnTaskScoreEM StrictEM FlexibleN eff
qwen3.5H200SGLANGFP8mtp88128falsegsm8k96.97% ±0.47%96.97% ±0.47%96.89% ±0.48%1319
-
-
-
- - - From af9bda17bbe815486ac49c25e2f30cf2f3b178c9 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 22:36:50 -0700 Subject: [PATCH 12/19] change env variable --- benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index d6a3fc6ed..561b63b30 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -24,6 +24,11 @@ SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} MAX_SEQ_LEN=$((ISL + OSL + 20)) +# MTP (Multi-Token Prediction) Config - EAGLE speculative decoding +SPECULATIVE_NUM_STEPS=2 +SPECULATIVE_DRAFT_TOKENS=3 +SPECULATIVE_EAGLE_TOPK=1 + echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" # Start GPU monitoring (power, temperature, clocks every second) @@ -54,9 +59,9 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ --trust-remote-code \ --speculative-algorithm EAGLE \ - --speculative-num-steps 2 \ - --speculative-num-draft-tokens 3 \ - --speculative-eagle-topk 1 \ + --speculative-num-steps "$SPECULATIVE_NUM_STEPS" \ + --speculative-num-draft-tokens "$SPECULATIVE_DRAFT_TOKENS" \ + --speculative-eagle-topk "$SPECULATIVE_EAGLE_TOPK" \ > "$SERVER_LOG" 2>&1 & SERVER_PID=$! From 571512b487679e2002dfd6c429f9d1cd73fcae23 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Mon, 23 Mar 2026 22:59:29 -0700 Subject: [PATCH 13/19] fix: max seq len --- benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index 561b63b30..b3a06570b 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -10,7 +10,8 @@ check_env_vars \ OSL \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ - EP_SIZE + EP_SIZE \ + MAX_MODEL_LEN if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -22,14 +23,13 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -MAX_SEQ_LEN=$((ISL + OSL + 20)) # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding SPECULATIVE_NUM_STEPS=2 SPECULATIVE_DRAFT_TOKENS=3 SPECULATIVE_EAGLE_TOPK=1 -echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" +echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN" # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -49,7 +49,7 @@ python3 -m sglang.launch_server \ --decode-log-interval 1 \ --mem-fraction-static 0.8 \ --cuda-graph-max-bs "$CONC" \ - --context-length "$MAX_SEQ_LEN" \ + --context-length "$MAX_MODEL_LEN" \ --kv-cache-dtype fp8_e4m3 \ --quantization fp8 \ --attention-backend flashinfer \ From cc42d88cdfb2c4505bb81b4ce35ab26f91cb164e Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 25 Mar 2026 10:52:06 -0700 Subject: [PATCH 14/19] remove extra flag --- benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index b3a06570b..6874a073f 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -46,7 +46,6 @@ python3 -m sglang.launch_server \ --enable-flashinfer-allreduce-fusion \ --max-running-requests 128 \ --chunked-prefill-size 16384 \ - --decode-log-interval 1 \ --mem-fraction-static 0.8 \ --cuda-graph-max-bs "$CONC" \ --context-length "$MAX_MODEL_LEN" \ From 4a35e5c98499b4b106d80ea4bcb23ea2571d2edc Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 25 Mar 2026 11:00:36 -0700 Subject: [PATCH 15/19] fix:perf --- perf-changelog.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2cc5f8e59..ddb404c65 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1041,9 +1041,36 @@ - "Add --exclusive flag to MI355X single-node salloc and multi-node sbatch to prevent node sharing during benchmarks" - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934 + +- config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Replace FP8 with combination of TP4 and TP8 config" + - "Add --enable-flashinfer-allreduce-fusion to TP8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918 + +- config-keys: + - kimik2.5-int4-b200-vllm + description: + - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935 + +- config-keys: + - dsr1-fp4-b200-sglang + - dsr1-fp8-b200-sglang + - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-h200-sglang + description: + - "Update SGLang image to v0.5.9-cu130 for all DSR1 SGLang configs" + - "dsr1-fp4-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130" + - "dsr1-fp8-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130" + - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130" + - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 - config-keys: - qwen3.5-fp8-h200-sglang-mtp description: - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921 + \ No newline at end of file From 61d531d89256c7f6aa281e22bda08a94f41819b4 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 25 Mar 2026 12:51:11 -0700 Subject: [PATCH 16/19] add new line --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1e3de1cc0..3e7bb771b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1092,4 +1092,4 @@ description: - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921 - \ No newline at end of file + \ No newline at end of file From 15443cd1e86d3b76bbc206d36bdba91bcd111f40 Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Wed, 25 Mar 2026 13:01:49 -0700 Subject: [PATCH 17/19] Clean up perf-changelog.yaml by removing old entries Removed deprecated configurations and updated descriptions for performance changelog. --- perf-changelog.yaml | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3e7bb771b..34d39a15d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1068,28 +1068,9 @@ - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 -- config-keys: - - kimik2.5-int4-b200-vllm - description: - - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935 - -- config-keys: - - dsr1-fp4-b200-sglang - - dsr1-fp8-b200-sglang - - dsr1-fp8-b200-sglang-mtp - - dsr1-fp8-h200-sglang - description: - - "Update SGLang image to v0.5.9-cu130 for all DSR1 SGLang configs" - - "dsr1-fp4-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130" - - "dsr1-fp8-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu130" - - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130" - - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 - - config-keys: - qwen3.5-fp8-h200-sglang-mtp description: - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/921 - \ No newline at end of file + From 75a416034313050537ad9ad1fc0f1e50221493ef Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 25 Mar 2026 14:43:50 -0700 Subject: [PATCH 18/19] update:spec num --- benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index 6874a073f..fe856bd8a 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -25,7 +25,7 @@ SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding -SPECULATIVE_NUM_STEPS=2 +SPECULATIVE_NUM_STEPS=3 SPECULATIVE_DRAFT_TOKENS=3 SPECULATIVE_EAGLE_TOPK=1 From 1e93eae1b29babddeb3bae01f382182eeacdef0a Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 25 Mar 2026 14:48:30 -0700 Subject: [PATCH 19/19] update: spec token --- benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh index fe856bd8a..fc2525f34 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh @@ -26,7 +26,7 @@ PORT=${PORT:-8888} # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding SPECULATIVE_NUM_STEPS=3 -SPECULATIVE_DRAFT_TOKENS=3 +SPECULATIVE_DRAFT_TOKENS=4 SPECULATIVE_EAGLE_TOPK=1 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_MODEL_LEN: $MAX_MODEL_LEN"