Skip to content

Commit 7d9bba8

Browse files
committed
add new cases for ckpts on HF
Signed-off-by: noeyy-mino <174223378+noeyy-mino@users.noreply.github.com>
1 parent d78797b commit 7d9bba8

File tree

2 files changed

+200
-74
lines changed

2 files changed

+200
-74
lines changed

tests/_test_utils/deploy_utils.py

Lines changed: 160 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
# limitations under the License.
1515

1616
import itertools
17+
import os
1718
import subprocess
19+
import sys
1820

1921
import pytest
2022
import torch
@@ -61,6 +63,121 @@ def get_available_backends():
6163
return _AVAILABLE_BACKENDS
6264

6365

66+
def _run_trtllm_deploy(
67+
model_id: str,
68+
tensor_parallel_size: int,
69+
mini_sm: int,
70+
attn_backend: str,
71+
base_model: str,
72+
eagle3_one_model: bool,
73+
) -> None:
74+
"""Top-level entry for subprocess: run TensorRT-LLM deploy in a child process."""
75+
try:
76+
deployer = ModelDeployer(
77+
backend="trtllm",
78+
model_id=model_id,
79+
tensor_parallel_size=tensor_parallel_size,
80+
mini_sm=mini_sm,
81+
attn_backend=attn_backend,
82+
base_model=base_model,
83+
eagle3_one_model=eagle3_one_model,
84+
)
85+
deployer._deploy_trtllm_impl()
86+
except Exception:
87+
import traceback
88+
89+
traceback.print_exc()
90+
pytest.fail(traceback.format_exc())
91+
92+
93+
def _run_vllm_deploy(
94+
model_id: str,
95+
tensor_parallel_size: int,
96+
mini_sm: int,
97+
attn_backend: str,
98+
base_model: str,
99+
eagle3_one_model: bool,
100+
) -> None:
101+
"""Top-level entry for subprocess: run vLLM deploy in a child process."""
102+
try:
103+
deployer = ModelDeployer(
104+
backend="vllm",
105+
model_id=model_id,
106+
tensor_parallel_size=tensor_parallel_size,
107+
mini_sm=mini_sm,
108+
attn_backend=attn_backend,
109+
base_model=base_model,
110+
eagle3_one_model=eagle3_one_model,
111+
)
112+
deployer._deploy_vllm_impl()
113+
except Exception:
114+
import traceback
115+
116+
traceback.print_exc()
117+
pytest.fail(traceback.format_exc())
118+
119+
120+
def _run_sglang_deploy(
121+
model_id: str,
122+
tensor_parallel_size: int,
123+
mini_sm: int,
124+
attn_backend: str,
125+
base_model: str,
126+
eagle3_one_model: bool,
127+
) -> None:
128+
"""Top-level entry for subprocess: run SGLang deploy in a child process."""
129+
try:
130+
deployer = ModelDeployer(
131+
backend="sglang",
132+
model_id=model_id,
133+
tensor_parallel_size=tensor_parallel_size,
134+
mini_sm=mini_sm,
135+
attn_backend=attn_backend,
136+
base_model=base_model,
137+
eagle3_one_model=eagle3_one_model,
138+
)
139+
deployer._deploy_sglang_impl()
140+
except Exception:
141+
import traceback
142+
143+
traceback.print_exc()
144+
pytest.fail(traceback.format_exc())
145+
146+
147+
def _run_deploy_via_subprocess(
148+
backend: str,
149+
model_id: str,
150+
tensor_parallel_size: int,
151+
mini_sm: int,
152+
attn_backend: str,
153+
base_model: str,
154+
eagle3_one_model: bool,
155+
) -> None:
156+
"""Run deploy in a subprocess and print its stdout/stderr so pytest capture=tee-sys captures to DB."""
157+
tests_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
158+
project_root = os.path.dirname(tests_dir)
159+
env = {
160+
**os.environ,
161+
"PYTHONPATH": project_root + os.pathsep + os.environ.get("PYTHONPATH", ""),
162+
}
163+
code = f"""from _test_utils.deploy_utils import _run_{backend}_deploy
164+
_run_{backend}_deploy(
165+
{model_id!r}, {tensor_parallel_size}, {mini_sm}, {attn_backend!r}, {base_model!r}, {eagle3_one_model}
166+
)
167+
"""
168+
result = subprocess.run(
169+
[sys.executable, "-c", code],
170+
cwd=tests_dir,
171+
env=env,
172+
stdout=subprocess.PIPE,
173+
stderr=subprocess.STDOUT,
174+
text=True,
175+
)
176+
if result.stdout:
177+
print(result.stdout, end="", flush=True)
178+
result.check_returncode()
179+
180+
64181
# Common test prompts for all backends
65182
COMMON_PROMPTS = [
66183
"Hello, my name is",
@@ -114,95 +231,70 @@ def run(self):
114231
if torch.cuda.device_count() < self.tensor_parallel_size:
115232
pytest.skip(reason=f"Requires at least {self.tensor_parallel_size} GPUs")
116233
return
117-
if self.backend == "vllm":
118-
self._deploy_vllm()
119-
elif self.backend == "trtllm":
120-
self._deploy_trtllm()
121-
elif self.backend == "sglang":
122-
self._deploy_sglang()
234+
235+
print(f"Deploying model: {self.model_id} with backend: {self.backend}")
236+
print(f"Tensor parallel size: {self.tensor_parallel_size}")
237+
# Use subprocess + capture so pytest capture=tee-sys (and DB plugins) see deploy output.
238+
if self.backend in ("vllm", "trtllm", "sglang"):
239+
_run_deploy_via_subprocess(
240+
backend=self.backend,
241+
model_id=self.model_id,
242+
tensor_parallel_size=self.tensor_parallel_size,
243+
mini_sm=self.mini_sm,
244+
attn_backend=self.attn_backend,
245+
base_model=self.base_model,
246+
eagle3_one_model=self.eagle3_one_model,
247+
)
123248
else:
124249
raise ValueError(f"Unknown backend: {self.backend}")
125-
# check gpu status
126-
gpu_status = subprocess.run(
127-
"nvidia-smi || true", shell=True, capture_output=True, text=True, check=True
128-
)
129-
print("\n=== GPU Status Before Test ===")
130-
print(gpu_status.stdout)
131-
print("=============================\n")
132250

133-
def _deploy_trtllm(self):
134-
"""Deploy a model using TensorRT-LLM."""
251+
def _deploy_trtllm_impl(self):
252+
"""Run TensorRT-LLM deploy (used by subprocess in run())."""
135253
from tensorrt_llm import LLM, SamplingParams
136254
from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig
137255

138256
sampling_params = SamplingParams(max_tokens=32)
139-
spec_config = None
140-
llm = None
141-
kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)
142-
143-
if self.model_id in (
257+
qwen3_models = (
258+
"nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4",
259+
"nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4",
260+
)
261+
nemotron_models = (
144262
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
145263
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
146-
):
147-
llm = LLM(
148-
model=self.model_id,
149-
tensor_parallel_size=self.tensor_parallel_size,
150-
enable_attention_dp=False,
151-
attn_backend=self.attn_backend,
152-
trust_remote_code=True,
153-
max_batch_size=8,
154-
kv_cache_config=KvCacheConfig(
155-
enable_block_reuse=False,
156-
mamba_ssm_cache_dtype="float32",
157-
),
158-
)
159-
elif self.model_id == "nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16":
160-
spec_config = EagleDecodingConfig(
161-
max_draft_len=3,
162-
speculative_model_dir=self.model_id,
163-
eagle3_one_model=self.eagle3_one_model,
164-
)
165-
llm = LLM(
166-
model=self.model_id,
167-
tensor_parallel_size=self.tensor_parallel_size,
168-
enable_attention_dp=False,
169-
attn_backend=self.attn_backend,
170-
trust_remote_code=True,
171-
max_batch_size=8,
172-
speculative_config=spec_config,
173-
kv_cache_config=KvCacheConfig(
174-
enable_block_reuse=False,
175-
mamba_ssm_cache_dtype="float32",
176-
),
177-
)
178-
elif "eagle" in self.model_id.lower():
264+
)
265+
kv_cache_config = KvCacheConfig(
266+
enable_block_reuse=self.model_id not in qwen3_models,
267+
free_gpu_memory_fraction=0.8,
268+
mamba_ssm_cache_dtype="float32" if self.model_id not in nemotron_models else "auto",
269+
)
270+
base_kw = {
271+
"tensor_parallel_size": self.tensor_parallel_size,
272+
"enable_attention_dp": False,
273+
"attn_backend": self.attn_backend,
274+
"trust_remote_code": True,
275+
"max_batch_size": 8,
276+
}
277+
278+
if "eagle" in self.model_id.lower():
179279
spec_config = EagleDecodingConfig(
180280
max_draft_len=3,
181281
speculative_model_dir=self.model_id,
182282
eagle3_one_model=self.eagle3_one_model,
183283
)
184-
cuda_graph = CudaGraphConfig(
185-
max_batch_size=1,
186-
)
187284
llm = LLM(
188285
model=self.base_model,
189-
tensor_parallel_size=self.tensor_parallel_size,
190-
enable_attention_dp=False,
191286
disable_overlap_scheduler=True,
192287
enable_autotuner=False,
193288
speculative_config=spec_config,
194-
cuda_graph_config=cuda_graph,
289+
cuda_graph_config=CudaGraphConfig(max_batch_size=1),
195290
kv_cache_config=kv_cache_config,
291+
**base_kw,
196292
)
197293
else:
198294
llm = LLM(
199295
model=self.model_id,
200-
tensor_parallel_size=self.tensor_parallel_size,
201-
enable_attention_dp=False,
202-
attn_backend=self.attn_backend,
203-
trust_remote_code=True,
204-
max_batch_size=8,
205296
kv_cache_config=kv_cache_config,
297+
**base_kw,
206298
)
207299

208300
outputs = llm.generate(COMMON_PROMPTS, sampling_params)
@@ -213,8 +305,8 @@ def _deploy_trtllm(self):
213305
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
214306
del llm
215307

216-
def _deploy_vllm(self):
217-
"""Deploy a model using vLLM."""
308+
def _deploy_vllm_impl(self):
309+
"""Run vLLM deploy (used by subprocess in run())."""
218310
from vllm import LLM, SamplingParams
219311

220312
quantization_method = "modelopt"
@@ -247,8 +339,8 @@ def _deploy_vllm(self):
247339
print("-" * 50)
248340
del llm
249341

250-
def _deploy_sglang(self):
251-
"""Deploy a model using SGLang."""
342+
def _deploy_sglang_impl(self):
343+
"""Run SGLang deploy (used by subprocess in run())."""
252344
import sglang as sgl
253345

254346
quantization_method = "modelopt"

tests/examples/llm_ptq/test_deploy.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,9 @@ def test_llama(command):
224224
mini_sm=89,
225225
),
226226
*ModelDeployerList(
227-
model_id="nvidia/QwQ-32B-NVFP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100
227+
model_id="nvidia/QwQ-32B-NVFP4",
228+
backend=("trtllm", "vllm", "sglang"),
229+
mini_sm=100,
228230
),
229231
*ModelDeployerList(
230232
model_id="nvidia/Qwen3-32B-NVFP4",
@@ -262,6 +264,30 @@ def test_llama(command):
262264
tensor_parallel_size=8,
263265
mini_sm=100,
264266
),
267+
*ModelDeployerList(
268+
model_id="nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4",
269+
backend=("trtllm", "vllm", "sglang"),
270+
tensor_parallel_size=8,
271+
mini_sm=100,
272+
),
273+
*ModelDeployerList(
274+
model_id="nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4",
275+
backend=("trtllm", "vllm", "sglang"),
276+
tensor_parallel_size=8,
277+
mini_sm=100,
278+
),
279+
*ModelDeployerList(
280+
model_id="nvidia/Qwen3-235B-A22B-Thinking-2507-NVFP4",
281+
backend=("trtllm", "vllm", "sglang"),
282+
tensor_parallel_size=8,
283+
mini_sm=100,
284+
),
285+
*ModelDeployerList(
286+
model_id="nvidia/Qwen3.5-397B-A17B-NVFP4",
287+
backend=("trtllm", "vllm", "sglang"),
288+
tensor_parallel_size=8,
289+
mini_sm=100,
290+
),
265291
],
266292
ids=idfn,
267293
)
@@ -273,7 +299,9 @@ def test_qwen(command):
273299
"command",
274300
[
275301
*ModelDeployerList(
276-
model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend=("trtllm", "vllm", "sglang")
302+
model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8",
303+
backend=("trtllm", "vllm", "sglang"),
304+
mini_sm=89,
277305
),
278306
*ModelDeployerList(
279307
model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-NVFP4",
@@ -375,6 +403,12 @@ def test_phi(command):
375403
tensor_parallel_size=8,
376404
mini_sm=100,
377405
),
406+
*ModelDeployerList(
407+
model_id="nvidia/Kimi-K2.5-NVFP4",
408+
backend=("trtllm", "vllm", "sglang"),
409+
tensor_parallel_size=8,
410+
mini_sm=100,
411+
),
378412
],
379413
ids=idfn,
380414
)
@@ -399,7 +433,7 @@ def test_kimi(command):
399433
),
400434
*ModelDeployerList(
401435
model_id="nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
402-
backend=("vllm",),
436+
backend=("trtllm", "vllm", "sglang"),
403437
tensor_parallel_size=8,
404438
mini_sm=89,
405439
),
@@ -541,11 +575,11 @@ def test_medusa(command):
541575
ids=idfn,
542576
)
543577
def test_eagle(command):
544-
"""Skip test if MODELOPT_LOCAL_MODEL_ROOT is set but model doesn't exist locally.
578+
"""Skip test if MODELOPT_LOCAL_EAGLE_MODEL is set but model doesn't exist locally.
545579
speculative models shoule be loaded by local path"""
546-
local_root = os.getenv("MODELOPT_LOCAL_MODEL_ROOT")
580+
local_root = os.getenv("MODELOPT_LOCAL_EAGLE_MODEL")
547581
if not local_root:
548-
return
582+
pytest.skip("MODELOPT_LOCAL_EAGLE_MODEL is not set")
549583

550584
local_path = os.path.join(local_root, command.model_id)
551585
if os.path.isdir(local_path):

0 commit comments

Comments
 (0)