1414# limitations under the License.
1515
1616import itertools
17+ import os
1718import subprocess
19+ import sys
1820
1921import pytest
2022import torch
@@ -61,6 +63,121 @@ def get_available_backends():
6163 return _AVAILABLE_BACKENDS
6264
6365
66+ def _run_trtllm_deploy (
67+ model_id : str ,
68+ tensor_parallel_size : int ,
69+ mini_sm : int ,
70+ attn_backend : str ,
71+ base_model : str ,
72+ eagle3_one_model : bool ,
73+ ) -> None :
74+ """Top-level entry for subprocess: run TensorRT-LLM deploy in a child process."""
75+ try :
76+ deployer = ModelDeployer (
77+ backend = "trtllm" ,
78+ model_id = model_id ,
79+ tensor_parallel_size = tensor_parallel_size ,
80+ mini_sm = mini_sm ,
81+ attn_backend = attn_backend ,
82+ base_model = base_model ,
83+ eagle3_one_model = eagle3_one_model ,
84+ )
85+ deployer ._deploy_trtllm_impl ()
86+ except Exception :
87+ import traceback
88+
89+ traceback .print_exc ()
90+ pytest .fail (traceback .format_exc ())
91+
92+
93+ def _run_vllm_deploy (
94+ model_id : str ,
95+ tensor_parallel_size : int ,
96+ mini_sm : int ,
97+ attn_backend : str ,
98+ base_model : str ,
99+ eagle3_one_model : bool ,
100+ ) -> None :
101+ """Top-level entry for subprocess: run vLLM deploy in a child process."""
102+ try :
103+ deployer = ModelDeployer (
104+ backend = "vllm" ,
105+ model_id = model_id ,
106+ tensor_parallel_size = tensor_parallel_size ,
107+ mini_sm = mini_sm ,
108+ attn_backend = attn_backend ,
109+ base_model = base_model ,
110+ eagle3_one_model = eagle3_one_model ,
111+ )
112+ deployer ._deploy_vllm_impl ()
113+ except Exception :
114+ import traceback
115+
116+ traceback .print_exc ()
117+ pytest .fail (traceback .format_exc ())
118+
119+
120+ def _run_sglang_deploy (
121+ model_id : str ,
122+ tensor_parallel_size : int ,
123+ mini_sm : int ,
124+ attn_backend : str ,
125+ base_model : str ,
126+ eagle3_one_model : bool ,
127+ ) -> None :
128+ """Top-level entry for subprocess: run SGLang deploy in a child process."""
129+ try :
130+ deployer = ModelDeployer (
131+ backend = "sglang" ,
132+ model_id = model_id ,
133+ tensor_parallel_size = tensor_parallel_size ,
134+ mini_sm = mini_sm ,
135+ attn_backend = attn_backend ,
136+ base_model = base_model ,
137+ eagle3_one_model = eagle3_one_model ,
138+ )
139+ deployer ._deploy_sglang_impl ()
140+ except Exception :
141+ import traceback
142+
143+ traceback .print_exc ()
144+ pytest .fail (traceback .format_exc ())
145+
146+
147+ def _run_deploy_via_subprocess (
148+ backend : str ,
149+ model_id : str ,
150+ tensor_parallel_size : int ,
151+ mini_sm : int ,
152+ attn_backend : str ,
153+ base_model : str ,
154+ eagle3_one_model : bool ,
155+ ) -> None :
156+ """Run deploy in a subprocess and print its stdout/stderr so pytest capture=tee-sys captures to DB."""
157+ tests_dir = os .path .dirname (os .path .dirname (os .path .abspath (__file__ )))
158+ project_root = os .path .dirname (tests_dir )
159+ env = {
160+ ** os .environ ,
161+ "PYTHONPATH" : project_root + os .pathsep + os .environ .get ("PYTHONPATH" , "" ),
162+ }
163+ code = f"""from _test_utils.deploy_utils import _run_{ backend } _deploy
164+ _run_{ backend } _deploy(
165+ { model_id !r} , { tensor_parallel_size } , { mini_sm } , { attn_backend !r} , { base_model !r} , { eagle3_one_model }
166+ )
167+ """
168+ result = subprocess .run (
169+ [sys .executable , "-c" , code ],
170+ cwd = tests_dir ,
171+ env = env ,
172+ stdout = subprocess .PIPE ,
173+ stderr = subprocess .STDOUT ,
174+ text = True ,
175+ )
176+ if result .stdout :
177+ print (result .stdout , end = "" , flush = True )
178+ result .check_returncode ()
179+
180+
64181# Common test prompts for all backends
65182COMMON_PROMPTS = [
66183 "Hello, my name is" ,
@@ -114,95 +231,70 @@ def run(self):
114231 if torch .cuda .device_count () < self .tensor_parallel_size :
115232 pytest .skip (reason = f"Requires at least { self .tensor_parallel_size } GPUs" )
116233 return
117- if self .backend == "vllm" :
118- self ._deploy_vllm ()
119- elif self .backend == "trtllm" :
120- self ._deploy_trtllm ()
121- elif self .backend == "sglang" :
122- self ._deploy_sglang ()
234+
235+ print (f"Deploying model: { self .model_id } with backend: { self .backend } " )
236+ print (f"Tensor parallel size: { self .tensor_parallel_size } " )
237+ # Use subprocess + capture so pytest capture=tee-sys (and DB plugins) see deploy output.
238+ if self .backend in ("vllm" , "trtllm" , "sglang" ):
239+ _run_deploy_via_subprocess (
240+ backend = self .backend ,
241+ model_id = self .model_id ,
242+ tensor_parallel_size = self .tensor_parallel_size ,
243+ mini_sm = self .mini_sm ,
244+ attn_backend = self .attn_backend ,
245+ base_model = self .base_model ,
246+ eagle3_one_model = self .eagle3_one_model ,
247+ )
123248 else :
124249 raise ValueError (f"Unknown backend: { self .backend } " )
125- # check gpu status
126- gpu_status = subprocess .run (
127- "nvidia-smi || true" , shell = True , capture_output = True , text = True , check = True
128- )
129- print ("\n === GPU Status Before Test ===" )
130- print (gpu_status .stdout )
131- print ("=============================\n " )
132250
133- def _deploy_trtllm (self ):
134- """Deploy a model using TensorRT-LLM."""
251+ def _deploy_trtllm_impl (self ):
252+ """Run TensorRT-LLM deploy (used by subprocess in run()) ."""
135253 from tensorrt_llm import LLM , SamplingParams
136254 from tensorrt_llm .llmapi import CudaGraphConfig , EagleDecodingConfig , KvCacheConfig
137255
138256 sampling_params = SamplingParams (max_tokens = 32 )
139- spec_config = None
140- llm = None
141- kv_cache_config = KvCacheConfig ( enable_block_reuse = True , free_gpu_memory_fraction = 0.8 )
142-
143- if self . model_id in (
257+ qwen3_models = (
258+ "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4" ,
259+ "nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4" ,
260+ )
261+ nemotron_models = (
144262 "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8" ,
145263 "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4" ,
146- ):
147- llm = LLM (
148- model = self .model_id ,
149- tensor_parallel_size = self .tensor_parallel_size ,
150- enable_attention_dp = False ,
151- attn_backend = self .attn_backend ,
152- trust_remote_code = True ,
153- max_batch_size = 8 ,
154- kv_cache_config = KvCacheConfig (
155- enable_block_reuse = False ,
156- mamba_ssm_cache_dtype = "float32" ,
157- ),
158- )
159- elif self .model_id == "nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" :
160- spec_config = EagleDecodingConfig (
161- max_draft_len = 3 ,
162- speculative_model_dir = self .model_id ,
163- eagle3_one_model = self .eagle3_one_model ,
164- )
165- llm = LLM (
166- model = self .model_id ,
167- tensor_parallel_size = self .tensor_parallel_size ,
168- enable_attention_dp = False ,
169- attn_backend = self .attn_backend ,
170- trust_remote_code = True ,
171- max_batch_size = 8 ,
172- speculative_config = spec_config ,
173- kv_cache_config = KvCacheConfig (
174- enable_block_reuse = False ,
175- mamba_ssm_cache_dtype = "float32" ,
176- ),
177- )
178- elif "eagle" in self .model_id .lower ():
264+ )
265+ kv_cache_config = KvCacheConfig (
266+ enable_block_reuse = self .model_id not in qwen3_models ,
267+ free_gpu_memory_fraction = 0.8 ,
268+ mamba_ssm_cache_dtype = "float32" if self .model_id not in nemotron_models else "auto" ,
269+ )
270+ base_kw = {
271+ "tensor_parallel_size" : self .tensor_parallel_size ,
272+ "enable_attention_dp" : False ,
273+ "attn_backend" : self .attn_backend ,
274+ "trust_remote_code" : True ,
275+ "max_batch_size" : 8 ,
276+ }
277+
278+ if "eagle" in self .model_id .lower ():
179279 spec_config = EagleDecodingConfig (
180280 max_draft_len = 3 ,
181281 speculative_model_dir = self .model_id ,
182282 eagle3_one_model = self .eagle3_one_model ,
183283 )
184- cuda_graph = CudaGraphConfig (
185- max_batch_size = 1 ,
186- )
187284 llm = LLM (
188285 model = self .base_model ,
189- tensor_parallel_size = self .tensor_parallel_size ,
190- enable_attention_dp = False ,
191286 disable_overlap_scheduler = True ,
192287 enable_autotuner = False ,
193288 speculative_config = spec_config ,
194- cuda_graph_config = cuda_graph ,
289+ cuda_graph_config = CudaGraphConfig ( max_batch_size = 1 ) ,
195290 kv_cache_config = kv_cache_config ,
291+ ** base_kw ,
196292 )
197293 else :
198294 llm = LLM (
199295 model = self .model_id ,
200- tensor_parallel_size = self .tensor_parallel_size ,
201- enable_attention_dp = False ,
202- attn_backend = self .attn_backend ,
203- trust_remote_code = True ,
204- max_batch_size = 8 ,
205296 kv_cache_config = kv_cache_config ,
297+ ** base_kw ,
206298 )
207299
208300 outputs = llm .generate (COMMON_PROMPTS , sampling_params )
@@ -213,8 +305,8 @@ def _deploy_trtllm(self):
213305 print (f"Prompt: { prompt !r} , Generated text: { generated_text !r} " )
214306 del llm
215307
216- def _deploy_vllm (self ):
217- """Deploy a model using vLLM ."""
308+ def _deploy_vllm_impl (self ):
309+ """Run vLLM deploy (used by subprocess in run()) ."""
218310 from vllm import LLM , SamplingParams
219311
220312 quantization_method = "modelopt"
@@ -247,8 +339,8 @@ def _deploy_vllm(self):
247339 print ("-" * 50 )
248340 del llm
249341
250- def _deploy_sglang (self ):
251- """Deploy a model using SGLang ."""
342+ def _deploy_sglang_impl (self ):
343+ """Run SGLang deploy (used by subprocess in run()) ."""
252344 import sglang as sgl
253345
254346 quantization_method = "modelopt"
0 commit comments