add new cases for ckpts on HF

noeyy-mino · noeyy-mino · commit 7d9bba80a07d · 2026-02-26T10:55:01.000Z
Signed-off-by: noeyy-mino &lt;174223378+noeyy-mino@users.noreply.github.com&gt;
diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py
@@ -14,7 +14,9 @@
 # limitations under the License.
 
 import itertools
+import os
 import subprocess
+import sys
 
 import pytest
 import torch
@@ -61,6 +63,121 @@ def get_available_backends():
     return _AVAILABLE_BACKENDS
 
 
+def _run_trtllm_deploy(
+    model_id: str,
+    tensor_parallel_size: int,
+    mini_sm: int,
+    attn_backend: str,
+    base_model: str,
+    eagle3_one_model: bool,
+) -> None:
+    """Top-level entry for subprocess: run TensorRT-LLM deploy in a child process."""
+    try:
+        deployer = ModelDeployer(
+            backend="trtllm",
+            model_id=model_id,
+            tensor_parallel_size=tensor_parallel_size,
+            mini_sm=mini_sm,
+            attn_backend=attn_backend,
+            base_model=base_model,
+            eagle3_one_model=eagle3_one_model,
+        )
+        deployer._deploy_trtllm_impl()
+    except Exception:
+        import traceback
+
+        traceback.print_exc()
+        pytest.fail(traceback.format_exc())
+
+
+def _run_vllm_deploy(
+    model_id: str,
+    tensor_parallel_size: int,
+    mini_sm: int,
+    attn_backend: str,
+    base_model: str,
+    eagle3_one_model: bool,
+) -> None:
+    """Top-level entry for subprocess: run vLLM deploy in a child process."""
+    try:
+        deployer = ModelDeployer(
+            backend="vllm",
+            model_id=model_id,
+            tensor_parallel_size=tensor_parallel_size,
+            mini_sm=mini_sm,
+            attn_backend=attn_backend,
+            base_model=base_model,
+            eagle3_one_model=eagle3_one_model,
+        )
+        deployer._deploy_vllm_impl()
+    except Exception:
+        import traceback
+
+        traceback.print_exc()
+        pytest.fail(traceback.format_exc())
+
+
+def _run_sglang_deploy(
+    model_id: str,
+    tensor_parallel_size: int,
+    mini_sm: int,
+    attn_backend: str,
+    base_model: str,
+    eagle3_one_model: bool,
+) -> None:
+    """Top-level entry for subprocess: run SGLang deploy in a child process."""
+    try:
+        deployer = ModelDeployer(
+            backend="sglang",
+            model_id=model_id,
+            tensor_parallel_size=tensor_parallel_size,
+            mini_sm=mini_sm,
+            attn_backend=attn_backend,
+            base_model=base_model,
+            eagle3_one_model=eagle3_one_model,
+        )
+        deployer._deploy_sglang_impl()
+    except Exception:
+        import traceback
+
+        traceback.print_exc()
+        pytest.fail(traceback.format_exc())
+
+
+def _run_deploy_via_subprocess(
+    backend: str,
+    model_id: str,
+    tensor_parallel_size: int,
+    mini_sm: int,
+    attn_backend: str,
+    base_model: str,
+    eagle3_one_model: bool,
+) -> None:
+    """Run deploy in a subprocess and print its stdout/stderr so pytest capture=tee-sys captures to DB."""
+    tests_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    project_root = os.path.dirname(tests_dir)
+    env = {
+        **os.environ,
+        "PYTHONPATH": project_root + os.pathsep + os.environ.get("PYTHONPATH", ""),
+    }
+    code = f"""from _test_utils.deploy_utils import _run_{backend}_deploy
+_run_{backend}_deploy(
+    {model_id!r}, {tensor_parallel_size}, {mini_sm}, {attn_backend!r}, {base_model!r}, {eagle3_one_model}
+)
+"""
+    result = subprocess.run(
+        [sys.executable, "-c", code],
+        cwd=tests_dir,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
+    if result.stdout:
+        print(result.stdout, end="", flush=True)
+    result.check_returncode()
+
+
 # Common test prompts for all backends
 COMMON_PROMPTS = [
     "Hello, my name is",
@@ -114,95 +231,70 @@ def run(self):
         if torch.cuda.device_count() < self.tensor_parallel_size:
             pytest.skip(reason=f"Requires at least {self.tensor_parallel_size} GPUs")
             return
-        if self.backend == "vllm":
-            self._deploy_vllm()
-        elif self.backend == "trtllm":
-            self._deploy_trtllm()
-        elif self.backend == "sglang":
-            self._deploy_sglang()
+
+        print(f"Deploying model: {self.model_id} with backend: {self.backend}")
+        print(f"Tensor parallel size: {self.tensor_parallel_size}")
+        # Use subprocess + capture so pytest capture=tee-sys (and DB plugins) see deploy output.
+        if self.backend in ("vllm", "trtllm", "sglang"):
+            _run_deploy_via_subprocess(
+                backend=self.backend,
+                model_id=self.model_id,
+                tensor_parallel_size=self.tensor_parallel_size,
+                mini_sm=self.mini_sm,
+                attn_backend=self.attn_backend,
+                base_model=self.base_model,
+                eagle3_one_model=self.eagle3_one_model,
+            )
         else:
             raise ValueError(f"Unknown backend: {self.backend}")
-        # check gpu status
-        gpu_status = subprocess.run(
-            "nvidia-smi || true", shell=True, capture_output=True, text=True, check=True
-        )
-        print("\n=== GPU Status Before Test ===")
-        print(gpu_status.stdout)
-        print("=============================\n")
 
-    def _deploy_trtllm(self):
-        """Deploy a model using TensorRT-LLM."""
+    def _deploy_trtllm_impl(self):
+        """Run TensorRT-LLM deploy (used by subprocess in run())."""
         from tensorrt_llm import LLM, SamplingParams
         from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig
 
         sampling_params = SamplingParams(max_tokens=32)
-        spec_config = None
-        llm = None
-        kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)
-
-        if self.model_id in (
+        qwen3_models = (
+            "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4",
+            "nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4",
+        )
+        nemotron_models = (
             "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
             "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
-        ):
-            llm = LLM(
-                model=self.model_id,
-                tensor_parallel_size=self.tensor_parallel_size,
-                enable_attention_dp=False,
-                attn_backend=self.attn_backend,
-                trust_remote_code=True,
-                max_batch_size=8,
-                kv_cache_config=KvCacheConfig(
-                    enable_block_reuse=False,
-                    mamba_ssm_cache_dtype="float32",
-                ),
-            )
-        elif self.model_id == "nvidia/EAGLE3-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16":
-            spec_config = EagleDecodingConfig(
-                max_draft_len=3,
-                speculative_model_dir=self.model_id,
-                eagle3_one_model=self.eagle3_one_model,
-            )
-            llm = LLM(
-                model=self.model_id,
-                tensor_parallel_size=self.tensor_parallel_size,
-                enable_attention_dp=False,
-                attn_backend=self.attn_backend,
-                trust_remote_code=True,
-                max_batch_size=8,
-                speculative_config=spec_config,
-                kv_cache_config=KvCacheConfig(
-                    enable_block_reuse=False,
-                    mamba_ssm_cache_dtype="float32",
-                ),
-            )
-        elif "eagle" in self.model_id.lower():
+        )
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=self.model_id not in qwen3_models,
+            free_gpu_memory_fraction=0.8,
+            mamba_ssm_cache_dtype="float32" if self.model_id not in nemotron_models else "auto",
+        )
+        base_kw = {
+            "tensor_parallel_size": self.tensor_parallel_size,
+            "enable_attention_dp": False,
+            "attn_backend": self.attn_backend,
+            "trust_remote_code": True,
+            "max_batch_size": 8,
+        }
+
+        if "eagle" in self.model_id.lower():
             spec_config = EagleDecodingConfig(
                 max_draft_len=3,
                 speculative_model_dir=self.model_id,
                 eagle3_one_model=self.eagle3_one_model,
             )
-            cuda_graph = CudaGraphConfig(
-                max_batch_size=1,
-            )
             llm = LLM(
                 model=self.base_model,
-                tensor_parallel_size=self.tensor_parallel_size,
-                enable_attention_dp=False,
                 disable_overlap_scheduler=True,
                 enable_autotuner=False,
                 speculative_config=spec_config,
-                cuda_graph_config=cuda_graph,
+                cuda_graph_config=CudaGraphConfig(max_batch_size=1),
                 kv_cache_config=kv_cache_config,
+                **base_kw,
             )
         else:
             llm = LLM(
                 model=self.model_id,
-                tensor_parallel_size=self.tensor_parallel_size,
-                enable_attention_dp=False,
-                attn_backend=self.attn_backend,
-                trust_remote_code=True,
-                max_batch_size=8,
                 kv_cache_config=kv_cache_config,
+                **base_kw,
             )
 
         outputs = llm.generate(COMMON_PROMPTS, sampling_params)
@@ -213,8 +305,8 @@ def _deploy_trtllm(self):
             print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         del llm
 
-    def _deploy_vllm(self):
-        """Deploy a model using vLLM."""
+    def _deploy_vllm_impl(self):
+        """Run vLLM deploy (used by subprocess in run())."""
         from vllm import LLM, SamplingParams
 
         quantization_method = "modelopt"
@@ -247,8 +339,8 @@ def _deploy_vllm(self):
             print("-" * 50)
         del llm
 
-    def _deploy_sglang(self):
-        """Deploy a model using SGLang."""
+    def _deploy_sglang_impl(self):
+        """Run SGLang deploy (used by subprocess in run())."""
         import sglang as sgl
 
         quantization_method = "modelopt"
diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py
@@ -224,7 +224,9 @@ def test_llama(command):
             mini_sm=89,
         ),
         *ModelDeployerList(
-            model_id="nvidia/QwQ-32B-NVFP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100
+            model_id="nvidia/QwQ-32B-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            mini_sm=100,
         ),
         *ModelDeployerList(
             model_id="nvidia/Qwen3-32B-NVFP4",
@@ -262,6 +264,30 @@ def test_llama(command):
             tensor_parallel_size=8,
             mini_sm=100,
         ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3-235B-A22B-Thinking-2507-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
+        *ModelDeployerList(
+            model_id="nvidia/Qwen3.5-397B-A17B-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
     ],
     ids=idfn,
 )
@@ -273,7 +299,9 @@ def test_qwen(command):
     "command",
     [
         *ModelDeployerList(
-            model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend=("trtllm", "vllm", "sglang")
+            model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8",
+            backend=("trtllm", "vllm", "sglang"),
+            mini_sm=89,
         ),
         *ModelDeployerList(
             model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-NVFP4",
@@ -375,6 +403,12 @@ def test_phi(command):
             tensor_parallel_size=8,
             mini_sm=100,
         ),
+        *ModelDeployerList(
+            model_id="nvidia/Kimi-K2.5-NVFP4",
+            backend=("trtllm", "vllm", "sglang"),
+            tensor_parallel_size=8,
+            mini_sm=100,
+        ),
     ],
     ids=idfn,
 )
@@ -399,7 +433,7 @@ def test_kimi(command):
         ),
         *ModelDeployerList(
             model_id="nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
-            backend=("vllm",),
+            backend=("trtllm", "vllm", "sglang"),
             tensor_parallel_size=8,
             mini_sm=89,
         ),
@@ -541,11 +575,11 @@ def test_medusa(command):
     ids=idfn,
 )
 def test_eagle(command):
-    """Skip test if MODELOPT_LOCAL_MODEL_ROOT is set but model doesn't exist locally.
+    """Skip test if MODELOPT_LOCAL_EAGLE_MODEL is set but model doesn't exist locally.
     speculative models shoule be loaded by local path"""
-    local_root = os.getenv("MODELOPT_LOCAL_MODEL_ROOT")
+    local_root = os.getenv("MODELOPT_LOCAL_EAGLE_MODEL")
     if not local_root:
-        return
+        pytest.skip("MODELOPT_LOCAL_EAGLE_MODEL is not set")
 
     local_path = os.path.join(local_root, command.model_id)
     if os.path.isdir(local_path):